summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/Make.tags.inc97
-rw-r--r--sys/kern/Makefile21
-rw-r--r--sys/kern/bus_if.m672
-rw-r--r--sys/kern/capabilities.conf754
-rw-r--r--sys/kern/clock_if.m45
-rw-r--r--sys/kern/cpufreq_if.m100
-rw-r--r--sys/kern/device_if.m318
-rw-r--r--sys/kern/dtio_kdtrace.c232
-rw-r--r--sys/kern/genassym.sh69
-rw-r--r--sys/kern/imgact_aout.c343
-rw-r--r--sys/kern/imgact_elf.c2135
-rw-r--r--sys/kern/imgact_elf32.c31
-rw-r--r--sys/kern/imgact_elf64.c31
-rw-r--r--sys/kern/imgact_gzip.c393
-rw-r--r--sys/kern/imgact_shell.c258
-rw-r--r--sys/kern/inflate.c1077
-rw-r--r--sys/kern/init_main.c855
-rw-r--r--sys/kern/init_sysent.c581
-rw-r--r--sys/kern/kern_acct.c647
-rw-r--r--sys/kern/kern_alq.c971
-rw-r--r--sys/kern/kern_clock.c895
-rw-r--r--sys/kern/kern_clocksource.c949
-rw-r--r--sys/kern/kern_condvar.c456
-rw-r--r--sys/kern/kern_conf.c1459
-rw-r--r--sys/kern/kern_cons.c643
-rw-r--r--sys/kern/kern_context.c129
-rw-r--r--sys/kern/kern_cpu.c1063
-rw-r--r--sys/kern/kern_cpuset.c1166
-rw-r--r--sys/kern/kern_ctf.c340
-rw-r--r--sys/kern/kern_descrip.c4016
-rw-r--r--sys/kern/kern_dtrace.c117
-rw-r--r--sys/kern/kern_environment.c626
-rw-r--r--sys/kern/kern_et.c246
-rw-r--r--sys/kern/kern_event.c2261
-rw-r--r--sys/kern/kern_exec.c1496
-rw-r--r--sys/kern/kern_exit.c1261
-rw-r--r--sys/kern/kern_fail.c611
-rw-r--r--sys/kern/kern_ffclock.c479
-rw-r--r--sys/kern/kern_fork.c1052
-rw-r--r--sys/kern/kern_gzio.c400
-rw-r--r--sys/kern/kern_hhook.c521
-rw-r--r--sys/kern/kern_idle.c86
-rw-r--r--sys/kern/kern_intr.c1943
-rw-r--r--sys/kern/kern_jail.c4677
-rw-r--r--sys/kern/kern_khelp.c372
-rw-r--r--sys/kern/kern_kthread.c466
-rw-r--r--sys/kern/kern_ktr.c495
-rw-r--r--sys/kern/kern_ktrace.c1269
-rw-r--r--sys/kern/kern_linker.c2162
-rw-r--r--sys/kern/kern_lock.c1505
-rw-r--r--sys/kern/kern_lockf.c2545
-rw-r--r--sys/kern/kern_lockstat.c64
-rw-r--r--sys/kern/kern_loginclass.c238
-rw-r--r--sys/kern/kern_malloc.c1100
-rw-r--r--sys/kern/kern_mbuf.c694
-rw-r--r--sys/kern/kern_mib.c542
-rw-r--r--sys/kern/kern_module.c523
-rw-r--r--sys/kern/kern_mtxpool.c218
-rw-r--r--sys/kern/kern_mutex.c1009
-rw-r--r--sys/kern/kern_ntptime.c1055
-rw-r--r--sys/kern/kern_osd.c403
-rw-r--r--sys/kern/kern_physio.c170
-rw-r--r--sys/kern/kern_pmc.c345
-rw-r--r--sys/kern/kern_poll.c567
-rw-r--r--sys/kern/kern_priv.c185
-rw-r--r--sys/kern/kern_proc.c2740
-rw-r--r--sys/kern/kern_prot.c2222
-rw-r--r--sys/kern/kern_racct.c1291
-rw-r--r--sys/kern/kern_rangelock.c248
-rw-r--r--sys/kern/kern_rctl.c1870
-rw-r--r--sys/kern/kern_resource.c1434
-rw-r--r--sys/kern/kern_rmlock.c831
-rw-r--r--sys/kern/kern_rwlock.c1232
-rw-r--r--sys/kern/kern_sdt.c51
-rw-r--r--sys/kern/kern_sema.c176
-rw-r--r--sys/kern/kern_sharedpage.c239
-rw-r--r--sys/kern/kern_shutdown.c893
-rw-r--r--sys/kern/kern_sig.c3469
-rw-r--r--sys/kern/kern_switch.c513
-rw-r--r--sys/kern/kern_sx.c1214
-rw-r--r--sys/kern/kern_synch.c632
-rw-r--r--sys/kern/kern_syscalls.c220
-rw-r--r--sys/kern/kern_sysctl.c1656
-rw-r--r--sys/kern/kern_tc.c2030
-rw-r--r--sys/kern/kern_thr.c555
-rw-r--r--sys/kern/kern_thread.c1054
-rw-r--r--sys/kern/kern_time.c1648
-rw-r--r--sys/kern/kern_timeout.c1433
-rw-r--r--sys/kern/kern_umtx.c3918
-rw-r--r--sys/kern/kern_uuid.c426
-rw-r--r--sys/kern/kern_xxx.c471
-rw-r--r--sys/kern/ksched.c292
-rw-r--r--sys/kern/link_elf.c1605
-rw-r--r--sys/kern/link_elf_obj.c1375
-rw-r--r--sys/kern/linker_if.m145
-rw-r--r--sys/kern/makesyscalls.sh653
-rw-r--r--sys/kern/md4c.c288
-rw-r--r--sys/kern/md5c.c340
-rw-r--r--sys/kern/p1003_1b.c315
-rw-r--r--sys/kern/posix4_mib.c183
-rw-r--r--sys/kern/sched_4bsd.c1784
-rw-r--r--sys/kern/sched_ule.c2911
-rw-r--r--sys/kern/serdev_if.m94
-rw-r--r--sys/kern/stack_protector.c31
-rw-r--r--sys/kern/subr_acl_nfs4.c1417
-rw-r--r--sys/kern/subr_acl_posix1e.c691
-rw-r--r--sys/kern/subr_autoconf.c230
-rw-r--r--sys/kern/subr_blist.c1095
-rw-r--r--sys/kern/subr_bufring.c65
-rw-r--r--sys/kern/subr_bus.c4885
-rw-r--r--sys/kern/subr_bus_dma.c533
-rw-r--r--sys/kern/subr_busdma_bufalloc.c174
-rw-r--r--sys/kern/subr_capability.c298
-rw-r--r--sys/kern/subr_clock.c225
-rw-r--r--sys/kern/subr_counter.c107
-rw-r--r--sys/kern/subr_devstat.c604
-rw-r--r--sys/kern/subr_disk.c267
-rw-r--r--sys/kern/subr_dummy_vdso_tc.c49
-rw-r--r--sys/kern/subr_eventhandler.c280
-rw-r--r--sys/kern/subr_fattime.c307
-rw-r--r--sys/kern/subr_firmware.c537
-rw-r--r--sys/kern/subr_hash.c128
-rw-r--r--sys/kern/subr_hints.c463
-rw-r--r--sys/kern/subr_kdb.c675
-rw-r--r--sys/kern/subr_kobj.c348
-rw-r--r--sys/kern/subr_lock.c649
-rw-r--r--sys/kern/subr_log.c310
-rw-r--r--sys/kern/subr_mbpool.c402
-rw-r--r--sys/kern/subr_mchain.c554
-rw-r--r--sys/kern/subr_module.c290
-rw-r--r--sys/kern/subr_msgbuf.c418
-rw-r--r--sys/kern/subr_param.c354
-rw-r--r--sys/kern/subr_pcpu.c394
-rw-r--r--sys/kern/subr_pctrie.c705
-rw-r--r--sys/kern/subr_power.c122
-rw-r--r--sys/kern/subr_prf.c1140
-rw-r--r--sys/kern/subr_prof.c589
-rw-r--r--sys/kern/subr_rman.c1160
-rw-r--r--sys/kern/subr_rtc.c178
-rw-r--r--sys/kern/subr_sbuf.c831
-rw-r--r--sys/kern/subr_scanf.c641
-rw-r--r--sys/kern/subr_sglist.c714
-rw-r--r--sys/kern/subr_sleepqueue.c1236
-rw-r--r--sys/kern/subr_smp.c787
-rw-r--r--sys/kern/subr_stack.c277
-rw-r--r--sys/kern/subr_syscall.c235
-rw-r--r--sys/kern/subr_taskqueue.c634
-rw-r--r--sys/kern/subr_trap.c303
-rw-r--r--sys/kern/subr_turnstile.c1308
-rw-r--r--sys/kern/subr_uio.c611
-rw-r--r--sys/kern/subr_unit.c1015
-rw-r--r--sys/kern/subr_vmem.c1487
-rw-r--r--sys/kern/subr_witness.c2912
-rw-r--r--sys/kern/sys_capability.c613
-rw-r--r--sys/kern/sys_generic.c1815
-rw-r--r--sys/kern/sys_pipe.c1834
-rw-r--r--sys/kern/sys_procdesc.c535
-rw-r--r--sys/kern/sys_process.c1242
-rw-r--r--sys/kern/sys_socket.c297
-rw-r--r--sys/kern/syscalls.c554
-rw-r--r--sys/kern/syscalls.master982
-rw-r--r--sys/kern/systrace_args.c10946
-rw-r--r--sys/kern/sysv_ipc.c246
-rw-r--r--sys/kern/sysv_msg.c1592
-rw-r--r--sys/kern/sysv_sem.c1666
-rw-r--r--sys/kern/sysv_shm.c1407
-rw-r--r--sys/kern/tty.c2209
-rw-r--r--sys/kern/tty_compat.c484
-rw-r--r--sys/kern/tty_info.c313
-rw-r--r--sys/kern/tty_inq.c489
-rw-r--r--sys/kern/tty_outq.c339
-rw-r--r--sys/kern/tty_pts.c858
-rw-r--r--sys/kern/tty_tty.c94
-rw-r--r--sys/kern/tty_ttydisc.c1268
-rw-r--r--sys/kern/uipc_accf.c298
-rw-r--r--sys/kern/uipc_cow.c182
-rw-r--r--sys/kern/uipc_debug.c531
-rw-r--r--sys/kern/uipc_domain.c523
-rw-r--r--sys/kern/uipc_mbuf.c2182
-rw-r--r--sys/kern/uipc_mbuf2.c453
-rw-r--r--sys/kern/uipc_mqueue.c2883
-rw-r--r--sys/kern/uipc_sem.c1111
-rw-r--r--sys/kern/uipc_shm.c1033
-rw-r--r--sys/kern/uipc_sockbuf.c1061
-rw-r--r--sys/kern/uipc_socket.c3752
-rw-r--r--sys/kern/uipc_syscalls.c2935
-rw-r--r--sys/kern/uipc_usrreq.c2505
-rw-r--r--sys/kern/vfs_acl.c562
-rw-r--r--sys/kern/vfs_aio.c3069
-rw-r--r--sys/kern/vfs_bio.c4602
-rw-r--r--sys/kern/vfs_cache.c1486
-rw-r--r--sys/kern/vfs_cluster.c1058
-rw-r--r--sys/kern/vfs_default.c1269
-rw-r--r--sys/kern/vfs_export.c493
-rw-r--r--sys/kern/vfs_extattr.c765
-rw-r--r--sys/kern/vfs_hash.c162
-rw-r--r--sys/kern/vfs_init.c344
-rw-r--r--sys/kern/vfs_lookup.c1254
-rw-r--r--sys/kern/vfs_mount.c1949
-rw-r--r--sys/kern/vfs_mountroot.c1041
-rw-r--r--sys/kern/vfs_subr.c4775
-rw-r--r--sys/kern/vfs_syscalls.c4729
-rw-r--r--sys/kern/vfs_vnops.c2083
-rw-r--r--sys/kern/vnode_if.src716
204 files changed, 208154 insertions, 0 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
new file mode 100644
index 0000000..cb8a3ff
--- /dev/null
+++ b/sys/kern/Make.tags.inc
@@ -0,0 +1,97 @@
+# $FreeBSD$
+# @(#)Make.tags.inc 8.1 (Berkeley) 6/11/93
+
+SYS?= ${.CURDIR}/..
+
+# Common files for "make tags", included by the Makefile for each
+# architecture.
+
+# Put the /sys/sys include files at the end so that subroutine definitions
+# win when there is a struct tag with the same name (e.g., vmmeter). The
+# better solution would be for ctags to generate "struct vmmeter" tags.
+
+COMM= ${SYS}/dev/advansys/*.[ch] \
+ ${SYS}/dev/aha/*.[ch] \
+ ${SYS}/dev/aic7xxx/*.[ch] \
+ ${SYS}/dev/buslogic/*.[ch] \
+ ${SYS}/dev/dpt/*.[ch] \
+ ${SYS}/dev/en/*.[ch] \
+ ${SYS}/dev/iicbus/*.[ch] \
+ ${SYS}/dev/isp/*.[ch] \
+ ${SYS}/dev/pdq/*.[ch] \
+ ${SYS}/dev/ppbus/*.[ch] \
+ ${SYS}/dev/smbus/*.[ch] \
+ ${SYS}/dev/vx/*.[ch] \
+ ${SYS}/fs/cd9660/*.[ch] \
+ ${SYS}/fs/deadfs/*.[ch] \
+ ${SYS}/fs/devfs/*.[ch] \
+ ${SYS}/fs/fdescfs/*.[ch] \
+ ${SYS}/fs/fifofs/*.[ch] \
+ ${SYS}/fs/msdosfs/*.[ch] \
+ ${SYS}/fs/nullfs/*.[ch] \
+ ${SYS}/fs/procfs/*.[ch] \
+ ${SYS}/fs/smbfs/*.[ch] \
+ ${SYS}/fs/udf/*.[ch] \
+ ${SYS}/fs/unionfs/*.[ch] \
+ ${SYS}/geom/*.[ch] \
+ ${SYS}/kern/*.[ch] \
+ ${SYS}/net/*.[ch] \
+ ${SYS}/netatalk/*.[ch] \
+ ${SYS}/netinet/*.[ch] \
+ ${SYS}/netinet6/*.[ch] \
+ ${SYS}/netipsec/*.[ch] \
+ ${SYS}/netipx/*.[ch] \
+ ${SYS}/netnatm/*.[ch] \
+ ${SYS}/nfs/*.[ch] \
+ ${SYS}/nfsclient/*.[ch] \
+ ${SYS}/nfsserver/*.[ch] \
+ ${SYS}/pci/*.[ch] \
+ ${SYS}/ufs/ffs/*.[ch] \
+ ${SYS}/ufs/ufs/*.[ch] \
+ ${SYS}/vm/*.[ch] \
+ ${SYS}/sys/*.[ch]
+
+COMMDIR1= ${SYS}/conf \
+ ${SYS}/geom \
+ ${SYS}/kern \
+ ${SYS}/net \
+ ${SYS}/netatalk \
+ ${SYS}/netinet \
+ ${SYS}/netinet6 \
+ ${SYS}/netipsec \
+ ${SYS}/netipx \
+ ${SYS}/netnatm \
+ ${SYS}/nfs \
+ ${SYS}/pci \
+ ${SYS}/vm \
+ ${SYS}/sys
+
+COMMDIR2= ${SYS}/dev/advansys \
+ ${SYS}/dev/aha \
+ ${SYS}/dev/aic7xxx \
+ ${SYS}/dev/buslogic \
+ ${SYS}/dev/ccd \
+ ${SYS}/dev/dec \
+ ${SYS}/dev/dpt \
+ ${SYS}/dev/en \
+ ${SYS}/dev/hea \
+ ${SYS}/dev/hfa \
+ ${SYS}/dev/iicbus \
+ ${SYS}/dev/isp \
+ ${SYS}/dev/pdq \
+ ${SYS}/dev/ppbus \
+ ${SYS}/dev/smbus \
+ ${SYS}/dev/vn \
+ ${SYS}/dev/vx \
+ ${SYS}/fs/deadfs \
+ ${SYS}/fs/devfs \
+ ${SYS}/fs/fdescfs \
+ ${SYS}/fs/fifofs \
+ ${SYS}/fs/msdosfs \
+ ${SYS}/fs/nullfs \
+ ${SYS}/fs/procfs \
+ ${SYS}/fs/specfs \
+ ${SYS}/fs/unionfs \
+ ${SYS}/fs/cd9660 \
+ ${SYS}/ufs/ffs \
+ ${SYS}/ufs/ufs
diff --git a/sys/kern/Makefile b/sys/kern/Makefile
new file mode 100644
index 0000000..0721e82
--- /dev/null
+++ b/sys/kern/Makefile
@@ -0,0 +1,21 @@
+# @(#)Makefile 8.2 (Berkeley) 3/21/94
+# $FreeBSD$
+
+# Makefile for init_sysent
+
+all:
+ @echo "make sysent only"
+
+sysent: init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall.mk \
+../sys/sysproto.h
+
+init_sysent.c syscalls.c systrace_args.c ../sys/syscall.h \
+../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master \
+capabilities.conf
+ -mv -f init_sysent.c init_sysent.c.bak
+ -mv -f syscalls.c syscalls.c.bak
+ -mv -f systrace_args.c systrace_args.c.bak
+ -mv -f ../sys/syscall.h ../sys/syscall.h.bak
+ -mv -f ../sys/syscall.mk ../sys/syscall.mk.bak
+ -mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
+ sh makesyscalls.sh syscalls.master
diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m
new file mode 100644
index 0000000..b0ad611
--- /dev/null
+++ b/sys/kern/bus_if.m
@@ -0,0 +1,672 @@
+#-
+# Copyright (c) 1998-2004 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+/**
+ * @defgroup BUS bus - KObj methods for drivers of devices with children
+ * @brief A set of methods required device drivers that support
+ * child devices.
+ * @{
+ */
+INTERFACE bus;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+ static struct resource *
+ null_alloc_resource(device_t dev, device_t child,
+ int type, int *rid, u_long start, u_long end,
+ u_long count, u_int flags)
+ {
+ return (0);
+ }
+
+ static int
+ null_remap_intr(device_t bus, device_t dev, u_int irq)
+ {
+
+ if (dev != NULL)
+ return (BUS_REMAP_INTR(dev, NULL, irq));
+ return (ENXIO);
+ }
+
+ static device_t
+ null_add_child(device_t bus, int order, const char *name,
+ int unit)
+ {
+
+ panic("bus_add_child is not implemented");
+ }
+};
+
+/**
+ * @brief Print a description of a child device
+ *
+ * This is called from system code which prints out a description of a
+ * device. It should describe the attachment that the child has with
+ * the parent. For instance the TurboLaser bus prints which node the
+ * device is attached to. See bus_generic_print_child() for more
+ * information.
+ *
+ * @param _dev the device whose child is being printed
+ * @param _child the child device to describe
+ *
+ * @returns the number of characters output.
+ */
+METHOD int print_child {
+ device_t _dev;
+ device_t _child;
+} DEFAULT bus_generic_print_child;
+
+/**
+ * @brief Print a notification about an unprobed child device.
+ *
+ * Called for each child device that did not succeed in probing for a
+ * driver.
+ *
+ * @param _dev the device whose child was being probed
+ * @param _child the child device which failed to probe
+ */
+METHOD void probe_nomatch {
+ device_t _dev;
+ device_t _child;
+};
+
+/**
+ * @brief Read the value of a bus-specific attribute of a device
+ *
+ * This method, along with BUS_WRITE_IVAR() manages a bus-specific set
+ * of instance variables of a child device. The intention is that
+ * each different type of bus defines a set of appropriate instance
+ * variables (such as ports and irqs for ISA bus etc.)
+ *
+ * This information could be given to the child device as a struct but
+ * that makes it hard for a bus to add or remove variables without
+ * forcing an edit and recompile for all drivers which may not be
+ * possible for vendor supplied binary drivers.
+ *
+ * This method copies the value of an instance variable to the
+ * location specified by @p *_result.
+ *
+ * @param _dev the device whose child was being examined
+ * @param _child the child device whose instance variable is
+ * being read
+ * @param _index the instance variable to read
+ * @param _result a loction to recieve the instance variable
+ * value
+ *
+ * @retval 0 success
+ * @retval ENOENT no such instance variable is supported by @p
+ * _dev
+ */
+METHOD int read_ivar {
+ device_t _dev;
+ device_t _child;
+ int _index;
+ uintptr_t *_result;
+};
+
+/**
+ * @brief Write the value of a bus-specific attribute of a device
+ *
+ * This method sets the value of an instance variable to @p _value.
+ *
+ * @param _dev the device whose child was being updated
+ * @param _child the child device whose instance variable is
+ * being written
+ * @param _index the instance variable to write
+ * @param _value the value to write to that instance variable
+ *
+ * @retval 0 success
+ * @retval ENOENT no such instance variable is supported by @p
+ * _dev
+ * @retval EINVAL the instance variable was recognised but
+ * contains a read-only value
+ */
+METHOD int write_ivar {
+ device_t _dev;
+ device_t _child;
+ int _indx;
+ uintptr_t _value;
+};
+
+/**
+ * @brief Notify a bus that a child was deleted
+ *
+ * Called at the beginning of device_delete_child() to allow the parent
+ * to teardown any bus-specific state for the child.
+ *
+ * @param _dev the device whose child is being deleted
+ * @param _child the child device which is being deleted
+ */
+METHOD void child_deleted {
+ device_t _dev;
+ device_t _child;
+};
+
+/**
+ * @brief Notify a bus that a child was detached
+ *
+ * Called after the child's DEVICE_DETACH() method to allow the parent
+ * to reclaim any resources allocated on behalf of the child.
+ *
+ * @param _dev the device whose child changed state
+ * @param _child the child device which changed state
+ */
+METHOD void child_detached {
+ device_t _dev;
+ device_t _child;
+};
+
+/**
+ * @brief Notify a bus that a new driver was added
+ *
+ * Called when a new driver is added to the devclass which owns this
+ * bus. The generic implementation of this method attempts to probe and
+ * attach any un-matched children of the bus.
+ *
+ * @param _dev the device whose devclass had a new driver
+ * added to it
+ * @param _driver the new driver which was added
+ */
+METHOD void driver_added {
+ device_t _dev;
+ driver_t *_driver;
+} DEFAULT bus_generic_driver_added;
+
+/**
+ * @brief Create a new child device
+ *
+ * For busses which use use drivers supporting DEVICE_IDENTIFY() to
+ * enumerate their devices, this method is used to create new
+ * device instances. The new device will be added after the last
+ * existing child with the same order.
+ *
+ * @param _dev the bus device which will be the parent of the
+ * new child device
+ * @param _order a value which is used to partially sort the
+ * children of @p _dev - devices created using
+ * lower values of @p _order appear first in @p
+ * _dev's list of children
+ * @param _name devclass name for new device or @c NULL if not
+ * specified
+ * @param _unit unit number for new device or @c -1 if not
+ * specified
+ */
+METHOD device_t add_child {
+ device_t _dev;
+ u_int _order;
+ const char *_name;
+ int _unit;
+} DEFAULT null_add_child;
+
+/**
+ * @brief Allocate a system resource
+ *
+ * This method is called by child devices of a bus to allocate resources.
+ * The types are defined in <machine/resource.h>; the meaning of the
+ * resource-ID field varies from bus to bus (but @p *rid == 0 is always
+ * valid if the resource type is). If a resource was allocated and the
+ * caller did not use the RF_ACTIVE to specify that it should be
+ * activated immediately, the caller is responsible for calling
+ * BUS_ACTIVATE_RESOURCE() when it actually uses the resource.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which is requesting an allocation
+ * @param _type the type of resource to allocate
+ * @param _rid a pointer to the resource identifier
+ * @param _start hint at the start of the resource range - pass
+ * @c 0UL for any start address
+ * @param _end hint at the end of the resource range - pass
+ * @c ~0UL for any end address
+ * @param _count hint at the size of range required - pass @c 1
+ * for any size
+ * @param _flags any extra flags to control the resource
+ * allocation - see @c RF_XXX flags in
+ * <sys/rman.h> for details
+ *
+ * @returns the resource which was allocated or @c NULL if no
+ * resource could be allocated
+ */
+METHOD struct resource * alloc_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int *_rid;
+ u_long _start;
+ u_long _end;
+ u_long _count;
+ u_int _flags;
+} DEFAULT null_alloc_resource;
+
+/**
+ * @brief Activate a resource
+ *
+ * Activate a resource previously allocated with
+ * BUS_ALLOC_RESOURCE(). This may for instance map a memory region
+ * into the kernel's virtual address space.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which allocated the resource
+ * @param _type the type of resource
+ * @param _rid the resource identifier
+ * @param _r the resource to activate
+ */
+METHOD int activate_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ struct resource *_r;
+};
+
+/**
+ * @brief Deactivate a resource
+ *
+ * Deactivate a resource previously allocated with
+ * BUS_ALLOC_RESOURCE(). This may for instance unmap a memory region
+ * from the kernel's virtual address space.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which allocated the resource
+ * @param _type the type of resource
+ * @param _rid the resource identifier
+ * @param _r the resource to deactivate
+ */
+METHOD int deactivate_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ struct resource *_r;
+};
+
+/**
+ * @brief Adjust a resource
+ *
+ * Adjust the start and/or end of a resource allocated by
+ * BUS_ALLOC_RESOURCE. At least part of the new address range must overlap
+ * with the existing address range. If the successful, the resource's range
+ * will be adjusted to [start, end] on return.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which allocated the resource
+ * @param _type the type of resource
+ * @param _res the resource to adjust
+ * @param _start the new starting address of the resource range
+ * @param _end the new ending address of the resource range
+ */
+METHOD int adjust_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ struct resource *_res;
+ u_long _start;
+ u_long _end;
+};
+
+/**
+ * @brief Release a resource
+ *
+ * Free a resource allocated by the BUS_ALLOC_RESOURCE. The @p _rid
+ * value must be the same as the one returned by BUS_ALLOC_RESOURCE()
+ * (which is not necessarily the same as the one the client passed).
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which allocated the resource
+ * @param _type the type of resource
+ * @param _rid the resource identifier
+ * @param _r the resource to release
+ */
+METHOD int release_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ struct resource *_res;
+};
+
+/**
+ * @brief Install an interrupt handler
+ *
+ * This method is used to associate an interrupt handler function with
+ * an irq resource. When the interrupt triggers, the function @p _intr
+ * will be called with the value of @p _arg as its single
+ * argument. The value returned in @p *_cookiep is used to cancel the
+ * interrupt handler - the caller should save this value to use in a
+ * future call to BUS_TEARDOWN_INTR().
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which allocated the resource
+ * @param _irq the resource representing the interrupt
+ * @param _flags a set of bits from enum intr_type specifying
+ * the class of interrupt
+ * @param _intr the function to call when the interrupt
+ * triggers
+ * @param _arg a value to use as the single argument in calls
+ * to @p _intr
+ * @param _cookiep a pointer to a location to recieve a cookie
+ * value that may be used to remove the interrupt
+ * handler
+ */
+METHOD int setup_intr {
+ device_t _dev;
+ device_t _child;
+ struct resource *_irq;
+ int _flags;
+ driver_filter_t *_filter;
+ driver_intr_t *_intr;
+ void *_arg;
+ void **_cookiep;
+};
+
+/**
+ * @brief Uninstall an interrupt handler
+ *
+ * This method is used to disassociate an interrupt handler function
+ * with an irq resource. The value of @p _cookie must be the value
+ * returned from a previous call to BUS_SETUP_INTR().
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which allocated the resource
+ * @param _irq the resource representing the interrupt
+ * @param _cookie the cookie value returned when the interrupt
+ * was originally registered
+ */
+METHOD int teardown_intr {
+ device_t _dev;
+ device_t _child;
+ struct resource *_irq;
+ void *_cookie;
+};
+
+/**
+ * @brief Define a resource which can be allocated with
+ * BUS_ALLOC_RESOURCE().
+ *
+ * This method is used by some busses (typically ISA) to allow a
+ * driver to describe a resource range that it would like to
+ * allocate. The resource defined by @p _type and @p _rid is defined
+ * to start at @p _start and to include @p _count indices in its
+ * range.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which owns the resource
+ * @param _type the type of resource
+ * @param _rid the resource identifier
+ * @param _start the start of the resource range
+ * @param _count the size of the resource range
+ */
+METHOD int set_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ u_long _start;
+ u_long _count;
+};
+
+/**
+ * @brief Describe a resource
+ *
+ * This method allows a driver to examine the range used for a given
+ * resource without actually allocating it.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which owns the resource
+ * @param _type the type of resource
+ * @param _rid the resource identifier
+ * @param _start the address of a location to recieve the start
+ * index of the resource range
+ * @param _count the address of a location to recieve the size
+ * of the resource range
+ */
+METHOD int get_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+ u_long *_startp;
+ u_long *_countp;
+};
+
+/**
+ * @brief Delete a resource.
+ *
+ * Use this to delete a resource (possibly one previously added with
+ * BUS_SET_RESOURCE()).
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which owns the resource
+ * @param _type the type of resource
+ * @param _rid the resource identifier
+ */
+METHOD void delete_resource {
+ device_t _dev;
+ device_t _child;
+ int _type;
+ int _rid;
+};
+
+/**
+ * @brief Return a struct resource_list.
+ *
+ * Used by drivers which use bus_generic_rl_alloc_resource() etc. to
+ * implement their resource handling. It should return the resource
+ * list of the given child device.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which owns the resource list
+ */
+METHOD struct resource_list * get_resource_list {
+ device_t _dev;
+ device_t _child;
+} DEFAULT bus_generic_get_resource_list;
+
+/**
+ * @brief Is the hardware described by @p _child still attached to the
+ * system?
+ *
+ * This method should return 0 if the device is not present. It
+ * should return -1 if it is present. Any errors in determining
+ * should be returned as a normal errno value. Client drivers are to
+ * assume that the device is present, even if there is an error
+ * determining if it is there. Busses are to try to avoid returning
+ * errors, but newcard will return an error if the device fails to
+ * implement this method.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which is being examined
+ */
+METHOD int child_present {
+ device_t _dev;
+ device_t _child;
+} DEFAULT bus_generic_child_present;
+
+/**
+ * @brief Returns the pnp info for this device.
+ *
+ * Return it as a string. If the string is insufficient for the
+ * storage, then return EOVERFLOW.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which is being examined
+ * @param _buf the address of a buffer to receive the pnp
+ * string
+ * @param _buflen the size of the buffer pointed to by @p _buf
+ */
+METHOD int child_pnpinfo_str {
+ device_t _dev;
+ device_t _child;
+ char *_buf;
+ size_t _buflen;
+};
+
+/**
+ * @brief Returns the location for this device.
+ *
+ * Return it as a string. If the string is insufficient for the
+ * storage, then return EOVERFLOW.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which is being examined
+ * @param _buf the address of a buffer to receive the location
+ * string
+ * @param _buflen the size of the buffer pointed to by @p _buf
+ */
+METHOD int child_location_str {
+ device_t _dev;
+ device_t _child;
+ char *_buf;
+ size_t _buflen;
+};
+
+/**
+ * @brief Allow drivers to request that an interrupt be bound to a specific
+ * CPU.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which allocated the resource
+ * @param _irq the resource representing the interrupt
+ * @param _cpu the CPU to bind the interrupt to
+ */
+METHOD int bind_intr {
+ device_t _dev;
+ device_t _child;
+ struct resource *_irq;
+ int _cpu;
+} DEFAULT bus_generic_bind_intr;
+
+/**
+ * @brief Allow (bus) drivers to specify the trigger mode and polarity
+ * of the specified interrupt.
+ *
+ * @param _dev the bus device
+ * @param _irq the interrupt number to modify
+ * @param _trig the trigger mode required
+ * @param _pol the interrupt polarity required
+ */
+METHOD int config_intr {
+ device_t _dev;
+ int _irq;
+ enum intr_trigger _trig;
+ enum intr_polarity _pol;
+} DEFAULT bus_generic_config_intr;
+
+/**
+ * @brief Allow drivers to associate a description with an active
+ * interrupt handler.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device which allocated the resource
+ * @param _irq the resource representing the interrupt
+ * @param _cookie the cookie value returned when the interrupt
+ * was originally registered
+ * @param _descr the description to associate with the interrupt
+ */
+METHOD int describe_intr {
+ device_t _dev;
+ device_t _child;
+ struct resource *_irq;
+ void *_cookie;
+ const char *_descr;
+} DEFAULT bus_generic_describe_intr;
+
+/**
+ * @brief Notify a (bus) driver about a child that the hints mechanism
+ * believes it has discovered.
+ *
+ * The bus is responsible for then adding the child in the right order
+ * and discovering other things about the child. The bus driver is
+ * free to ignore this hint, to do special things, etc. It is all up
+ * to the bus driver to interpret.
+ *
+ * This method is only called in response to the parent bus asking for
+ * hinted devices to be enumerated.
+ *
+ * @param _dev the bus device
+ * @param _dname the name of the device w/o unit numbers
+ * @param _dunit the unit number of the device
+ */
+METHOD void hinted_child {
+ device_t _dev;
+ const char *_dname;
+ int _dunit;
+};
+
+/**
+ * @brief Returns bus_dma_tag_t for use w/ devices on the bus.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device to which the tag will belong
+ */
+METHOD bus_dma_tag_t get_dma_tag {
+ device_t _dev;
+ device_t _child;
+} DEFAULT bus_generic_get_dma_tag;
+
+/**
+ * @brief Allow the bus to determine the unit number of a device.
+ *
+ * @param _dev the parent device of @p _child
+ * @param _child the device whose unit is to be wired
+ * @param _name the name of the device's new devclass
+ * @param _unitp a pointer to the device's new unit value
+ */
+METHOD void hint_device_unit {
+ device_t _dev;
+ device_t _child;
+ const char *_name;
+ int *_unitp;
+};
+
+/**
+ * @brief Notify a bus that the bus pass level has been changed
+ *
+ * @param _dev the bus device
+ */
+METHOD void new_pass {
+ device_t _dev;
+} DEFAULT bus_generic_new_pass;
+
+/**
+ * @brief Notify a bus that specified child's IRQ should be remapped.
+ *
+ * @param _dev the bus device
+ * @param _child the child device
+ * @param _irq the irq number
+ */
+METHOD int remap_intr {
+ device_t _dev;
+ device_t _child;
+ u_int _irq;
+} DEFAULT null_remap_intr;
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
new file mode 100644
index 0000000..7f68668
--- /dev/null
+++ b/sys/kern/capabilities.conf
@@ -0,0 +1,754 @@
+##
+## Copyright (c) 2008-2010 Robert N. M. Watson
+## All rights reserved.
+##
+## This software was developed at the University of Cambridge Computer
+## Laboratory with support from a grant from Google, Inc.
+##
+## Redistribution and use in source and binary forms, with or without
+## modification, are permitted provided that the following conditions
+## are met:
+## 1. Redistributions of source code must retain the above copyright
+## notice, this list of conditions and the following disclaimer.
+## 2. Redistributions in binary form must reproduce the above copyright
+## notice, this list of conditions and the following disclaimer in the
+## documentation and/or other materials provided with the distribution.
+##
+## THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+## ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+## ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+## FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+## DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+## OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+## HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+## LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+## OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+## SUCH DAMAGE.
+##
+## List of system calls enabled in capability mode, one name per line.
+##
+## Notes:
+## - sys_exit(2), abort2(2) and close(2) are very important.
+## - Sorted alphabetically, please keep it that way.
+##
+## $FreeBSD$
+##
+
+##
+## Allow ACL and MAC label operations by file descriptor, subject to
+## capability rights. Allow MAC label operations on the current process but
+## we will need to scope __mac_get_pid(2).
+##
+__acl_aclcheck_fd
+__acl_delete_fd
+__acl_get_fd
+__acl_set_fd
+__mac_get_fd
+#__mac_get_pid
+__mac_get_proc
+__mac_set_fd
+__mac_set_proc
+
+##
+## Allow sysctl(2) as we scope internal to the call; this is a global
+## namespace, but there are several critical sysctls required for almost
+## anything to run, such as hw.pagesize. For now that policy lives in the
+## kernel for performance and simplicity, but perhaps it could move to a
+## proxying daemon in userspace.
+##
+__sysctl
+
+##
+## Allow umtx operations as these are scoped by address space.
+##
+## XXRW: Need to check this very carefully.
+##
+_umtx_lock
+_umtx_op
+_umtx_unlock
+
+##
+## Allow process termination using abort2(2).
+##
+abort2
+
+##
+## Allow accept(2) since it doesn't manipulate namespaces directly, rather
+## relies on existing bindings on a socket, subject to capability rights.
+##
+accept
+accept4
+
+##
+## Allow AIO operations by file descriptor, subject to capability rights.
+##
+aio_cancel
+aio_error
+aio_fsync
+aio_read
+aio_return
+aio_suspend
+aio_waitcomplete
+aio_write
+
+##
+## audit(2) is a global operation, submitting to the global trail, but it is
+## controlled by privilege, and it might be useful to be able to submit
+## records from sandboxes. For now, disallow, but we may want to think about
+## providing some sort of proxy service for this.
+##
+#audit
+
+##
+## Allow bindat(2).
+##
+bindat
+
+##
+## Allow capability mode and capability system calls.
+##
+cap_enter
+cap_fcntls_get
+cap_fcntls_limit
+cap_getmode
+cap_ioctls_get
+cap_ioctls_limit
+__cap_rights_get
+cap_rights_limit
+
+##
+## Allow read-only clock operations.
+##
+clock_getres
+clock_gettime
+
+##
+## Always allow file descriptor close(2).
+##
+close
+closefrom
+
+##
+## Allow connectat(2).
+##
+connectat
+
+##
+## cpuset(2) and related calls require scoping by process, but should
+## eventually be allowed, at least in the current process case.
+##
+#cpuset
+#cpuset_getaffinity
+#cpuset_getid
+#cpuset_setaffinity
+#cpuset_setid
+
+##
+## Always allow dup(2) and dup2(2) manipulation of the file descriptor table.
+##
+dup
+dup2
+
+##
+## Allow extended attribute operations by file descriptor, subject to
+## capability rights.
+##
+extattr_delete_fd
+extattr_get_fd
+extattr_list_fd
+extattr_set_fd
+
+##
+## Allow changing file flags, mode, and owner by file descriptor, subject to
+## capability rights.
+##
+fchflags
+fchmod
+fchown
+
+##
+## For now, allow fcntl(2), subject to capability rights, but this probably
+## needs additional scoping.
+##
+fcntl
+
+##
+## Allow fexecve(2), subject to capability rights. We perform some scoping,
+## such as disallowing privilege escalation.
+##
+fexecve
+
+##
+## Allow flock(2), subject to capability rights.
+##
+flock
+
+##
+## Allow fork(2), even though it returns pids -- some applications seem to
+## prefer this interface.
+##
+fork
+
+##
+## Allow fpathconf(2), subject to capability rights.
+##
+fpathconf
+
+##
+## Allow various file descriptor-based I/O operations, subject to capability
+## rights.
+##
+freebsd6_ftruncate
+freebsd6_lseek
+freebsd6_mmap
+freebsd6_pread
+freebsd6_pwrite
+
+##
+## Allow querying file and file system state with fstat(2) and fstatfs(2),
+## subject to capability rights.
+##
+fstat
+fstatfs
+
+##
+## Allow further file descriptor-based I/O operations, subject to capability
+## rights.
+##
+fsync
+ftruncate
+
+##
+## Allow futimes(2), subject to capability rights.
+##
+futimes
+
+##
+## Allow querying process audit state, subject to normal access control.
+##
+getaudit
+getaudit_addr
+getauid
+
+##
+## Allow thread context management with getcontext(2).
+##
+getcontext
+
+##
+## Allow directory I/O on a file descriptor, subject to capability rights.
+## Originally we had separate capabilities for directory-specific read
+## operations, but on BSD we allow reading the raw directory data, so we just
+## rely on CAP_READ now.
+##
+getdents
+getdirentries
+
+##
+## Allow querying certain trivial global state.
+##
+getdomainname
+
+##
+## Allow querying current process credential state.
+##
+getegid
+geteuid
+
+##
+## Allow querying certain trivial global state.
+##
+gethostid
+gethostname
+
+##
+## Allow querying per-process timer.
+##
+getitimer
+
+##
+## Allow querying current process credential state.
+##
+getgid
+getgroups
+getlogin
+
+##
+## Allow querying certain trivial global state.
+##
+getpagesize
+getpeername
+
+##
+## Allow querying certain per-process scheduling, resource limit, and
+## credential state.
+##
+## XXXRW: getpgid(2) needs scoping. It's not clear if it's worth scoping
+## getppid(2). getpriority(2) needs scoping. getrusage(2) needs scoping.
+## getsid(2) needs scoping.
+##
+getpgid
+getpgrp
+getpid
+getppid
+getpriority
+getresgid
+getresuid
+getrlimit
+getrusage
+getsid
+
+##
+## Allow querying socket state, subject to capability rights.
+##
+## XXXRW: getsockopt(2) may need more attention.
+##
+getsockname
+getsockopt
+
+##
+## Allow querying the global clock.
+##
+gettimeofday
+
+##
+## Allow querying current process credential state.
+##
+getuid
+
+##
+## Allow ioctl(2), which hopefully will be limited by applications only to
+## required commands with cap_ioctls_limit(2) syscall.
+##
+ioctl
+
+##
+## Allow querying current process credential state.
+##
+issetugid
+
+##
+## Allow kevent(2), as we will authorize based on capability rights on the
+## target descriptor.
+##
+kevent
+
+##
+## Allow kill(2), as we allow the process to send signals only to himself.
+##
+kill
+
+##
+## Allow message queue operations on file descriptors, subject to capability
+## rights.
+##
+kmq_notify
+kmq_setattr
+kmq_timedreceive
+kmq_timedsend
+
+##
+## Allow kqueue(2), we will control use.
+##
+kqueue
+
+##
+## Allow managing per-process timers.
+##
+ktimer_create
+ktimer_delete
+ktimer_getoverrun
+ktimer_gettime
+ktimer_settime
+
+##
+## We can't allow ktrace(2) because it relies on a global namespace, but we
+## might want to introduce an fktrace(2) of some sort.
+##
+#ktrace
+
+##
+## Allow AIO operations by file descriptor, subject to capability rights.
+##
+lio_listio
+
+##
+## Allow listen(2), subject to capability rights.
+##
+## XXXRW: One might argue this manipulates a global namespace.
+##
+listen
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+lseek
+
+##
+## Allow MAC label operations by file descriptor, subject to capability
+## rights.
+##
+mac_get_fd
+mac_set_fd
+
+##
+## Allow simple VM operations on the current process.
+##
+madvise
+mincore
+minherit
+mlock
+mlockall
+
+##
+## Allow memory mapping a file descriptor, and updating protections, subject
+## to capability rights.
+##
+mmap
+mprotect
+
+##
+## Allow simple VM operations on the current process.
+##
+msync
+munlock
+munlockall
+munmap
+
+##
+## Allow the current process to sleep.
+##
+nanosleep
+
+##
+## Allow querying the global clock.
+##
+ntp_gettime
+
+##
+## Allow AIO operations by file descriptor, subject to capability rights.
+##
+oaio_read
+oaio_write
+
+##
+## Allow simple VM operations on the current process.
+##
+obreak
+
+##
+## Allow AIO operations by file descriptor, subject to capability rights.
+##
+olio_listio
+
+##
+## Operations relative to directory capabilities.
+##
+chflagsat
+faccessat
+fchmodat
+fchownat
+fstatat
+futimesat
+linkat
+mkdirat
+mkfifoat
+mknodat
+openat
+readlinkat
+renameat
+symlinkat
+unlinkat
+
+##
+## Allow entry into open(2). This system call will fail, since access to the
+## global file namespace has been disallowed, but allowing entry into the
+## syscall means that an audit trail will be generated (which is also very
+## useful for debugging).
+##
+open
+
+##
+## Allow poll(2), which will be scoped by capability rights.
+##
+## XXXRW: Perhaps we don't need the OpenBSD version?
+## XXXRW: We don't yet do that scoping.
+##
+openbsd_poll
+
+##
+## Process descriptor-related system calls are allowed.
+##
+pdfork
+pdgetpid
+pdkill
+#pdwait4 # not yet implemented
+
+##
+## Allow pipe(2).
+##
+pipe
+pipe2
+
+##
+## Allow poll(2), which will be scoped by capability rights.
+## XXXRW: We don't yet do that scoping.
+##
+poll
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+pread
+preadv
+
+##
+## Allow access to profiling state on the current process.
+##
+profil
+
+##
+## Disallow ptrace(2) for now, but we do need debugging facilities in
+## capability mode, so we will want to revisit this, possibly by scoping its
+## operation.
+##
+#ptrace
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+pwrite
+pwritev
+read
+readv
+recv
+recvfrom
+recvmsg
+
+##
+## Allow real-time scheduling primitives to be used.
+##
+## XXXRW: These require scoping.
+##
+rtprio
+rtprio_thread
+
+##
+## Allow simple VM operations on the current process.
+##
+sbrk
+
+##
+## Allow querying trivial global scheduler state.
+##
+sched_get_priority_max
+sched_get_priority_min
+
+##
+## Allow various thread/process scheduler operations.
+##
+## XXXRW: Some of these require further scoping.
+##
+sched_getparam
+sched_getscheduler
+sched_rr_getinterval
+sched_setparam
+sched_setscheduler
+sched_yield
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+sctp_generic_recvmsg
+sctp_generic_sendmsg
+sctp_generic_sendmsg_iov
+sctp_peeloff
+
+##
+## Allow select(2), which will be scoped by capability rights.
+##
+## XXXRW: But is it?
+##
+select
+
+##
+## Allow I/O-related file descriptors, subject to capability rights. Use of
+## explicit addresses here is restricted by the system calls themselves.
+##
+send
+sendfile
+sendmsg
+sendto
+
+##
+## Allow setting per-process audit state, which is controlled separately by
+## privileges.
+##
+setaudit
+setaudit_addr
+setauid
+
+##
+## Allow setting thread context.
+##
+setcontext
+
+##
+## Allow setting current process credential state, which is controlled
+## separately by privilege.
+##
+setegid
+seteuid
+setgid
+
+##
+## Allow use of the process interval timer.
+##
+setitimer
+
+##
+## Allow setpriority(2).
+##
+## XXXRW: Requires scoping.
+##
+setpriority
+
+##
+## Allow setting current process credential state, which is controlled
+## separately by privilege.
+##
+setregid
+setresgid
+setresuid
+setreuid
+
+##
+## Allow setting process resource limits with setrlimit(2).
+##
+setrlimit
+
+##
+## Allow creating a new session with setsid(2).
+##
+setsid
+
+##
+## Allow setting socket options with setsockopt(2), subject to capability
+## rights.
+##
+## XXXRW: Might require scoping.
+##
+setsockopt
+
+##
+## Allow setting current process credential state, which is controlled
+## separately by privilege.
+##
+setuid
+
+##
+## shm_open(2) is scoped so as to allow only access to new anonymous objects.
+##
+shm_open
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+shutdown
+
+##
+## Allow signal control on current process.
+##
+sigaction
+sigaltstack
+sigblock
+sigpending
+sigprocmask
+sigqueue
+sigreturn
+sigsetmask
+sigstack
+sigsuspend
+sigtimedwait
+sigvec
+sigwaitinfo
+
+##
+## Allow creating new socket pairs with socket(2) and socketpair(2).
+##
+socket
+socketpair
+
+##
+## Allow simple VM operations on the current process.
+##
+## XXXRW: Kernel doesn't implement this, so drop?
+##
+sstk
+
+##
+## Do allow sync(2) for now, but possibly shouldn't.
+##
+sync
+
+##
+## Always allow process termination with sys_exit(2).
+##
+sys_exit
+
+##
+## sysarch(2) does rather diverse things, but is required on at least i386
+## in order to configure per-thread data. As such, it's scoped on each
+## architecture.
+##
+sysarch
+
+##
+## Allow thread operations operating only on current process.
+##
+thr_create
+thr_exit
+thr_kill
+
+##
+## Disallow thr_kill2(2), as it may operate beyond the current process.
+##
+## XXXRW: Requires scoping.
+##
+#thr_kill2
+
+##
+## Allow thread operations operating only on current process.
+##
+thr_new
+thr_self
+thr_set_name
+thr_suspend
+thr_wake
+
+##
+## Allow manipulation of the current process umask with umask(2).
+##
+umask
+
+##
+## Allow submitting of process trace entries with utrace(2).
+##
+utrace
+
+##
+## Allow generating UUIDs with uuidgen(2).
+##
+uuidgen
+
+##
+## Allow I/O-related file descriptors, subject to capability rights.
+##
+write
+writev
+
+##
+## Allow processes to yield(2).
+##
+yield
diff --git a/sys/kern/clock_if.m b/sys/kern/clock_if.m
new file mode 100644
index 0000000..cb1179a
--- /dev/null
+++ b/sys/kern/clock_if.m
@@ -0,0 +1,45 @@
+#-
+# Copyright (c) 2001 by Thomas Moestl <tmm@FreeBSD.org>.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/bus.h>
+#include <sys/time.h>
+
+INTERFACE clock;
+
+# Interface for clock drivers. This is inspired by the NetBSD device-independent
+# clock code (by Gordon W. Ross).
+
+# An EINVAL error return from this call signifies that the clock has an illegal
+# setting.
+METHOD int gettime {
+ device_t dev;
+ struct timespec *ts;
+};
+
+METHOD int settime {
+ device_t dev;
+ struct timespec *ts;
+};
diff --git a/sys/kern/cpufreq_if.m b/sys/kern/cpufreq_if.m
new file mode 100644
index 0000000..8b1213e
--- /dev/null
+++ b/sys/kern/cpufreq_if.m
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2004 Nate Lawson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+INTERFACE cpufreq;
+
+HEADER {
+ struct cf_level;
+ struct cf_setting;
+};
+
+# cpufreq interface methods
+
+#
+# Set the current CPU frequency level.
+#
+METHOD int set {
+ device_t dev;
+ const struct cf_level *level;
+ int priority;
+};
+
+#
+# Get the current active level.
+#
+METHOD int get {
+ device_t dev;
+ struct cf_level *level;
+};
+
+#
+# Get the current possible levels, based on all drivers.
+#
+METHOD int levels {
+ device_t dev;
+ struct cf_level *levels;
+ int *count;
+};
+
+# Individual frequency driver methods
+
+#
+# Set an individual driver's setting.
+#
+METHOD int drv_set {
+ device_t dev;
+ const struct cf_setting *set;
+};
+
+#
+# Get an individual driver's setting.
+#
+METHOD int drv_get {
+ device_t dev;
+ struct cf_setting *set;
+};
+
+#
+# Get the settings supported by a driver.
+#
+METHOD int drv_settings {
+ device_t dev;
+ struct cf_setting *sets;
+ int *count;
+};
+
+#
+# Get an individual driver's type.
+#
+METHOD int drv_type {
+ device_t dev;
+ int *type;
+};
+
diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m
new file mode 100644
index 0000000..eb720eb
--- /dev/null
+++ b/sys/kern/device_if.m
@@ -0,0 +1,318 @@
+#-
+# Copyright (c) 1998-2004 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+
+/**
+ * @defgroup DEVICE device - KObj methods for all device drivers
+ * @brief A basic set of methods required for all device drivers.
+ *
+ * The device interface is used to match devices to drivers during
+ * autoconfiguration and provides methods to allow drivers to handle
+ * system-wide events such as suspend, resume or shutdown.
+ * @{
+ */
+INTERFACE device;
+
+#
+# Default implementations of some methods.
+#
+CODE {
+ static int null_shutdown(device_t dev)
+ {
+ return 0;
+ }
+
+ static int null_suspend(device_t dev)
+ {
+ return 0;
+ }
+
+ static int null_resume(device_t dev)
+ {
+ return 0;
+ }
+
+ static int null_quiesce(device_t dev)
+ {
+ return EOPNOTSUPP;
+ }
+};
+
+/**
+ * @brief Probe to see if a device matches a driver.
+ *
+ * Users should not call this method directly. Normally, this
+ * is called via device_probe_and_attach() to select a driver
+ * calling the DEVICE_PROBE() of all candidate drivers and attach
+ * the winning driver (if any) to the device.
+ *
+ * This function is used to match devices to device drivers.
+ * Typically, the driver will examine the device to see if
+ * it is suitable for this driver. This might include checking
+ * the values of various device instance variables or reading
+ * hardware registers.
+ *
+ * In some cases, there may be more than one driver available
+ * which can be used for a device (for instance there might
+ * be a generic driver which works for a set of many types of
+ * device and a more specific driver which works for a subset
+ * of devices). Because of this, a driver should not assume
+ * that it will be the driver that attaches to the device even
+ * if it returns a success status from DEVICE_PROBE(). In particular,
+ * a driver must free any resources which it allocated during
+ * the probe before returning. The return value of DEVICE_PROBE()
+ * is used to elect which driver is used - the driver which returns
+ * the largest non-error value wins the election and attaches to
+ * the device. Common non-error values are described in the
+ * DEVICE_PROBE(9) manual page.
+ *
+ * If a driver matches the hardware, it should set the device
+ * description string using device_set_desc() or
+ * device_set_desc_copy(). This string is used to generate an
+ * informative message when DEVICE_ATTACH() is called.
+ *
+ * As a special case, if a driver returns zero, the driver election
+ * is cut short and that driver will attach to the device
+ * immediately. This should rarely be used.
+ *
+ * For example, a probe method for a PCI device driver might look
+ * like this:
+ *
+ * @code
+ * int
+ * foo_probe(device_t dev)
+ * {
+ * if (pci_get_vendor(dev) == FOOVENDOR &&
+ * pci_get_device(dev) == FOODEVICE) {
+ * device_set_desc(dev, "Foo device");
+ * return (BUS_PROBE_DEFAULT);
+ * }
+ * return (ENXIO);
+ * }
+ * @endcode
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * KOBJMETHOD(device_probe, foo_probe)
+ * @endcode
+ *
+ * @param dev the device to probe
+ *
+ * @retval 0 if this is the only possible driver for this
+ * device
+ * @retval negative if the driver can match this device - the
+ * least negative value is used to select the
+ * driver
+ * @retval ENXIO if the driver does not match the device
+ * @retval positive if some kind of error was detected during
+ * the probe, a regular unix error code should
+ * be returned to indicate the type of error
+ * @see DEVICE_ATTACH(), pci_get_vendor(), pci_get_device()
+ */
+METHOD int probe {
+ device_t dev;
+};
+
+/**
+ * @brief Allow a device driver to detect devices not otherwise enumerated.
+ *
+ * The DEVICE_IDENTIFY() method is used by some drivers (e.g. the ISA
+ * bus driver) to help populate the bus device with a useful set of
+ * child devices, normally by calling the BUS_ADD_CHILD() method of
+ * the parent device. For instance, the ISA bus driver uses several
+ * special drivers, including the isahint driver and the pnp driver to
+ * create child devices based on configuration hints and PnP bus
+ * probes respectively.
+ *
+ * Many bus drivers which support true plug-and-play do not need to
+ * use this method at all since child devices can be discovered
+ * automatically without help from child drivers.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * KOBJMETHOD(device_identify, foo_identify)
+ * @endcode
+ *
+ * @param driver the driver whose identify method is being called
+ * @param parent the parent device to use when adding new children
+ */
+STATICMETHOD void identify {
+ driver_t *driver;
+ device_t parent;
+};
+
+/**
+ * @brief Attach a device to a device driver
+ *
+ * Normally only called via device_probe_and_attach(), this is called
+ * when a driver has succeeded in probing against a device.
+ * This method should initialise the hardware and allocate other
+ * system resources (e.g. devfs entries) as required.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * KOBJMETHOD(device_attach, foo_attach)
+ * @endcode
+ *
+ * @param dev the device to probe
+ *
+ * @retval 0 success
+ * @retval non-zero if some kind of error was detected during
+ * the attach, a regular unix error code should
+ * be returned to indicate the type of error
+ * @see DEVICE_PROBE()
+ */
+METHOD int attach {
+ device_t dev;
+};
+
+/**
+ * @brief Detach a driver from a device.
+ *
+ * This can be called if the user is replacing the
+ * driver software or if a device is about to be physically removed
+ * from the system (e.g. for removable hardware such as USB or PCCARD).
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * KOBJMETHOD(device_detach, foo_detach)
+ * @endcode
+ *
+ * @param dev the device to detach
+ *
+ * @retval 0 success
+ * @retval non-zero the detach could not be performed, e.g. if the
+ * driver does not support detaching.
+ *
+ * @see DEVICE_ATTACH()
+ */
+METHOD int detach {
+ device_t dev;
+};
+
+/**
+ * @brief Called during system shutdown.
+ *
+ * This method allows drivers to detect when the system is being shut down.
+ * Some drivers need to use this to place their hardware in a consistent
+ * state before rebooting the computer.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * KOBJMETHOD(device_shutdown, foo_shutdown)
+ * @endcode
+ */
+METHOD int shutdown {
+ device_t dev;
+} DEFAULT null_shutdown;
+
+/**
+ * @brief This is called by the power-management subsystem when a
+ * suspend has been requested by the user or by some automatic
+ * mechanism.
+ *
+ * This gives drivers a chance to veto the suspend or save their
+ * configuration before power is removed.
+ *
+ * To include this method in a device driver, use a line like this in
+ * the driver's method list:
+ *
+ * @code
+ * KOBJMETHOD(device_suspend, foo_suspend)
+ * @endcode
+ *
+ * @param dev the device being suspended
+ *
+ * @retval 0 success
+ * @retval non-zero an error occurred while attempting to prepare the
+ * device for suspension
+ *
+ * @see DEVICE_RESUME()
+ */
+METHOD int suspend {
+ device_t dev;
+} DEFAULT null_suspend;
+
+/**
+ * @brief This is called when the system resumes after a suspend.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * KOBJMETHOD(device_resume, foo_resume)
+ * @endcode
+ *
+ * @param dev the device being resumed
+ *
+ * @retval 0 success
+ * @retval non-zero an error occurred while attempting to restore the
+ * device from suspension
+ *
+ * @see DEVICE_SUSPEND()
+ */
+METHOD int resume {
+ device_t dev;
+} DEFAULT null_resume;
+
+/**
+ * @brief This is called when the driver is asked to quiesce itself.
+ *
+ * The driver should arrange for the orderly shutdown of this device.
+ * All further access to the device should be curtailed. Soon there
+ * will be a request to detach, but there won't necessarily be one.
+ *
+ * To include this method in a device driver, use a line like this
+ * in the driver's method list:
+ *
+ * @code
+ * KOBJMETHOD(device_quiesce, foo_quiesce)
+ * @endcode
+ *
+ * @param dev the device being quiesced
+ *
+ * @retval 0 success
+ * @retval non-zero an error occurred while attempting to quiesce the
+ * device
+ *
+ * @see DEVICE_DETACH()
+ */
+METHOD int quiesce {
+ device_t dev;
+} DEFAULT null_quiesce;
diff --git a/sys/kern/dtio_kdtrace.c b/sys/kern/dtio_kdtrace.c
new file mode 100644
index 0000000..3d6f416
--- /dev/null
+++ b/sys/kern/dtio_kdtrace.c
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2012 Advanced Computing Technologies LLC
+ * Written by George Neville-Neil gnn@freebsd.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <sys/dtrace.h>
+#include "../sys/dtrace_bsd.h"
+
+
+static int dtio_unload(void);
+static void dtio_getargdesc(void *, dtrace_id_t, void *,
+ dtrace_argdesc_t *);
+static void dtio_provide(void *, dtrace_probedesc_t *);
+static void dtio_destroy(void *, dtrace_id_t, void *);
+static void dtio_enable(void *, dtrace_id_t, void *);
+static void dtio_disable(void *, dtrace_id_t, void *);
+static void dtio_load(void *);
+
+static dtrace_pattr_t dtio_attr = {
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
+};
+
+static char *kernel = "kernel";
+
+/*
+ * Name strings.
+ */
+static char *dtio_start_str = "start";
+static char *dtio_done_str = "done";
+static char *dtio_wait_start_str = "wait-start";
+static char *dtio_wait_done_str = "wait-done";
+
+static dtrace_pops_t dtio_pops = {
+ dtio_provide,
+ NULL,
+ dtio_enable,
+ dtio_disable,
+ NULL,
+ NULL,
+ dtio_getargdesc,
+ NULL,
+ NULL,
+ dtio_destroy
+};
+
+static dtrace_provider_id_t dtio_id;
+
+extern uint32_t dtio_start_id;
+extern uint32_t dtio_done_id;
+extern uint32_t dtio_wait_start_id;
+extern uint32_t dtio_wait_done_id;
+
+static void
+dtio_getargdesc(void *arg, dtrace_id_t id, void *parg,
+ dtrace_argdesc_t *desc)
+{
+ const char *p = NULL;
+
+ switch (desc->dtargd_ndx) {
+ case 0:
+ p = "struct bio *";
+ break;
+ case 1:
+ p = "struct devstat *";
+ break;
+ default:
+ desc->dtargd_ndx = DTRACE_ARGNONE;
+ }
+
+ if (p != NULL)
+ strlcpy(desc->dtargd_native, p, sizeof(desc->dtargd_native));
+}
+
+static void
+dtio_provide(void *arg, dtrace_probedesc_t *desc)
+{
+ if (desc != NULL)
+ return;
+
+ if (dtrace_probe_lookup(dtio_id, kernel, NULL,
+ dtio_start_str) == 0) {
+ dtio_start_id = dtrace_probe_create(dtio_id, kernel, NULL,
+ dtio_start_str, 0, NULL);
+ }
+ if (dtrace_probe_lookup(dtio_id, kernel, NULL, dtio_done_str) == 0) {
+ dtio_done_id = dtrace_probe_create(dtio_id, kernel, NULL,
+ dtio_done_str, 0, NULL);
+ }
+ if (dtrace_probe_lookup(dtio_id, kernel, NULL,
+ dtio_wait_start_str) == 0) {
+ dtio_wait_start_id = dtrace_probe_create(dtio_id, kernel,
+ NULL,
+ dtio_wait_start_str,
+ 0, NULL);
+ }
+ if (dtrace_probe_lookup(dtio_id, kernel, NULL,
+ dtio_wait_done_str) == 0) {
+ dtio_wait_done_id = dtrace_probe_create(dtio_id, kernel, NULL,
+ dtio_wait_done_str, 0, NULL);
+ }
+
+}
+
+static void
+dtio_destroy(void *arg, dtrace_id_t id, void *parg)
+{
+}
+
+static void
+dtio_enable(void *arg, dtrace_id_t id, void *parg)
+{
+ if (id == dtio_start_id)
+ dtrace_io_start_probe =
+ (dtrace_io_start_probe_func_t)dtrace_probe;
+ else if (id == dtio_done_id)
+ dtrace_io_done_probe =
+ (dtrace_io_done_probe_func_t)dtrace_probe;
+ else if (id == dtio_wait_start_id)
+ dtrace_io_wait_start_probe =
+ (dtrace_io_wait_start_probe_func_t)dtrace_probe;
+ else if (id == dtio_wait_done_id)
+ dtrace_io_wait_done_probe =
+ (dtrace_io_wait_done_probe_func_t)dtrace_probe;
+ else
+ printf("dtrace io provider: unknown ID\n");
+
+}
+
+static void
+dtio_disable(void *arg, dtrace_id_t id, void *parg)
+{
+ if (id == dtio_start_id)
+ dtrace_io_start_probe = NULL;
+ else if (id == dtio_done_id)
+ dtrace_io_done_probe = NULL;
+ else if (id == dtio_wait_start_id)
+ dtrace_io_wait_start_probe = NULL;
+ else if (id == dtio_wait_done_id)
+ dtrace_io_wait_done_probe = NULL;
+ else
+ printf("dtrace io provider: unknown ID\n");
+
+}
+
+static void
+dtio_load(void *dummy)
+{
+ if (dtrace_register("io", &dtio_attr, DTRACE_PRIV_USER, NULL,
+ &dtio_pops, NULL, &dtio_id) != 0)
+ return;
+}
+
+
+static int
+dtio_unload()
+{
+ dtrace_io_start_probe = NULL;
+ dtrace_io_done_probe = NULL;
+ dtrace_io_wait_start_probe = NULL;
+ dtrace_io_wait_done_probe = NULL;
+
+ return (dtrace_unregister(dtio_id));
+}
+
+static int
+dtio_modevent(module_t mod __unused, int type, void *data __unused)
+{
+ int error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ break;
+
+ case MOD_UNLOAD:
+ break;
+
+ case MOD_SHUTDOWN:
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+SYSINIT(dtio_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+ dtio_load, NULL);
+SYSUNINIT(dtio_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
+ dtio_unload, NULL);
+
+DEV_MODULE(dtio, dtio_modevent, NULL);
+MODULE_VERSION(dtio, 1);
+MODULE_DEPEND(dtio, dtrace, 1, 1, 1);
+MODULE_DEPEND(dtio, opensolaris, 1, 1, 1);
diff --git a/sys/kern/genassym.sh b/sys/kern/genassym.sh
new file mode 100644
index 0000000..1cbc32b
--- /dev/null
+++ b/sys/kern/genassym.sh
@@ -0,0 +1,69 @@
+#!/bin/sh
+# $FreeBSD$
+
+usage()
+{
+ echo "usage: genassym [-o outfile] objfile"
+ exit 1
+}
+
+
+work()
+{
+ ${NM:='nm'} "$1" | ${AWK:='awk'} '
+ / C .*sign$/ {
+ sign = substr($1, length($1) - 3, 4)
+ sub("^0*", "", sign)
+ if (sign != "")
+ sign = "-"
+ }
+ / C .*w0$/ {
+ w0 = substr($1, length($1) - 3, 4)
+ }
+ / C .*w1$/ {
+ w1 = substr($1, length($1) - 3, 4)
+ }
+ / C .*w2$/ {
+ w2 = substr($1, length($1) - 3, 4)
+ }
+ / C .*w3$/ {
+ w3 = substr($1, length($1) - 3, 4)
+ w = w3 w2 w1 w0
+ sub("^0*", "", w)
+ if (w == "")
+ w = "0"
+ sub("w3$", "", $3)
+ # This still has minor problems representing INT_MIN, etc.
+ # E.g.,
+ # with 32-bit 2''s complement ints, this prints -0x80000000,
+ # which has the wrong type (unsigned int).
+ printf("#define\t%s\t%s0x%s\n", $3, sign, w)
+ } '
+}
+
+
+#
+#MAIN PROGGRAM
+#
+use_outfile="no"
+while getopts "o:" option
+do
+ case "$option" in
+ o) outfile="$OPTARG"
+ use_outfile="yes";;
+ *) usage;;
+ esac
+done
+shift $(($OPTIND - 1))
+case $# in
+1) ;;
+*) usage;;
+esac
+
+if [ "$use_outfile" = "yes" ]
+then
+ work $1 3>"$outfile" >&3 3>&-
+else
+ work $1
+fi
+
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
new file mode 100644
index 0000000..3ae78de
--- /dev/null
+++ b/sys/kern/imgact_aout.c
@@ -0,0 +1,343 @@
+/*-
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <machine/frame.h>
+#include <machine/md_var.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_param.h>
+
+#ifdef __amd64__
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/ia32/ia32_signal.h>
+#endif
+
+static int exec_aout_imgact(struct image_params *imgp);
+static int aout_fixup(register_t **stack_base, struct image_params *imgp);
+
+#if defined(__i386__)
+struct sysentvec aout_sysvec = {
+ .sv_size = SYS_MAXSYSCALL,
+ .sv_table = sysent,
+ .sv_mask = 0,
+ .sv_sigsize = 0,
+ .sv_sigtbl = NULL,
+ .sv_errsize = 0,
+ .sv_errtbl = NULL,
+ .sv_transtrap = NULL,
+ .sv_fixup = aout_fixup,
+ .sv_sendsig = sendsig,
+ .sv_sigcode = sigcode,
+ .sv_szsigcode = &szsigcode,
+ .sv_prepsyscall = NULL,
+ .sv_name = "FreeBSD a.out",
+ .sv_coredump = NULL,
+ .sv_imgact_try = NULL,
+ .sv_minsigstksz = MINSIGSTKSZ,
+ .sv_pagesize = PAGE_SIZE,
+ .sv_minuser = VM_MIN_ADDRESS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS,
+ .sv_usrstack = USRSTACK,
+ .sv_psstrings = PS_STRINGS,
+ .sv_stackprot = VM_PROT_ALL,
+ .sv_copyout_strings = exec_copyout_strings,
+ .sv_setregs = exec_setregs,
+ .sv_fixlimit = NULL,
+ .sv_maxssiz = NULL,
+ .sv_flags = SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
+ .sv_set_syscall_retval = cpu_set_syscall_retval,
+ .sv_fetch_syscall_args = cpu_fetch_syscall_args,
+ .sv_syscallnames = syscallnames,
+ .sv_schedtail = NULL,
+};
+
+#elif defined(__amd64__)
+
+#define AOUT32_USRSTACK 0xbfc00000
+#define AOUT32_PS_STRINGS \
+ (AOUT32_USRSTACK - sizeof(struct freebsd32_ps_strings))
+#define AOUT32_MINUSER FREEBSD32_MINUSER
+
+extern const char *freebsd32_syscallnames[];
+extern u_long ia32_maxssiz;
+
+struct sysentvec aout_sysvec = {
+ .sv_size = FREEBSD32_SYS_MAXSYSCALL,
+ .sv_table = freebsd32_sysent,
+ .sv_mask = 0,
+ .sv_sigsize = 0,
+ .sv_sigtbl = NULL,
+ .sv_errsize = 0,
+ .sv_errtbl = NULL,
+ .sv_transtrap = NULL,
+ .sv_fixup = aout_fixup,
+ .sv_sendsig = ia32_sendsig,
+ .sv_sigcode = ia32_sigcode,
+ .sv_szsigcode = &sz_ia32_sigcode,
+ .sv_prepsyscall = NULL,
+ .sv_name = "FreeBSD a.out",
+ .sv_coredump = NULL,
+ .sv_imgact_try = NULL,
+ .sv_minsigstksz = MINSIGSTKSZ,
+ .sv_pagesize = IA32_PAGE_SIZE,
+ .sv_minuser = AOUT32_MINUSER,
+ .sv_maxuser = AOUT32_USRSTACK,
+ .sv_usrstack = AOUT32_USRSTACK,
+ .sv_psstrings = AOUT32_PS_STRINGS,
+ .sv_stackprot = VM_PROT_ALL,
+ .sv_copyout_strings = freebsd32_copyout_strings,
+ .sv_setregs = ia32_setregs,
+ .sv_fixlimit = ia32_fixlimit,
+ .sv_maxssiz = &ia32_maxssiz,
+ .sv_flags = SV_ABI_FREEBSD | SV_AOUT | SV_IA32 | SV_ILP32,
+ .sv_set_syscall_retval = ia32_set_syscall_retval,
+ .sv_fetch_syscall_args = ia32_fetch_syscall_args,
+ .sv_syscallnames = freebsd32_syscallnames,
+};
+#else
+#error "Port me"
+#endif
+
+static int
+aout_fixup(register_t **stack_base, struct image_params *imgp)
+{
+
+ *(char **)stack_base -= sizeof(uint32_t);
+ return (suword32(*stack_base, imgp->args->argc));
+}
+
+static int
+exec_aout_imgact(struct image_params *imgp)
+{
+ const struct exec *a_out = (const struct exec *) imgp->image_header;
+ struct vmspace *vmspace;
+ vm_map_t map;
+ vm_object_t object;
+ vm_offset_t text_end, data_end;
+ unsigned long virtual_offset;
+ unsigned long file_offset;
+ unsigned long bss_size;
+ int error;
+
+ /*
+ * Linux and *BSD binaries look very much alike,
+ * only the machine id is different:
+ * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
+ * NetBSD is in network byte order.. ugh.
+ */
+ if (((a_out->a_midmag >> 16) & 0xff) != 0x86 &&
+ ((a_out->a_midmag >> 16) & 0xff) != 0 &&
+ ((((int)ntohl(a_out->a_midmag)) >> 16) & 0xff) != 0x86)
+ return -1;
+
+ /*
+ * Set file/virtual offset based on a.out variant.
+ * We do two cases: host byte order and network byte order
+ * (for NetBSD compatibility)
+ */
+ switch ((int)(a_out->a_midmag & 0xffff)) {
+ case ZMAGIC:
+ virtual_offset = 0;
+ if (a_out->a_text) {
+ file_offset = PAGE_SIZE;
+ } else {
+ /* Bill's "screwball mode" */
+ file_offset = 0;
+ }
+ break;
+ case QMAGIC:
+ virtual_offset = PAGE_SIZE;
+ file_offset = 0;
+ /* Pass PS_STRINGS for BSD/OS binaries only. */
+ if (N_GETMID(*a_out) == MID_ZERO)
+ imgp->ps_strings = aout_sysvec.sv_psstrings;
+ break;
+ default:
+ /* NetBSD compatibility */
+ switch ((int)(ntohl(a_out->a_midmag) & 0xffff)) {
+ case ZMAGIC:
+ case QMAGIC:
+ virtual_offset = PAGE_SIZE;
+ file_offset = 0;
+ break;
+ default:
+ return (-1);
+ }
+ }
+
+ bss_size = roundup(a_out->a_bss, PAGE_SIZE);
+
+ /*
+ * Check various fields in header for validity/bounds.
+ */
+ if (/* entry point must lay with text region */
+ a_out->a_entry < virtual_offset ||
+ a_out->a_entry >= virtual_offset + a_out->a_text ||
+
+ /* text and data size must each be page rounded */
+ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK
+
+#ifdef __amd64__
+ ||
+ /* overflows */
+ virtual_offset + a_out->a_text + a_out->a_data + bss_size > UINT_MAX
+#endif
+ )
+ return (-1);
+
+ /* text + data can't exceed file size */
+ if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
+ return (EFAULT);
+
+ /*
+ * text/data/bss must not exceed limits
+ */
+ PROC_LOCK(imgp->proc);
+ if (/* text can't exceed maximum text size */
+ a_out->a_text > maxtsiz ||
+
+ /* data + bss can't exceed rlimit */
+ a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
+ racct_set(imgp->proc, RACCT_DATA, a_out->a_data + bss_size) != 0) {
+ PROC_UNLOCK(imgp->proc);
+ return (ENOMEM);
+ }
+ PROC_UNLOCK(imgp->proc);
+
+ /*
+ * Avoid a possible deadlock if the current address space is destroyed
+ * and that address space maps the locked vnode. In the common case,
+ * the locked vnode's v_usecount is decremented but remains greater
+ * than zero. Consequently, the vnode lock is not needed by vrele().
+ * However, in cases where the vnode lock is external, such as nullfs,
+ * v_usecount may become zero.
+ */
+ VOP_UNLOCK(imgp->vp, 0);
+
+ /*
+ * Destroy old process VM and create a new one (with a new stack)
+ */
+ error = exec_new_vmspace(imgp, &aout_sysvec);
+
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error)
+ return (error);
+
+ /*
+ * The vm space can be changed by exec_new_vmspace
+ */
+ vmspace = imgp->proc->p_vmspace;
+
+ object = imgp->object;
+ map = &vmspace->vm_map;
+ vm_map_lock(map);
+ vm_object_reference(object);
+
+ text_end = virtual_offset + a_out->a_text;
+ error = vm_map_insert(map, object,
+ file_offset,
+ virtual_offset, text_end,
+ VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
+ MAP_COPY_ON_WRITE | MAP_PREFAULT);
+ if (error) {
+ vm_map_unlock(map);
+ vm_object_deallocate(object);
+ return (error);
+ }
+ data_end = text_end + a_out->a_data;
+ if (a_out->a_data) {
+ vm_object_reference(object);
+ error = vm_map_insert(map, object,
+ file_offset + a_out->a_text,
+ text_end, data_end,
+ VM_PROT_ALL, VM_PROT_ALL,
+ MAP_COPY_ON_WRITE | MAP_PREFAULT);
+ if (error) {
+ vm_map_unlock(map);
+ vm_object_deallocate(object);
+ return (error);
+ }
+ }
+
+ if (bss_size) {
+ error = vm_map_insert(map, NULL, 0,
+ data_end, data_end + bss_size,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ vm_map_unlock(map);
+ return (error);
+ }
+ }
+ vm_map_unlock(map);
+
+ /* Fill in process VM information */
+ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
+ vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
+ vmspace->vm_daddr = (caddr_t) (uintptr_t)
+ (virtual_offset + a_out->a_text);
+
+ /* Fill in image_params */
+ imgp->interpreted = 0;
+ imgp->entry_addr = a_out->a_entry;
+
+ imgp->proc->p_sysent = &aout_sysvec;
+
+ return (0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
+EXEC_SET(aout, aout_execsw);
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
new file mode 100644
index 0000000..61a2aef
--- /dev/null
+++ b/sys/kern/imgact_elf.c
@@ -0,0 +1,2135 @@
+/*-
+ * Copyright (c) 2000 David O'Brien
+ * Copyright (c) 1995-1996 Søren Schmidt
+ * Copyright (c) 1996 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_core.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/exec.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mman.h>
+#include <sys/namei.h>
+#include <sys/pioctl.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sf_buf.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+#include <sys/syslog.h>
+#include <sys/eventhandler.h>
+#include <sys/user.h>
+
+#include <net/zlib.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+
+#include <machine/elf.h>
+#include <machine/md_var.h>
+
+#define ELF_NOTE_ROUNDSIZE 4
+#define OLD_EI_BRAND 8
+
+static int __elfN(check_header)(const Elf_Ehdr *hdr);
+static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp,
+ const char *interp, int interp_name_len, int32_t *osrel);
+static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
+ u_long *entry, size_t pagesize);
+static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
+ caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
+ size_t pagesize);
+static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
+static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note,
+ int32_t *osrel);
+static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
+static boolean_t __elfN(check_note)(struct image_params *imgp,
+ Elf_Brandnote *checknote, int32_t *osrel);
+static vm_prot_t __elfN(trans_prot)(Elf_Word);
+static Elf_Word __elfN(untrans_prot)(vm_prot_t);
+
+SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
+ "");
+
+#ifdef COMPRESS_USER_CORES
+static int compress_core(gzFile, char *, char *, unsigned int,
+ struct thread * td);
+#endif
+#define CORE_BUF_SIZE (16 * 1024)
+
+int __elfN(fallback_brand) = -1;
+SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
+ fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0,
+ __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
+TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand",
+ &__elfN(fallback_brand));
+
+static int elf_legacy_coredump = 0;
+SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW,
+ &elf_legacy_coredump, 0, "");
+
+int __elfN(nxstack) =
+#if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */
+ 1;
+#else
+ 0;
+#endif
+SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
+ nxstack, CTLFLAG_RW, &__elfN(nxstack), 0,
+ __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack");
+
+#if __ELF_WORD_SIZE == 32
+#if defined(__amd64__) || defined(__ia64__)
+int i386_read_exec = 0;
+SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0,
+ "enable execution from readable segments");
+#endif
+#endif
+
+static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
+
+#define trunc_page_ps(va, ps) ((va) & ~(ps - 1))
+#define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1))
+#define aligned(a, t) (trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a))
+
+static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
+
+Elf_Brandnote __elfN(freebsd_brandnote) = {
+ .hdr.n_namesz = sizeof(FREEBSD_ABI_VENDOR),
+ .hdr.n_descsz = sizeof(int32_t),
+ .hdr.n_type = 1,
+ .vendor = FREEBSD_ABI_VENDOR,
+ .flags = BN_TRANSLATE_OSREL,
+ .trans_osrel = __elfN(freebsd_trans_osrel)
+};
+
+static boolean_t
+__elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel)
+{
+ uintptr_t p;
+
+ p = (uintptr_t)(note + 1);
+ p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
+ *osrel = *(const int32_t *)(p);
+
+ return (TRUE);
+}
+
+static const char GNU_ABI_VENDOR[] = "GNU";
+static int GNU_KFREEBSD_ABI_DESC = 3;
+
+Elf_Brandnote __elfN(kfreebsd_brandnote) = {
+ .hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
+ .hdr.n_descsz = 16, /* XXX at least 16 */
+ .hdr.n_type = 1,
+ .vendor = GNU_ABI_VENDOR,
+ .flags = BN_TRANSLATE_OSREL,
+ .trans_osrel = kfreebsd_trans_osrel
+};
+
+static boolean_t
+kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel)
+{
+ const Elf32_Word *desc;
+ uintptr_t p;
+
+ p = (uintptr_t)(note + 1);
+ p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
+
+ desc = (const Elf32_Word *)p;
+ if (desc[0] != GNU_KFREEBSD_ABI_DESC)
+ return (FALSE);
+
+ /*
+ * Debian GNU/kFreeBSD embed the earliest compatible kernel version
+ * (__FreeBSD_version: <major><two digit minor>Rxx) in the LSB way.
+ */
+ *osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3];
+
+ return (TRUE);
+}
+
+int
+__elfN(insert_brand_entry)(Elf_Brandinfo *entry)
+{
+ int i;
+
+ for (i = 0; i < MAX_BRANDS; i++) {
+ if (elf_brand_list[i] == NULL) {
+ elf_brand_list[i] = entry;
+ break;
+ }
+ }
+ if (i == MAX_BRANDS) {
+ printf("WARNING: %s: could not insert brandinfo entry: %p\n",
+ __func__, entry);
+ return (-1);
+ }
+ return (0);
+}
+
+int
+__elfN(remove_brand_entry)(Elf_Brandinfo *entry)
+{
+ int i;
+
+ for (i = 0; i < MAX_BRANDS; i++) {
+ if (elf_brand_list[i] == entry) {
+ elf_brand_list[i] = NULL;
+ break;
+ }
+ }
+ if (i == MAX_BRANDS)
+ return (-1);
+ return (0);
+}
+
+int
+__elfN(brand_inuse)(Elf_Brandinfo *entry)
+{
+ struct proc *p;
+ int rval = FALSE;
+
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_sysent == entry->sysvec) {
+ rval = TRUE;
+ break;
+ }
+ }
+ sx_sunlock(&allproc_lock);
+
+ return (rval);
+}
+
+static Elf_Brandinfo *
+__elfN(get_brandinfo)(struct image_params *imgp, const char *interp,
+ int interp_name_len, int32_t *osrel)
+{
+ const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
+ Elf_Brandinfo *bi;
+ boolean_t ret;
+ int i;
+
+ /*
+ * We support four types of branding -- (1) the ELF EI_OSABI field
+ * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
+ * branding w/in the ELF header, (3) path of the `interp_path'
+ * field, and (4) the ".note.ABI-tag" ELF section.
+ */
+
+ /* Look for an ".note.ABI-tag" ELF section */
+ for (i = 0; i < MAX_BRANDS; i++) {
+ bi = elf_brand_list[i];
+ if (bi == NULL)
+ continue;
+ if (hdr->e_machine == bi->machine && (bi->flags &
+ (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
+ ret = __elfN(check_note)(imgp, bi->brand_note, osrel);
+ if (ret)
+ return (bi);
+ }
+ }
+
+ /* If the executable has a brand, search for it in the brand list. */
+ for (i = 0; i < MAX_BRANDS; i++) {
+ bi = elf_brand_list[i];
+ if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
+ continue;
+ if (hdr->e_machine == bi->machine &&
+ (hdr->e_ident[EI_OSABI] == bi->brand ||
+ strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
+ bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
+ return (bi);
+ }
+
+ /* Lacking a known brand, search for a recognized interpreter. */
+ if (interp != NULL) {
+ for (i = 0; i < MAX_BRANDS; i++) {
+ bi = elf_brand_list[i];
+ if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
+ continue;
+ if (hdr->e_machine == bi->machine &&
+ /* ELF image p_filesz includes terminating zero */
+ strlen(bi->interp_path) + 1 == interp_name_len &&
+ strncmp(interp, bi->interp_path, interp_name_len)
+ == 0)
+ return (bi);
+ }
+ }
+
+ /* Lacking a recognized interpreter, try the default brand */
+ for (i = 0; i < MAX_BRANDS; i++) {
+ bi = elf_brand_list[i];
+ if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY)
+ continue;
+ if (hdr->e_machine == bi->machine &&
+ __elfN(fallback_brand) == bi->brand)
+ return (bi);
+ }
+ return (NULL);
+}
+
+static int
+__elfN(check_header)(const Elf_Ehdr *hdr)
+{
+ Elf_Brandinfo *bi;
+ int i;
+
+ if (!IS_ELF(*hdr) ||
+ hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+ hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
+ hdr->e_ident[EI_VERSION] != EV_CURRENT ||
+ hdr->e_phentsize != sizeof(Elf_Phdr) ||
+ hdr->e_version != ELF_TARG_VER)
+ return (ENOEXEC);
+
+ /*
+ * Make sure we have at least one brand for this machine.
+ */
+
+ for (i = 0; i < MAX_BRANDS; i++) {
+ bi = elf_brand_list[i];
+ if (bi != NULL && bi->machine == hdr->e_machine)
+ break;
+ }
+ if (i == MAX_BRANDS)
+ return (ENOEXEC);
+
+ return (0);
+}
+
+static int
+__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
+ vm_offset_t start, vm_offset_t end, vm_prot_t prot)
+{
+ struct sf_buf *sf;
+ int error;
+ vm_offset_t off;
+
+ /*
+ * Create the page if it doesn't exist yet. Ignore errors.
+ */
+ vm_map_lock(map);
+ vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end),
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ vm_map_unlock(map);
+
+ /*
+ * Find the page from the underlying object.
+ */
+ if (object) {
+ sf = vm_imgact_map_page(object, offset);
+ if (sf == NULL)
+ return (KERN_FAILURE);
+ off = offset - trunc_page(offset);
+ error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
+ end - start);
+ vm_imgact_unmap_page(sf);
+ if (error) {
+ return (KERN_FAILURE);
+ }
+ }
+
+ return (KERN_SUCCESS);
+}
+
+static int
+__elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
+ vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow)
+{
+ struct sf_buf *sf;
+ vm_offset_t off;
+ vm_size_t sz;
+ int error, rv;
+
+ if (start != trunc_page(start)) {
+ rv = __elfN(map_partial)(map, object, offset, start,
+ round_page(start), prot);
+ if (rv)
+ return (rv);
+ offset += round_page(start) - start;
+ start = round_page(start);
+ }
+ if (end != round_page(end)) {
+ rv = __elfN(map_partial)(map, object, offset +
+ trunc_page(end) - start, trunc_page(end), end, prot);
+ if (rv)
+ return (rv);
+ end = trunc_page(end);
+ }
+ if (end > start) {
+ if (offset & PAGE_MASK) {
+ /*
+ * The mapping is not page aligned. This means we have
+ * to copy the data. Sigh.
+ */
+ rv = vm_map_find(map, NULL, 0, &start, end - start,
+ FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0);
+ if (rv)
+ return (rv);
+ if (object == NULL)
+ return (KERN_SUCCESS);
+ for (; start < end; start += sz) {
+ sf = vm_imgact_map_page(object, offset);
+ if (sf == NULL)
+ return (KERN_FAILURE);
+ off = offset - trunc_page(offset);
+ sz = end - start;
+ if (sz > PAGE_SIZE - off)
+ sz = PAGE_SIZE - off;
+ error = copyout((caddr_t)sf_buf_kva(sf) + off,
+ (caddr_t)start, sz);
+ vm_imgact_unmap_page(sf);
+ if (error) {
+ return (KERN_FAILURE);
+ }
+ offset += sz;
+ }
+ rv = KERN_SUCCESS;
+ } else {
+ vm_object_reference(object);
+ vm_map_lock(map);
+ rv = vm_map_insert(map, object, offset, start, end,
+ prot, VM_PROT_ALL, cow);
+ vm_map_unlock(map);
+ if (rv != KERN_SUCCESS)
+ vm_object_deallocate(object);
+ }
+ return (rv);
+ } else {
+ return (KERN_SUCCESS);
+ }
+}
+
+static int
+__elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
+ caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
+ size_t pagesize)
+{
+ struct sf_buf *sf;
+ size_t map_len;
+ vm_map_t map;
+ vm_object_t object;
+ vm_offset_t map_addr;
+ int error, rv, cow;
+ size_t copy_len;
+ vm_offset_t file_addr;
+
+ /*
+ * It's necessary to fail if the filsz + offset taken from the
+ * header is greater than the actual file pager object's size.
+ * If we were to allow this, then the vm_map_find() below would
+ * walk right off the end of the file object and into the ether.
+ *
+ * While I'm here, might as well check for something else that
+ * is invalid: filsz cannot be greater than memsz.
+ */
+ if ((off_t)filsz + offset > imgp->attr->va_size || filsz > memsz) {
+ uprintf("elf_load_section: truncated ELF file\n");
+ return (ENOEXEC);
+ }
+
+ object = imgp->object;
+ map = &imgp->proc->p_vmspace->vm_map;
+ map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
+ file_addr = trunc_page_ps(offset, pagesize);
+
+ /*
+ * We have two choices. We can either clear the data in the last page
+ * of an oversized mapping, or we can start the anon mapping a page
+ * early and copy the initialized data into that first page. We
+ * choose the second..
+ */
+ if (memsz > filsz)
+ map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
+ else
+ map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
+
+ if (map_len != 0) {
+ /* cow flags: don't dump readonly sections in core */
+ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
+ (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
+
+ rv = __elfN(map_insert)(map,
+ object,
+ file_addr, /* file offset */
+ map_addr, /* virtual start */
+ map_addr + map_len,/* virtual end */
+ prot,
+ cow);
+ if (rv != KERN_SUCCESS)
+ return (EINVAL);
+
+ /* we can stop now if we've covered it all */
+ if (memsz == filsz) {
+ return (0);
+ }
+ }
+
+
+ /*
+ * We have to get the remaining bit of the file into the first part
+ * of the oversized map segment. This is normally because the .data
+ * segment in the file is extended to provide bss. It's a neat idea
+ * to try and save a page, but it's a pain in the behind to implement.
+ */
+ copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
+ map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
+ map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
+ map_addr;
+
+ /* This had damn well better be true! */
+ if (map_len != 0) {
+ rv = __elfN(map_insert)(map, NULL, 0, map_addr, map_addr +
+ map_len, VM_PROT_ALL, 0);
+ if (rv != KERN_SUCCESS) {
+ return (EINVAL);
+ }
+ }
+
+ if (copy_len != 0) {
+ vm_offset_t off;
+
+ sf = vm_imgact_map_page(object, offset + filsz);
+ if (sf == NULL)
+ return (EIO);
+
+ /* send the page fragment to user space */
+ off = trunc_page_ps(offset + filsz, pagesize) -
+ trunc_page(offset + filsz);
+ error = copyout((caddr_t)sf_buf_kva(sf) + off,
+ (caddr_t)map_addr, copy_len);
+ vm_imgact_unmap_page(sf);
+ if (error) {
+ return (error);
+ }
+ }
+
+ /*
+ * set it to the specified protection.
+ * XXX had better undo the damage from pasting over the cracks here!
+ */
+ vm_map_protect(map, trunc_page(map_addr), round_page(map_addr +
+ map_len), prot, FALSE);
+
+ return (0);
+}
+
+/*
+ * Load the file "file" into memory. It may be either a shared object
+ * or an executable.
+ *
+ * The "addr" reference parameter is in/out. On entry, it specifies
+ * the address where a shared object should be loaded. If the file is
+ * an executable, this value is ignored. On exit, "addr" specifies
+ * where the file was actually loaded.
+ *
+ * The "entry" reference parameter is out only. On exit, it specifies
+ * the entry point for the loaded file.
+ */
+static int
+__elfN(load_file)(struct proc *p, const char *file, u_long *addr,
+ u_long *entry, size_t pagesize)
+{
+ struct {
+ struct nameidata nd;
+ struct vattr attr;
+ struct image_params image_params;
+ } *tempdata;
+ const Elf_Ehdr *hdr = NULL;
+ const Elf_Phdr *phdr = NULL;
+ struct nameidata *nd;
+ struct vattr *attr;
+ struct image_params *imgp;
+ vm_prot_t prot;
+ u_long rbase;
+ u_long base_addr = 0;
+ int error, i, numsegs;
+
+#ifdef CAPABILITY_MODE
+ /*
+ * XXXJA: This check can go away once we are sufficiently confident
+ * that the checks in namei() are correct.
+ */
+ if (IN_CAPABILITY_MODE(curthread))
+ return (ECAPMODE);
+#endif
+
+ tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
+ nd = &tempdata->nd;
+ attr = &tempdata->attr;
+ imgp = &tempdata->image_params;
+
+ /*
+ * Initialize part of the common data
+ */
+ imgp->proc = p;
+ imgp->attr = attr;
+ imgp->firstpage = NULL;
+ imgp->image_header = NULL;
+ imgp->object = NULL;
+ imgp->execlabel = NULL;
+
+ NDINIT(nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, curthread);
+ if ((error = namei(nd)) != 0) {
+ nd->ni_vp = NULL;
+ goto fail;
+ }
+ NDFREE(nd, NDF_ONLY_PNBUF);
+ imgp->vp = nd->ni_vp;
+
+ /*
+ * Check permissions, modes, uid, etc on the file, and "open" it.
+ */
+ error = exec_check_permissions(imgp);
+ if (error)
+ goto fail;
+
+ error = exec_map_first_page(imgp);
+ if (error)
+ goto fail;
+
+ /*
+ * Also make certain that the interpreter stays the same, so set
+ * its VV_TEXT flag, too.
+ */
+ VOP_SET_TEXT(nd->ni_vp);
+
+ imgp->object = nd->ni_vp->v_object;
+
+ hdr = (const Elf_Ehdr *)imgp->image_header;
+ if ((error = __elfN(check_header)(hdr)) != 0)
+ goto fail;
+ if (hdr->e_type == ET_DYN)
+ rbase = *addr;
+ else if (hdr->e_type == ET_EXEC)
+ rbase = 0;
+ else {
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ /* Only support headers that fit within first page for now */
+ if ((hdr->e_phoff > PAGE_SIZE) ||
+ (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+ if (!aligned(phdr, Elf_Addr)) {
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
+ if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) {
+ /* Loadable segment */
+ prot = __elfN(trans_prot)(phdr[i].p_flags);
+ error = __elfN(load_section)(imgp, phdr[i].p_offset,
+ (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
+ phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize);
+ if (error != 0)
+ goto fail;
+ /*
+ * Establish the base address if this is the
+ * first segment.
+ */
+ if (numsegs == 0)
+ base_addr = trunc_page(phdr[i].p_vaddr +
+ rbase);
+ numsegs++;
+ }
+ }
+ *addr = base_addr;
+ *entry = (unsigned long)hdr->e_entry + rbase;
+
+fail:
+ if (imgp->firstpage)
+ exec_unmap_first_page(imgp);
+
+ if (nd->ni_vp)
+ vput(nd->ni_vp);
+
+ free(tempdata, M_TEMP);
+
+ return (error);
+}
+
+static int
+__CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
+{
+ const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
+ const Elf_Phdr *phdr;
+ Elf_Auxargs *elf_auxargs;
+ struct vmspace *vmspace;
+ vm_prot_t prot;
+ u_long text_size = 0, data_size = 0, total_size = 0;
+ u_long text_addr = 0, data_addr = 0;
+ u_long seg_size, seg_addr;
+ u_long addr, baddr, et_dyn_addr, entry = 0, proghdr = 0;
+ int32_t osrel = 0;
+ int error = 0, i, n, interp_name_len = 0;
+ const char *interp = NULL, *newinterp = NULL;
+ Elf_Brandinfo *brand_info;
+ char *path;
+ struct sysentvec *sv;
+
+ /*
+ * Do we have a valid ELF header ?
+ *
+ * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
+ * if particular brand doesn't support it.
+ */
+ if (__elfN(check_header)(hdr) != 0 ||
+ (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
+ return (-1);
+
+ /*
+ * From here on down, we return an errno, not -1, as we've
+ * detected an ELF file.
+ */
+
+ if ((hdr->e_phoff > PAGE_SIZE) ||
+ (u_int)hdr->e_phentsize * hdr->e_phnum > PAGE_SIZE - hdr->e_phoff) {
+ /* Only support headers in first page for now */
+ return (ENOEXEC);
+ }
+ phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+ if (!aligned(phdr, Elf_Addr))
+ return (ENOEXEC);
+ n = 0;
+ baddr = 0;
+ for (i = 0; i < hdr->e_phnum; i++) {
+ switch (phdr[i].p_type) {
+ case PT_LOAD:
+ if (n == 0)
+ baddr = phdr[i].p_vaddr;
+ n++;
+ break;
+ case PT_INTERP:
+ /* Path to interpreter */
+ if (phdr[i].p_filesz > MAXPATHLEN ||
+ phdr[i].p_offset > PAGE_SIZE ||
+ phdr[i].p_filesz > PAGE_SIZE - phdr[i].p_offset)
+ return (ENOEXEC);
+ interp = imgp->image_header + phdr[i].p_offset;
+ interp_name_len = phdr[i].p_filesz;
+ break;
+ case PT_GNU_STACK:
+ if (__elfN(nxstack))
+ imgp->stack_prot =
+ __elfN(trans_prot)(phdr[i].p_flags);
+ break;
+ }
+ }
+
+ brand_info = __elfN(get_brandinfo)(imgp, interp, interp_name_len,
+ &osrel);
+ if (brand_info == NULL) {
+ uprintf("ELF binary type \"%u\" not known.\n",
+ hdr->e_ident[EI_OSABI]);
+ return (ENOEXEC);
+ }
+ if (hdr->e_type == ET_DYN) {
+ if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0)
+ return (ENOEXEC);
+ /*
+ * Honour the base load address from the dso if it is
+ * non-zero for some reason.
+ */
+ if (baddr == 0)
+ et_dyn_addr = ET_DYN_LOAD_ADDR;
+ else
+ et_dyn_addr = 0;
+ } else
+ et_dyn_addr = 0;
+ sv = brand_info->sysvec;
+ if (interp != NULL && brand_info->interp_newpath != NULL)
+ newinterp = brand_info->interp_newpath;
+
+ /*
+ * Avoid a possible deadlock if the current address space is destroyed
+ * and that address space maps the locked vnode. In the common case,
+ * the locked vnode's v_usecount is decremented but remains greater
+ * than zero. Consequently, the vnode lock is not needed by vrele().
+ * However, in cases where the vnode lock is external, such as nullfs,
+ * v_usecount may become zero.
+ *
+ * The VV_TEXT flag prevents modifications to the executable while
+ * the vnode is unlocked.
+ */
+ VOP_UNLOCK(imgp->vp, 0);
+
+ error = exec_new_vmspace(imgp, sv);
+ imgp->proc->p_sysent = sv;
+
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error)
+ return (error);
+
+ for (i = 0; i < hdr->e_phnum; i++) {
+ switch (phdr[i].p_type) {
+ case PT_LOAD: /* Loadable segment */
+ if (phdr[i].p_memsz == 0)
+ break;
+ prot = __elfN(trans_prot)(phdr[i].p_flags);
+ error = __elfN(load_section)(imgp, phdr[i].p_offset,
+ (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr,
+ phdr[i].p_memsz, phdr[i].p_filesz, prot,
+ sv->sv_pagesize);
+ if (error != 0)
+ return (error);
+
+ /*
+ * If this segment contains the program headers,
+ * remember their virtual address for the AT_PHDR
+ * aux entry. Static binaries don't usually include
+ * a PT_PHDR entry.
+ */
+ if (phdr[i].p_offset == 0 &&
+ hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
+ <= phdr[i].p_filesz)
+ proghdr = phdr[i].p_vaddr + hdr->e_phoff +
+ et_dyn_addr;
+
+ seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
+ seg_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr + et_dyn_addr - seg_addr);
+
+ /*
+ * Make the largest executable segment the official
+ * text segment and all others data.
+ *
+ * Note that obreak() assumes that data_addr +
+ * data_size == end of data load area, and the ELF
+ * file format expects segments to be sorted by
+ * address. If multiple data segments exist, the
+ * last one will be used.
+ */
+
+ if (phdr[i].p_flags & PF_X && text_size < seg_size) {
+ text_size = seg_size;
+ text_addr = seg_addr;
+ } else {
+ data_size = seg_size;
+ data_addr = seg_addr;
+ }
+ total_size += seg_size;
+ break;
+ case PT_PHDR: /* Program header table info */
+ proghdr = phdr[i].p_vaddr + et_dyn_addr;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (data_addr == 0 && data_size == 0) {
+ data_addr = text_addr;
+ data_size = text_size;
+ }
+
+ entry = (u_long)hdr->e_entry + et_dyn_addr;
+
+ /*
+ * Check limits. It should be safe to check the
+ * limits after loading the segments since we do
+ * not actually fault in all the segments pages.
+ */
+ PROC_LOCK(imgp->proc);
+ if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
+ text_size > maxtsiz ||
+ total_size > lim_cur(imgp->proc, RLIMIT_VMEM) ||
+ racct_set(imgp->proc, RACCT_DATA, data_size) != 0 ||
+ racct_set(imgp->proc, RACCT_VMEM, total_size) != 0) {
+ PROC_UNLOCK(imgp->proc);
+ return (ENOMEM);
+ }
+
+ vmspace = imgp->proc->p_vmspace;
+ vmspace->vm_tsize = text_size >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
+ vmspace->vm_dsize = data_size >> PAGE_SHIFT;
+ vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
+
+ /*
+ * We load the dynamic linker where a userland call
+ * to mmap(0, ...) would put it. The rationale behind this
+ * calculation is that it leaves room for the heap to grow to
+ * its maximum allowed size.
+ */
+ addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(imgp->proc,
+ RLIMIT_DATA));
+ PROC_UNLOCK(imgp->proc);
+
+ imgp->entry_addr = entry;
+
+ if (interp != NULL) {
+ int have_interp = FALSE;
+ VOP_UNLOCK(imgp->vp, 0);
+ if (brand_info->emul_path != NULL &&
+ brand_info->emul_path[0] != '\0') {
+ path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ snprintf(path, MAXPATHLEN, "%s%s",
+ brand_info->emul_path, interp);
+ error = __elfN(load_file)(imgp->proc, path, &addr,
+ &imgp->entry_addr, sv->sv_pagesize);
+ free(path, M_TEMP);
+ if (error == 0)
+ have_interp = TRUE;
+ }
+ if (!have_interp && newinterp != NULL) {
+ error = __elfN(load_file)(imgp->proc, newinterp, &addr,
+ &imgp->entry_addr, sv->sv_pagesize);
+ if (error == 0)
+ have_interp = TRUE;
+ }
+ if (!have_interp) {
+ error = __elfN(load_file)(imgp->proc, interp, &addr,
+ &imgp->entry_addr, sv->sv_pagesize);
+ }
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error != 0) {
+ uprintf("ELF interpreter %s not found\n", interp);
+ return (error);
+ }
+ } else
+ addr = et_dyn_addr;
+
+ /*
+ * Construct auxargs table (used by the fixup routine)
+ */
+ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
+ elf_auxargs->execfd = -1;
+ elf_auxargs->phdr = proghdr;
+ elf_auxargs->phent = hdr->e_phentsize;
+ elf_auxargs->phnum = hdr->e_phnum;
+ elf_auxargs->pagesz = PAGE_SIZE;
+ elf_auxargs->base = addr;
+ elf_auxargs->flags = 0;
+ elf_auxargs->entry = entry;
+
+ imgp->auxargs = elf_auxargs;
+ imgp->interpreted = 0;
+ imgp->reloc_base = addr;
+ imgp->proc->p_osrel = osrel;
+
+ return (error);
+}
+
+#define suword __CONCAT(suword, __ELF_WORD_SIZE)
+
+int
+__elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
+{
+ Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
+ Elf_Addr *base;
+ Elf_Addr *pos;
+
+ base = (Elf_Addr *)*stack_base;
+ pos = base + (imgp->args->argc + imgp->args->envc + 2);
+
+ if (args->execfd != -1)
+ AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
+ AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
+ AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
+ AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
+ AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
+ AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
+ AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
+ AUXARGS_ENTRY(pos, AT_BASE, args->base);
+ if (imgp->execpathp != 0)
+ AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp);
+ AUXARGS_ENTRY(pos, AT_OSRELDATE, osreldate);
+ if (imgp->canary != 0) {
+ AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary);
+ AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen);
+ }
+ AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus);
+ if (imgp->pagesizes != 0) {
+ AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes);
+ AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen);
+ }
+ if (imgp->sysent->sv_timekeep_base != 0) {
+ AUXARGS_ENTRY(pos, AT_TIMEKEEP,
+ imgp->sysent->sv_timekeep_base);
+ }
+ AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj
+ != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
+ imgp->sysent->sv_stackprot);
+ AUXARGS_ENTRY(pos, AT_NULL, 0);
+
+ free(imgp->auxargs, M_TEMP);
+ imgp->auxargs = NULL;
+
+ base--;
+ suword(base, (long)imgp->args->argc);
+ *stack_base = (register_t *)base;
+ return (0);
+}
+
+/*
+ * Code for generating ELF core dumps.
+ */
+
+typedef void (*segment_callback)(vm_map_entry_t, void *);
+
+/* Closure for cb_put_phdr(). */
+struct phdr_closure {
+ Elf_Phdr *phdr; /* Program header to fill in */
+ Elf_Off offset; /* Offset of segment in core file */
+};
+
+/* Closure for cb_size_segment(). */
+struct sseg_closure {
+ int count; /* Count of writable segments. */
+ size_t size; /* Total size of all writable segments. */
+};
+
+typedef void (*outfunc_t)(void *, struct sbuf *, size_t *);
+
+struct note_info {
+ int type; /* Note type. */
+ outfunc_t outfunc; /* Output function. */
+ void *outarg; /* Argument for the output function. */
+ size_t outsize; /* Output size. */
+ TAILQ_ENTRY(note_info) link; /* Link to the next note info. */
+};
+
+TAILQ_HEAD(note_info_list, note_info);
+
+static void cb_put_phdr(vm_map_entry_t, void *);
+static void cb_size_segment(vm_map_entry_t, void *);
+static void each_writable_segment(struct thread *, segment_callback, void *);
+static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
+ int, void *, size_t, struct note_info_list *, size_t, gzFile);
+static void __elfN(prepare_notes)(struct thread *, struct note_info_list *,
+ size_t *);
+static void __elfN(puthdr)(struct thread *, void *, size_t, int, size_t);
+static void __elfN(putnote)(struct note_info *, struct sbuf *);
+static size_t register_note(struct note_info_list *, int, outfunc_t, void *);
+static int sbuf_drain_core_output(void *, const char *, int);
+static int sbuf_drain_count(void *arg, const char *data, int len);
+
+static void __elfN(note_fpregset)(void *, struct sbuf *, size_t *);
+static void __elfN(note_prpsinfo)(void *, struct sbuf *, size_t *);
+static void __elfN(note_prstatus)(void *, struct sbuf *, size_t *);
+static void __elfN(note_threadmd)(void *, struct sbuf *, size_t *);
+static void __elfN(note_thrmisc)(void *, struct sbuf *, size_t *);
+static void __elfN(note_procstat_auxv)(void *, struct sbuf *, size_t *);
+static void __elfN(note_procstat_proc)(void *, struct sbuf *, size_t *);
+static void __elfN(note_procstat_psstrings)(void *, struct sbuf *, size_t *);
+static void note_procstat_files(void *, struct sbuf *, size_t *);
+static void note_procstat_groups(void *, struct sbuf *, size_t *);
+static void note_procstat_osrel(void *, struct sbuf *, size_t *);
+static void note_procstat_rlimit(void *, struct sbuf *, size_t *);
+static void note_procstat_umask(void *, struct sbuf *, size_t *);
+static void note_procstat_vmmap(void *, struct sbuf *, size_t *);
+
+#ifdef COMPRESS_USER_CORES
+extern int compress_user_cores;
+extern int compress_user_cores_gzlevel;
+#endif
+
+static int
+core_output(struct vnode *vp, void *base, size_t len, off_t offset,
+ struct ucred *active_cred, struct ucred *file_cred,
+ struct thread *td, char *core_buf, gzFile gzfile) {
+
+ int error;
+ if (gzfile) {
+#ifdef COMPRESS_USER_CORES
+ error = compress_core(gzfile, base, core_buf, len, td);
+#else
+ panic("shouldn't be here");
+#endif
+ } else {
+ error = vn_rdwr_inchunks(UIO_WRITE, vp, base, len, offset,
+ UIO_USERSPACE, IO_UNIT | IO_DIRECT, active_cred, file_cred,
+ NULL, td);
+ }
+ return (error);
+}
+
+/* Coredump output parameters for sbuf drain routine. */
+struct sbuf_drain_core_params {
+ off_t offset;
+ struct ucred *active_cred;
+ struct ucred *file_cred;
+ struct thread *td;
+ struct vnode *vp;
+#ifdef COMPRESS_USER_CORES
+ gzFile gzfile;
+#endif
+};
+
+/*
+ * Drain into a core file.
+ */
+static int
+sbuf_drain_core_output(void *arg, const char *data, int len)
+{
+ struct sbuf_drain_core_params *p;
+ int error, locked;
+
+ p = (struct sbuf_drain_core_params *)arg;
+
+ /*
+ * Some kern_proc out routines that print to this sbuf may
+ * call us with the process lock held. Draining with the
+ * non-sleepable lock held is unsafe. The lock is needed for
+ * those routines when dumping a live process. In our case we
+ * can safely release the lock before draining and acquire
+ * again after.
+ */
+ locked = PROC_LOCKED(p->td->td_proc);
+ if (locked)
+ PROC_UNLOCK(p->td->td_proc);
+#ifdef COMPRESS_USER_CORES
+ if (p->gzfile != Z_NULL)
+ error = compress_core(p->gzfile, NULL, __DECONST(char *, data),
+ len, p->td);
+ else
+#endif
+ error = vn_rdwr_inchunks(UIO_WRITE, p->vp,
+ __DECONST(void *, data), len, p->offset, UIO_SYSSPACE,
+ IO_UNIT | IO_DIRECT, p->active_cred, p->file_cred, NULL,
+ p->td);
+ if (locked)
+ PROC_LOCK(p->td->td_proc);
+ if (error != 0)
+ return (-error);
+ p->offset += len;
+ return (len);
+}
+
+/*
+ * Drain into a counter.
+ */
+static int
+sbuf_drain_count(void *arg, const char *data __unused, int len)
+{
+ size_t *sizep;
+
+ sizep = (size_t *)arg;
+ *sizep += len;
+ return (len);
+}
+
+int
+__elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags)
+{
+ struct ucred *cred = td->td_ucred;
+ int error = 0;
+ struct sseg_closure seginfo;
+ struct note_info_list notelst;
+ struct note_info *ninfo;
+ void *hdr;
+ size_t hdrsize, notesz, coresize;
+
+ gzFile gzfile = Z_NULL;
+ char *core_buf = NULL;
+#ifdef COMPRESS_USER_CORES
+ char gzopen_flags[8];
+ char *p;
+ int doing_compress = flags & IMGACT_CORE_COMPRESS;
+#endif
+
+ hdr = NULL;
+ TAILQ_INIT(&notelst);
+
+#ifdef COMPRESS_USER_CORES
+ if (doing_compress) {
+ p = gzopen_flags;
+ *p++ = 'w';
+ if (compress_user_cores_gzlevel >= 0 &&
+ compress_user_cores_gzlevel <= 9)
+ *p++ = '0' + compress_user_cores_gzlevel;
+ *p = 0;
+ gzfile = gz_open("", gzopen_flags, vp);
+ if (gzfile == Z_NULL) {
+ error = EFAULT;
+ goto done;
+ }
+ core_buf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO);
+ if (!core_buf) {
+ error = ENOMEM;
+ goto done;
+ }
+ }
+#endif
+
+ /* Size the program segments. */
+ seginfo.count = 0;
+ seginfo.size = 0;
+ each_writable_segment(td, cb_size_segment, &seginfo);
+
+ /*
+ * Collect info about the core file header area.
+ */
+ hdrsize = sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * (1 + seginfo.count);
+ __elfN(prepare_notes)(td, &notelst, &notesz);
+ coresize = round_page(hdrsize + notesz) + seginfo.size;
+
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ error = racct_add(td->td_proc, RACCT_CORE, coresize);
+ PROC_UNLOCK(td->td_proc);
+ if (error != 0) {
+ error = EFAULT;
+ goto done;
+ }
+#endif
+ if (coresize >= limit) {
+ error = EFAULT;
+ goto done;
+ }
+
+ /*
+ * Allocate memory for building the header, fill it up,
+ * and write it out following the notes.
+ */
+ hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
+ if (hdr == NULL) {
+ error = EINVAL;
+ goto done;
+ }
+ error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize,
+ &notelst, notesz, gzfile);
+
+ /* Write the contents of all of the writable segments. */
+ if (error == 0) {
+ Elf_Phdr *php;
+ off_t offset;
+ int i;
+
+ php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
+ offset = round_page(hdrsize + notesz);
+ for (i = 0; i < seginfo.count; i++) {
+ error = core_output(vp, (caddr_t)(uintptr_t)php->p_vaddr,
+ php->p_filesz, offset, cred, NOCRED, curthread, core_buf, gzfile);
+ if (error != 0)
+ break;
+ offset += php->p_filesz;
+ php++;
+ }
+ }
+ if (error) {
+ log(LOG_WARNING,
+ "Failed to write core file for process %s (error %d)\n",
+ curproc->p_comm, error);
+ }
+
+done:
+#ifdef COMPRESS_USER_CORES
+ if (core_buf)
+ free(core_buf, M_TEMP);
+ if (gzfile)
+ gzclose(gzfile);
+#endif
+ while ((ninfo = TAILQ_FIRST(&notelst)) != NULL) {
+ TAILQ_REMOVE(&notelst, ninfo, link);
+ free(ninfo, M_TEMP);
+ }
+ if (hdr != NULL)
+ free(hdr, M_TEMP);
+
+ return (error);
+}
+
+/*
+ * A callback for each_writable_segment() to write out the segment's
+ * program header entry.
+ */
+static void
+cb_put_phdr(entry, closure)
+ vm_map_entry_t entry;
+ void *closure;
+{
+ struct phdr_closure *phc = (struct phdr_closure *)closure;
+ Elf_Phdr *phdr = phc->phdr;
+
+ phc->offset = round_page(phc->offset);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_offset = phc->offset;
+ phdr->p_vaddr = entry->start;
+ phdr->p_paddr = 0;
+ phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
+ phdr->p_align = PAGE_SIZE;
+ phdr->p_flags = __elfN(untrans_prot)(entry->protection);
+
+ phc->offset += phdr->p_filesz;
+ phc->phdr++;
+}
+
+/*
+ * A callback for each_writable_segment() to gather information about
+ * the number of segments and their total size.
+ */
+static void
+cb_size_segment(entry, closure)
+ vm_map_entry_t entry;
+ void *closure;
+{
+ struct sseg_closure *ssc = (struct sseg_closure *)closure;
+
+ ssc->count++;
+ ssc->size += entry->end - entry->start;
+}
+
+/*
+ * For each writable segment in the process's memory map, call the given
+ * function with a pointer to the map entry and some arbitrary
+ * caller-supplied data.
+ */
+static void
+each_writable_segment(td, func, closure)
+ struct thread *td;
+ segment_callback func;
+ void *closure;
+{
+ struct proc *p = td->td_proc;
+ vm_map_t map = &p->p_vmspace->vm_map;
+ vm_map_entry_t entry;
+ vm_object_t backing_object, object;
+ boolean_t ignore_entry;
+
+ vm_map_lock_read(map);
+ for (entry = map->header.next; entry != &map->header;
+ entry = entry->next) {
+ /*
+ * Don't dump inaccessible mappings, deal with legacy
+ * coredump mode.
+ *
+ * Note that read-only segments related to the elf binary
+ * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
+ * need to arbitrarily ignore such segments.
+ */
+ if (elf_legacy_coredump) {
+ if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
+ continue;
+ } else {
+ if ((entry->protection & VM_PROT_ALL) == 0)
+ continue;
+ }
+
+ /*
+ * Dont include memory segment in the coredump if
+ * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
+ * madvise(2). Do not dump submaps (i.e. parts of the
+ * kernel map).
+ */
+ if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
+ continue;
+
+ if ((object = entry->object.vm_object) == NULL)
+ continue;
+
+ /* Ignore memory-mapped devices and such things. */
+ VM_OBJECT_RLOCK(object);
+ while ((backing_object = object->backing_object) != NULL) {
+ VM_OBJECT_RLOCK(backing_object);
+ VM_OBJECT_RUNLOCK(object);
+ object = backing_object;
+ }
+ ignore_entry = object->type != OBJT_DEFAULT &&
+ object->type != OBJT_SWAP && object->type != OBJT_VNODE;
+ VM_OBJECT_RUNLOCK(object);
+ if (ignore_entry)
+ continue;
+
+ (*func)(entry, closure);
+ }
+ vm_map_unlock_read(map);
+}
+
+/*
+ * Write the core file header to the file, including padding up to
+ * the page boundary.
+ */
+static int
+__elfN(corehdr)(struct thread *td, struct vnode *vp, struct ucred *cred,
+ int numsegs, void *hdr, size_t hdrsize, struct note_info_list *notelst,
+ size_t notesz, gzFile gzfile)
+{
+ struct sbuf_drain_core_params params;
+ struct note_info *ninfo;
+ struct sbuf *sb;
+ int error;
+
+ /* Fill in the header. */
+ bzero(hdr, hdrsize);
+ __elfN(puthdr)(td, hdr, hdrsize, numsegs, notesz);
+
+ params.offset = 0;
+ params.active_cred = cred;
+ params.file_cred = NOCRED;
+ params.td = td;
+ params.vp = vp;
+#ifdef COMPRESS_USER_CORES
+ params.gzfile = gzfile;
+#endif
+ sb = sbuf_new(NULL, NULL, CORE_BUF_SIZE, SBUF_FIXEDLEN);
+ sbuf_set_drain(sb, sbuf_drain_core_output, &params);
+ sbuf_start_section(sb, NULL);
+ sbuf_bcat(sb, hdr, hdrsize);
+ TAILQ_FOREACH(ninfo, notelst, link)
+ __elfN(putnote)(ninfo, sb);
+ /* Align up to a page boundary for the program segments. */
+ sbuf_end_section(sb, -1, PAGE_SIZE, 0);
+ error = sbuf_finish(sb);
+ sbuf_delete(sb);
+
+ return (error);
+}
+
+static void
+__elfN(prepare_notes)(struct thread *td, struct note_info_list *list,
+ size_t *sizep)
+{
+ struct proc *p;
+ struct thread *thr;
+ size_t size;
+
+ p = td->td_proc;
+ size = 0;
+
+ size += register_note(list, NT_PRPSINFO, __elfN(note_prpsinfo), p);
+
+ /*
+ * To have the debugger select the right thread (LWP) as the initial
+ * thread, we dump the state of the thread passed to us in td first.
+ * This is the thread that causes the core dump and thus likely to
+ * be the right thread one wants to have selected in the debugger.
+ */
+ thr = td;
+ while (thr != NULL) {
+ size += register_note(list, NT_PRSTATUS,
+ __elfN(note_prstatus), thr);
+ size += register_note(list, NT_FPREGSET,
+ __elfN(note_fpregset), thr);
+ size += register_note(list, NT_THRMISC,
+ __elfN(note_thrmisc), thr);
+ size += register_note(list, -1,
+ __elfN(note_threadmd), thr);
+
+ thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
+ TAILQ_NEXT(thr, td_plist);
+ if (thr == td)
+ thr = TAILQ_NEXT(thr, td_plist);
+ }
+
+ size += register_note(list, NT_PROCSTAT_PROC,
+ __elfN(note_procstat_proc), p);
+ size += register_note(list, NT_PROCSTAT_FILES,
+ note_procstat_files, p);
+ size += register_note(list, NT_PROCSTAT_VMMAP,
+ note_procstat_vmmap, p);
+ size += register_note(list, NT_PROCSTAT_GROUPS,
+ note_procstat_groups, p);
+ size += register_note(list, NT_PROCSTAT_UMASK,
+ note_procstat_umask, p);
+ size += register_note(list, NT_PROCSTAT_RLIMIT,
+ note_procstat_rlimit, p);
+ size += register_note(list, NT_PROCSTAT_OSREL,
+ note_procstat_osrel, p);
+ size += register_note(list, NT_PROCSTAT_PSSTRINGS,
+ __elfN(note_procstat_psstrings), p);
+ size += register_note(list, NT_PROCSTAT_AUXV,
+ __elfN(note_procstat_auxv), p);
+
+ *sizep = size;
+}
+
+static void
+__elfN(puthdr)(struct thread *td, void *hdr, size_t hdrsize, int numsegs,
+ size_t notesz)
+{
+ Elf_Ehdr *ehdr;
+ Elf_Phdr *phdr;
+ struct phdr_closure phc;
+
+ ehdr = (Elf_Ehdr *)hdr;
+ phdr = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr));
+
+ ehdr->e_ident[EI_MAG0] = ELFMAG0;
+ ehdr->e_ident[EI_MAG1] = ELFMAG1;
+ ehdr->e_ident[EI_MAG2] = ELFMAG2;
+ ehdr->e_ident[EI_MAG3] = ELFMAG3;
+ ehdr->e_ident[EI_CLASS] = ELF_CLASS;
+ ehdr->e_ident[EI_DATA] = ELF_DATA;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
+ ehdr->e_ident[EI_ABIVERSION] = 0;
+ ehdr->e_ident[EI_PAD] = 0;
+ ehdr->e_type = ET_CORE;
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+ ehdr->e_machine = ELF_ARCH32;
+#else
+ ehdr->e_machine = ELF_ARCH;
+#endif
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_entry = 0;
+ ehdr->e_phoff = sizeof(Elf_Ehdr);
+ ehdr->e_flags = 0;
+ ehdr->e_ehsize = sizeof(Elf_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf_Phdr);
+ ehdr->e_phnum = numsegs + 1;
+ ehdr->e_shentsize = sizeof(Elf_Shdr);
+ ehdr->e_shnum = 0;
+ ehdr->e_shstrndx = SHN_UNDEF;
+
+ /*
+ * Fill in the program header entries.
+ */
+
+ /* The note segement. */
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = hdrsize;
+ phdr->p_vaddr = 0;
+ phdr->p_paddr = 0;
+ phdr->p_filesz = notesz;
+ phdr->p_memsz = 0;
+ phdr->p_flags = PF_R;
+ phdr->p_align = ELF_NOTE_ROUNDSIZE;
+ phdr++;
+
+ /* All the writable segments from the program. */
+ phc.phdr = phdr;
+ phc.offset = round_page(hdrsize + notesz);
+ each_writable_segment(td, cb_put_phdr, &phc);
+}
+
+static size_t
+register_note(struct note_info_list *list, int type, outfunc_t out, void *arg)
+{
+ struct note_info *ninfo;
+ size_t size, notesize;
+
+ size = 0;
+ out(arg, NULL, &size);
+ ninfo = malloc(sizeof(*ninfo), M_TEMP, M_ZERO | M_WAITOK);
+ ninfo->type = type;
+ ninfo->outfunc = out;
+ ninfo->outarg = arg;
+ ninfo->outsize = size;
+ TAILQ_INSERT_TAIL(list, ninfo, link);
+
+ if (type == -1)
+ return (size);
+
+ notesize = sizeof(Elf_Note) + /* note header */
+ roundup2(8, ELF_NOTE_ROUNDSIZE) + /* note name ("FreeBSD") */
+ roundup2(size, ELF_NOTE_ROUNDSIZE); /* note description */
+
+ return (notesize);
+}
+
+static void
+__elfN(putnote)(struct note_info *ninfo, struct sbuf *sb)
+{
+ Elf_Note note;
+ ssize_t old_len;
+
+ if (ninfo->type == -1) {
+ ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
+ return;
+ }
+
+ note.n_namesz = 8; /* strlen("FreeBSD") + 1 */
+ note.n_descsz = ninfo->outsize;
+ note.n_type = ninfo->type;
+
+ sbuf_bcat(sb, &note, sizeof(note));
+ sbuf_start_section(sb, &old_len);
+ sbuf_bcat(sb, "FreeBSD", note.n_namesz);
+ sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
+ if (note.n_descsz == 0)
+ return;
+ sbuf_start_section(sb, &old_len);
+ ninfo->outfunc(ninfo->outarg, sb, &ninfo->outsize);
+ sbuf_end_section(sb, old_len, ELF_NOTE_ROUNDSIZE, 0);
+}
+
+/*
+ * Miscellaneous note out functions.
+ */
+
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+#include <compat/freebsd32/freebsd32.h>
+
+typedef struct prstatus32 elf_prstatus_t;
+typedef struct prpsinfo32 elf_prpsinfo_t;
+typedef struct fpreg32 elf_prfpregset_t;
+typedef struct fpreg32 elf_fpregset_t;
+typedef struct reg32 elf_gregset_t;
+typedef struct thrmisc32 elf_thrmisc_t;
+#define ELF_KERN_PROC_MASK KERN_PROC_MASK32
+typedef struct kinfo_proc32 elf_kinfo_proc_t;
+typedef uint32_t elf_ps_strings_t;
+#else
+typedef prstatus_t elf_prstatus_t;
+typedef prpsinfo_t elf_prpsinfo_t;
+typedef prfpregset_t elf_prfpregset_t;
+typedef prfpregset_t elf_fpregset_t;
+typedef gregset_t elf_gregset_t;
+typedef thrmisc_t elf_thrmisc_t;
+#define ELF_KERN_PROC_MASK 0
+typedef struct kinfo_proc elf_kinfo_proc_t;
+typedef vm_offset_t elf_ps_strings_t;
+#endif
+
+static void
+__elfN(note_prpsinfo)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ elf_prpsinfo_t *psinfo;
+
+ p = (struct proc *)arg;
+ if (sb != NULL) {
+ KASSERT(*sizep == sizeof(*psinfo), ("invalid size"));
+ psinfo = malloc(sizeof(*psinfo), M_TEMP, M_ZERO | M_WAITOK);
+ psinfo->pr_version = PRPSINFO_VERSION;
+ psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
+ strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
+ /*
+ * XXX - We don't fill in the command line arguments properly
+ * yet.
+ */
+ strlcpy(psinfo->pr_psargs, p->p_comm,
+ sizeof(psinfo->pr_psargs));
+
+ sbuf_bcat(sb, psinfo, sizeof(*psinfo));
+ free(psinfo, M_TEMP);
+ }
+ *sizep = sizeof(*psinfo);
+}
+
+static void
+__elfN(note_prstatus)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct thread *td;
+ elf_prstatus_t *status;
+
+ td = (struct thread *)arg;
+ if (sb != NULL) {
+ KASSERT(*sizep == sizeof(*status), ("invalid size"));
+ status = malloc(sizeof(*status), M_TEMP, M_ZERO | M_WAITOK);
+ status->pr_version = PRSTATUS_VERSION;
+ status->pr_statussz = sizeof(elf_prstatus_t);
+ status->pr_gregsetsz = sizeof(elf_gregset_t);
+ status->pr_fpregsetsz = sizeof(elf_fpregset_t);
+ status->pr_osreldate = osreldate;
+ status->pr_cursig = td->td_proc->p_sig;
+ status->pr_pid = td->td_tid;
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+ fill_regs32(td, &status->pr_reg);
+#else
+ fill_regs(td, &status->pr_reg);
+#endif
+ sbuf_bcat(sb, status, sizeof(*status));
+ free(status, M_TEMP);
+ }
+ *sizep = sizeof(*status);
+}
+
+static void
+__elfN(note_fpregset)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct thread *td;
+ elf_prfpregset_t *fpregset;
+
+ td = (struct thread *)arg;
+ if (sb != NULL) {
+ KASSERT(*sizep == sizeof(*fpregset), ("invalid size"));
+ fpregset = malloc(sizeof(*fpregset), M_TEMP, M_ZERO | M_WAITOK);
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+ fill_fpregs32(td, fpregset);
+#else
+ fill_fpregs(td, fpregset);
+#endif
+ sbuf_bcat(sb, fpregset, sizeof(*fpregset));
+ free(fpregset, M_TEMP);
+ }
+ *sizep = sizeof(*fpregset);
+}
+
+static void
+__elfN(note_thrmisc)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct thread *td;
+ elf_thrmisc_t thrmisc;
+
+ td = (struct thread *)arg;
+ if (sb != NULL) {
+ KASSERT(*sizep == sizeof(thrmisc), ("invalid size"));
+ bzero(&thrmisc._pad, sizeof(thrmisc._pad));
+ strcpy(thrmisc.pr_tname, td->td_name);
+ sbuf_bcat(sb, &thrmisc, sizeof(thrmisc));
+ }
+ *sizep = sizeof(thrmisc);
+}
+
+/*
+ * Allow for MD specific notes, as well as any MD
+ * specific preparations for writing MI notes.
+ */
+static void
+__elfN(note_threadmd)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct thread *td;
+ void *buf;
+ size_t size;
+
+ td = (struct thread *)arg;
+ size = *sizep;
+ buf = NULL;
+ if (size != 0 && sb != NULL)
+ buf = malloc(size, M_TEMP, M_ZERO | M_WAITOK);
+ size = 0;
+ __elfN(dump_thread)(td, buf, &size);
+ KASSERT(*sizep == size, ("invalid size"));
+ if (size != 0 && sb != NULL)
+ sbuf_bcat(sb, buf, size);
+ *sizep = size;
+}
+
+#ifdef KINFO_PROC_SIZE
+CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
+#endif
+
+static void
+__elfN(note_procstat_proc)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ size_t size;
+ int structsize;
+
+ p = (struct proc *)arg;
+ size = sizeof(structsize) + p->p_numthreads *
+ sizeof(elf_kinfo_proc_t);
+
+ if (sb != NULL) {
+ KASSERT(*sizep == size, ("invalid size"));
+ structsize = sizeof(elf_kinfo_proc_t);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ PROC_LOCK(p);
+ kern_proc_out(p, sb, ELF_KERN_PROC_MASK);
+ }
+ *sizep = size;
+}
+
+#ifdef KINFO_FILE_SIZE
+CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
+#endif
+
+static void
+note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ size_t size;
+ int structsize;
+
+ p = (struct proc *)arg;
+ if (sb == NULL) {
+ size = 0;
+ sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
+ sbuf_set_drain(sb, sbuf_drain_count, &size);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ PROC_LOCK(p);
+ kern_proc_filedesc_out(p, sb, -1);
+ sbuf_finish(sb);
+ sbuf_delete(sb);
+ *sizep = size;
+ } else {
+ structsize = sizeof(struct kinfo_file);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ PROC_LOCK(p);
+ kern_proc_filedesc_out(p, sb, -1);
+ }
+}
+
+#ifdef KINFO_VMENTRY_SIZE
+CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
+#endif
+
+static void
+note_procstat_vmmap(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ size_t size;
+ int structsize;
+
+ p = (struct proc *)arg;
+ if (sb == NULL) {
+ size = 0;
+ sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
+ sbuf_set_drain(sb, sbuf_drain_count, &size);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ PROC_LOCK(p);
+ kern_proc_vmmap_out(p, sb);
+ sbuf_finish(sb);
+ sbuf_delete(sb);
+ *sizep = size;
+ } else {
+ structsize = sizeof(struct kinfo_vmentry);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ PROC_LOCK(p);
+ kern_proc_vmmap_out(p, sb);
+ }
+}
+
+static void
+note_procstat_groups(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ size_t size;
+ int structsize;
+
+ p = (struct proc *)arg;
+ size = sizeof(structsize) + p->p_ucred->cr_ngroups * sizeof(gid_t);
+ if (sb != NULL) {
+ KASSERT(*sizep == size, ("invalid size"));
+ structsize = sizeof(gid_t);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ sbuf_bcat(sb, p->p_ucred->cr_groups, p->p_ucred->cr_ngroups *
+ sizeof(gid_t));
+ }
+ *sizep = size;
+}
+
+static void
+note_procstat_umask(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ size_t size;
+ int structsize;
+
+ p = (struct proc *)arg;
+ size = sizeof(structsize) + sizeof(p->p_fd->fd_cmask);
+ if (sb != NULL) {
+ KASSERT(*sizep == size, ("invalid size"));
+ structsize = sizeof(p->p_fd->fd_cmask);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ sbuf_bcat(sb, &p->p_fd->fd_cmask, sizeof(p->p_fd->fd_cmask));
+ }
+ *sizep = size;
+}
+
+static void
+note_procstat_rlimit(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ struct rlimit rlim[RLIM_NLIMITS];
+ size_t size;
+ int structsize, i;
+
+ p = (struct proc *)arg;
+ size = sizeof(structsize) + sizeof(rlim);
+ if (sb != NULL) {
+ KASSERT(*sizep == size, ("invalid size"));
+ structsize = sizeof(rlim);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ PROC_LOCK(p);
+ for (i = 0; i < RLIM_NLIMITS; i++)
+ lim_rlimit(p, i, &rlim[i]);
+ PROC_UNLOCK(p);
+ sbuf_bcat(sb, rlim, sizeof(rlim));
+ }
+ *sizep = size;
+}
+
+static void
+note_procstat_osrel(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ size_t size;
+ int structsize;
+
+ p = (struct proc *)arg;
+ size = sizeof(structsize) + sizeof(p->p_osrel);
+ if (sb != NULL) {
+ KASSERT(*sizep == size, ("invalid size"));
+ structsize = sizeof(p->p_osrel);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ sbuf_bcat(sb, &p->p_osrel, sizeof(p->p_osrel));
+ }
+ *sizep = size;
+}
+
+static void
+__elfN(note_procstat_psstrings)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ elf_ps_strings_t ps_strings;
+ size_t size;
+ int structsize;
+
+ p = (struct proc *)arg;
+ size = sizeof(structsize) + sizeof(ps_strings);
+ if (sb != NULL) {
+ KASSERT(*sizep == size, ("invalid size"));
+ structsize = sizeof(ps_strings);
+#if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32
+ ps_strings = PTROUT(p->p_sysent->sv_psstrings);
+#else
+ ps_strings = p->p_sysent->sv_psstrings;
+#endif
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ sbuf_bcat(sb, &ps_strings, sizeof(ps_strings));
+ }
+ *sizep = size;
+}
+
+static void
+__elfN(note_procstat_auxv)(void *arg, struct sbuf *sb, size_t *sizep)
+{
+ struct proc *p;
+ size_t size;
+ int structsize;
+
+ p = (struct proc *)arg;
+ if (sb == NULL) {
+ size = 0;
+ sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
+ sbuf_set_drain(sb, sbuf_drain_count, &size);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ PHOLD(p);
+ proc_getauxv(curthread, p, sb);
+ PRELE(p);
+ sbuf_finish(sb);
+ sbuf_delete(sb);
+ *sizep = size;
+ } else {
+ structsize = sizeof(Elf_Auxinfo);
+ sbuf_bcat(sb, &structsize, sizeof(structsize));
+ PHOLD(p);
+ proc_getauxv(curthread, p, sb);
+ PRELE(p);
+ }
+}
+
+static boolean_t
+__elfN(parse_notes)(struct image_params *imgp, Elf_Brandnote *checknote,
+ int32_t *osrel, const Elf_Phdr *pnote)
+{
+ const Elf_Note *note, *note0, *note_end;
+ const char *note_name;
+ int i;
+
+ if (pnote == NULL || pnote->p_offset > PAGE_SIZE ||
+ pnote->p_filesz > PAGE_SIZE - pnote->p_offset)
+ return (FALSE);
+
+ note = note0 = (const Elf_Note *)(imgp->image_header + pnote->p_offset);
+ note_end = (const Elf_Note *)(imgp->image_header +
+ pnote->p_offset + pnote->p_filesz);
+ for (i = 0; i < 100 && note >= note0 && note < note_end; i++) {
+ if (!aligned(note, Elf32_Addr) || (const char *)note_end -
+ (const char *)note < sizeof(Elf_Note))
+ return (FALSE);
+ if (note->n_namesz != checknote->hdr.n_namesz ||
+ note->n_descsz != checknote->hdr.n_descsz ||
+ note->n_type != checknote->hdr.n_type)
+ goto nextnote;
+ note_name = (const char *)(note + 1);
+ if (note_name + checknote->hdr.n_namesz >=
+ (const char *)note_end || strncmp(checknote->vendor,
+ note_name, checknote->hdr.n_namesz) != 0)
+ goto nextnote;
+
+ /*
+ * Fetch the osreldate for binary
+ * from the ELF OSABI-note if necessary.
+ */
+ if ((checknote->flags & BN_TRANSLATE_OSREL) != 0 &&
+ checknote->trans_osrel != NULL)
+ return (checknote->trans_osrel(note, osrel));
+ return (TRUE);
+
+nextnote:
+ note = (const Elf_Note *)((const char *)(note + 1) +
+ roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE) +
+ roundup2(note->n_descsz, ELF_NOTE_ROUNDSIZE));
+ }
+
+ return (FALSE);
+}
+
+/*
+ * Try to find the appropriate ABI-note section for checknote,
+ * fetch the osreldate for binary from the ELF OSABI-note. Only the
+ * first page of the image is searched, the same as for headers.
+ */
+static boolean_t
+__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote,
+ int32_t *osrel)
+{
+ const Elf_Phdr *phdr;
+ const Elf_Ehdr *hdr;
+ int i;
+
+ hdr = (const Elf_Ehdr *)imgp->image_header;
+ phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+
+ for (i = 0; i < hdr->e_phnum; i++) {
+ if (phdr[i].p_type == PT_NOTE &&
+ __elfN(parse_notes)(imgp, checknote, osrel, &phdr[i]))
+ return (TRUE);
+ }
+ return (FALSE);
+
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw __elfN(execsw) = {
+ __CONCAT(exec_, __elfN(imgact)),
+ __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
+};
+EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
+
+#ifdef COMPRESS_USER_CORES
+/*
+ * Compress and write out a core segment for a user process.
+ *
+ * 'inbuf' is the starting address of a VM segment in the process' address
+ * space that is to be compressed and written out to the core file. 'dest_buf'
+ * is a buffer in the kernel's address space. The segment is copied from
+ * 'inbuf' to 'dest_buf' first before being processed by the compression
+ * routine gzwrite(). This copying is necessary because the content of the VM
+ * segment may change between the compression pass and the crc-computation pass
+ * in gzwrite(). This is because realtime threads may preempt the UNIX kernel.
+ *
+ * If inbuf is NULL it is assumed that data is already copied to 'dest_buf'.
+ */
+static int
+compress_core (gzFile file, char *inbuf, char *dest_buf, unsigned int len,
+ struct thread *td)
+{
+ int len_compressed;
+ int error = 0;
+ unsigned int chunk_len;
+
+ while (len) {
+ if (inbuf != NULL) {
+ chunk_len = (len > CORE_BUF_SIZE) ? CORE_BUF_SIZE : len;
+ copyin(inbuf, dest_buf, chunk_len);
+ inbuf += chunk_len;
+ } else {
+ chunk_len = len;
+ }
+ len_compressed = gzwrite(file, dest_buf, chunk_len);
+
+ EVENTHANDLER_INVOKE(app_coredump_progress, td, len_compressed);
+
+ if ((unsigned int)len_compressed != chunk_len) {
+ log(LOG_WARNING,
+ "compress_core: length mismatch (0x%x returned, "
+ "0x%x expected)\n", len_compressed, chunk_len);
+ EVENTHANDLER_INVOKE(app_coredump_error, td,
+ "compress_core: length mismatch %x -> %x",
+ chunk_len, len_compressed);
+ error = EFAULT;
+ break;
+ }
+ len -= chunk_len;
+ maybe_yield();
+ }
+
+ return (error);
+}
+#endif /* COMPRESS_USER_CORES */
+
+static vm_prot_t
+__elfN(trans_prot)(Elf_Word flags)
+{
+ vm_prot_t prot;
+
+ prot = 0;
+ if (flags & PF_X)
+ prot |= VM_PROT_EXECUTE;
+ if (flags & PF_W)
+ prot |= VM_PROT_WRITE;
+ if (flags & PF_R)
+ prot |= VM_PROT_READ;
+#if __ELF_WORD_SIZE == 32
+#if defined(__amd64__) || defined(__ia64__)
+ if (i386_read_exec && (flags & PF_R))
+ prot |= VM_PROT_EXECUTE;
+#endif
+#endif
+ return (prot);
+}
+
+static Elf_Word
+__elfN(untrans_prot)(vm_prot_t prot)
+{
+ Elf_Word flags;
+
+ flags = 0;
+ if (prot & VM_PROT_EXECUTE)
+ flags |= PF_X;
+ if (prot & VM_PROT_READ)
+ flags |= PF_R;
+ if (prot & VM_PROT_WRITE)
+ flags |= PF_W;
+ return (flags);
+}
diff --git a/sys/kern/imgact_elf32.c b/sys/kern/imgact_elf32.c
new file mode 100644
index 0000000..b286f31
--- /dev/null
+++ b/sys/kern/imgact_elf32.c
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2002 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define __ELF_WORD_SIZE 32
+#include <kern/imgact_elf.c>
diff --git a/sys/kern/imgact_elf64.c b/sys/kern/imgact_elf64.c
new file mode 100644
index 0000000..db2470d
--- /dev/null
+++ b/sys/kern/imgact_elf64.c
@@ -0,0 +1,31 @@
+/*-
+ * Copyright (c) 2002 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define __ELF_WORD_SIZE 64
+#include <kern/imgact_elf.c>
diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c
new file mode 100644
index 0000000..230854b
--- /dev/null
+++ b/sys/kern/imgact_gzip.c
@@ -0,0 +1,393 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ */
+
+/*
+ * This module handles execution of a.out files which have been run through
+ * "gzip". This saves diskspace, but wastes cpu-cycles and VM.
+ *
+ * TODO:
+ * text-segments should be made R/O after being filled
+ * is the vm-stuff safe ?
+ * should handle the entire header of gzip'ed stuff.
+ * inflate isn't quite reentrant yet...
+ * error-handling is a mess...
+ * so is the rest...
+ * tidy up unnecessary includes
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/inflate.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+struct imgact_gzip {
+ struct image_params *ip;
+ struct exec a_out;
+ int error;
+ int gotheader;
+ int where;
+ u_char *inbuf;
+ u_long offset;
+ u_long output;
+ u_long len;
+ int idx;
+ u_long virtual_offset, file_offset, file_end, bss_size;
+};
+
+static int exec_gzip_imgact(struct image_params *imgp);
+static int NextByte(void *vp);
+static int do_aout_hdr(struct imgact_gzip *);
+static int Flush(void *vp, u_char *, u_long siz);
+
+static int
+exec_gzip_imgact(imgp)
+ struct image_params *imgp;
+{
+ int error;
+ const u_char *p = (const u_char *) imgp->image_header;
+ struct imgact_gzip igz;
+ struct inflate infl;
+ struct vmspace *vmspace;
+
+ /* If these four are not OK, it isn't a gzip file */
+ if (p[0] != 0x1f)
+ return -1; /* 0 Simply magic */
+ if (p[1] != 0x8b)
+ return -1; /* 1 Simply magic */
+ if (p[2] != 0x08)
+ return -1; /* 2 Compression method */
+ if (p[9] != 0x03)
+ return -1; /* 9 OS compressed on */
+
+ /*
+ * If this one contains anything but a comment or a filename marker,
+ * we don't want to chew on it
+ */
+ if (p[3] & ~(0x18))
+ return ENOEXEC; /* 3 Flags */
+
+ /* These are of no use to us */
+ /* 4-7 Timestamp */
+ /* 8 Extra flags */
+
+ bzero(&igz, sizeof igz);
+ bzero(&infl, sizeof infl);
+ infl.gz_private = (void *) &igz;
+ infl.gz_input = NextByte;
+ infl.gz_output = Flush;
+
+ igz.ip = imgp;
+ igz.idx = 10;
+
+ if (p[3] & 0x08) { /* skip a filename */
+ while (p[igz.idx++])
+ if (igz.idx >= PAGE_SIZE)
+ return ENOEXEC;
+ }
+ if (p[3] & 0x10) { /* skip a comment */
+ while (p[igz.idx++])
+ if (igz.idx >= PAGE_SIZE)
+ return ENOEXEC;
+ }
+ igz.len = imgp->attr->va_size;
+
+ error = inflate(&infl);
+
+ /*
+ * The unzipped file may not even have been long enough to contain
+ * a header giving Flush() a chance to return error. Check for this.
+ */
+ if ( !igz.gotheader )
+ return ENOEXEC;
+
+ if ( !error ) {
+ vmspace = imgp->proc->p_vmspace;
+ error = vm_map_protect(&vmspace->vm_map,
+ (vm_offset_t) vmspace->vm_taddr,
+ (vm_offset_t) (vmspace->vm_taddr +
+ (vmspace->vm_tsize << PAGE_SHIFT)) ,
+ VM_PROT_READ|VM_PROT_EXECUTE,0);
+ }
+
+ if (igz.inbuf)
+ kmap_free_wakeup(exec_map, (vm_offset_t)igz.inbuf, PAGE_SIZE);
+ if (igz.error || error) {
+ printf("Output=%lu ", igz.output);
+ printf("Inflate_error=%d igz.error=%d where=%d\n",
+ error, igz.error, igz.where);
+ }
+ if (igz.error)
+ return igz.error;
+ if (error)
+ return ENOEXEC;
+ return 0;
+}
+
+static int
+do_aout_hdr(struct imgact_gzip * gz)
+{
+ int error;
+ struct vmspace *vmspace;
+ vm_offset_t vmaddr;
+
+ /*
+ * Set file/virtual offset based on a.out variant. We do two cases:
+ * host byte order and network byte order (for NetBSD compatibility)
+ */
+ switch ((int) (gz->a_out.a_midmag & 0xffff)) {
+ case ZMAGIC:
+ gz->virtual_offset = 0;
+ if (gz->a_out.a_text) {
+ gz->file_offset = PAGE_SIZE;
+ } else {
+ /* Bill's "screwball mode" */
+ gz->file_offset = 0;
+ }
+ break;
+ case QMAGIC:
+ gz->virtual_offset = PAGE_SIZE;
+ gz->file_offset = 0;
+ break;
+ default:
+ /* NetBSD compatibility */
+ switch ((int) (ntohl(gz->a_out.a_midmag) & 0xffff)) {
+ case ZMAGIC:
+ case QMAGIC:
+ gz->virtual_offset = PAGE_SIZE;
+ gz->file_offset = 0;
+ break;
+ default:
+ gz->where = __LINE__;
+ return (-1);
+ }
+ }
+
+ gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
+
+ /*
+ * Check various fields in header for validity/bounds.
+ */
+ if ( /* entry point must lay with text region */
+ gz->a_out.a_entry < gz->virtual_offset ||
+ gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
+
+ /* text and data size must each be page rounded */
+ gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
+ gz->where = __LINE__;
+ return (-1);
+ }
+ /*
+ * text/data/bss must not exceed limits
+ */
+ PROC_LOCK(gz->ip->proc);
+ if ( /* text can't exceed maximum text size */
+ gz->a_out.a_text > maxtsiz ||
+
+ /* data + bss can't exceed rlimit */
+ gz->a_out.a_data + gz->bss_size >
+ lim_cur(gz->ip->proc, RLIMIT_DATA) ||
+ racct_set(gz->ip->proc, RACCT_DATA,
+ gz->a_out.a_data + gz->bss_size) != 0) {
+ PROC_UNLOCK(gz->ip->proc);
+ gz->where = __LINE__;
+ return (ENOMEM);
+ }
+ PROC_UNLOCK(gz->ip->proc);
+ /* Find out how far we should go */
+ gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
+
+ /*
+ * Avoid a possible deadlock if the current address space is destroyed
+ * and that address space maps the locked vnode. In the common case,
+ * the locked vnode's v_usecount is decremented but remains greater
+ * than zero. Consequently, the vnode lock is not needed by vrele().
+ * However, in cases where the vnode lock is external, such as nullfs,
+ * v_usecount may become zero.
+ */
+ VOP_UNLOCK(gz->ip->vp, 0);
+
+ /*
+ * Destroy old process VM and create a new one (with a new stack)
+ */
+ error = exec_new_vmspace(gz->ip, &aout_sysvec);
+
+ vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+
+ vmspace = gz->ip->proc->p_vmspace;
+
+ vmaddr = gz->virtual_offset;
+
+ error = vm_mmap(&vmspace->vm_map,
+ &vmaddr,
+ gz->a_out.a_text + gz->a_out.a_data,
+ VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
+ OBJT_DEFAULT,
+ NULL,
+ 0);
+
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+
+ if (gz->bss_size != 0) {
+ /*
+ * Allocate demand-zeroed area for uninitialized data.
+ * "bss" = 'block started by symbol' - named after the
+ * IBM 7090 instruction of the same name.
+ */
+ vmaddr = gz->virtual_offset + gz->a_out.a_text +
+ gz->a_out.a_data;
+ error = vm_map_find(&vmspace->vm_map,
+ NULL,
+ 0,
+ &vmaddr,
+ gz->bss_size,
+ FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+ }
+ /* Fill in process VM information */
+ vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
+ vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
+ vmspace->vm_daddr = (caddr_t) (uintptr_t)
+ (gz->virtual_offset + gz->a_out.a_text);
+
+ /* Fill in image_params */
+ gz->ip->interpreted = 0;
+ gz->ip->entry_addr = gz->a_out.a_entry;
+
+ gz->ip->proc->p_sysent = &aout_sysvec;
+
+ return 0;
+}
+
+static int
+NextByte(void *vp)
+{
+ int error;
+ struct imgact_gzip *igz = (struct imgact_gzip *) vp;
+
+ if (igz->idx >= igz->len) {
+ igz->where = __LINE__;
+ return GZ_EOF;
+ }
+ if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
+ return igz->inbuf[(igz->idx++) - igz->offset];
+ }
+ if (igz->inbuf)
+ kmap_free_wakeup(exec_map, (vm_offset_t)igz->inbuf, PAGE_SIZE);
+ igz->offset = igz->idx & ~PAGE_MASK;
+
+ error = vm_mmap(exec_map, /* map */
+ (vm_offset_t *) & igz->inbuf, /* address */
+ PAGE_SIZE, /* size */
+ VM_PROT_READ, /* protection */
+ VM_PROT_READ, /* max protection */
+ 0, /* flags */
+ OBJT_VNODE, /* handle type */
+ igz->ip->vp, /* vnode */
+ igz->offset); /* offset */
+ if (error) {
+ igz->where = __LINE__;
+ igz->error = error;
+ return GZ_EOF;
+ }
+ return igz->inbuf[(igz->idx++) - igz->offset];
+}
+
+static int
+Flush(void *vp, u_char * ptr, u_long siz)
+{
+ struct imgact_gzip *gz = (struct imgact_gzip *) vp;
+ u_char *p = ptr, *q;
+ int i;
+
+ /* First, find an a.out-header. */
+ if (gz->output < sizeof gz->a_out) {
+ q = (u_char *) & gz->a_out;
+ i = min(siz, sizeof gz->a_out - gz->output);
+ bcopy(p, q + gz->output, i);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ if (gz->output == sizeof gz->a_out) {
+ gz->gotheader = 1;
+ i = do_aout_hdr(gz);
+ if (i == -1) {
+ if (!gz->where)
+ gz->where = __LINE__;
+ gz->error = ENOEXEC;
+ return ENOEXEC;
+ } else if (i) {
+ gz->where = __LINE__;
+ gz->error = i;
+ return ENOEXEC;
+ }
+ if (gz->file_offset == 0) {
+ q = (u_char *) (uintptr_t) gz->virtual_offset;
+ copyout(&gz->a_out, q, sizeof gz->a_out);
+ }
+ }
+ }
+ /* Skip over zero-padded first PAGE if needed */
+ if (gz->output < gz->file_offset &&
+ gz->output + siz > gz->file_offset) {
+ i = min(siz, gz->file_offset - gz->output);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ }
+ if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
+ i = min(siz, gz->file_end - gz->output);
+ q = (u_char *) (uintptr_t)
+ (gz->virtual_offset + gz->output - gz->file_offset);
+ copyout(p, q, i);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ }
+ gz->output += siz;
+ return 0;
+}
+
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
+EXEC_SET(execgzip, gzip_execsw);
diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c
new file mode 100644
index 0000000..d9884f5
--- /dev/null
+++ b/sys/kern/imgact_shell.c
@@ -0,0 +1,258 @@
+/*-
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC 0x2123 /* #! */
+#else
+#define SHELLMAGIC 0x2321
+#endif
+
+/*
+ * At the time of this writing, MAXSHELLCMDLEN == PAGE_SIZE. This is
+ * significant because the caller has only mapped in one page of the
+ * file we're reading.
+ */
+#if MAXSHELLCMDLEN > PAGE_SIZE
+#error "MAXSHELLCMDLEN is larger than a single page!"
+#endif
+
+/*
+ * MAXSHELLCMDLEN must be at least MAXINTERP plus the size of the `#!'
+ * prefix and terminating newline.
+ */
+CTASSERT(MAXSHELLCMDLEN >= MAXINTERP + 3);
+
+/**
+ * Shell interpreter image activator. An interpreter name beginning at
+ * imgp->args->begin_argv is the minimal successful exit requirement.
+ *
+ * If the given file is a shell-script, then the first line will start
+ * with the two characters `#!' (aka SHELLMAGIC), followed by the name
+ * of the shell-interpreter to run, followed by zero or more tokens.
+ *
+ * The interpreter is then started up such that it will see:
+ * arg[0] -> The name of interpreter as specified after `#!' in the
+ * first line of the script. The interpreter name must
+ * not be longer than MAXSHELLCMDLEN bytes.
+ * arg[1] -> *If* there are any additional tokens on the first line,
+ * then we add a new arg[1], which is a copy of the rest of
+ * that line. The copy starts at the first token after the
+ * interpreter name. We leave it to the interpreter to
+ * parse the tokens in that value.
+ * arg[x] -> the full pathname of the script. This will either be
+ * arg[2] or arg[1], depending on whether or not tokens
+ * were found after the interpreter name.
+ * arg[x+1] -> all the arguments that were specified on the original
+ * command line.
+ *
+ * This processing is described in the execve(2) man page.
+ */
+
+/*
+ * HISTORICAL NOTE: From 1993 to mid-2005, FreeBSD parsed out the tokens as
+ * found on the first line of the script, and setup each token as a separate
+ * value in arg[]. This extra processing did not match the behavior of other
+ * OS's, and caused a few subtle problems. For one, it meant the kernel was
+ * deciding how those values should be parsed (wrt characters for quoting or
+ * comments, etc), while the interpreter might have other rules for parsing.
+ * It also meant the interpreter had no way of knowing which arguments came
+ * from the first line of the shell script, and which arguments were specified
+ * by the user on the command line. That extra processing was dropped in the
+ * 6.x branch on May 28, 2005 (matching __FreeBSD_version 600029).
+ */
+int
+exec_shell_imgact(imgp)
+ struct image_params *imgp;
+{
+ const char *image_header = imgp->image_header;
+ const char *ihp, *interpb, *interpe, *maxp, *optb, *opte, *fname;
+ int error, offset;
+ size_t length;
+ struct vattr vattr;
+ struct sbuf *sname;
+
+ /* a shell script? */
+ if (((const short *)image_header)[0] != SHELLMAGIC)
+ return (-1);
+
+ /*
+ * Don't allow a shell script to be the shell for a shell
+ * script. :-)
+ */
+ if (imgp->interpreted)
+ return (ENOEXEC);
+
+ imgp->interpreted = 1;
+
+ /*
+ * At this point we have the first page of the file mapped.
+ * However, we don't know how far into the page the contents are
+ * valid -- the actual file might be much shorter than the page.
+ * So find out the file size.
+ */
+ error = VOP_GETATTR(imgp->vp, &vattr, imgp->proc->p_ucred);
+ if (error)
+ return (error);
+
+ /*
+ * Copy shell name and arguments from image_header into a string
+ * buffer.
+ */
+ maxp = &image_header[MIN(vattr.va_size, MAXSHELLCMDLEN)];
+ ihp = &image_header[2];
+
+ /*
+ * Find the beginning and end of the interpreter_name. If the
+ * line does not include any interpreter, or if the name which
+ * was found is too long, we bail out.
+ */
+ while (ihp < maxp && ((*ihp == ' ') || (*ihp == '\t')))
+ ihp++;
+ interpb = ihp;
+ while (ihp < maxp && ((*ihp != ' ') && (*ihp != '\t') && (*ihp != '\n')
+ && (*ihp != '\0')))
+ ihp++;
+ interpe = ihp;
+ if (interpb == interpe)
+ return (ENOEXEC);
+ if (interpe - interpb >= MAXINTERP)
+ return (ENAMETOOLONG);
+
+ /*
+ * Find the beginning of the options (if any), and the end-of-line.
+ * Then trim the trailing blanks off the value. Note that some
+ * other operating systems do *not* trim the trailing whitespace...
+ */
+ while (ihp < maxp && ((*ihp == ' ') || (*ihp == '\t')))
+ ihp++;
+ optb = ihp;
+ while (ihp < maxp && ((*ihp != '\n') && (*ihp != '\0')))
+ ihp++;
+ opte = ihp;
+ if (opte == maxp)
+ return (ENOEXEC);
+ while (--ihp > optb && ((*ihp == ' ') || (*ihp == '\t')))
+ opte = ihp;
+
+ if (imgp->args->fname != NULL) {
+ fname = imgp->args->fname;
+ sname = NULL;
+ } else {
+ sname = sbuf_new_auto();
+ sbuf_printf(sname, "/dev/fd/%d", imgp->args->fd);
+ sbuf_finish(sname);
+ fname = sbuf_data(sname);
+ }
+
+ /*
+ * We need to "pop" (remove) the present value of arg[0], and "push"
+ * either two or three new values in the arg[] list. To do this,
+ * we first shift all the other values in the `begin_argv' area to
+ * provide the exact amount of room for the values added. Set up
+ * `offset' as the number of bytes to be added to the `begin_argv'
+ * area, and 'length' as the number of bytes being removed.
+ */
+ offset = interpe - interpb + 1; /* interpreter */
+ if (opte > optb) /* options (if any) */
+ offset += opte - optb + 1;
+ offset += strlen(fname) + 1; /* fname of script */
+ length = (imgp->args->argc == 0) ? 0 :
+ strlen(imgp->args->begin_argv) + 1; /* bytes to delete */
+
+ if (offset > imgp->args->stringspace + length) {
+ if (sname != NULL)
+ sbuf_delete(sname);
+ return (E2BIG);
+ }
+
+ bcopy(imgp->args->begin_argv + length, imgp->args->begin_argv + offset,
+ imgp->args->endp - (imgp->args->begin_argv + length));
+
+ offset -= length; /* calculate actual adjustment */
+ imgp->args->begin_envv += offset;
+ imgp->args->endp += offset;
+ imgp->args->stringspace -= offset;
+
+ /*
+ * If there was no arg[0] when we started, then the interpreter_name
+ * is adding an argument (instead of replacing the arg[0] we started
+ * with). And we're always adding an argument when we include the
+ * full pathname of the original script.
+ */
+ if (imgp->args->argc == 0)
+ imgp->args->argc = 1;
+ imgp->args->argc++;
+
+ /*
+ * The original arg[] list has been shifted appropriately. Copy in
+ * the interpreter name and options-string.
+ */
+ length = interpe - interpb;
+ bcopy(interpb, imgp->args->begin_argv, length);
+ *(imgp->args->begin_argv + length) = '\0';
+ offset = length + 1;
+ if (opte > optb) {
+ length = opte - optb;
+ bcopy(optb, imgp->args->begin_argv + offset, length);
+ *(imgp->args->begin_argv + offset + length) = '\0';
+ offset += length + 1;
+ imgp->args->argc++;
+ }
+
+ /*
+ * Finally, add the filename onto the end for the interpreter to
+ * use and copy the interpreter's name to imgp->interpreter_name
+ * for exec to use.
+ */
+ error = copystr(fname, imgp->args->begin_argv + offset,
+ imgp->args->stringspace, NULL);
+
+ if (error == 0)
+ imgp->interpreter_name = imgp->args->begin_argv;
+
+ if (sname != NULL)
+ sbuf_delete(sname);
+ return (error);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ */
+static struct execsw shell_execsw = { exec_shell_imgact, "#!" };
+EXEC_SET(shell, shell_execsw);
diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c
new file mode 100644
index 0000000..383ebc4
--- /dev/null
+++ b/sys/kern/inflate.c
@@ -0,0 +1,1077 @@
+/*
+ * Most parts of this file are not covered by:
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/inflate.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#endif
+#include <sys/malloc.h>
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_GZIP, "gzip_trees", "Gzip trees");
+#endif
+
+/* needed to make inflate() work */
+#define uch u_char
+#define ush u_short
+#define ulg u_long
+
+/* Stuff to make inflate() work */
+#ifdef _KERNEL
+#define memzero(dest,len) bzero(dest,len)
+#endif
+#define NOMEMCPY
+#ifdef _KERNEL
+#define FPRINTF printf
+#else
+extern void putstr (char *);
+#define FPRINTF putstr
+#endif
+
+#define FLUSH(x,y) { \
+ int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y); \
+ if (foo) \
+ return foo; \
+ }
+
+static const int qflag = 0;
+
+#ifndef _KERNEL /* want to use this file in kzip also */
+extern unsigned char *kzipmalloc (int);
+extern void kzipfree (void*);
+#define malloc(x, y, z) kzipmalloc((x))
+#define free(x, y) kzipfree((x))
+#endif
+
+/*
+ * This came from unzip-5.12. I have changed it the flow to pass
+ * a structure pointer around, thus hopefully making it re-entrant.
+ * Poul-Henning
+ */
+
+/* inflate.c -- put in the public domain by Mark Adler
+ version c14o, 23 August 1994 */
+
+/* You can do whatever you like with this source file, though I would
+ prefer that if you modify it and redistribute it that you include
+ comments to that effect with your name and the date. Thank you.
+
+ History:
+ vers date who what
+ ---- --------- -------------- ------------------------------------
+ a ~~ Feb 92 M. Adler used full (large, one-step) lookup table
+ b1 21 Mar 92 M. Adler first version with partial lookup tables
+ b2 21 Mar 92 M. Adler fixed bug in fixed-code blocks
+ b3 22 Mar 92 M. Adler sped up match copies, cleaned up some
+ b4 25 Mar 92 M. Adler added prototypes; removed window[] (now
+ is the responsibility of unzip.h--also
+ changed name to slide[]), so needs diffs
+ for unzip.c and unzip.h (this allows
+ compiling in the small model on MSDOS);
+ fixed cast of q in huft_build();
+ b5 26 Mar 92 M. Adler got rid of unintended macro recursion.
+ b6 27 Mar 92 M. Adler got rid of nextbyte() routine. fixed
+ bug in inflate_fixed().
+ c1 30 Mar 92 M. Adler removed lbits, dbits environment variables.
+ changed BMAX to 16 for explode. Removed
+ OUTB usage, and replaced it with flush()--
+ this was a 20% speed improvement! Added
+ an explode.c (to replace unimplod.c) that
+ uses the huft routines here. Removed
+ register union.
+ c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k.
+ c3 10 Apr 92 M. Adler reduced memory of code tables made by
+ huft_build significantly (factor of two to
+ three).
+ c4 15 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy().
+ worked around a Turbo C optimization bug.
+ c5 21 Apr 92 M. Adler added the GZ_WSIZE #define to allow reducing
+ the 32K window size for specialized
+ applications.
+ c6 31 May 92 M. Adler added some typecasts to eliminate warnings
+ c7 27 Jun 92 G. Roelofs added some more typecasts (444: MSC bug).
+ c8 5 Oct 92 J-l. Gailly added ifdef'd code to deal with PKZIP bug.
+ c9 9 Oct 92 M. Adler removed a memory error message (~line 416).
+ c10 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch,
+ removed old inflate, renamed inflate_entry
+ to inflate, added Mark's fix to a comment.
+ c10.5 14 Dec 92 M. Adler fix up error messages for incomplete trees.
+ c11 2 Jan 93 M. Adler fixed bug in detection of incomplete
+ tables, and removed assumption that EOB is
+ the longest code (bad assumption).
+ c12 3 Jan 93 M. Adler make tables for fixed blocks only once.
+ c13 5 Jan 93 M. Adler allow all zero length codes (pkzip 2.04c
+ outputs one zero length code for an empty
+ distance tree).
+ c14 12 Mar 93 M. Adler made inflate.c standalone with the
+ introduction of inflate.h.
+ c14b 16 Jul 93 G. Roelofs added (unsigned) typecast to w at 470.
+ c14c 19 Jul 93 J. Bush changed v[N_MAX], l[288], ll[28x+3x] arrays
+ to static for Amiga.
+ c14d 13 Aug 93 J-l. Gailly de-complicatified Mark's c[*p++]++ thing.
+ c14e 8 Oct 93 G. Roelofs changed memset() to memzero().
+ c14f 22 Oct 93 G. Roelofs renamed quietflg to qflag; made Trace()
+ conditional; added inflate_free().
+ c14g 28 Oct 93 G. Roelofs changed l/(lx+1) macro to pointer (Cray bug)
+ c14h 7 Dec 93 C. Ghisler huft_build() optimizations.
+ c14i 9 Jan 94 A. Verheijen set fixed_t{d,l} to NULL after freeing;
+ G. Roelofs check NEXTBYTE macro for GZ_EOF.
+ c14j 23 Jan 94 G. Roelofs removed Ghisler "optimizations"; ifdef'd
+ GZ_EOF check.
+ c14k 27 Feb 94 G. Roelofs added some typecasts to avoid warnings.
+ c14l 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines
+ to avoid bug in Encore compiler.
+ c14m 7 Jul 94 P. Kienitz modified to allow assembler version of
+ inflate_codes() (define ASM_INFLATECODES)
+ c14n 22 Jul 94 G. Roelofs changed fprintf to FPRINTF for DLL versions
+ c14o 23 Aug 94 C. Spieler added a newline to a debug statement;
+ G. Roelofs added another typecast to avoid MSC warning
+ */
+
+
+/*
+ Inflate deflated (PKZIP's method 8 compressed) data. The compression
+ method searches for as much of the current string of bytes (up to a
+ length of 258) in the previous 32K bytes. If it doesn't find any
+ matches (of at least length 3), it codes the next byte. Otherwise, it
+ codes the length of the matched string and its distance backwards from
+ the current position. There is a single Huffman code that codes both
+ single bytes (called "literals") and match lengths. A second Huffman
+ code codes the distance information, which follows a length code. Each
+ length or distance code actually represents a base value and a number
+ of "extra" (sometimes zero) bits to get to add to the base value. At
+ the end of each deflated block is a special end-of-block (EOB) literal/
+ length code. The decoding process is basically: get a literal/length
+ code; if EOB then done; if a literal, emit the decoded byte; if a
+ length then get the distance and emit the referred-to bytes from the
+ sliding window of previously emitted data.
+
+ There are (currently) three kinds of inflate blocks: stored, fixed, and
+ dynamic. The compressor outputs a chunk of data at a time and decides
+ which method to use on a chunk-by-chunk basis. A chunk might typically
+ be 32K to 64K, uncompressed. If the chunk is uncompressible, then the
+ "stored" method is used. In this case, the bytes are simply stored as
+ is, eight bits per byte, with none of the above coding. The bytes are
+ preceded by a count, since there is no longer an EOB code.
+
+ If the data is compressible, then either the fixed or dynamic methods
+ are used. In the dynamic method, the compressed data is preceded by
+ an encoding of the literal/length and distance Huffman codes that are
+ to be used to decode this block. The representation is itself Huffman
+ coded, and so is preceded by a description of that code. These code
+ descriptions take up a little space, and so for small blocks, there is
+ a predefined set of codes, called the fixed codes. The fixed method is
+ used if the block ends up smaller that way (usually for quite small
+ chunks); otherwise the dynamic method is used. In the latter case, the
+ codes are customized to the probabilities in the current block and so
+ can code it much better than the pre-determined fixed codes can.
+
+ The Huffman codes themselves are decoded using a mutli-level table
+ lookup, in order to maximize the speed of decoding plus the speed of
+ building the decoding tables. See the comments below that precede the
+ lbits and dbits tuning parameters.
+ */
+
+
+/*
+ Notes beyond the 1.93a appnote.txt:
+
+ 1. Distance pointers never point before the beginning of the output
+ stream.
+ 2. Distance pointers can point back across blocks, up to 32k away.
+ 3. There is an implied maximum of 7 bits for the bit length table and
+ 15 bits for the actual data.
+ 4. If only one code exists, then it is encoded using one bit. (Zero
+ would be more efficient, but perhaps a little confusing.) If two
+ codes exist, they are coded using one bit each (0 and 1).
+ 5. There is no way of sending zero distance codes--a dummy must be
+ sent if there are none. (History: a pre 2.0 version of PKZIP would
+ store blocks with no distance codes, but this was discovered to be
+ too harsh a criterion.) Valid only for 1.93a. 2.04c does allow
+ zero distance codes, which is sent as one code of zero bits in
+ length.
+ 6. There are up to 286 literal/length codes. Code 256 represents the
+ end-of-block. Note however that the static length tree defines
+ 288 codes just to fill out the Huffman codes. Codes 286 and 287
+ cannot be used though, since there is no length base or extra bits
+ defined for them. Similarily, there are up to 30 distance codes.
+ However, static trees define 32 codes (all 5 bits) to fill out the
+ Huffman codes, but the last two had better not show up in the data.
+ 7. Unzip can check dynamic Huffman blocks for complete code sets.
+ The exception is that a single code would not be complete (see #4).
+ 8. The five bits following the block type is really the number of
+ literal codes sent minus 257.
+ 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+ (1+6+6). Therefore, to output three times the length, you output
+ three codes (1+1+1), whereas to output four times the same length,
+ you only need two codes (1+3). Hmm.
+ 10. In the tree reconstruction algorithm, Code = Code + Increment
+ only if BitLength(i) is not zero. (Pretty obvious.)
+ 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19)
+ 12. Note: length code 284 can represent 227-258, but length code 285
+ really is 258. The last length deserves its own, short code
+ since it gets used a lot in very redundant files. The length
+ 258 is special since 258 - 3 (the min match length) is 255.
+ 13. The literal/length and distance code bit lengths are read as a
+ single stream of lengths. It is possible (and advantageous) for
+ a repeat code (16, 17, or 18) to go across the boundary between
+ the two sets of lengths.
+ */
+
+
+#define PKZIP_BUG_WORKAROUND /* PKZIP 1.93a problem--live with it */
+
+/*
+ inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE,
+ FLUSH() and memzero macros. If the window size is not 32K, it
+ should also define GZ_WSIZE. If INFMOD is defined, it can include
+ compiled functions to support the NEXTBYTE and/or FLUSH() macros.
+ There are defaults for NEXTBYTE and FLUSH() below for use as
+ examples of what those functions need to do. Normally, you would
+ also want FLUSH() to compute a crc on the data. inflate.h also
+ needs to provide these typedefs:
+
+ typedef unsigned char uch;
+ typedef unsigned short ush;
+ typedef unsigned long ulg;
+
+ This module uses the external functions malloc() and free() (and
+ probably memset() or bzero() in the memzero() macro). Their
+ prototypes are normally found in <string.h> and <stdlib.h>.
+ */
+#define INFMOD /* tell inflate.h to include code to be
+ * compiled */
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+ that have 16-bit pointers (e.g. PC's in the small or medium model).
+ Valid extra bits are 0..13. e == 15 is EOB (end of block), e == 16
+ means that v is a literal, 16 < e < 32 means that v is a pointer to
+ the next table, which codes e - 16 bits, and lastly e == 99 indicates
+ an unused code. If a code with e == 99 is looked up, this implies an
+ error in the data. */
+struct huft {
+ uch e; /* number of extra bits or operation */
+ uch b; /* number of bits in this code or subcode */
+ union {
+ ush n; /* literal, length base, or distance
+ * base */
+ struct huft *t; /* pointer to next level of table */
+ } v;
+};
+
+
+/* Function prototypes */
+static int huft_build(struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *);
+static int huft_free(struct inflate *, struct huft *);
+static int inflate_codes(struct inflate *, struct huft *, struct huft *, int, int);
+static int inflate_stored(struct inflate *);
+static int xinflate(struct inflate *);
+static int inflate_fixed(struct inflate *);
+static int inflate_dynamic(struct inflate *);
+static int inflate_block(struct inflate *, int *);
+
+/* The inflate algorithm uses a sliding 32K byte window on the uncompressed
+ stream to find repeated byte strings. This is implemented here as a
+ circular buffer. The index is updated simply by incrementing and then
+ and'ing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32K area. It is assumed
+ to be usable as if it were declared "uch slide[32768];" or as just
+ "uch *slide;" and then malloc'ed in the latter case. The definition
+ must be in unzip.h, included above. */
+
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+
+/* Order of the bit length code lengths */
+static const unsigned border[] = {
+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static const ush cplens[] = { /* Copy lengths for literal codes 257..285 */
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+ 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ /* note: see note #13 above about the 258 in this list. */
+
+static const ush cplext[] = { /* Extra bits for literal codes 257..285 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */
+
+static const ush cpdist[] = { /* Copy offsets for distance codes 0..29 */
+ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+ 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+ 8193, 12289, 16385, 24577};
+
+static const ush cpdext[] = { /* Extra bits for distance codes */
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 12, 13, 13};
+
+/* And'ing with mask[n] masks the lower n bits */
+static const ush mask[] = {
+ 0x0000,
+ 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+ 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+
+/* Macros for inflate() bit peeking and grabbing.
+ The usage is:
+
+ NEEDBITS(glbl,j)
+ x = b & mask[j];
+ DUMPBITS(j)
+
+ where NEEDBITS makes sure that b has at least j bits in it, and
+ DUMPBITS removes the bits from b. The macros use the variable k
+ for the number of bits in b. Normally, b and k are register
+ variables for speed, and are initialized at the begining of a
+ routine that uses these macros from a global bit buffer and count.
+
+ In order to not ask for more bits than there are in the compressed
+ stream, the Huffman tables are constructed to only ask for just
+ enough bits to make up the end-of-block code (value 256). Then no
+ bytes need to be "returned" to the buffer at the end of the last
+ block. See the huft_build() routine.
+ */
+
+/*
+ * The following 2 were global variables.
+ * They are now fields of the inflate structure.
+ */
+
+#define NEEDBITS(glbl,n) { \
+ while(k<(n)) { \
+ int c=(*glbl->gz_input)(glbl->gz_private); \
+ if(c==GZ_EOF) \
+ return 1; \
+ b|=((ulg)c)<<k; \
+ k+=8; \
+ } \
+ }
+
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+/*
+ Huffman code decoding is performed using a multi-level table lookup.
+ The fastest way to decode is to simply build a lookup table whose
+ size is determined by the longest code. However, the time it takes
+ to build this table can also be a factor if the data being decoded
+ is not very long. The most common codes are necessarily the
+ shortest codes, so those codes dominate the decoding time, and hence
+ the speed. The idea is you can have a shorter table that decodes the
+ shorter, more probable codes, and then point to subsidiary tables for
+ the longer codes. The time it costs to decode the longer codes is
+ then traded against the time it takes to make longer tables.
+
+ This results of this trade are in the variables lbits and dbits
+ below. lbits is the number of bits the first level table for literal/
+ length codes can decode in one step, and dbits is the same thing for
+ the distance codes. Subsequent tables are also less than or equal to
+ those sizes. These values may be adjusted either when all of the
+ codes are shorter than that, in which case the longest code length in
+ bits is used, or when the shortest code is *longer* than the requested
+ table size, in which case the length of the shortest code in bits is
+ used.
+
+ There are two different values for the two tables, since they code a
+ different number of possibilities each. The literal/length table
+ codes 286 possible values, or in a flat code, a little over eight
+ bits. The distance table codes 30 possible values, or a little less
+ than five bits, flat. The optimum values for speed end up being
+ about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+ The optimum values may differ though from machine to machine, and
+ possibly even between compilers. Your mileage may vary.
+ */
+
+static const int lbits = 9; /* bits in base literal/length lookup table */
+static const int dbits = 6; /* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16 /* maximum bit length of any code (16 for
+ * explode) */
+#define N_MAX 288 /* maximum number of codes in any set */
+
+/* Given a list of code lengths and a maximum table size, make a set of
+ tables to decode that set of codes. Return zero on success, one if
+ the given code set is incomplete (the tables are still built in this
+ case), two if the input is invalid (all zero length codes or an
+ oversubscribed set of lengths), and three if not enough memory.
+ The code with value 256 is special, and the tables are constructed
+ so that no bits beyond that code are fetched when that code is
+ decoded. */
+static int
+huft_build(glbl, b, n, s, d, e, t, m)
+ struct inflate *glbl;
+ unsigned *b; /* code lengths in bits (all assumed <= BMAX) */
+ unsigned n; /* number of codes (assumed <= N_MAX) */
+ unsigned s; /* number of simple-valued codes (0..s-1) */
+ const ush *d; /* list of base values for non-simple codes */
+ const ush *e; /* list of extra bits for non-simple codes */
+ struct huft **t; /* result: starting table */
+ int *m; /* maximum lookup bits, returns actual */
+{
+ unsigned a; /* counter for codes of length k */
+ unsigned c[BMAX + 1]; /* bit length count table */
+ unsigned el; /* length of EOB code (value 256) */
+ unsigned f; /* i repeats in table every f entries */
+ int g; /* maximum code length */
+ int h; /* table level */
+ register unsigned i; /* counter, current code */
+ register unsigned j; /* counter */
+ register int k; /* number of bits in current code */
+ int lx[BMAX + 1]; /* memory for l[-1..BMAX-1] */
+ int *l = lx + 1; /* stack of bits per table */
+ register unsigned *p; /* pointer into c[], b[], or v[] */
+ register struct huft *q;/* points to current table */
+ struct huft r; /* table entry for structure assignment */
+ struct huft *u[BMAX];/* table stack */
+ unsigned v[N_MAX]; /* values in order of bit length */
+ register int w; /* bits before this table == (l * h) */
+ unsigned x[BMAX + 1]; /* bit offsets, then code stack */
+ unsigned *xp; /* pointer into x */
+ int y; /* number of dummy codes added */
+ unsigned z; /* number of entries in current table */
+
+ /* Generate counts for each bit length */
+ el = n > 256 ? b[256] : BMAX; /* set length of EOB code, if any */
+#ifdef _KERNEL
+ memzero((char *) c, sizeof(c));
+#else
+ for (i = 0; i < BMAX+1; i++)
+ c [i] = 0;
+#endif
+ p = b;
+ i = n;
+ do {
+ c[*p]++;
+ p++; /* assume all entries <= BMAX */
+ } while (--i);
+ if (c[0] == n) { /* null input--all zero length codes */
+ *t = (struct huft *) NULL;
+ *m = 0;
+ return 0;
+ }
+ /* Find minimum and maximum length, bound *m by those */
+ for (j = 1; j <= BMAX; j++)
+ if (c[j])
+ break;
+ k = j; /* minimum code length */
+ if ((unsigned) *m < j)
+ *m = j;
+ for (i = BMAX; i; i--)
+ if (c[i])
+ break;
+ g = i; /* maximum code length */
+ if ((unsigned) *m > i)
+ *m = i;
+
+ /* Adjust last length count to fill out codes, if needed */
+ for (y = 1 << j; j < i; j++, y <<= 1)
+ if ((y -= c[j]) < 0)
+ return 2; /* bad input: more codes than bits */
+ if ((y -= c[i]) < 0)
+ return 2;
+ c[i] += y;
+
+ /* Generate starting offsets into the value table for each length */
+ x[1] = j = 0;
+ p = c + 1;
+ xp = x + 2;
+ while (--i) { /* note that i == g from above */
+ *xp++ = (j += *p++);
+ }
+
+ /* Make a table of values in order of bit lengths */
+ p = b;
+ i = 0;
+ do {
+ if ((j = *p++) != 0)
+ v[x[j]++] = i;
+ } while (++i < n);
+
+ /* Generate the Huffman codes and for each, make the table entries */
+ x[0] = i = 0; /* first Huffman code is zero */
+ p = v; /* grab values in bit order */
+ h = -1; /* no tables yet--level -1 */
+ w = l[-1] = 0; /* no bits decoded yet */
+ u[0] = (struct huft *) NULL; /* just to keep compilers happy */
+ q = (struct huft *) NULL; /* ditto */
+ z = 0; /* ditto */
+
+ /* go through the bit lengths (k already is bits in shortest code) */
+ for (; k <= g; k++) {
+ a = c[k];
+ while (a--) {
+ /*
+ * here i is the Huffman code of length k bits for
+ * value *p
+ */
+ /* make tables up to required level */
+ while (k > w + l[h]) {
+ w += l[h++]; /* add bits already decoded */
+
+ /*
+ * compute minimum size table less than or
+ * equal to *m bits
+ */
+ z = (z = g - w) > (unsigned) *m ? *m : z; /* upper limit */
+ if ((f = 1 << (j = k - w)) > a + 1) { /* try a k-w bit table *//* t
+ * oo few codes for k-w
+ * bit table */
+ f -= a + 1; /* deduct codes from
+ * patterns left */
+ xp = c + k;
+ while (++j < z) { /* try smaller tables up
+ * to z bits */
+ if ((f <<= 1) <= *++xp)
+ break; /* enough codes to use
+ * up j bits */
+ f -= *xp; /* else deduct codes
+ * from patterns */
+ }
+ }
+ if ((unsigned) w + j > el && (unsigned) w < el)
+ j = el - w; /* make EOB code end at
+ * table */
+ z = 1 << j; /* table entries for j-bit
+ * table */
+ l[h] = j; /* set table size in stack */
+
+ /* allocate and link in new table */
+ if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) ==
+ (struct huft *) NULL) {
+ if (h)
+ huft_free(glbl, u[0]);
+ return 3; /* not enough memory */
+ }
+ glbl->gz_hufts += z + 1; /* track memory usage */
+ *t = q + 1; /* link to list for
+ * huft_free() */
+ *(t = &(q->v.t)) = (struct huft *) NULL;
+ u[h] = ++q; /* table starts after link */
+
+ /* connect to last table, if there is one */
+ if (h) {
+ x[h] = i; /* save pattern for
+ * backing up */
+ r.b = (uch) l[h - 1]; /* bits to dump before
+ * this table */
+ r.e = (uch) (16 + j); /* bits in this table */
+ r.v.t = q; /* pointer to this table */
+ j = (i & ((1 << w) - 1)) >> (w - l[h - 1]);
+ u[h - 1][j] = r; /* connect to last table */
+ }
+ }
+
+ /* set up table entry in r */
+ r.b = (uch) (k - w);
+ if (p >= v + n)
+ r.e = 99; /* out of values--invalid
+ * code */
+ else if (*p < s) {
+ r.e = (uch) (*p < 256 ? 16 : 15); /* 256 is end-of-block
+ * code */
+ r.v.n = *p++; /* simple code is just the
+ * value */
+ } else {
+ r.e = (uch) e[*p - s]; /* non-simple--look up
+ * in lists */
+ r.v.n = d[*p++ - s];
+ }
+
+ /* fill code-like entries with r */
+ f = 1 << (k - w);
+ for (j = i >> w; j < z; j += f)
+ q[j] = r;
+
+ /* backwards increment the k-bit code i */
+ for (j = 1 << (k - 1); i & j; j >>= 1)
+ i ^= j;
+ i ^= j;
+
+ /* backup over finished tables */
+ while ((i & ((1 << w) - 1)) != x[h])
+ w -= l[--h]; /* don't need to update q */
+ }
+ }
+
+ /* return actual size of base table */
+ *m = l[0];
+
+ /* Return true (1) if we were given an incomplete table */
+ return y != 0 && g != 1;
+}
+
+static int
+huft_free(glbl, t)
+ struct inflate *glbl;
+ struct huft *t; /* table to free */
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+ list of the tables it made, with the links in a dummy first entry of
+ each table. */
+{
+ register struct huft *p, *q;
+
+ /* Go through linked list, freeing from the malloced (t[-1]) address. */
+ p = t;
+ while (p != (struct huft *) NULL) {
+ q = (--p)->v.t;
+ free(p, M_GZIP);
+ p = q;
+ }
+ return 0;
+}
+
+/* inflate (decompress) the codes in a deflated (compressed) block.
+ Return an error code or zero if it all goes ok. */
+static int
+inflate_codes(glbl, tl, td, bl, bd)
+ struct inflate *glbl;
+ struct huft *tl, *td;/* literal/length and distance decoder tables */
+ int bl, bd; /* number of bits decoded by tl[] and td[] */
+{
+ register unsigned e; /* table entry flag/number of extra bits */
+ unsigned n, d; /* length and index for copy */
+ unsigned w; /* current window position */
+ struct huft *t; /* pointer to table entry */
+ unsigned ml, md; /* masks for bl and bd bits */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local copies of globals */
+ b = glbl->gz_bb; /* initialize bit buffer */
+ k = glbl->gz_bk;
+ w = glbl->gz_wp; /* initialize window position */
+
+ /* inflate the coded data */
+ ml = mask[bl]; /* precompute masks for speed */
+ md = mask[bd];
+ while (1) { /* do until end of block */
+ NEEDBITS(glbl, (unsigned) bl)
+ if ((e = (t = tl + ((unsigned) b & ml))->e) > 16)
+ do {
+ if (e == 99)
+ return 1;
+ DUMPBITS(t->b)
+ e -= 16;
+ NEEDBITS(glbl, e)
+ } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+ DUMPBITS(t->b)
+ if (e == 16) { /* then it's a literal */
+ glbl->gz_slide[w++] = (uch) t->v.n;
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ } else { /* it's an EOB or a length */
+ /* exit if end of block */
+ if (e == 15)
+ break;
+
+ /* get length of block to copy */
+ NEEDBITS(glbl, e)
+ n = t->v.n + ((unsigned) b & mask[e]);
+ DUMPBITS(e);
+
+ /* decode distance of block to copy */
+ NEEDBITS(glbl, (unsigned) bd)
+ if ((e = (t = td + ((unsigned) b & md))->e) > 16)
+ do {
+ if (e == 99)
+ return 1;
+ DUMPBITS(t->b)
+ e -= 16;
+ NEEDBITS(glbl, e)
+ } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+ DUMPBITS(t->b)
+ NEEDBITS(glbl, e)
+ d = w - t->v.n - ((unsigned) b & mask[e]);
+ DUMPBITS(e)
+ /* do the copy */
+ do {
+ n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e);
+#ifndef NOMEMCPY
+ if (w - d >= e) { /* (this test assumes
+ * unsigned comparison) */
+ memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e);
+ w += e;
+ d += e;
+ } else /* do it slow to avoid memcpy()
+ * overlap */
+#endif /* !NOMEMCPY */
+ do {
+ glbl->gz_slide[w++] = glbl->gz_slide[d++];
+ } while (--e);
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ } while (n);
+ }
+ }
+
+ /* restore the globals from the locals */
+ glbl->gz_wp = w; /* restore global window pointer */
+ glbl->gz_bb = b; /* restore global bit buffer */
+ glbl->gz_bk = k;
+
+ /* done */
+ return 0;
+}
+
+/* "decompress" an inflated type 0 (stored) block. */
+static int
+inflate_stored(glbl)
+ struct inflate *glbl;
+{
+ unsigned n; /* number of bytes in block */
+ unsigned w; /* current window position */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local copies of globals */
+ b = glbl->gz_bb; /* initialize bit buffer */
+ k = glbl->gz_bk;
+ w = glbl->gz_wp; /* initialize window position */
+
+ /* go to byte boundary */
+ n = k & 7;
+ DUMPBITS(n);
+
+ /* get the length and its complement */
+ NEEDBITS(glbl, 16)
+ n = ((unsigned) b & 0xffff);
+ DUMPBITS(16)
+ NEEDBITS(glbl, 16)
+ if (n != (unsigned) ((~b) & 0xffff))
+ return 1; /* error in compressed data */
+ DUMPBITS(16)
+ /* read and output the compressed data */
+ while (n--) {
+ NEEDBITS(glbl, 8)
+ glbl->gz_slide[w++] = (uch) b;
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ DUMPBITS(8)
+ }
+
+ /* restore the globals from the locals */
+ glbl->gz_wp = w; /* restore global window pointer */
+ glbl->gz_bb = b; /* restore global bit buffer */
+ glbl->gz_bk = k;
+ return 0;
+}
+
+/* decompress an inflated type 1 (fixed Huffman codes) block. We should
+ either replace this with a custom decoder, or at least precompute the
+ Huffman tables. */
+static int
+inflate_fixed(glbl)
+ struct inflate *glbl;
+{
+ /* if first time, set up tables for fixed blocks */
+ if (glbl->gz_fixed_tl == (struct huft *) NULL) {
+ int i; /* temporary variable */
+ static unsigned l[288]; /* length list for huft_build */
+
+ /* literal table */
+ for (i = 0; i < 144; i++)
+ l[i] = 8;
+ for (; i < 256; i++)
+ l[i] = 9;
+ for (; i < 280; i++)
+ l[i] = 7;
+ for (; i < 288; i++) /* make a complete, but wrong code
+ * set */
+ l[i] = 8;
+ glbl->gz_fixed_bl = 7;
+ if ((i = huft_build(glbl, l, 288, 257, cplens, cplext,
+ &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) {
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ return i;
+ }
+ /* distance table */
+ for (i = 0; i < 30; i++) /* make an incomplete code
+ * set */
+ l[i] = 5;
+ glbl->gz_fixed_bd = 5;
+ if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext,
+ &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) {
+ huft_free(glbl, glbl->gz_fixed_tl);
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ return i;
+ }
+ }
+ /* decompress until an end-of-block code */
+ return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0;
+}
+
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+static int
+inflate_dynamic(glbl)
+ struct inflate *glbl;
+{
+ int i; /* temporary variables */
+ unsigned j;
+ unsigned l; /* last length */
+ unsigned m; /* mask for bit lengths table */
+ unsigned n; /* number of lengths to get */
+ struct huft *tl; /* literal/length code table */
+ struct huft *td; /* distance code table */
+ int bl; /* lookup bits for tl */
+ int bd; /* lookup bits for td */
+ unsigned nb; /* number of bit length codes */
+ unsigned nl; /* number of literal/length codes */
+ unsigned nd; /* number of distance codes */
+#ifdef PKZIP_BUG_WORKAROUND
+ unsigned ll[288 + 32]; /* literal/length and distance code
+ * lengths */
+#else
+ unsigned ll[286 + 30]; /* literal/length and distance code
+ * lengths */
+#endif
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local bit buffer */
+ b = glbl->gz_bb;
+ k = glbl->gz_bk;
+
+ /* read in table lengths */
+ NEEDBITS(glbl, 5)
+ nl = 257 + ((unsigned) b & 0x1f); /* number of
+ * literal/length codes */
+ DUMPBITS(5)
+ NEEDBITS(glbl, 5)
+ nd = 1 + ((unsigned) b & 0x1f); /* number of distance codes */
+ DUMPBITS(5)
+ NEEDBITS(glbl, 4)
+ nb = 4 + ((unsigned) b & 0xf); /* number of bit length codes */
+ DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+ if (nl > 288 || nd > 32)
+#else
+ if (nl > 286 || nd > 30)
+#endif
+ return 1; /* bad lengths */
+ /* read in bit-length-code lengths */
+ for (j = 0; j < nb; j++) {
+ NEEDBITS(glbl, 3)
+ ll[border[j]] = (unsigned) b & 7;
+ DUMPBITS(3)
+ }
+ for (; j < 19; j++)
+ ll[border[j]] = 0;
+
+ /* build decoding table for trees--single level, 7 bit lookup */
+ bl = 7;
+ if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) {
+ if (i == 1)
+ huft_free(glbl, tl);
+ return i; /* incomplete code set */
+ }
+ /* read in literal and distance code lengths */
+ n = nl + nd;
+ m = mask[bl];
+ i = l = 0;
+ while ((unsigned) i < n) {
+ NEEDBITS(glbl, (unsigned) bl)
+ j = (td = tl + ((unsigned) b & m))->b;
+ DUMPBITS(j)
+ j = td->v.n;
+ if (j < 16) /* length of code in bits (0..15) */
+ ll[i++] = l = j; /* save last length in l */
+ else if (j == 16) { /* repeat last length 3 to 6 times */
+ NEEDBITS(glbl, 2)
+ j = 3 + ((unsigned) b & 3);
+ DUMPBITS(2)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = l;
+ } else if (j == 17) { /* 3 to 10 zero length codes */
+ NEEDBITS(glbl, 3)
+ j = 3 + ((unsigned) b & 7);
+ DUMPBITS(3)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = 0;
+ l = 0;
+ } else { /* j == 18: 11 to 138 zero length codes */
+ NEEDBITS(glbl, 7)
+ j = 11 + ((unsigned) b & 0x7f);
+ DUMPBITS(7)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = 0;
+ l = 0;
+ }
+ }
+
+ /* free decoding table for trees */
+ huft_free(glbl, tl);
+
+ /* restore the global bit buffer */
+ glbl->gz_bb = b;
+ glbl->gz_bk = k;
+
+ /* build the decoding tables for literal/length and distance codes */
+ bl = lbits;
+ i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl);
+ if (i != 0) {
+ if (i == 1 && !qflag) {
+ FPRINTF("(incomplete l-tree) ");
+ huft_free(glbl, tl);
+ }
+ return i; /* incomplete code set */
+ }
+ bd = dbits;
+ i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd);
+ if (i != 0) {
+ if (i == 1 && !qflag) {
+ FPRINTF("(incomplete d-tree) ");
+#ifdef PKZIP_BUG_WORKAROUND
+ i = 0;
+ }
+#else
+ huft_free(glbl, td);
+ }
+ huft_free(glbl, tl);
+ return i; /* incomplete code set */
+#endif
+ }
+ /* decompress until an end-of-block code */
+ if (inflate_codes(glbl, tl, td, bl, bd))
+ return 1;
+
+ /* free the decoding tables, return */
+ huft_free(glbl, tl);
+ huft_free(glbl, td);
+ return 0;
+}
+
+/* decompress an inflated block */
+static int
+inflate_block(glbl, e)
+ struct inflate *glbl;
+ int *e; /* last block flag */
+{
+ unsigned t; /* block type */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local bit buffer */
+ b = glbl->gz_bb;
+ k = glbl->gz_bk;
+
+ /* read in last block bit */
+ NEEDBITS(glbl, 1)
+ * e = (int) b & 1;
+ DUMPBITS(1)
+ /* read in block type */
+ NEEDBITS(glbl, 2)
+ t = (unsigned) b & 3;
+ DUMPBITS(2)
+ /* restore the global bit buffer */
+ glbl->gz_bb = b;
+ glbl->gz_bk = k;
+
+ /* inflate that block type */
+ if (t == 2)
+ return inflate_dynamic(glbl);
+ if (t == 0)
+ return inflate_stored(glbl);
+ if (t == 1)
+ return inflate_fixed(glbl);
+ /* bad block type */
+ return 2;
+}
+
+
+
+/* decompress an inflated entry */
+static int
+xinflate(glbl)
+ struct inflate *glbl;
+{
+ int e; /* last block flag */
+ int r; /* result code */
+ unsigned h; /* maximum struct huft's malloc'ed */
+
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+
+ /* initialize window, bit buffer */
+ glbl->gz_wp = 0;
+ glbl->gz_bk = 0;
+ glbl->gz_bb = 0;
+
+ /* decompress until the last block */
+ h = 0;
+ do {
+ glbl->gz_hufts = 0;
+ if ((r = inflate_block(glbl, &e)) != 0)
+ return r;
+ if (glbl->gz_hufts > h)
+ h = glbl->gz_hufts;
+ } while (!e);
+
+ /* flush out slide */
+ FLUSH(glbl, glbl->gz_wp);
+
+ /* return success */
+ return 0;
+}
+
+/* Nobody uses this - why not? */
+int
+inflate(glbl)
+ struct inflate *glbl;
+{
+ int i;
+#ifdef _KERNEL
+ u_char *p = NULL;
+
+ if (!glbl->gz_slide)
+ p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK);
+#endif
+ if (!glbl->gz_slide)
+#ifdef _KERNEL
+ return(ENOMEM);
+#else
+ return 3; /* kzip expects 3 */
+#endif
+ i = xinflate(glbl);
+
+ if (glbl->gz_fixed_td != (struct huft *) NULL) {
+ huft_free(glbl, glbl->gz_fixed_td);
+ glbl->gz_fixed_td = (struct huft *) NULL;
+ }
+ if (glbl->gz_fixed_tl != (struct huft *) NULL) {
+ huft_free(glbl, glbl->gz_fixed_tl);
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ }
+#ifdef _KERNEL
+ if (p == glbl->gz_slide) {
+ free(glbl->gz_slide, M_GZIP);
+ glbl->gz_slide = NULL;
+ }
+#endif
+ return i;
+}
+/* ----------------------- END INFLATE.C */
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
new file mode 100644
index 0000000..40eff02
--- /dev/null
+++ b/sys/kern/init_main.c
@@ -0,0 +1,855 @@
+/*-
+ * Copyright (c) 1995 Terrence R. Lambert
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)init_main.c 8.9 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_init_path.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/exec.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/jail.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/sysent.h>
+#include <sys/reboot.h>
+#include <sys/sched.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#include <sys/unistd.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/cpuset.h>
+
+#include <machine/cpu.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/copyright.h>
+
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+
+void mi_startup(void); /* Should be elsewhere */
+
+/* Components of the first process -- never freed. */
+static struct session session0;
+static struct pgrp pgrp0;
+struct proc proc0;
+struct thread thread0 __aligned(16);
+struct vmspace vmspace0;
+struct proc *initproc;
+
+#ifndef BOOTHOWTO
+#define BOOTHOWTO 0
+#endif
+int boothowto = BOOTHOWTO; /* initialized so that it can be patched */
+SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0,
+ "Boot control flags, passed from loader");
+
+#ifndef BOOTVERBOSE
+#define BOOTVERBOSE 0
+#endif
+int bootverbose = BOOTVERBOSE;
+SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0,
+ "Control the output of verbose kernel messages");
+
+/*
+ * This ensures that there is at least one entry so that the sysinit_set
+ * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
+ * executed.
+ */
+SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);
+
+/*
+ * The sysinit table itself. Items are checked off as the are run.
+ * If we want to register new sysinit types, add them to newsysinit.
+ */
+SET_DECLARE(sysinit_set, struct sysinit);
+struct sysinit **sysinit, **sysinit_end;
+struct sysinit **newsysinit, **newsysinit_end;
+
+/*
+ * Merge a new sysinit set into the current set, reallocating it if
+ * necessary. This can only be called after malloc is running.
+ */
+void
+sysinit_add(struct sysinit **set, struct sysinit **set_end)
+{
+ struct sysinit **newset;
+ struct sysinit **sipp;
+ struct sysinit **xipp;
+ int count;
+
+ count = set_end - set;
+ if (newsysinit)
+ count += newsysinit_end - newsysinit;
+ else
+ count += sysinit_end - sysinit;
+ newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
+ if (newset == NULL)
+ panic("cannot malloc for sysinit");
+ xipp = newset;
+ if (newsysinit)
+ for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
+ *xipp++ = *sipp;
+ else
+ for (sipp = sysinit; sipp < sysinit_end; sipp++)
+ *xipp++ = *sipp;
+ for (sipp = set; sipp < set_end; sipp++)
+ *xipp++ = *sipp;
+ if (newsysinit)
+ free(newsysinit, M_TEMP);
+ newsysinit = newset;
+ newsysinit_end = newset + count;
+}
+
+#if defined (DDB) && defined(VERBOSE_SYSINIT)
+static const char *
+symbol_name(vm_offset_t va, db_strategy_t strategy)
+{
+ const char *name;
+ c_db_sym_t sym;
+ db_expr_t offset;
+
+ if (va == 0)
+ return (NULL);
+ sym = db_search_symbol(va, strategy, &offset);
+ if (offset != 0)
+ return (NULL);
+ db_symbol_values(sym, &name, NULL);
+ return (name);
+}
+#endif
+
+/*
+ * System startup; initialize the world, create process 0, mount root
+ * filesystem, and fork to create init and pagedaemon. Most of the
+ * hard work is done in the lower-level initialization routines including
+ * startup(), which does memory initialization and autoconfiguration.
+ *
+ * This allows simple addition of new kernel subsystems that require
+ * boot time initialization. It also allows substitution of subsystem
+ * (for instance, a scheduler, kernel profiler, or VM system) by object
+ * module. Finally, it allows for optional "kernel threads".
+ */
+void
+mi_startup(void)
+{
+
+ register struct sysinit **sipp; /* system initialization*/
+ register struct sysinit **xipp; /* interior loop of sort*/
+ register struct sysinit *save; /* bubble*/
+
+#if defined(VERBOSE_SYSINIT)
+ int last;
+ int verbose;
+#endif
+
+ if (boothowto & RB_VERBOSE)
+ bootverbose++;
+
+ if (sysinit == NULL) {
+ sysinit = SET_BEGIN(sysinit_set);
+ sysinit_end = SET_LIMIT(sysinit_set);
+ }
+
+restart:
+ /*
+ * Perform a bubble sort of the system initialization objects by
+ * their subsystem (primary key) and order (secondary key).
+ */
+ for (sipp = sysinit; sipp < sysinit_end; sipp++) {
+ for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
+ if ((*sipp)->subsystem < (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order <= (*xipp)->order))
+ continue; /* skip*/
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+#if defined(VERBOSE_SYSINIT)
+ last = SI_SUB_COPYRIGHT;
+ verbose = 0;
+#if !defined(DDB)
+ printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
+#endif
+#endif
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ */
+ for (sipp = sysinit; sipp < sysinit_end; sipp++) {
+
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s)*/
+
+ if ((*sipp)->subsystem == SI_SUB_DONE)
+ continue;
+
+#if defined(VERBOSE_SYSINIT)
+ if ((*sipp)->subsystem > last) {
+ verbose = 1;
+ last = (*sipp)->subsystem;
+ printf("subsystem %x\n", last);
+ }
+ if (verbose) {
+#if defined(DDB)
+ const char *func, *data;
+
+ func = symbol_name((vm_offset_t)(*sipp)->func,
+ DB_STGY_PROC);
+ data = symbol_name((vm_offset_t)(*sipp)->udata,
+ DB_STGY_ANY);
+ if (func != NULL && data != NULL)
+ printf(" %s(&%s)... ", func, data);
+ else if (func != NULL)
+ printf(" %s(%p)... ", func, (*sipp)->udata);
+ else
+#endif
+ printf(" %p(%p)... ", (*sipp)->func,
+ (*sipp)->udata);
+ }
+#endif
+
+ /* Call function */
+ (*((*sipp)->func))((*sipp)->udata);
+
+#if defined(VERBOSE_SYSINIT)
+ if (verbose)
+ printf("done.\n");
+#endif
+
+ /* Check off the one we're just done */
+ (*sipp)->subsystem = SI_SUB_DONE;
+
+ /* Check if we've installed more sysinit items via KLD */
+ if (newsysinit != NULL) {
+ if (sysinit != SET_BEGIN(sysinit_set))
+ free(sysinit, M_TEMP);
+ sysinit = newsysinit;
+ sysinit_end = newsysinit_end;
+ newsysinit = NULL;
+ newsysinit_end = NULL;
+ goto restart;
+ }
+ }
+
+ mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
+ mtx_unlock(&Giant);
+
+ /*
+ * Now hand over this thread to swapper.
+ */
+ swapper();
+ /* NOTREACHED*/
+}
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's belong elsewhere, but have not yet
+ **** been moved.
+ ****
+ ***************************************************************************
+ */
+static void
+print_caddr_t(void *data)
+{
+ printf("%s", (char *)data);
+}
+
+static void
+print_version(void *data __unused)
+{
+ int len;
+
+ /* Strip a trailing newline from version. */
+ len = strlen(version);
+ while (len > 0 && version[len - 1] == '\n')
+ len--;
+ printf("%.*s %s\n", len, version, machine);
+ printf("%s\n", compiler_version);
+}
+
+SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t,
+ copyright);
+SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t,
+ trademark);
+SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL);
+
+#ifdef WITNESS
+static char wit_warn[] =
+ "WARNING: WITNESS option enabled, expect reduced performance.\n";
+SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1,
+ print_caddr_t, wit_warn);
+SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1,
+ print_caddr_t, wit_warn);
+#endif
+
+#ifdef DIAGNOSTIC
+static char diag_warn[] =
+ "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
+SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2,
+ print_caddr_t, diag_warn);
+SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2,
+ print_caddr_t, diag_warn);
+#endif
+
+static int
+null_fetch_syscall_args(struct thread *td __unused,
+ struct syscall_args *sa __unused)
+{
+
+ panic("null_fetch_syscall_args");
+}
+
+static void
+null_set_syscall_retval(struct thread *td __unused, int error __unused)
+{
+
+ panic("null_set_syscall_retval");
+}
+
+struct sysentvec null_sysvec = {
+ .sv_size = 0,
+ .sv_table = NULL,
+ .sv_mask = 0,
+ .sv_sigsize = 0,
+ .sv_sigtbl = NULL,
+ .sv_errsize = 0,
+ .sv_errtbl = NULL,
+ .sv_transtrap = NULL,
+ .sv_fixup = NULL,
+ .sv_sendsig = NULL,
+ .sv_sigcode = NULL,
+ .sv_szsigcode = NULL,
+ .sv_prepsyscall = NULL,
+ .sv_name = "null",
+ .sv_coredump = NULL,
+ .sv_imgact_try = NULL,
+ .sv_minsigstksz = 0,
+ .sv_pagesize = PAGE_SIZE,
+ .sv_minuser = VM_MIN_ADDRESS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS,
+ .sv_usrstack = USRSTACK,
+ .sv_psstrings = PS_STRINGS,
+ .sv_stackprot = VM_PROT_ALL,
+ .sv_copyout_strings = NULL,
+ .sv_setregs = NULL,
+ .sv_fixlimit = NULL,
+ .sv_maxssiz = NULL,
+ .sv_flags = 0,
+ .sv_set_syscall_retval = null_set_syscall_retval,
+ .sv_fetch_syscall_args = null_fetch_syscall_args,
+ .sv_syscallnames = NULL,
+ .sv_schedtail = NULL,
+};
+
+/*
+ ***************************************************************************
+ ****
+ **** The two following SYSINIT's are proc0 specific glue code. I am not
+ **** convinced that they can not be safely combined, but their order of
+ **** operation has been maintained as the same as the original init_main.c
+ **** for right now.
+ ****
+ **** These probably belong in init_proc.c or kern_proc.c, since they
+ **** deal with proc0 (the fork template process).
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void
+proc0_init(void *dummy __unused)
+{
+ struct proc *p;
+ struct thread *td;
+ vm_paddr_t pageablemem;
+ int i;
+
+ GIANT_REQUIRED;
+ p = &proc0;
+ td = &thread0;
+
+ /*
+ * Initialize magic number and osrel.
+ */
+ p->p_magic = P_MAGIC;
+ p->p_osrel = osreldate;
+
+ /*
+ * Initialize thread and process structures.
+ */
+ procinit(); /* set up proc zone */
+ threadinit(); /* set up UMA zones */
+
+ /*
+ * Initialise scheduler resources.
+ * Add scheduler specific parts to proc, thread as needed.
+ */
+ schedinit(); /* scheduler gets its house in order */
+
+ /*
+ * Create process 0 (the swapper).
+ */
+ LIST_INSERT_HEAD(&allproc, p, p_list);
+ LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
+ mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+ p->p_pgrp = &pgrp0;
+ LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
+ LIST_INIT(&pgrp0.pg_members);
+ LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
+
+ pgrp0.pg_session = &session0;
+ mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
+ refcount_init(&session0.s_count, 1);
+ session0.s_leader = p;
+
+ p->p_sysent = &null_sysvec;
+ p->p_flag = P_SYSTEM | P_INMEM;
+ p->p_state = PRS_NORMAL;
+ knlist_init_mtx(&p->p_klist, &p->p_mtx);
+ STAILQ_INIT(&p->p_ktr);
+ p->p_nice = NZERO;
+ /* pid_max cannot be greater than PID_MAX */
+ td->td_tid = PID_MAX + 1;
+ LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
+ td->td_state = TDS_RUNNING;
+ td->td_pri_class = PRI_TIMESHARE;
+ td->td_user_pri = PUSER;
+ td->td_base_user_pri = PUSER;
+ td->td_lend_user_pri = PRI_MAX;
+ td->td_priority = PVM;
+ td->td_base_pri = PVM;
+ td->td_oncpu = 0;
+ td->td_flags = TDF_INMEM;
+ td->td_pflags = TDP_KTHREAD;
+ td->td_cpuset = cpuset_thread0();
+ prison0.pr_cpuset = cpuset_ref(td->td_cpuset);
+ p->p_peers = 0;
+ p->p_leader = p;
+
+
+ strncpy(p->p_comm, "kernel", sizeof (p->p_comm));
+ strncpy(td->td_name, "swapper", sizeof (td->td_name));
+
+ callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0);
+ callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
+ callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
+
+ /* Create credentials. */
+ p->p_ucred = crget();
+ p->p_ucred->cr_ngroups = 1; /* group 0 */
+ p->p_ucred->cr_uidinfo = uifind(0);
+ p->p_ucred->cr_ruidinfo = uifind(0);
+ p->p_ucred->cr_prison = &prison0;
+ p->p_ucred->cr_loginclass = loginclass_find("default");
+#ifdef AUDIT
+ audit_cred_kproc0(p->p_ucred);
+#endif
+#ifdef MAC
+ mac_cred_create_swapper(p->p_ucred);
+#endif
+ td->td_ucred = crhold(p->p_ucred);
+
+ /* Create sigacts. */
+ p->p_sigacts = sigacts_alloc();
+
+ /* Initialize signal state for process 0. */
+ siginit(&proc0);
+
+ /* Create the file descriptor table. */
+ p->p_fd = fdinit(NULL);
+ p->p_fdtol = NULL;
+
+ /* Create the limits structures. */
+ p->p_limit = lim_alloc();
+ for (i = 0; i < RLIM_NLIMITS; i++)
+ p->p_limit->pl_rlimit[i].rlim_cur =
+ p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
+ p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
+ p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
+ p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
+ p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
+ p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
+ p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
+ p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
+ p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
+ /* Cast to avoid overflow on i386/PAE. */
+ pageablemem = ptoa((vm_paddr_t)cnt.v_free_count);
+ p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur =
+ p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem;
+ p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3;
+ p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem;
+ p->p_cpulimit = RLIM_INFINITY;
+
+ /* Initialize resource accounting structures. */
+ racct_create(&p->p_racct);
+
+ p->p_stats = pstats_alloc();
+
+ /* Allocate a prototype map so we have something to fork. */
+ pmap_pinit0(vmspace_pmap(&vmspace0));
+ p->p_vmspace = &vmspace0;
+ vmspace0.vm_refcnt = 1;
+
+ /*
+ * proc0 is not expected to enter usermode, so there is no special
+ * handling for sv_minuser here, like is done for exec_new_vmspace().
+ */
+ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0),
+ p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser);
+
+ /*
+ * Call the init and ctor for the new thread and proc. We wait
+ * to do this until all other structures are fairly sane.
+ */
+ EVENTHANDLER_INVOKE(process_init, p);
+ EVENTHANDLER_INVOKE(thread_init, td);
+ EVENTHANDLER_INVOKE(process_ctor, p);
+ EVENTHANDLER_INVOKE(thread_ctor, td);
+
+ /*
+ * Charge root for one process.
+ */
+ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
+ PROC_LOCK(p);
+ racct_add_force(p, RACCT_NPROC, 1);
+ PROC_UNLOCK(p);
+}
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);
+
+/* ARGSUSED*/
+static void
+proc0_post(void *dummy __unused)
+{
+ struct timespec ts;
+ struct proc *p;
+ struct rusage ru;
+ struct thread *td;
+
+ /*
+ * Now we can look at the time, having had a chance to verify the
+ * time from the filesystem. Pretend that proc0 started now.
+ */
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ microuptime(&p->p_stats->p_start);
+ PROC_SLOCK(p);
+ rufetch(p, &ru); /* Clears thread stats */
+ PROC_SUNLOCK(p);
+ p->p_rux.rux_runtime = 0;
+ p->p_rux.rux_uticks = 0;
+ p->p_rux.rux_sticks = 0;
+ p->p_rux.rux_iticks = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ td->td_runtime = 0;
+ }
+ }
+ sx_sunlock(&allproc_lock);
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+
+ /*
+ * Give the ``random'' number generator a thump.
+ */
+ nanotime(&ts);
+ srandom(ts.tv_sec ^ ts.tv_nsec);
+}
+SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL);
+
+static void
+random_init(void *dummy __unused)
+{
+
+ /*
+ * After CPU has been started we have some randomness on most
+ * platforms via get_cyclecount(). For platforms that don't
+ * we will reseed random(9) in proc0_post() as well.
+ */
+ srandom(get_cyclecount());
+}
+SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL);
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's and glue code should be moved to the
+ **** respective files on a per subsystem basis.
+ ****
+ ***************************************************************************
+ */
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following code probably belongs in another file, like
+ **** kern/init_init.c.
+ ****
+ ***************************************************************************
+ */
+
+/*
+ * List of paths to try when searching for "init".
+ */
+static char init_path[MAXPATHLEN] =
+#ifdef INIT_PATH
+ __XSTRING(INIT_PATH);
+#else
+ "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init";
+#endif
+SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
+ "Path used to search the init process");
+
+/*
+ * Shutdown timeout of init(8).
+ * Unused within kernel, but used to control init(8), hence do not remove.
+ */
+#ifndef INIT_SHUTDOWN_TIMEOUT
+#define INIT_SHUTDOWN_TIMEOUT 120
+#endif
+static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT;
+SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout,
+ CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). "
+ "Unused within kernel, but used to control init(8)");
+
+/*
+ * Start the initial user process; try exec'ing each pathname in init_path.
+ * The program is invoked with one argument containing the boot flags.
+ */
+static void
+start_init(void *dummy)
+{
+ vm_offset_t addr;
+ struct execve_args args;
+ int options, error;
+ char *var, *path, *next, *s;
+ char *ucp, **uap, *arg0, *arg1;
+ struct thread *td;
+ struct proc *p;
+
+ mtx_lock(&Giant);
+
+ GIANT_REQUIRED;
+
+ td = curthread;
+ p = td->td_proc;
+
+ vfs_mountroot();
+
+ /*
+ * Need just enough stack to hold the faked-up "execve()" arguments.
+ */
+ addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
+ if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
+ FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
+ panic("init: couldn't allocate argument space");
+ p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
+ p->p_vmspace->vm_ssize = 1;
+
+ if ((var = getenv("init_path")) != NULL) {
+ strlcpy(init_path, var, sizeof(init_path));
+ freeenv(var);
+ }
+
+ for (path = init_path; *path != '\0'; path = next) {
+ while (*path == ':')
+ path++;
+ if (*path == '\0')
+ break;
+ for (next = path; *next != '\0' && *next != ':'; next++)
+ /* nothing */ ;
+ if (bootverbose)
+ printf("start_init: trying %.*s\n", (int)(next - path),
+ path);
+
+ /*
+ * Move out the boot flag argument.
+ */
+ options = 0;
+ ucp = (char *)p->p_sysent->sv_usrstack;
+ (void)subyte(--ucp, 0); /* trailing zero */
+ if (boothowto & RB_SINGLE) {
+ (void)subyte(--ucp, 's');
+ options = 1;
+ }
+#ifdef notyet
+ if (boothowto & RB_FASTBOOT) {
+ (void)subyte(--ucp, 'f');
+ options = 1;
+ }
+#endif
+
+#ifdef BOOTCDROM
+ (void)subyte(--ucp, 'C');
+ options = 1;
+#endif
+
+ if (options == 0)
+ (void)subyte(--ucp, '-');
+ (void)subyte(--ucp, '-'); /* leading hyphen */
+ arg1 = ucp;
+
+ /*
+ * Move out the file name (also arg 0).
+ */
+ (void)subyte(--ucp, 0);
+ for (s = next - 1; s >= path; s--)
+ (void)subyte(--ucp, *s);
+ arg0 = ucp;
+
+ /*
+ * Move out the arg pointers.
+ */
+ uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
+ (void)suword((caddr_t)--uap, (long)0); /* terminator */
+ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
+ (void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
+
+ /*
+ * Point at the arguments.
+ */
+ args.fname = arg0;
+ args.argv = uap;
+ args.envv = NULL;
+
+ /*
+ * Now try to exec the program. If can't for any reason
+ * other than it doesn't exist, complain.
+ *
+ * Otherwise, return via fork_trampoline() all the way
+ * to user mode as init!
+ */
+ if ((error = sys_execve(td, &args)) == 0) {
+ mtx_unlock(&Giant);
+ return;
+ }
+ if (error != ENOENT)
+ printf("exec %.*s: error %d\n", (int)(next - path),
+ path, error);
+ }
+ printf("init: not found in path %s\n", init_path);
+ panic("no init");
+}
+
+/*
+ * Like kproc_create(), but runs in it's own address space.
+ * We do this early to reserve pid 1.
+ *
+ * Note special case - do not make it runnable yet. Other work
+ * in progress will change this more.
+ */
+static void
+create_init(const void *udata __unused)
+{
+ struct ucred *newcred, *oldcred;
+ int error;
+
+ error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc,
+ NULL, 0);
+ if (error)
+ panic("cannot fork init: %d\n", error);
+ KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
+ /* divorce init's credentials from the kernel's */
+ newcred = crget();
+ PROC_LOCK(initproc);
+ initproc->p_flag |= P_SYSTEM | P_INMEM;
+ oldcred = initproc->p_ucred;
+ crcopy(newcred, oldcred);
+#ifdef MAC
+ mac_cred_create_init(newcred);
+#endif
+#ifdef AUDIT
+ audit_cred_proc1(newcred);
+#endif
+ initproc->p_ucred = newcred;
+ PROC_UNLOCK(initproc);
+ crfree(oldcred);
+ cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
+ cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
+}
+SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
+
+/*
+ * Make it runnable now.
+ */
+static void
+kick_init(const void *udata __unused)
+{
+ struct thread *td;
+
+ td = FIRST_THREAD_IN_PROC(initproc);
+ thread_lock(td);
+ TD_SET_CAN_RUN(td);
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
+}
+SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL);
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
new file mode 100644
index 0000000..64b0201
--- /dev/null
+++ b/sys/kern/init_sysent.c
@@ -0,0 +1,581 @@
+/*
+ * System call switch table.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: head/sys/kern/syscalls.master 255219 2013-09-05 00:09:56Z pjd
+ */
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+#define AS(name) (sizeof(struct name) / sizeof(register_t))
+
+#ifdef COMPAT_43
+#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)
+#else
+#define compat(n, name) 0, (sy_call_t *)nosys
+#endif
+
+#ifdef COMPAT_FREEBSD4
+#define compat4(n, name) n, (sy_call_t *)__CONCAT(freebsd4_,name)
+#else
+#define compat4(n, name) 0, (sy_call_t *)nosys
+#endif
+
+#ifdef COMPAT_FREEBSD7
+#define compat7(n, name) n, (sy_call_t *)__CONCAT(freebsd7_,name)
+#else
+#define compat7(n, name) 0, (sy_call_t *)nosys
+#endif
+
+/* The casts are bogus but will do for now. */
+struct sysent sysent[] = {
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 0 = syscall */
+ { AS(sys_exit_args), (sy_call_t *)sys_sys_exit, AUE_EXIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 1 = exit */
+ { 0, (sy_call_t *)sys_fork, AUE_FORK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 2 = fork */
+ { AS(read_args), (sy_call_t *)sys_read, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 3 = read */
+ { AS(write_args), (sy_call_t *)sys_write, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 4 = write */
+ { AS(open_args), (sy_call_t *)sys_open, AUE_OPEN_RWTC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 5 = open */
+ { AS(close_args), (sy_call_t *)sys_close, AUE_CLOSE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 6 = close */
+ { AS(wait4_args), (sy_call_t *)sys_wait4, AUE_WAIT4, NULL, 0, 0, 0, SY_THR_STATIC }, /* 7 = wait4 */
+ { compat(AS(ocreat_args),creat), AUE_CREAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 8 = old creat */
+ { AS(link_args), (sy_call_t *)sys_link, AUE_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 9 = link */
+ { AS(unlink_args), (sy_call_t *)sys_unlink, AUE_UNLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 10 = unlink */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 11 = obsolete execv */
+ { AS(chdir_args), (sy_call_t *)sys_chdir, AUE_CHDIR, NULL, 0, 0, 0, SY_THR_STATIC }, /* 12 = chdir */
+ { AS(fchdir_args), (sy_call_t *)sys_fchdir, AUE_FCHDIR, NULL, 0, 0, 0, SY_THR_STATIC }, /* 13 = fchdir */
+ { AS(mknod_args), (sy_call_t *)sys_mknod, AUE_MKNOD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 14 = mknod */
+ { AS(chmod_args), (sy_call_t *)sys_chmod, AUE_CHMOD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 15 = chmod */
+ { AS(chown_args), (sy_call_t *)sys_chown, AUE_CHOWN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 16 = chown */
+ { AS(obreak_args), (sy_call_t *)sys_obreak, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 17 = break */
+ { compat4(AS(freebsd4_getfsstat_args),getfsstat), AUE_GETFSSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 18 = freebsd4 getfsstat */
+ { compat(AS(olseek_args),lseek), AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 19 = old lseek */
+ { 0, (sy_call_t *)sys_getpid, AUE_GETPID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 20 = getpid */
+ { AS(mount_args), (sy_call_t *)sys_mount, AUE_MOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 21 = mount */
+ { AS(unmount_args), (sy_call_t *)sys_unmount, AUE_UMOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 22 = unmount */
+ { AS(setuid_args), (sy_call_t *)sys_setuid, AUE_SETUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 23 = setuid */
+ { 0, (sy_call_t *)sys_getuid, AUE_GETUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 24 = getuid */
+ { 0, (sy_call_t *)sys_geteuid, AUE_GETEUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 25 = geteuid */
+ { AS(ptrace_args), (sy_call_t *)sys_ptrace, AUE_PTRACE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 26 = ptrace */
+ { AS(recvmsg_args), (sy_call_t *)sys_recvmsg, AUE_RECVMSG, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 27 = recvmsg */
+ { AS(sendmsg_args), (sy_call_t *)sys_sendmsg, AUE_SENDMSG, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 28 = sendmsg */
+ { AS(recvfrom_args), (sy_call_t *)sys_recvfrom, AUE_RECVFROM, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 29 = recvfrom */
+ { AS(accept_args), (sy_call_t *)sys_accept, AUE_ACCEPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 30 = accept */
+ { AS(getpeername_args), (sy_call_t *)sys_getpeername, AUE_GETPEERNAME, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 31 = getpeername */
+ { AS(getsockname_args), (sy_call_t *)sys_getsockname, AUE_GETSOCKNAME, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 32 = getsockname */
+ { AS(access_args), (sy_call_t *)sys_access, AUE_ACCESS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 33 = access */
+ { AS(chflags_args), (sy_call_t *)sys_chflags, AUE_CHFLAGS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 34 = chflags */
+ { AS(fchflags_args), (sy_call_t *)sys_fchflags, AUE_FCHFLAGS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 35 = fchflags */
+ { 0, (sy_call_t *)sys_sync, AUE_SYNC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 36 = sync */
+ { AS(kill_args), (sy_call_t *)sys_kill, AUE_KILL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 37 = kill */
+ { compat(AS(ostat_args),stat), AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 38 = old stat */
+ { 0, (sy_call_t *)sys_getppid, AUE_GETPPID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 39 = getppid */
+ { compat(AS(olstat_args),lstat), AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 40 = old lstat */
+ { AS(dup_args), (sy_call_t *)sys_dup, AUE_DUP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 41 = dup */
+ { 0, (sy_call_t *)sys_pipe, AUE_PIPE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 42 = pipe */
+ { 0, (sy_call_t *)sys_getegid, AUE_GETEGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 43 = getegid */
+ { AS(profil_args), (sy_call_t *)sys_profil, AUE_PROFILE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 44 = profil */
+ { AS(ktrace_args), (sy_call_t *)sys_ktrace, AUE_KTRACE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 45 = ktrace */
+ { compat(AS(osigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 46 = old sigaction */
+ { 0, (sy_call_t *)sys_getgid, AUE_GETGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 47 = getgid */
+ { compat(AS(osigprocmask_args),sigprocmask), AUE_SIGPROCMASK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 48 = old sigprocmask */
+ { AS(getlogin_args), (sy_call_t *)sys_getlogin, AUE_GETLOGIN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 49 = getlogin */
+ { AS(setlogin_args), (sy_call_t *)sys_setlogin, AUE_SETLOGIN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 50 = setlogin */
+ { AS(acct_args), (sy_call_t *)sys_acct, AUE_ACCT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 51 = acct */
+ { compat(0,sigpending), AUE_SIGPENDING, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 52 = old sigpending */
+ { AS(sigaltstack_args), (sy_call_t *)sys_sigaltstack, AUE_SIGALTSTACK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 53 = sigaltstack */
+ { AS(ioctl_args), (sy_call_t *)sys_ioctl, AUE_IOCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 54 = ioctl */
+ { AS(reboot_args), (sy_call_t *)sys_reboot, AUE_REBOOT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 55 = reboot */
+ { AS(revoke_args), (sy_call_t *)sys_revoke, AUE_REVOKE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 56 = revoke */
+ { AS(symlink_args), (sy_call_t *)sys_symlink, AUE_SYMLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 57 = symlink */
+ { AS(readlink_args), (sy_call_t *)sys_readlink, AUE_READLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 58 = readlink */
+ { AS(execve_args), (sy_call_t *)sys_execve, AUE_EXECVE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 59 = execve */
+ { AS(umask_args), (sy_call_t *)sys_umask, AUE_UMASK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 60 = umask */
+ { AS(chroot_args), (sy_call_t *)sys_chroot, AUE_CHROOT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 61 = chroot */
+ { compat(AS(ofstat_args),fstat), AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 62 = old fstat */
+ { compat(AS(getkerninfo_args),getkerninfo), AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 63 = old getkerninfo */
+ { compat(0,getpagesize), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 64 = old getpagesize */
+ { AS(msync_args), (sy_call_t *)sys_msync, AUE_MSYNC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 65 = msync */
+ { 0, (sy_call_t *)sys_vfork, AUE_VFORK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 66 = vfork */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 67 = obsolete vread */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 68 = obsolete vwrite */
+ { AS(sbrk_args), (sy_call_t *)sys_sbrk, AUE_SBRK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 69 = sbrk */
+ { AS(sstk_args), (sy_call_t *)sys_sstk, AUE_SSTK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 70 = sstk */
+ { compat(AS(ommap_args),mmap), AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 71 = old mmap */
+ { AS(ovadvise_args), (sy_call_t *)sys_ovadvise, AUE_O_VADVISE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 72 = vadvise */
+ { AS(munmap_args), (sy_call_t *)sys_munmap, AUE_MUNMAP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 73 = munmap */
+ { AS(mprotect_args), (sy_call_t *)sys_mprotect, AUE_MPROTECT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 74 = mprotect */
+ { AS(madvise_args), (sy_call_t *)sys_madvise, AUE_MADVISE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 75 = madvise */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 76 = obsolete vhangup */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 77 = obsolete vlimit */
+ { AS(mincore_args), (sy_call_t *)sys_mincore, AUE_MINCORE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 78 = mincore */
+ { AS(getgroups_args), (sy_call_t *)sys_getgroups, AUE_GETGROUPS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 79 = getgroups */
+ { AS(setgroups_args), (sy_call_t *)sys_setgroups, AUE_SETGROUPS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 80 = setgroups */
+ { 0, (sy_call_t *)sys_getpgrp, AUE_GETPGRP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 81 = getpgrp */
+ { AS(setpgid_args), (sy_call_t *)sys_setpgid, AUE_SETPGRP, NULL, 0, 0, 0, SY_THR_STATIC }, /* 82 = setpgid */
+ { AS(setitimer_args), (sy_call_t *)sys_setitimer, AUE_SETITIMER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 83 = setitimer */
+ { compat(0,wait), AUE_WAIT4, NULL, 0, 0, 0, SY_THR_STATIC }, /* 84 = old wait */
+ { AS(swapon_args), (sy_call_t *)sys_swapon, AUE_SWAPON, NULL, 0, 0, 0, SY_THR_STATIC }, /* 85 = swapon */
+ { AS(getitimer_args), (sy_call_t *)sys_getitimer, AUE_GETITIMER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 86 = getitimer */
+ { compat(AS(gethostname_args),gethostname), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 87 = old gethostname */
+ { compat(AS(sethostname_args),sethostname), AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 88 = old sethostname */
+ { 0, (sy_call_t *)sys_getdtablesize, AUE_GETDTABLESIZE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 89 = getdtablesize */
+ { AS(dup2_args), (sy_call_t *)sys_dup2, AUE_DUP2, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 90 = dup2 */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 91 = getdopt */
+ { AS(fcntl_args), (sy_call_t *)sys_fcntl, AUE_FCNTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 92 = fcntl */
+ { AS(select_args), (sy_call_t *)sys_select, AUE_SELECT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 93 = select */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 94 = setdopt */
+ { AS(fsync_args), (sy_call_t *)sys_fsync, AUE_FSYNC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 95 = fsync */
+ { AS(setpriority_args), (sy_call_t *)sys_setpriority, AUE_SETPRIORITY, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 96 = setpriority */
+ { AS(socket_args), (sy_call_t *)sys_socket, AUE_SOCKET, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 97 = socket */
+ { AS(connect_args), (sy_call_t *)sys_connect, AUE_CONNECT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 98 = connect */
+ { compat(AS(accept_args),accept), AUE_ACCEPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 99 = old accept */
+ { AS(getpriority_args), (sy_call_t *)sys_getpriority, AUE_GETPRIORITY, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 100 = getpriority */
+ { compat(AS(osend_args),send), AUE_SEND, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 101 = old send */
+ { compat(AS(orecv_args),recv), AUE_RECV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 102 = old recv */
+ { compat(AS(osigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 103 = old sigreturn */
+ { AS(bind_args), (sy_call_t *)sys_bind, AUE_BIND, NULL, 0, 0, 0, SY_THR_STATIC }, /* 104 = bind */
+ { AS(setsockopt_args), (sy_call_t *)sys_setsockopt, AUE_SETSOCKOPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 105 = setsockopt */
+ { AS(listen_args), (sy_call_t *)sys_listen, AUE_LISTEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 106 = listen */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 107 = obsolete vtimes */
+ { compat(AS(osigvec_args),sigvec), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 108 = old sigvec */
+ { compat(AS(osigblock_args),sigblock), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 109 = old sigblock */
+ { compat(AS(osigsetmask_args),sigsetmask), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 110 = old sigsetmask */
+ { compat(AS(osigsuspend_args),sigsuspend), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 111 = old sigsuspend */
+ { compat(AS(osigstack_args),sigstack), AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 112 = old sigstack */
+ { compat(AS(orecvmsg_args),recvmsg), AUE_RECVMSG, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 113 = old recvmsg */
+ { compat(AS(osendmsg_args),sendmsg), AUE_SENDMSG, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 114 = old sendmsg */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 115 = obsolete vtrace */
+ { AS(gettimeofday_args), (sy_call_t *)sys_gettimeofday, AUE_GETTIMEOFDAY, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 116 = gettimeofday */
+ { AS(getrusage_args), (sy_call_t *)sys_getrusage, AUE_GETRUSAGE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 117 = getrusage */
+ { AS(getsockopt_args), (sy_call_t *)sys_getsockopt, AUE_GETSOCKOPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 118 = getsockopt */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 119 = resuba */
+ { AS(readv_args), (sy_call_t *)sys_readv, AUE_READV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 120 = readv */
+ { AS(writev_args), (sy_call_t *)sys_writev, AUE_WRITEV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 121 = writev */
+ { AS(settimeofday_args), (sy_call_t *)sys_settimeofday, AUE_SETTIMEOFDAY, NULL, 0, 0, 0, SY_THR_STATIC }, /* 122 = settimeofday */
+ { AS(fchown_args), (sy_call_t *)sys_fchown, AUE_FCHOWN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 123 = fchown */
+ { AS(fchmod_args), (sy_call_t *)sys_fchmod, AUE_FCHMOD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 124 = fchmod */
+ { compat(AS(recvfrom_args),recvfrom), AUE_RECVFROM, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 125 = old recvfrom */
+ { AS(setreuid_args), (sy_call_t *)sys_setreuid, AUE_SETREUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 126 = setreuid */
+ { AS(setregid_args), (sy_call_t *)sys_setregid, AUE_SETREGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 127 = setregid */
+ { AS(rename_args), (sy_call_t *)sys_rename, AUE_RENAME, NULL, 0, 0, 0, SY_THR_STATIC }, /* 128 = rename */
+ { compat(AS(otruncate_args),truncate), AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 129 = old truncate */
+ { compat(AS(oftruncate_args),ftruncate), AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 130 = old ftruncate */
+ { AS(flock_args), (sy_call_t *)sys_flock, AUE_FLOCK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 131 = flock */
+ { AS(mkfifo_args), (sy_call_t *)sys_mkfifo, AUE_MKFIFO, NULL, 0, 0, 0, SY_THR_STATIC }, /* 132 = mkfifo */
+ { AS(sendto_args), (sy_call_t *)sys_sendto, AUE_SENDTO, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 133 = sendto */
+ { AS(shutdown_args), (sy_call_t *)sys_shutdown, AUE_SHUTDOWN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 134 = shutdown */
+ { AS(socketpair_args), (sy_call_t *)sys_socketpair, AUE_SOCKETPAIR, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 135 = socketpair */
+ { AS(mkdir_args), (sy_call_t *)sys_mkdir, AUE_MKDIR, NULL, 0, 0, 0, SY_THR_STATIC }, /* 136 = mkdir */
+ { AS(rmdir_args), (sy_call_t *)sys_rmdir, AUE_RMDIR, NULL, 0, 0, 0, SY_THR_STATIC }, /* 137 = rmdir */
+ { AS(utimes_args), (sy_call_t *)sys_utimes, AUE_UTIMES, NULL, 0, 0, 0, SY_THR_STATIC }, /* 138 = utimes */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 139 = obsolete 4.2 sigreturn */
+ { AS(adjtime_args), (sy_call_t *)sys_adjtime, AUE_ADJTIME, NULL, 0, 0, 0, SY_THR_STATIC }, /* 140 = adjtime */
+ { compat(AS(ogetpeername_args),getpeername), AUE_GETPEERNAME, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 141 = old getpeername */
+ { compat(0,gethostid), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 142 = old gethostid */
+ { compat(AS(osethostid_args),sethostid), AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 143 = old sethostid */
+ { compat(AS(ogetrlimit_args),getrlimit), AUE_GETRLIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 144 = old getrlimit */
+ { compat(AS(osetrlimit_args),setrlimit), AUE_SETRLIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 145 = old setrlimit */
+ { compat(AS(okillpg_args),killpg), AUE_KILLPG, NULL, 0, 0, 0, SY_THR_STATIC }, /* 146 = old killpg */
+ { 0, (sy_call_t *)sys_setsid, AUE_SETSID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 147 = setsid */
+ { AS(quotactl_args), (sy_call_t *)sys_quotactl, AUE_QUOTACTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 148 = quotactl */
+ { compat(0,quota), AUE_O_QUOTA, NULL, 0, 0, 0, SY_THR_STATIC }, /* 149 = old quota */
+ { compat(AS(getsockname_args),getsockname), AUE_GETSOCKNAME, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 150 = old getsockname */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 151 = sem_lock */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 152 = sem_wakeup */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 153 = asyncdaemon */
+ { AS(nlm_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 154 = nlm_syscall */
+ { AS(nfssvc_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 155 = nfssvc */
+ { compat(AS(ogetdirentries_args),getdirentries), AUE_GETDIRENTRIES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 156 = old getdirentries */
+ { compat4(AS(freebsd4_statfs_args),statfs), AUE_STATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 157 = freebsd4 statfs */
+ { compat4(AS(freebsd4_fstatfs_args),fstatfs), AUE_FSTATFS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 158 = freebsd4 fstatfs */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 159 = nosys */
+ { AS(lgetfh_args), (sy_call_t *)sys_lgetfh, AUE_LGETFH, NULL, 0, 0, 0, SY_THR_STATIC }, /* 160 = lgetfh */
+ { AS(getfh_args), (sy_call_t *)sys_getfh, AUE_NFS_GETFH, NULL, 0, 0, 0, SY_THR_STATIC }, /* 161 = getfh */
+ { compat4(AS(freebsd4_getdomainname_args),getdomainname), AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 162 = freebsd4 getdomainname */
+ { compat4(AS(freebsd4_setdomainname_args),setdomainname), AUE_SYSCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 163 = freebsd4 setdomainname */
+ { compat4(AS(freebsd4_uname_args),uname), AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 164 = freebsd4 uname */
+ { AS(sysarch_args), (sy_call_t *)sysarch, AUE_SYSARCH, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 165 = sysarch */
+ { AS(rtprio_args), (sy_call_t *)sys_rtprio, AUE_RTPRIO, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 166 = rtprio */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 167 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 168 = nosys */
+ { AS(semsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 169 = semsys */
+ { AS(msgsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 170 = msgsys */
+ { AS(shmsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 171 = shmsys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 172 = nosys */
+ { AS(freebsd6_pread_args), (sy_call_t *)freebsd6_pread, AUE_PREAD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 173 = freebsd6_pread */
+ { AS(freebsd6_pwrite_args), (sy_call_t *)freebsd6_pwrite, AUE_PWRITE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 174 = freebsd6_pwrite */
+ { AS(setfib_args), (sy_call_t *)sys_setfib, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 175 = setfib */
+ { AS(ntp_adjtime_args), (sy_call_t *)sys_ntp_adjtime, AUE_NTP_ADJTIME, NULL, 0, 0, 0, SY_THR_STATIC }, /* 176 = ntp_adjtime */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 177 = sfork */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 178 = getdescriptor */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 179 = setdescriptor */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 180 = nosys */
+ { AS(setgid_args), (sy_call_t *)sys_setgid, AUE_SETGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 181 = setgid */
+ { AS(setegid_args), (sy_call_t *)sys_setegid, AUE_SETEGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 182 = setegid */
+ { AS(seteuid_args), (sy_call_t *)sys_seteuid, AUE_SETEUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 183 = seteuid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 184 = lfs_bmapv */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 185 = lfs_markv */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 186 = lfs_segclean */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 187 = lfs_segwait */
+ { AS(stat_args), (sy_call_t *)sys_stat, AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 188 = stat */
+ { AS(fstat_args), (sy_call_t *)sys_fstat, AUE_FSTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 189 = fstat */
+ { AS(lstat_args), (sy_call_t *)sys_lstat, AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 190 = lstat */
+ { AS(pathconf_args), (sy_call_t *)sys_pathconf, AUE_PATHCONF, NULL, 0, 0, 0, SY_THR_STATIC }, /* 191 = pathconf */
+ { AS(fpathconf_args), (sy_call_t *)sys_fpathconf, AUE_FPATHCONF, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 192 = fpathconf */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 193 = nosys */
+ { AS(__getrlimit_args), (sy_call_t *)sys_getrlimit, AUE_GETRLIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 194 = getrlimit */
+ { AS(__setrlimit_args), (sy_call_t *)sys_setrlimit, AUE_SETRLIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 195 = setrlimit */
+ { AS(getdirentries_args), (sy_call_t *)sys_getdirentries, AUE_GETDIRENTRIES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 196 = getdirentries */
+ { AS(freebsd6_mmap_args), (sy_call_t *)freebsd6_mmap, AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 197 = freebsd6_mmap */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 198 = __syscall */
+ { AS(freebsd6_lseek_args), (sy_call_t *)freebsd6_lseek, AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 199 = freebsd6_lseek */
+ { AS(freebsd6_truncate_args), (sy_call_t *)freebsd6_truncate, AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 200 = freebsd6_truncate */
+ { AS(freebsd6_ftruncate_args), (sy_call_t *)freebsd6_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 201 = freebsd6_ftruncate */
+ { AS(sysctl_args), (sy_call_t *)sys___sysctl, AUE_SYSCTL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 202 = __sysctl */
+ { AS(mlock_args), (sy_call_t *)sys_mlock, AUE_MLOCK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 203 = mlock */
+ { AS(munlock_args), (sy_call_t *)sys_munlock, AUE_MUNLOCK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 204 = munlock */
+ { AS(undelete_args), (sy_call_t *)sys_undelete, AUE_UNDELETE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 205 = undelete */
+ { AS(futimes_args), (sy_call_t *)sys_futimes, AUE_FUTIMES, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 206 = futimes */
+ { AS(getpgid_args), (sy_call_t *)sys_getpgid, AUE_GETPGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 207 = getpgid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 208 = newreboot */
+ { AS(poll_args), (sy_call_t *)sys_poll, AUE_POLL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 209 = poll */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 210 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 211 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 212 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 213 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 214 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 215 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 216 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 217 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 218 = lkmnosys */
+ { AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 219 = lkmnosys */
+ { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 220 = freebsd7 __semctl */
+ { AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 221 = semget */
+ { AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 222 = semop */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 223 = semconfig */
+ { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 224 = freebsd7 msgctl */
+ { AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 225 = msgget */
+ { AS(msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 226 = msgsnd */
+ { AS(msgrcv_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 227 = msgrcv */
+ { AS(shmat_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 228 = shmat */
+ { 0, (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 229 = freebsd7 shmctl */
+ { AS(shmdt_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 230 = shmdt */
+ { AS(shmget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 231 = shmget */
+ { AS(clock_gettime_args), (sy_call_t *)sys_clock_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 232 = clock_gettime */
+ { AS(clock_settime_args), (sy_call_t *)sys_clock_settime, AUE_CLOCK_SETTIME, NULL, 0, 0, 0, SY_THR_STATIC }, /* 233 = clock_settime */
+ { AS(clock_getres_args), (sy_call_t *)sys_clock_getres, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 234 = clock_getres */
+ { AS(ktimer_create_args), (sy_call_t *)sys_ktimer_create, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 235 = ktimer_create */
+ { AS(ktimer_delete_args), (sy_call_t *)sys_ktimer_delete, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 236 = ktimer_delete */
+ { AS(ktimer_settime_args), (sy_call_t *)sys_ktimer_settime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 237 = ktimer_settime */
+ { AS(ktimer_gettime_args), (sy_call_t *)sys_ktimer_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 238 = ktimer_gettime */
+ { AS(ktimer_getoverrun_args), (sy_call_t *)sys_ktimer_getoverrun, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 239 = ktimer_getoverrun */
+ { AS(nanosleep_args), (sy_call_t *)sys_nanosleep, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 240 = nanosleep */
+ { AS(ffclock_getcounter_args), (sy_call_t *)sys_ffclock_getcounter, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 241 = ffclock_getcounter */
+ { AS(ffclock_setestimate_args), (sy_call_t *)sys_ffclock_setestimate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 242 = ffclock_setestimate */
+ { AS(ffclock_getestimate_args), (sy_call_t *)sys_ffclock_getestimate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 243 = ffclock_getestimate */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 244 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 245 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 246 = nosys */
+ { AS(clock_getcpuclockid2_args), (sy_call_t *)sys_clock_getcpuclockid2, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 247 = clock_getcpuclockid2 */
+ { AS(ntp_gettime_args), (sy_call_t *)sys_ntp_gettime, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 248 = ntp_gettime */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 249 = nosys */
+ { AS(minherit_args), (sy_call_t *)sys_minherit, AUE_MINHERIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 250 = minherit */
+ { AS(rfork_args), (sy_call_t *)sys_rfork, AUE_RFORK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 251 = rfork */
+ { AS(openbsd_poll_args), (sy_call_t *)sys_openbsd_poll, AUE_POLL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 252 = openbsd_poll */
+ { 0, (sy_call_t *)sys_issetugid, AUE_ISSETUGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 253 = issetugid */
+ { AS(lchown_args), (sy_call_t *)sys_lchown, AUE_LCHOWN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 254 = lchown */
+ { AS(aio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 255 = aio_read */
+ { AS(aio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 256 = aio_write */
+ { AS(lio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 257 = lio_listio */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 258 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 259 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 260 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 261 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 262 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 263 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 264 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 265 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 266 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 267 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 268 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 269 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 270 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 271 = nosys */
+ { AS(getdents_args), (sy_call_t *)sys_getdents, AUE_O_GETDENTS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 272 = getdents */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 273 = nosys */
+ { AS(lchmod_args), (sy_call_t *)sys_lchmod, AUE_LCHMOD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 274 = lchmod */
+ { AS(lchown_args), (sy_call_t *)sys_lchown, AUE_LCHOWN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 275 = netbsd_lchown */
+ { AS(lutimes_args), (sy_call_t *)sys_lutimes, AUE_LUTIMES, NULL, 0, 0, 0, SY_THR_STATIC }, /* 276 = lutimes */
+ { AS(msync_args), (sy_call_t *)sys_msync, AUE_MSYNC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 277 = netbsd_msync */
+ { AS(nstat_args), (sy_call_t *)sys_nstat, AUE_STAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 278 = nstat */
+ { AS(nfstat_args), (sy_call_t *)sys_nfstat, AUE_FSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 279 = nfstat */
+ { AS(nlstat_args), (sy_call_t *)sys_nlstat, AUE_LSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 280 = nlstat */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 281 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 282 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 283 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 284 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 285 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 286 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 287 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 288 = nosys */
+ { AS(preadv_args), (sy_call_t *)sys_preadv, AUE_PREADV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 289 = preadv */
+ { AS(pwritev_args), (sy_call_t *)sys_pwritev, AUE_PWRITEV, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 290 = pwritev */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 291 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 292 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 293 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 294 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 295 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 296 = nosys */
+ { compat4(AS(freebsd4_fhstatfs_args),fhstatfs), AUE_FHSTATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 297 = freebsd4 fhstatfs */
+ { AS(fhopen_args), (sy_call_t *)sys_fhopen, AUE_FHOPEN, NULL, 0, 0, 0, SY_THR_STATIC }, /* 298 = fhopen */
+ { AS(fhstat_args), (sy_call_t *)sys_fhstat, AUE_FHSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 299 = fhstat */
+ { AS(modnext_args), (sy_call_t *)sys_modnext, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 300 = modnext */
+ { AS(modstat_args), (sy_call_t *)sys_modstat, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 301 = modstat */
+ { AS(modfnext_args), (sy_call_t *)sys_modfnext, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 302 = modfnext */
+ { AS(modfind_args), (sy_call_t *)sys_modfind, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 303 = modfind */
+ { AS(kldload_args), (sy_call_t *)sys_kldload, AUE_MODLOAD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 304 = kldload */
+ { AS(kldunload_args), (sy_call_t *)sys_kldunload, AUE_MODUNLOAD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 305 = kldunload */
+ { AS(kldfind_args), (sy_call_t *)sys_kldfind, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 306 = kldfind */
+ { AS(kldnext_args), (sy_call_t *)sys_kldnext, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 307 = kldnext */
+ { AS(kldstat_args), (sy_call_t *)sys_kldstat, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 308 = kldstat */
+ { AS(kldfirstmod_args), (sy_call_t *)sys_kldfirstmod, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 309 = kldfirstmod */
+ { AS(getsid_args), (sy_call_t *)sys_getsid, AUE_GETSID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 310 = getsid */
+ { AS(setresuid_args), (sy_call_t *)sys_setresuid, AUE_SETRESUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 311 = setresuid */
+ { AS(setresgid_args), (sy_call_t *)sys_setresgid, AUE_SETRESGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 312 = setresgid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 313 = obsolete signanosleep */
+ { AS(aio_return_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 314 = aio_return */
+ { AS(aio_suspend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 315 = aio_suspend */
+ { AS(aio_cancel_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 316 = aio_cancel */
+ { AS(aio_error_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 317 = aio_error */
+ { AS(oaio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 318 = oaio_read */
+ { AS(oaio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 319 = oaio_write */
+ { AS(olio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 320 = olio_listio */
+ { 0, (sy_call_t *)sys_yield, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 321 = yield */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 322 = obsolete thr_sleep */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 323 = obsolete thr_wakeup */
+ { AS(mlockall_args), (sy_call_t *)sys_mlockall, AUE_MLOCKALL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 324 = mlockall */
+ { 0, (sy_call_t *)sys_munlockall, AUE_MUNLOCKALL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 325 = munlockall */
+ { AS(__getcwd_args), (sy_call_t *)sys___getcwd, AUE_GETCWD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 326 = __getcwd */
+ { AS(sched_setparam_args), (sy_call_t *)sys_sched_setparam, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 327 = sched_setparam */
+ { AS(sched_getparam_args), (sy_call_t *)sys_sched_getparam, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 328 = sched_getparam */
+ { AS(sched_setscheduler_args), (sy_call_t *)sys_sched_setscheduler, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 329 = sched_setscheduler */
+ { AS(sched_getscheduler_args), (sy_call_t *)sys_sched_getscheduler, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 330 = sched_getscheduler */
+ { 0, (sy_call_t *)sys_sched_yield, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 331 = sched_yield */
+ { AS(sched_get_priority_max_args), (sy_call_t *)sys_sched_get_priority_max, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 332 = sched_get_priority_max */
+ { AS(sched_get_priority_min_args), (sy_call_t *)sys_sched_get_priority_min, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 333 = sched_get_priority_min */
+ { AS(sched_rr_get_interval_args), (sy_call_t *)sys_sched_rr_get_interval, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 334 = sched_rr_get_interval */
+ { AS(utrace_args), (sy_call_t *)sys_utrace, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 335 = utrace */
+ { compat4(AS(freebsd4_sendfile_args),sendfile), AUE_SENDFILE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 336 = freebsd4 sendfile */
+ { AS(kldsym_args), (sy_call_t *)sys_kldsym, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 337 = kldsym */
+ { AS(jail_args), (sy_call_t *)sys_jail, AUE_JAIL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 338 = jail */
+ { AS(nnpfs_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 339 = nnpfs_syscall */
+ { AS(sigprocmask_args), (sy_call_t *)sys_sigprocmask, AUE_SIGPROCMASK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 340 = sigprocmask */
+ { AS(sigsuspend_args), (sy_call_t *)sys_sigsuspend, AUE_SIGSUSPEND, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 341 = sigsuspend */
+ { compat4(AS(freebsd4_sigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 342 = freebsd4 sigaction */
+ { AS(sigpending_args), (sy_call_t *)sys_sigpending, AUE_SIGPENDING, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 343 = sigpending */
+ { compat4(AS(freebsd4_sigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 344 = freebsd4 sigreturn */
+ { AS(sigtimedwait_args), (sy_call_t *)sys_sigtimedwait, AUE_SIGWAIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 345 = sigtimedwait */
+ { AS(sigwaitinfo_args), (sy_call_t *)sys_sigwaitinfo, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 346 = sigwaitinfo */
+ { AS(__acl_get_file_args), (sy_call_t *)sys___acl_get_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 347 = __acl_get_file */
+ { AS(__acl_set_file_args), (sy_call_t *)sys___acl_set_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 348 = __acl_set_file */
+ { AS(__acl_get_fd_args), (sy_call_t *)sys___acl_get_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 349 = __acl_get_fd */
+ { AS(__acl_set_fd_args), (sy_call_t *)sys___acl_set_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 350 = __acl_set_fd */
+ { AS(__acl_delete_file_args), (sy_call_t *)sys___acl_delete_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 351 = __acl_delete_file */
+ { AS(__acl_delete_fd_args), (sy_call_t *)sys___acl_delete_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 352 = __acl_delete_fd */
+ { AS(__acl_aclcheck_file_args), (sy_call_t *)sys___acl_aclcheck_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 353 = __acl_aclcheck_file */
+ { AS(__acl_aclcheck_fd_args), (sy_call_t *)sys___acl_aclcheck_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 354 = __acl_aclcheck_fd */
+ { AS(extattrctl_args), (sy_call_t *)sys_extattrctl, AUE_EXTATTRCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 355 = extattrctl */
+ { AS(extattr_set_file_args), (sy_call_t *)sys_extattr_set_file, AUE_EXTATTR_SET_FILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 356 = extattr_set_file */
+ { AS(extattr_get_file_args), (sy_call_t *)sys_extattr_get_file, AUE_EXTATTR_GET_FILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 357 = extattr_get_file */
+ { AS(extattr_delete_file_args), (sy_call_t *)sys_extattr_delete_file, AUE_EXTATTR_DELETE_FILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 358 = extattr_delete_file */
+ { AS(aio_waitcomplete_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 359 = aio_waitcomplete */
+ { AS(getresuid_args), (sy_call_t *)sys_getresuid, AUE_GETRESUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 360 = getresuid */
+ { AS(getresgid_args), (sy_call_t *)sys_getresgid, AUE_GETRESGID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 361 = getresgid */
+ { 0, (sy_call_t *)sys_kqueue, AUE_KQUEUE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 362 = kqueue */
+ { AS(kevent_args), (sy_call_t *)sys_kevent, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 363 = kevent */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 364 = __cap_get_proc */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 365 = __cap_set_proc */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 366 = __cap_get_fd */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 367 = __cap_get_file */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 368 = __cap_set_fd */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 369 = __cap_set_file */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 370 = nosys */
+ { AS(extattr_set_fd_args), (sy_call_t *)sys_extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 371 = extattr_set_fd */
+ { AS(extattr_get_fd_args), (sy_call_t *)sys_extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 372 = extattr_get_fd */
+ { AS(extattr_delete_fd_args), (sy_call_t *)sys_extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 373 = extattr_delete_fd */
+ { AS(__setugid_args), (sy_call_t *)sys___setugid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 374 = __setugid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 375 = nfsclnt */
+ { AS(eaccess_args), (sy_call_t *)sys_eaccess, AUE_EACCESS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 376 = eaccess */
+ { AS(afs3_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 377 = afs3_syscall */
+ { AS(nmount_args), (sy_call_t *)sys_nmount, AUE_NMOUNT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 378 = nmount */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 379 = kse_exit */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 380 = kse_wakeup */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 381 = kse_create */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 382 = kse_thr_interrupt */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 383 = kse_release */
+ { AS(__mac_get_proc_args), (sy_call_t *)sys___mac_get_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 384 = __mac_get_proc */
+ { AS(__mac_set_proc_args), (sy_call_t *)sys___mac_set_proc, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 385 = __mac_set_proc */
+ { AS(__mac_get_fd_args), (sy_call_t *)sys___mac_get_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 386 = __mac_get_fd */
+ { AS(__mac_get_file_args), (sy_call_t *)sys___mac_get_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 387 = __mac_get_file */
+ { AS(__mac_set_fd_args), (sy_call_t *)sys___mac_set_fd, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 388 = __mac_set_fd */
+ { AS(__mac_set_file_args), (sy_call_t *)sys___mac_set_file, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 389 = __mac_set_file */
+ { AS(kenv_args), (sy_call_t *)sys_kenv, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 390 = kenv */
+ { AS(lchflags_args), (sy_call_t *)sys_lchflags, AUE_LCHFLAGS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 391 = lchflags */
+ { AS(uuidgen_args), (sy_call_t *)sys_uuidgen, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 392 = uuidgen */
+ { AS(sendfile_args), (sy_call_t *)sys_sendfile, AUE_SENDFILE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 393 = sendfile */
+ { AS(mac_syscall_args), (sy_call_t *)sys_mac_syscall, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 394 = mac_syscall */
+ { AS(getfsstat_args), (sy_call_t *)sys_getfsstat, AUE_GETFSSTAT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 395 = getfsstat */
+ { AS(statfs_args), (sy_call_t *)sys_statfs, AUE_STATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 396 = statfs */
+ { AS(fstatfs_args), (sy_call_t *)sys_fstatfs, AUE_FSTATFS, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 397 = fstatfs */
+ { AS(fhstatfs_args), (sy_call_t *)sys_fhstatfs, AUE_FHSTATFS, NULL, 0, 0, 0, SY_THR_STATIC }, /* 398 = fhstatfs */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 399 = nosys */
+ { AS(ksem_close_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 400 = ksem_close */
+ { AS(ksem_post_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 401 = ksem_post */
+ { AS(ksem_wait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 402 = ksem_wait */
+ { AS(ksem_trywait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 403 = ksem_trywait */
+ { AS(ksem_init_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 404 = ksem_init */
+ { AS(ksem_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 405 = ksem_open */
+ { AS(ksem_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 406 = ksem_unlink */
+ { AS(ksem_getvalue_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 407 = ksem_getvalue */
+ { AS(ksem_destroy_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 408 = ksem_destroy */
+ { AS(__mac_get_pid_args), (sy_call_t *)sys___mac_get_pid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 409 = __mac_get_pid */
+ { AS(__mac_get_link_args), (sy_call_t *)sys___mac_get_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 410 = __mac_get_link */
+ { AS(__mac_set_link_args), (sy_call_t *)sys___mac_set_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 411 = __mac_set_link */
+ { AS(extattr_set_link_args), (sy_call_t *)sys_extattr_set_link, AUE_EXTATTR_SET_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 412 = extattr_set_link */
+ { AS(extattr_get_link_args), (sy_call_t *)sys_extattr_get_link, AUE_EXTATTR_GET_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 413 = extattr_get_link */
+ { AS(extattr_delete_link_args), (sy_call_t *)sys_extattr_delete_link, AUE_EXTATTR_DELETE_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 414 = extattr_delete_link */
+ { AS(__mac_execve_args), (sy_call_t *)sys___mac_execve, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 415 = __mac_execve */
+ { AS(sigaction_args), (sy_call_t *)sys_sigaction, AUE_SIGACTION, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 416 = sigaction */
+ { AS(sigreturn_args), (sy_call_t *)sys_sigreturn, AUE_SIGRETURN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 417 = sigreturn */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 418 = __xstat */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 419 = __xfstat */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 420 = __xlstat */
+ { AS(getcontext_args), (sy_call_t *)sys_getcontext, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 421 = getcontext */
+ { AS(setcontext_args), (sy_call_t *)sys_setcontext, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 422 = setcontext */
+ { AS(swapcontext_args), (sy_call_t *)sys_swapcontext, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 423 = swapcontext */
+ { AS(swapoff_args), (sy_call_t *)sys_swapoff, AUE_SWAPOFF, NULL, 0, 0, 0, SY_THR_STATIC }, /* 424 = swapoff */
+ { AS(__acl_get_link_args), (sy_call_t *)sys___acl_get_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 425 = __acl_get_link */
+ { AS(__acl_set_link_args), (sy_call_t *)sys___acl_set_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 426 = __acl_set_link */
+ { AS(__acl_delete_link_args), (sy_call_t *)sys___acl_delete_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 427 = __acl_delete_link */
+ { AS(__acl_aclcheck_link_args), (sy_call_t *)sys___acl_aclcheck_link, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 428 = __acl_aclcheck_link */
+ { AS(sigwait_args), (sy_call_t *)sys_sigwait, AUE_SIGWAIT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 429 = sigwait */
+ { AS(thr_create_args), (sy_call_t *)sys_thr_create, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 430 = thr_create */
+ { AS(thr_exit_args), (sy_call_t *)sys_thr_exit, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 431 = thr_exit */
+ { AS(thr_self_args), (sy_call_t *)sys_thr_self, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 432 = thr_self */
+ { AS(thr_kill_args), (sy_call_t *)sys_thr_kill, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 433 = thr_kill */
+ { AS(_umtx_lock_args), (sy_call_t *)sys__umtx_lock, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 434 = _umtx_lock */
+ { AS(_umtx_unlock_args), (sy_call_t *)sys__umtx_unlock, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 435 = _umtx_unlock */
+ { AS(jail_attach_args), (sy_call_t *)sys_jail_attach, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 436 = jail_attach */
+ { AS(extattr_list_fd_args), (sy_call_t *)sys_extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 437 = extattr_list_fd */
+ { AS(extattr_list_file_args), (sy_call_t *)sys_extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 438 = extattr_list_file */
+ { AS(extattr_list_link_args), (sy_call_t *)sys_extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 439 = extattr_list_link */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 440 = kse_switchin */
+ { AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 441 = ksem_timedwait */
+ { AS(thr_suspend_args), (sy_call_t *)sys_thr_suspend, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 442 = thr_suspend */
+ { AS(thr_wake_args), (sy_call_t *)sys_thr_wake, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 443 = thr_wake */
+ { AS(kldunloadf_args), (sy_call_t *)sys_kldunloadf, AUE_MODUNLOAD, NULL, 0, 0, 0, SY_THR_STATIC }, /* 444 = kldunloadf */
+ { AS(audit_args), (sy_call_t *)sys_audit, AUE_AUDIT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 445 = audit */
+ { AS(auditon_args), (sy_call_t *)sys_auditon, AUE_AUDITON, NULL, 0, 0, 0, SY_THR_STATIC }, /* 446 = auditon */
+ { AS(getauid_args), (sy_call_t *)sys_getauid, AUE_GETAUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 447 = getauid */
+ { AS(setauid_args), (sy_call_t *)sys_setauid, AUE_SETAUID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 448 = setauid */
+ { AS(getaudit_args), (sy_call_t *)sys_getaudit, AUE_GETAUDIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 449 = getaudit */
+ { AS(setaudit_args), (sy_call_t *)sys_setaudit, AUE_SETAUDIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 450 = setaudit */
+ { AS(getaudit_addr_args), (sy_call_t *)sys_getaudit_addr, AUE_GETAUDIT_ADDR, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 451 = getaudit_addr */
+ { AS(setaudit_addr_args), (sy_call_t *)sys_setaudit_addr, AUE_SETAUDIT_ADDR, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 452 = setaudit_addr */
+ { AS(auditctl_args), (sy_call_t *)sys_auditctl, AUE_AUDITCTL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 453 = auditctl */
+ { AS(_umtx_op_args), (sy_call_t *)sys__umtx_op, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 454 = _umtx_op */
+ { AS(thr_new_args), (sy_call_t *)sys_thr_new, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 455 = thr_new */
+ { AS(sigqueue_args), (sy_call_t *)sys_sigqueue, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 456 = sigqueue */
+ { AS(kmq_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 457 = kmq_open */
+ { AS(kmq_setattr_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 458 = kmq_setattr */
+ { AS(kmq_timedreceive_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 459 = kmq_timedreceive */
+ { AS(kmq_timedsend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 460 = kmq_timedsend */
+ { AS(kmq_notify_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 461 = kmq_notify */
+ { AS(kmq_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 462 = kmq_unlink */
+ { AS(abort2_args), (sy_call_t *)sys_abort2, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 463 = abort2 */
+ { AS(thr_set_name_args), (sy_call_t *)sys_thr_set_name, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 464 = thr_set_name */
+ { AS(aio_fsync_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 465 = aio_fsync */
+ { AS(rtprio_thread_args), (sy_call_t *)sys_rtprio_thread, AUE_RTPRIO, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 466 = rtprio_thread */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 467 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 468 = nosys */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 469 = __getpath_fromfd */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 470 = __getpath_fromaddr */
+ { AS(sctp_peeloff_args), (sy_call_t *)sys_sctp_peeloff, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 471 = sctp_peeloff */
+ { AS(sctp_generic_sendmsg_args), (sy_call_t *)sys_sctp_generic_sendmsg, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 472 = sctp_generic_sendmsg */
+ { AS(sctp_generic_sendmsg_iov_args), (sy_call_t *)sys_sctp_generic_sendmsg_iov, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 473 = sctp_generic_sendmsg_iov */
+ { AS(sctp_generic_recvmsg_args), (sy_call_t *)sys_sctp_generic_recvmsg, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 474 = sctp_generic_recvmsg */
+ { AS(pread_args), (sy_call_t *)sys_pread, AUE_PREAD, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 475 = pread */
+ { AS(pwrite_args), (sy_call_t *)sys_pwrite, AUE_PWRITE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 476 = pwrite */
+ { AS(mmap_args), (sy_call_t *)sys_mmap, AUE_MMAP, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 477 = mmap */
+ { AS(lseek_args), (sy_call_t *)sys_lseek, AUE_LSEEK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 478 = lseek */
+ { AS(truncate_args), (sy_call_t *)sys_truncate, AUE_TRUNCATE, NULL, 0, 0, 0, SY_THR_STATIC }, /* 479 = truncate */
+ { AS(ftruncate_args), (sy_call_t *)sys_ftruncate, AUE_FTRUNCATE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 480 = ftruncate */
+ { AS(thr_kill2_args), (sy_call_t *)sys_thr_kill2, AUE_KILL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 481 = thr_kill2 */
+ { AS(shm_open_args), (sy_call_t *)sys_shm_open, AUE_SHMOPEN, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 482 = shm_open */
+ { AS(shm_unlink_args), (sy_call_t *)sys_shm_unlink, AUE_SHMUNLINK, NULL, 0, 0, 0, SY_THR_STATIC }, /* 483 = shm_unlink */
+ { AS(cpuset_args), (sy_call_t *)sys_cpuset, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 484 = cpuset */
+ { AS(cpuset_setid_args), (sy_call_t *)sys_cpuset_setid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 485 = cpuset_setid */
+ { AS(cpuset_getid_args), (sy_call_t *)sys_cpuset_getid, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 486 = cpuset_getid */
+ { AS(cpuset_getaffinity_args), (sy_call_t *)sys_cpuset_getaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 487 = cpuset_getaffinity */
+ { AS(cpuset_setaffinity_args), (sy_call_t *)sys_cpuset_setaffinity, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 488 = cpuset_setaffinity */
+ { AS(faccessat_args), (sy_call_t *)sys_faccessat, AUE_FACCESSAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 489 = faccessat */
+ { AS(fchmodat_args), (sy_call_t *)sys_fchmodat, AUE_FCHMODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 490 = fchmodat */
+ { AS(fchownat_args), (sy_call_t *)sys_fchownat, AUE_FCHOWNAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 491 = fchownat */
+ { AS(fexecve_args), (sy_call_t *)sys_fexecve, AUE_FEXECVE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 492 = fexecve */
+ { AS(fstatat_args), (sy_call_t *)sys_fstatat, AUE_FSTATAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 493 = fstatat */
+ { AS(futimesat_args), (sy_call_t *)sys_futimesat, AUE_FUTIMESAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 494 = futimesat */
+ { AS(linkat_args), (sy_call_t *)sys_linkat, AUE_LINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 495 = linkat */
+ { AS(mkdirat_args), (sy_call_t *)sys_mkdirat, AUE_MKDIRAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 496 = mkdirat */
+ { AS(mkfifoat_args), (sy_call_t *)sys_mkfifoat, AUE_MKFIFOAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 497 = mkfifoat */
+ { AS(mknodat_args), (sy_call_t *)sys_mknodat, AUE_MKNODAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 498 = mknodat */
+ { AS(openat_args), (sy_call_t *)sys_openat, AUE_OPENAT_RWTC, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 499 = openat */
+ { AS(readlinkat_args), (sy_call_t *)sys_readlinkat, AUE_READLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 500 = readlinkat */
+ { AS(renameat_args), (sy_call_t *)sys_renameat, AUE_RENAMEAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 501 = renameat */
+ { AS(symlinkat_args), (sy_call_t *)sys_symlinkat, AUE_SYMLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 502 = symlinkat */
+ { AS(unlinkat_args), (sy_call_t *)sys_unlinkat, AUE_UNLINKAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 503 = unlinkat */
+ { AS(posix_openpt_args), (sy_call_t *)sys_posix_openpt, AUE_POSIX_OPENPT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 504 = posix_openpt */
+ { AS(gssd_syscall_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 505 = gssd_syscall */
+ { AS(jail_get_args), (sy_call_t *)sys_jail_get, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 506 = jail_get */
+ { AS(jail_set_args), (sy_call_t *)sys_jail_set, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 507 = jail_set */
+ { AS(jail_remove_args), (sy_call_t *)sys_jail_remove, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 508 = jail_remove */
+ { AS(closefrom_args), (sy_call_t *)sys_closefrom, AUE_CLOSEFROM, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 509 = closefrom */
+ { AS(__semctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 510 = __semctl */
+ { AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 511 = msgctl */
+ { AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 512 = shmctl */
+ { AS(lpathconf_args), (sy_call_t *)sys_lpathconf, AUE_LPATHCONF, NULL, 0, 0, 0, SY_THR_STATIC }, /* 513 = lpathconf */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 514 = obsolete cap_new */
+ { AS(__cap_rights_get_args), (sy_call_t *)sys___cap_rights_get, AUE_CAP_RIGHTS_GET, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 515 = __cap_rights_get */
+ { 0, (sy_call_t *)sys_cap_enter, AUE_CAP_ENTER, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 516 = cap_enter */
+ { AS(cap_getmode_args), (sy_call_t *)sys_cap_getmode, AUE_CAP_GETMODE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 517 = cap_getmode */
+ { AS(pdfork_args), (sy_call_t *)sys_pdfork, AUE_PDFORK, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 518 = pdfork */
+ { AS(pdkill_args), (sy_call_t *)sys_pdkill, AUE_PDKILL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 519 = pdkill */
+ { AS(pdgetpid_args), (sy_call_t *)sys_pdgetpid, AUE_PDGETPID, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 520 = pdgetpid */
+ { 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 521 = pdwait4 */
+ { AS(pselect_args), (sy_call_t *)sys_pselect, AUE_SELECT, NULL, 0, 0, 0, SY_THR_STATIC }, /* 522 = pselect */
+ { AS(getloginclass_args), (sy_call_t *)sys_getloginclass, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 523 = getloginclass */
+ { AS(setloginclass_args), (sy_call_t *)sys_setloginclass, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 524 = setloginclass */
+ { AS(rctl_get_racct_args), (sy_call_t *)sys_rctl_get_racct, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 525 = rctl_get_racct */
+ { AS(rctl_get_rules_args), (sy_call_t *)sys_rctl_get_rules, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 526 = rctl_get_rules */
+ { AS(rctl_get_limits_args), (sy_call_t *)sys_rctl_get_limits, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 527 = rctl_get_limits */
+ { AS(rctl_add_rule_args), (sy_call_t *)sys_rctl_add_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 528 = rctl_add_rule */
+ { AS(rctl_remove_rule_args), (sy_call_t *)sys_rctl_remove_rule, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 529 = rctl_remove_rule */
+ { AS(posix_fallocate_args), (sy_call_t *)sys_posix_fallocate, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 530 = posix_fallocate */
+ { AS(posix_fadvise_args), (sy_call_t *)sys_posix_fadvise, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC }, /* 531 = posix_fadvise */
+ { AS(wait6_args), (sy_call_t *)sys_wait6, AUE_WAIT6, NULL, 0, 0, 0, SY_THR_STATIC }, /* 532 = wait6 */
+ { AS(cap_rights_limit_args), (sy_call_t *)sys_cap_rights_limit, AUE_CAP_RIGHTS_LIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 533 = cap_rights_limit */
+ { AS(cap_ioctls_limit_args), (sy_call_t *)sys_cap_ioctls_limit, AUE_CAP_IOCTLS_LIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 534 = cap_ioctls_limit */
+ { AS(cap_ioctls_get_args), (sy_call_t *)sys_cap_ioctls_get, AUE_CAP_IOCTLS_GET, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 535 = cap_ioctls_get */
+ { AS(cap_fcntls_limit_args), (sy_call_t *)sys_cap_fcntls_limit, AUE_CAP_FCNTLS_LIMIT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 536 = cap_fcntls_limit */
+ { AS(cap_fcntls_get_args), (sy_call_t *)sys_cap_fcntls_get, AUE_CAP_FCNTLS_GET, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 537 = cap_fcntls_get */
+ { AS(bindat_args), (sy_call_t *)sys_bindat, AUE_BINDAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 538 = bindat */
+ { AS(connectat_args), (sy_call_t *)sys_connectat, AUE_CONNECTAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 539 = connectat */
+ { AS(chflagsat_args), (sy_call_t *)sys_chflagsat, AUE_CHFLAGSAT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 540 = chflagsat */
+ { AS(accept4_args), (sy_call_t *)sys_accept4, AUE_ACCEPT, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 541 = accept4 */
+ { AS(pipe2_args), (sy_call_t *)sys_pipe2, AUE_PIPE, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC }, /* 542 = pipe2 */
+ { AS(aio_mlock_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT }, /* 543 = aio_mlock */
+};
diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c
new file mode 100644
index 0000000..ef3fd2e
--- /dev/null
+++ b/sys/kern/kern_acct.c
@@ -0,0 +1,647 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (c) 1994 Christopher G. Demetriou
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/tty.h>
+#include <sys/vnode.h>
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * The routines implemented in this file are described in:
+ * Leffler, et al.: The Design and Implementation of the 4.3BSD
+ * UNIX Operating System (Addison Welley, 1989)
+ * on pages 62-63.
+ * On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction
+ * compt_t representation described in the above reference was replaced
+ * with that of IEEE-754 floats.
+ *
+ * Arguably, to simplify accounting operations, this mechanism should
+ * be replaced by one in which an accounting log file (similar to /dev/klog)
+ * is read by a user process, etc. However, that has its own problems.
+ */
+
+/* Floating point definitions from <float.h>. */
+#define FLT_MANT_DIG 24 /* p */
+#define FLT_MAX_EXP 128 /* emax */
+
+/*
+ * Internal accounting functions.
+ * The former's operation is described in Leffler, et al., and the latter
+ * was provided by UCB with the 4.4BSD-Lite release
+ */
+static uint32_t encode_timeval(struct timeval);
+static uint32_t encode_long(long);
+static void acctwatch(void);
+static void acct_thread(void *);
+static int acct_disable(struct thread *, int);
+
+/*
+ * Accounting vnode pointer, saved vnode pointer, and flags for each.
+ * acct_sx protects against changes to the active vnode and credentials
+ * while accounting records are being committed to disk.
+ */
+static int acct_configured;
+static int acct_suspended;
+static struct vnode *acct_vp;
+static struct ucred *acct_cred;
+static struct plimit *acct_limit;
+static int acct_flags;
+static struct sx acct_sx;
+
+SX_SYSINIT(acct, &acct_sx, "acct_sx");
+
+/*
+ * State of the accounting kthread.
+ */
+static int acct_state;
+
+#define ACCT_RUNNING 1 /* Accounting kthread is running. */
+#define ACCT_EXITREQ 2 /* Accounting kthread should exit. */
+
+/*
+ * Values associated with enabling and disabling accounting
+ */
+static int acctsuspend = 2; /* stop accounting when < 2% free space left */
+SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
+ &acctsuspend, 0, "percentage of free disk space below which accounting stops");
+
+static int acctresume = 4; /* resume when free space risen to > 4% */
+SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
+ &acctresume, 0, "percentage of free disk space above which accounting resumes");
+
+static int acctchkfreq = 15; /* frequency (in seconds) to check space */
+
+static int
+sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS)
+{
+ int error, value;
+
+ /* Write out the old value. */
+ error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int));
+ if (error || req->newptr == NULL)
+ return (error);
+
+ /* Read in and verify the new value. */
+ error = SYSCTL_IN(req, &value, sizeof(int));
+ if (error)
+ return (error);
+ if (value <= 0)
+ return (EINVAL);
+ acctchkfreq = value;
+ return (0);
+}
+SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, CTLTYPE_INT|CTLFLAG_RW,
+ &acctchkfreq, 0, sysctl_acct_chkfreq, "I",
+ "frequency for checking the free space");
+
+SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0,
+ "Accounting configured or not");
+
+SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0,
+ "Accounting suspended or not");
+
+/*
+ * Accounting system call. Written based on the specification and previous
+ * implementation done by Mark Tinguely.
+ */
+int
+sys_acct(struct thread *td, struct acct_args *uap)
+{
+ struct nameidata nd;
+ int error, flags, i, replacing;
+
+ error = priv_check(td, PRIV_ACCT);
+ if (error)
+ return (error);
+
+ /*
+ * If accounting is to be started to a file, open that file for
+ * appending and make sure it's a 'normal'.
+ */
+ if (uap->path != NULL) {
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
+ flags = FWRITE | O_APPEND;
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+#ifdef MAC
+ error = mac_system_check_acct(td->td_ucred, nd.ni_vp);
+ if (error) {
+ VOP_UNLOCK(nd.ni_vp, 0);
+ vn_close(nd.ni_vp, flags, td->td_ucred, td);
+ return (error);
+ }
+#endif
+ VOP_UNLOCK(nd.ni_vp, 0);
+ if (nd.ni_vp->v_type != VREG) {
+ vn_close(nd.ni_vp, flags, td->td_ucred, td);
+ return (EACCES);
+ }
+#ifdef MAC
+ } else {
+ error = mac_system_check_acct(td->td_ucred, NULL);
+ if (error)
+ return (error);
+#endif
+ }
+
+ /*
+ * Disallow concurrent access to the accounting vnode while we swap
+ * it out, in order to prevent access after close.
+ */
+ sx_xlock(&acct_sx);
+
+ /*
+ * Don't log spurious disable/enable messages if we are
+ * switching from one accounting file to another due to log
+ * rotation.
+ */
+ replacing = (acct_vp != NULL && uap->path != NULL);
+
+ /*
+ * If accounting was previously enabled, kill the old space-watcher,
+ * close the file, and (if no new file was specified, leave). Reset
+ * the suspended state regardless of whether accounting remains
+ * enabled.
+ */
+ acct_suspended = 0;
+ if (acct_vp != NULL)
+ error = acct_disable(td, !replacing);
+ if (uap->path == NULL) {
+ if (acct_state & ACCT_RUNNING) {
+ acct_state |= ACCT_EXITREQ;
+ wakeup(&acct_state);
+ }
+ sx_xunlock(&acct_sx);
+ return (error);
+ }
+
+ /*
+ * Create our own plimit object without limits. It will be assigned
+ * to exiting processes.
+ */
+ acct_limit = lim_alloc();
+ for (i = 0; i < RLIM_NLIMITS; i++)
+ acct_limit->pl_rlimit[i].rlim_cur =
+ acct_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
+
+ /*
+ * Save the new accounting file vnode, and schedule the new
+ * free space watcher.
+ */
+ acct_vp = nd.ni_vp;
+ acct_cred = crhold(td->td_ucred);
+ acct_flags = flags;
+ if (acct_state & ACCT_RUNNING)
+ acct_state &= ~ACCT_EXITREQ;
+ else {
+ /*
+ * Try to start up an accounting kthread. We may start more
+ * than one, but if so the extras will commit suicide as
+ * soon as they start up.
+ */
+ error = kproc_create(acct_thread, NULL, NULL, 0, 0,
+ "accounting");
+ if (error) {
+ (void) acct_disable(td, 0);
+ sx_xunlock(&acct_sx);
+ log(LOG_NOTICE, "Unable to start accounting thread\n");
+ return (error);
+ }
+ }
+ acct_configured = 1;
+ sx_xunlock(&acct_sx);
+ if (!replacing)
+ log(LOG_NOTICE, "Accounting enabled\n");
+ return (error);
+}
+
+/*
+ * Disable currently in-progress accounting by closing the vnode, dropping
+ * our reference to the credential, and clearing the vnode's flags.
+ */
+static int
+acct_disable(struct thread *td, int logging)
+{
+ int error;
+
+ sx_assert(&acct_sx, SX_XLOCKED);
+ error = vn_close(acct_vp, acct_flags, acct_cred, td);
+ crfree(acct_cred);
+ lim_free(acct_limit);
+ acct_configured = 0;
+ acct_vp = NULL;
+ acct_cred = NULL;
+ acct_flags = 0;
+ if (logging)
+ log(LOG_NOTICE, "Accounting disabled\n");
+ return (error);
+}
+
+/*
+ * Write out process accounting information, on process exit.
+ * Data to be written out is specified in Leffler, et al.
+ * and are enumerated below. (They're also noted in the system
+ * "acct.h" header file.)
+ */
+int
+acct_process(struct thread *td)
+{
+ struct acctv2 acct;
+ struct timeval ut, st, tmp;
+ struct plimit *oldlim;
+ struct proc *p;
+ struct rusage ru;
+ int t, ret;
+
+ /*
+ * Lockless check of accounting condition before doing the hard
+ * work.
+ */
+ if (acct_vp == NULL || acct_suspended)
+ return (0);
+
+ sx_slock(&acct_sx);
+
+ /*
+ * If accounting isn't enabled, don't bother. Have to check again
+ * once we own the lock in case we raced with disabling of accounting
+ * by another thread.
+ */
+ if (acct_vp == NULL || acct_suspended) {
+ sx_sunlock(&acct_sx);
+ return (0);
+ }
+
+ p = td->td_proc;
+
+ /*
+ * Get process accounting information.
+ */
+
+ sx_slock(&proctree_lock);
+ PROC_LOCK(p);
+
+ /* (1) The terminal from which the process was started */
+ if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
+ acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp);
+ else
+ acct.ac_tty = NODEV;
+ sx_sunlock(&proctree_lock);
+
+ /* (2) The name of the command that ran */
+ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
+
+ /* (3) The amount of user and system time that was used */
+ rufetchcalc(p, &ru, &ut, &st);
+ acct.ac_utime = encode_timeval(ut);
+ acct.ac_stime = encode_timeval(st);
+
+ /* (4) The elapsed time the command ran (and its starting time) */
+ tmp = boottime;
+ timevaladd(&tmp, &p->p_stats->p_start);
+ acct.ac_btime = tmp.tv_sec;
+ microuptime(&tmp);
+ timevalsub(&tmp, &p->p_stats->p_start);
+ acct.ac_etime = encode_timeval(tmp);
+
+ /* (5) The average amount of memory used */
+ tmp = ut;
+ timevaladd(&tmp, &st);
+ /* Convert tmp (i.e. u + s) into hz units to match ru_i*. */
+ t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+ if (t)
+ acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss +
+ + ru.ru_isrss) / t);
+ else
+ acct.ac_mem = 0;
+
+ /* (6) The number of disk I/O operations done */
+ acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock);
+
+ /* (7) The UID and GID of the process */
+ acct.ac_uid = p->p_ucred->cr_ruid;
+ acct.ac_gid = p->p_ucred->cr_rgid;
+
+ /* (8) The boolean flags that tell how the process terminated, etc. */
+ acct.ac_flagx = p->p_acflag;
+
+ /* Setup ancillary structure fields. */
+ acct.ac_flagx |= ANVER;
+ acct.ac_zero = 0;
+ acct.ac_version = 2;
+ acct.ac_len = acct.ac_len2 = sizeof(acct);
+
+ /*
+ * Eliminate rlimits (file size limit in particular).
+ */
+ oldlim = p->p_limit;
+ p->p_limit = lim_hold(acct_limit);
+ PROC_UNLOCK(p);
+ lim_free(oldlim);
+
+ /*
+ * Write the accounting information to the file.
+ */
+ ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct),
+ (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, acct_cred, NOCRED,
+ NULL, td);
+ sx_sunlock(&acct_sx);
+ return (ret);
+}
+
+/* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */
+
+/* Convert timevals and longs into IEEE-754 bit patterns. */
+
+/* Mantissa mask (MSB is implied, so subtract 1). */
+#define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1)
+
+/*
+ * We calculate integer values to a precision of approximately
+ * 28 bits.
+ * This is high-enough precision to fill the 24 float bits
+ * and low-enough to avoid overflowing the 32 int bits.
+ */
+#define CALC_BITS 28
+
+/* log_2(1000000). */
+#define LOG2_1M 20
+
+/*
+ * Convert the elements of a timeval into a 32-bit word holding
+ * the bits of a IEEE-754 float.
+ * The float value represents the timeval's value in microsecond units.
+ */
+static uint32_t
+encode_timeval(struct timeval tv)
+{
+ int log2_s;
+ int val, exp; /* Unnormalized value and exponent */
+ int norm_exp; /* Normalized exponent */
+ int shift;
+
+ /*
+ * First calculate value and exponent to about CALC_BITS precision.
+ * Note that the following conditionals have been ordered so that
+ * the most common cases appear first.
+ */
+ if (tv.tv_sec == 0) {
+ if (tv.tv_usec == 0)
+ return (0);
+ exp = 0;
+ val = tv.tv_usec;
+ } else {
+ /*
+ * Calculate the value to a precision of approximately
+ * CALC_BITS.
+ */
+ log2_s = fls(tv.tv_sec) - 1;
+ if (log2_s + LOG2_1M < CALC_BITS) {
+ exp = 0;
+ val = 1000000 * tv.tv_sec + tv.tv_usec;
+ } else {
+ exp = log2_s + LOG2_1M - CALC_BITS;
+ val = (unsigned int)(((uint64_t)1000000 * tv.tv_sec +
+ tv.tv_usec) >> exp);
+ }
+ }
+ /* Now normalize and pack the value into an IEEE-754 float. */
+ norm_exp = fls(val) - 1;
+ shift = FLT_MANT_DIG - norm_exp - 1;
+#ifdef ACCT_DEBUG
+ printf("val=%d exp=%d shift=%d log2(val)=%d\n",
+ val, exp, shift, norm_exp);
+ printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
+ ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
+#endif
+ return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) |
+ ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
+}
+
+/*
+ * Convert a non-negative long value into the bit pattern of
+ * an IEEE-754 float value.
+ */
+static uint32_t
+encode_long(long val)
+{
+ int norm_exp; /* Normalized exponent */
+ int shift;
+
+ if (val == 0)
+ return (0);
+ if (val < 0) {
+ log(LOG_NOTICE,
+ "encode_long: negative value %ld in accounting record\n",
+ val);
+ val = LONG_MAX;
+ }
+ norm_exp = fls(val) - 1;
+ shift = FLT_MANT_DIG - norm_exp - 1;
+#ifdef ACCT_DEBUG
+ printf("val=%d shift=%d log2(val)=%d\n",
+ val, shift, norm_exp);
+ printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
+ ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
+#endif
+ return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) |
+ ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
+}
+
+/* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */
+
+/*
+ * Periodically check the filesystem to see if accounting
+ * should be turned on or off. Beware the case where the vnode
+ * has been vgone()'d out from underneath us, e.g. when the file
+ * system containing the accounting file has been forcibly unmounted.
+ */
+/* ARGSUSED */
+static void
+acctwatch(void)
+{
+ struct statfs sb;
+
+ sx_assert(&acct_sx, SX_XLOCKED);
+
+ /*
+ * If accounting was disabled before our kthread was scheduled,
+ * then acct_vp might be NULL. If so, just ask our kthread to
+ * exit and return.
+ */
+ if (acct_vp == NULL) {
+ acct_state |= ACCT_EXITREQ;
+ return;
+ }
+
+ /*
+ * If our vnode is no longer valid, tear it down and signal the
+ * accounting thread to die.
+ */
+ if (acct_vp->v_type == VBAD) {
+ (void) acct_disable(NULL, 1);
+ acct_state |= ACCT_EXITREQ;
+ return;
+ }
+
+ /*
+ * Stopping here is better than continuing, maybe it will be VBAD
+ * next time around.
+ */
+ if (VFS_STATFS(acct_vp->v_mount, &sb) < 0)
+ return;
+ if (acct_suspended) {
+ if (sb.f_bavail > (int64_t)(acctresume * sb.f_blocks /
+ 100)) {
+ acct_suspended = 0;
+ log(LOG_NOTICE, "Accounting resumed\n");
+ }
+ } else {
+ if (sb.f_bavail <= (int64_t)(acctsuspend * sb.f_blocks /
+ 100)) {
+ acct_suspended = 1;
+ log(LOG_NOTICE, "Accounting suspended\n");
+ }
+ }
+}
+
+/*
+ * The main loop for the dedicated kernel thread that periodically calls
+ * acctwatch().
+ */
+static void
+acct_thread(void *dummy)
+{
+ u_char pri;
+
+ /* This is a low-priority kernel thread. */
+ pri = PRI_MAX_KERN;
+ thread_lock(curthread);
+ sched_prio(curthread, pri);
+ thread_unlock(curthread);
+
+ /* If another accounting kthread is already running, just die. */
+ sx_xlock(&acct_sx);
+ if (acct_state & ACCT_RUNNING) {
+ sx_xunlock(&acct_sx);
+ kproc_exit(0);
+ }
+ acct_state |= ACCT_RUNNING;
+
+ /* Loop until we are asked to exit. */
+ while (!(acct_state & ACCT_EXITREQ)) {
+
+ /* Perform our periodic checks. */
+ acctwatch();
+
+ /*
+ * We check this flag again before sleeping since the
+ * acctwatch() might have shut down accounting and asked us
+ * to exit.
+ */
+ if (!(acct_state & ACCT_EXITREQ)) {
+ sx_sleep(&acct_state, &acct_sx, 0, "-",
+ acctchkfreq * hz);
+ }
+ }
+
+ /*
+ * Acknowledge the exit request and shutdown. We clear both the
+ * exit request and running flags.
+ */
+ acct_state = 0;
+ sx_xunlock(&acct_sx);
+ kproc_exit(0);
+}
diff --git a/sys/kern/kern_alq.c b/sys/kern/kern_alq.c
new file mode 100644
index 0000000..1e6fcf7
--- /dev/null
+++ b/sys/kern/kern_alq.c
@@ -0,0 +1,971 @@
+/*-
+ * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008-2009, Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2009-2010, The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_mac.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/alq.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/fcntl.h>
+#include <sys/eventhandler.h>
+
+#include <security/mac/mac_framework.h>
+
+/* Async. Logging Queue */
+struct alq {
+ char *aq_entbuf; /* Buffer for stored entries */
+ int aq_entmax; /* Max entries */
+ int aq_entlen; /* Entry length */
+ int aq_freebytes; /* Bytes available in buffer */
+ int aq_buflen; /* Total length of our buffer */
+ int aq_writehead; /* Location for next write */
+ int aq_writetail; /* Flush starts at this location */
+ int aq_wrapearly; /* # bytes left blank at end of buf */
+ int aq_flags; /* Queue flags */
+ int aq_waiters; /* Num threads waiting for resources
+ * NB: Used as a wait channel so must
+ * not be first field in the alq struct
+ */
+ struct ale aq_getpost; /* ALE for use by get/post */
+ struct mtx aq_mtx; /* Queue lock */
+ struct vnode *aq_vp; /* Open vnode handle */
+ struct ucred *aq_cred; /* Credentials of the opening thread */
+ LIST_ENTRY(alq) aq_act; /* List of active queues */
+ LIST_ENTRY(alq) aq_link; /* List of all queues */
+};
+
+#define AQ_WANTED 0x0001 /* Wakeup sleeper when io is done */
+#define AQ_ACTIVE 0x0002 /* on the active list */
+#define AQ_FLUSHING 0x0004 /* doing IO */
+#define AQ_SHUTDOWN 0x0008 /* Queue no longer valid */
+#define AQ_ORDERED 0x0010 /* Queue enforces ordered writes */
+#define AQ_LEGACY 0x0020 /* Legacy queue (fixed length writes) */
+
+#define ALQ_LOCK(alq) mtx_lock_spin(&(alq)->aq_mtx)
+#define ALQ_UNLOCK(alq) mtx_unlock_spin(&(alq)->aq_mtx)
+
+#define HAS_PENDING_DATA(alq) ((alq)->aq_freebytes != (alq)->aq_buflen)
+
+static MALLOC_DEFINE(M_ALD, "ALD", "ALD");
+
+/*
+ * The ald_mtx protects the ald_queues list and the ald_active list.
+ */
+static struct mtx ald_mtx;
+static LIST_HEAD(, alq) ald_queues;
+static LIST_HEAD(, alq) ald_active;
+static int ald_shutingdown = 0;
+struct thread *ald_thread;
+static struct proc *ald_proc;
+static eventhandler_tag alq_eventhandler_tag = NULL;
+
+#define ALD_LOCK() mtx_lock(&ald_mtx)
+#define ALD_UNLOCK() mtx_unlock(&ald_mtx)
+
+/* Daemon functions */
+static int ald_add(struct alq *);
+static int ald_rem(struct alq *);
+static void ald_startup(void *);
+static void ald_daemon(void);
+static void ald_shutdown(void *, int);
+static void ald_activate(struct alq *);
+static void ald_deactivate(struct alq *);
+
+/* Internal queue functions */
+static void alq_shutdown(struct alq *);
+static void alq_destroy(struct alq *);
+static int alq_doio(struct alq *);
+
+
+/*
+ * Add a new queue to the global list. Fail if we're shutting down.
+ */
+static int
+ald_add(struct alq *alq)
+{
+ int error;
+
+ error = 0;
+
+ ALD_LOCK();
+ if (ald_shutingdown) {
+ error = EBUSY;
+ goto done;
+ }
+ LIST_INSERT_HEAD(&ald_queues, alq, aq_link);
+done:
+ ALD_UNLOCK();
+ return (error);
+}
+
+/*
+ * Remove a queue from the global list unless we're shutting down. If so,
+ * the ald will take care of cleaning up it's resources.
+ */
+static int
+ald_rem(struct alq *alq)
+{
+ int error;
+
+ error = 0;
+
+ ALD_LOCK();
+ if (ald_shutingdown) {
+ error = EBUSY;
+ goto done;
+ }
+ LIST_REMOVE(alq, aq_link);
+done:
+ ALD_UNLOCK();
+ return (error);
+}
+
+/*
+ * Put a queue on the active list. This will schedule it for writing.
+ */
+static void
+ald_activate(struct alq *alq)
+{
+ LIST_INSERT_HEAD(&ald_active, alq, aq_act);
+ wakeup(&ald_active);
+}
+
+static void
+ald_deactivate(struct alq *alq)
+{
+ LIST_REMOVE(alq, aq_act);
+ alq->aq_flags &= ~AQ_ACTIVE;
+}
+
+static void
+ald_startup(void *unused)
+{
+ mtx_init(&ald_mtx, "ALDmtx", NULL, MTX_DEF|MTX_QUIET);
+ LIST_INIT(&ald_queues);
+ LIST_INIT(&ald_active);
+}
+
+static void
+ald_daemon(void)
+{
+ int needwakeup;
+ struct alq *alq;
+
+ ald_thread = FIRST_THREAD_IN_PROC(ald_proc);
+
+ alq_eventhandler_tag = EVENTHANDLER_REGISTER(shutdown_pre_sync,
+ ald_shutdown, NULL, SHUTDOWN_PRI_FIRST);
+
+ ALD_LOCK();
+
+ for (;;) {
+ while ((alq = LIST_FIRST(&ald_active)) == NULL &&
+ !ald_shutingdown)
+ mtx_sleep(&ald_active, &ald_mtx, PWAIT, "aldslp", 0);
+
+ /* Don't shutdown until all active ALQs are flushed. */
+ if (ald_shutingdown && alq == NULL) {
+ ALD_UNLOCK();
+ break;
+ }
+
+ ALQ_LOCK(alq);
+ ald_deactivate(alq);
+ ALD_UNLOCK();
+ needwakeup = alq_doio(alq);
+ ALQ_UNLOCK(alq);
+ if (needwakeup)
+ wakeup_one(alq);
+ ALD_LOCK();
+ }
+
+ kproc_exit(0);
+}
+
+static void
+ald_shutdown(void *arg, int howto)
+{
+ struct alq *alq;
+
+ ALD_LOCK();
+
+ /* Ensure no new queues can be created. */
+ ald_shutingdown = 1;
+
+ /* Shutdown all ALQs prior to terminating the ald_daemon. */
+ while ((alq = LIST_FIRST(&ald_queues)) != NULL) {
+ LIST_REMOVE(alq, aq_link);
+ ALD_UNLOCK();
+ alq_shutdown(alq);
+ ALD_LOCK();
+ }
+
+ /* At this point, all ALQs are flushed and shutdown. */
+
+ /*
+ * Wake ald_daemon so that it exits. It won't be able to do
+ * anything until we mtx_sleep because we hold the ald_mtx.
+ */
+ wakeup(&ald_active);
+
+ /* Wait for ald_daemon to exit. */
+ mtx_sleep(ald_proc, &ald_mtx, PWAIT, "aldslp", 0);
+
+ ALD_UNLOCK();
+}
+
+static void
+alq_shutdown(struct alq *alq)
+{
+ ALQ_LOCK(alq);
+
+ /* Stop any new writers. */
+ alq->aq_flags |= AQ_SHUTDOWN;
+
+ /*
+ * If the ALQ isn't active but has unwritten data (possible if
+ * the ALQ_NOACTIVATE flag has been used), explicitly activate the
+ * ALQ here so that the pending data gets flushed by the ald_daemon.
+ */
+ if (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq)) {
+ alq->aq_flags |= AQ_ACTIVE;
+ ALQ_UNLOCK(alq);
+ ALD_LOCK();
+ ald_activate(alq);
+ ALD_UNLOCK();
+ ALQ_LOCK(alq);
+ }
+
+ /* Drain IO */
+ while (alq->aq_flags & AQ_ACTIVE) {
+ alq->aq_flags |= AQ_WANTED;
+ msleep_spin(alq, &alq->aq_mtx, "aldclose", 0);
+ }
+ ALQ_UNLOCK(alq);
+
+ vn_close(alq->aq_vp, FWRITE, alq->aq_cred,
+ curthread);
+ crfree(alq->aq_cred);
+}
+
+void
+alq_destroy(struct alq *alq)
+{
+ /* Drain all pending IO. */
+ alq_shutdown(alq);
+
+ mtx_destroy(&alq->aq_mtx);
+ free(alq->aq_entbuf, M_ALD);
+ free(alq, M_ALD);
+}
+
+/*
+ * Flush all pending data to disk. This operation will block.
+ */
+static int
+alq_doio(struct alq *alq)
+{
+ struct thread *td;
+ struct mount *mp;
+ struct vnode *vp;
+ struct uio auio;
+ struct iovec aiov[2];
+ int totlen;
+ int iov;
+ int wrapearly;
+
+ KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
+
+ vp = alq->aq_vp;
+ td = curthread;
+ totlen = 0;
+ iov = 1;
+ wrapearly = alq->aq_wrapearly;
+
+ bzero(&aiov, sizeof(aiov));
+ bzero(&auio, sizeof(auio));
+
+ /* Start the write from the location of our buffer tail pointer. */
+ aiov[0].iov_base = alq->aq_entbuf + alq->aq_writetail;
+
+ if (alq->aq_writetail < alq->aq_writehead) {
+ /* Buffer not wrapped. */
+ totlen = aiov[0].iov_len = alq->aq_writehead - alq->aq_writetail;
+ } else if (alq->aq_writehead == 0) {
+ /* Buffer not wrapped (special case to avoid an empty iov). */
+ totlen = aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail -
+ wrapearly;
+ } else {
+ /*
+ * Buffer wrapped, requires 2 aiov entries:
+ * - first is from writetail to end of buffer
+ * - second is from start of buffer to writehead
+ */
+ aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail -
+ wrapearly;
+ iov++;
+ aiov[1].iov_base = alq->aq_entbuf;
+ aiov[1].iov_len = alq->aq_writehead;
+ totlen = aiov[0].iov_len + aiov[1].iov_len;
+ }
+
+ alq->aq_flags |= AQ_FLUSHING;
+ ALQ_UNLOCK(alq);
+
+ auio.uio_iov = &aiov[0];
+ auio.uio_offset = 0;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_iovcnt = iov;
+ auio.uio_resid = totlen;
+ auio.uio_td = td;
+
+ /*
+ * Do all of the junk required to write now.
+ */
+ vn_start_write(vp, &mp, V_WAIT);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ /*
+ * XXX: VOP_WRITE error checks are ignored.
+ */
+#ifdef MAC
+ if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0)
+#endif
+ VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+
+ ALQ_LOCK(alq);
+ alq->aq_flags &= ~AQ_FLUSHING;
+
+ /* Adjust writetail as required, taking into account wrapping. */
+ alq->aq_writetail = (alq->aq_writetail + totlen + wrapearly) %
+ alq->aq_buflen;
+ alq->aq_freebytes += totlen + wrapearly;
+
+ /*
+ * If we just flushed part of the buffer which wrapped, reset the
+ * wrapearly indicator.
+ */
+ if (wrapearly)
+ alq->aq_wrapearly = 0;
+
+ /*
+ * If we just flushed the buffer completely, reset indexes to 0 to
+ * minimise buffer wraps.
+ * This is also required to ensure alq_getn() can't wedge itself.
+ */
+ if (!HAS_PENDING_DATA(alq))
+ alq->aq_writehead = alq->aq_writetail = 0;
+
+ KASSERT((alq->aq_writetail >= 0 && alq->aq_writetail < alq->aq_buflen),
+ ("%s: aq_writetail < 0 || aq_writetail >= aq_buflen", __func__));
+
+ if (alq->aq_flags & AQ_WANTED) {
+ alq->aq_flags &= ~AQ_WANTED;
+ return (1);
+ }
+
+ return(0);
+}
+
+static struct kproc_desc ald_kp = {
+ "ALQ Daemon",
+ ald_daemon,
+ &ald_proc
+};
+
+SYSINIT(aldthread, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &ald_kp);
+SYSINIT(ald, SI_SUB_LOCK, SI_ORDER_ANY, ald_startup, NULL);
+
+
+/* User visible queue functions */
+
+/*
+ * Create the queue data structure, allocate the buffer, and open the file.
+ */
+
+int
+alq_open_flags(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
+ int size, int flags)
+{
+ struct thread *td;
+ struct nameidata nd;
+ struct alq *alq;
+ int oflags;
+ int error;
+
+ KASSERT((size > 0), ("%s: size <= 0", __func__));
+
+ *alqp = NULL;
+ td = curthread;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td);
+ oflags = FWRITE | O_NOFOLLOW | O_CREAT;
+
+ error = vn_open_cred(&nd, &oflags, cmode, 0, cred, NULL);
+ if (error)
+ return (error);
+
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ /* We just unlock so we hold a reference */
+ VOP_UNLOCK(nd.ni_vp, 0);
+
+ alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO);
+ alq->aq_vp = nd.ni_vp;
+ alq->aq_cred = crhold(cred);
+
+ mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET);
+
+ alq->aq_buflen = size;
+ alq->aq_entmax = 0;
+ alq->aq_entlen = 0;
+
+ alq->aq_freebytes = alq->aq_buflen;
+ alq->aq_entbuf = malloc(alq->aq_buflen, M_ALD, M_WAITOK|M_ZERO);
+ alq->aq_writehead = alq->aq_writetail = 0;
+ if (flags & ALQ_ORDERED)
+ alq->aq_flags |= AQ_ORDERED;
+
+ if ((error = ald_add(alq)) != 0) {
+ alq_destroy(alq);
+ return (error);
+ }
+
+ *alqp = alq;
+
+ return (0);
+}
+
+int
+alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode,
+ int size, int count)
+{
+ int ret;
+
+ KASSERT((count >= 0), ("%s: count < 0", __func__));
+
+ if (count > 0) {
+ ret = alq_open_flags(alqp, file, cred, cmode, size*count, 0);
+ (*alqp)->aq_flags |= AQ_LEGACY;
+ (*alqp)->aq_entmax = count;
+ (*alqp)->aq_entlen = size;
+ } else
+ ret = alq_open_flags(alqp, file, cred, cmode, size, 0);
+
+ return (ret);
+}
+
+
+/*
+ * Copy a new entry into the queue. If the operation would block either
+ * wait or return an error depending on the value of waitok.
+ */
+int
+alq_writen(struct alq *alq, void *data, int len, int flags)
+{
+ int activate, copy, ret;
+ void *waitchan;
+
+ KASSERT((len > 0 && len <= alq->aq_buflen),
+ ("%s: len <= 0 || len > aq_buflen", __func__));
+
+ activate = ret = 0;
+ copy = len;
+ waitchan = NULL;
+
+ ALQ_LOCK(alq);
+
+ /*
+ * Fail to perform the write and return EWOULDBLOCK if:
+ * - The message is larger than our underlying buffer.
+ * - The ALQ is being shutdown.
+ * - There is insufficient free space in our underlying buffer
+ * to accept the message and the user can't wait for space.
+ * - There is insufficient free space in our underlying buffer
+ * to accept the message and the alq is inactive due to prior
+ * use of the ALQ_NOACTIVATE flag (which would lead to deadlock).
+ */
+ if (len > alq->aq_buflen ||
+ alq->aq_flags & AQ_SHUTDOWN ||
+ (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) &&
+ HAS_PENDING_DATA(alq))) && alq->aq_freebytes < len)) {
+ ALQ_UNLOCK(alq);
+ return (EWOULDBLOCK);
+ }
+
+ /*
+ * If we want ordered writes and there is already at least one thread
+ * waiting for resources to become available, sleep until we're woken.
+ */
+ if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) {
+ KASSERT(!(flags & ALQ_NOWAIT),
+ ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
+ alq->aq_waiters++;
+ msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqwnord", 0);
+ alq->aq_waiters--;
+ }
+
+ /*
+ * (ALQ_WAITOK && aq_freebytes < len) or aq_freebytes >= len, either
+ * enter while loop and sleep until we have enough free bytes (former)
+ * or skip (latter). If AQ_ORDERED is set, only 1 thread at a time will
+ * be in this loop. Otherwise, multiple threads may be sleeping here
+ * competing for ALQ resources.
+ */
+ while (alq->aq_freebytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) {
+ KASSERT(!(flags & ALQ_NOWAIT),
+ ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
+ alq->aq_flags |= AQ_WANTED;
+ alq->aq_waiters++;
+ if (waitchan)
+ wakeup(waitchan);
+ msleep_spin(alq, &alq->aq_mtx, "alqwnres", 0);
+ alq->aq_waiters--;
+
+ /*
+ * If we're the first thread to wake after an AQ_WANTED wakeup
+ * but there isn't enough free space for us, we're going to loop
+ * and sleep again. If there are other threads waiting in this
+ * loop, schedule a wakeup so that they can see if the space
+ * they require is available.
+ */
+ if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) &&
+ alq->aq_freebytes < len && !(alq->aq_flags & AQ_WANTED))
+ waitchan = alq;
+ else
+ waitchan = NULL;
+ }
+
+ /*
+ * If there are waiters, we need to signal the waiting threads after we
+ * complete our work. The alq ptr is used as a wait channel for threads
+ * requiring resources to be freed up. In the AQ_ORDERED case, threads
+ * are not allowed to concurrently compete for resources in the above
+ * while loop, so we use a different wait channel in this case.
+ */
+ if (alq->aq_waiters > 0) {
+ if (alq->aq_flags & AQ_ORDERED)
+ waitchan = &alq->aq_waiters;
+ else
+ waitchan = alq;
+ } else
+ waitchan = NULL;
+
+ /* Bail if we're shutting down. */
+ if (alq->aq_flags & AQ_SHUTDOWN) {
+ ret = EWOULDBLOCK;
+ goto unlock;
+ }
+
+ /*
+ * If we need to wrap the buffer to accommodate the write,
+ * we'll need 2 calls to bcopy.
+ */
+ if ((alq->aq_buflen - alq->aq_writehead) < len)
+ copy = alq->aq_buflen - alq->aq_writehead;
+
+ /* Copy message (or part thereof if wrap required) to the buffer. */
+ bcopy(data, alq->aq_entbuf + alq->aq_writehead, copy);
+ alq->aq_writehead += copy;
+
+ if (alq->aq_writehead >= alq->aq_buflen) {
+ KASSERT((alq->aq_writehead == alq->aq_buflen),
+ ("%s: alq->aq_writehead (%d) > alq->aq_buflen (%d)",
+ __func__,
+ alq->aq_writehead,
+ alq->aq_buflen));
+ alq->aq_writehead = 0;
+ }
+
+ if (copy != len) {
+ /*
+ * Wrap the buffer by copying the remainder of our message
+ * to the start of the buffer and resetting aq_writehead.
+ */
+ bcopy(((uint8_t *)data)+copy, alq->aq_entbuf, len - copy);
+ alq->aq_writehead = len - copy;
+ }
+
+ KASSERT((alq->aq_writehead >= 0 && alq->aq_writehead < alq->aq_buflen),
+ ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen", __func__));
+
+ alq->aq_freebytes -= len;
+
+ if (!(alq->aq_flags & AQ_ACTIVE) && !(flags & ALQ_NOACTIVATE)) {
+ alq->aq_flags |= AQ_ACTIVE;
+ activate = 1;
+ }
+
+ KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
+
+unlock:
+ ALQ_UNLOCK(alq);
+
+ if (activate) {
+ ALD_LOCK();
+ ald_activate(alq);
+ ALD_UNLOCK();
+ }
+
+ /* NB: We rely on wakeup_one waking threads in a FIFO manner. */
+ if (waitchan != NULL)
+ wakeup_one(waitchan);
+
+ return (ret);
+}
+
+int
+alq_write(struct alq *alq, void *data, int flags)
+{
+ /* Should only be called in fixed length message (legacy) mode. */
+ KASSERT((alq->aq_flags & AQ_LEGACY),
+ ("%s: fixed length write on variable length queue", __func__));
+ return (alq_writen(alq, data, alq->aq_entlen, flags));
+}
+
+/*
+ * Retrieve a pointer for the ALQ to write directly into, avoiding bcopy.
+ */
+struct ale *
+alq_getn(struct alq *alq, int len, int flags)
+{
+ int contigbytes;
+ void *waitchan;
+
+ KASSERT((len > 0 && len <= alq->aq_buflen),
+ ("%s: len <= 0 || len > alq->aq_buflen", __func__));
+
+ waitchan = NULL;
+
+ ALQ_LOCK(alq);
+
+ /*
+ * Determine the number of free contiguous bytes.
+ * We ensure elsewhere that if aq_writehead == aq_writetail because
+ * the buffer is empty, they will both be set to 0 and therefore
+ * aq_freebytes == aq_buflen and is fully contiguous.
+ * If they are equal and the buffer is not empty, aq_freebytes will
+ * be 0 indicating the buffer is full.
+ */
+ if (alq->aq_writehead <= alq->aq_writetail)
+ contigbytes = alq->aq_freebytes;
+ else {
+ contigbytes = alq->aq_buflen - alq->aq_writehead;
+
+ if (contigbytes < len) {
+ /*
+ * Insufficient space at end of buffer to handle a
+ * contiguous write. Wrap early if there's space at
+ * the beginning. This will leave a hole at the end
+ * of the buffer which we will have to skip over when
+ * flushing the buffer to disk.
+ */
+ if (alq->aq_writetail >= len || flags & ALQ_WAITOK) {
+ /* Keep track of # bytes left blank. */
+ alq->aq_wrapearly = contigbytes;
+ /* Do the wrap and adjust counters. */
+ contigbytes = alq->aq_freebytes =
+ alq->aq_writetail;
+ alq->aq_writehead = 0;
+ }
+ }
+ }
+
+ /*
+ * Return a NULL ALE if:
+ * - The message is larger than our underlying buffer.
+ * - The ALQ is being shutdown.
+ * - There is insufficient free space in our underlying buffer
+ * to accept the message and the user can't wait for space.
+ * - There is insufficient free space in our underlying buffer
+ * to accept the message and the alq is inactive due to prior
+ * use of the ALQ_NOACTIVATE flag (which would lead to deadlock).
+ */
+ if (len > alq->aq_buflen ||
+ alq->aq_flags & AQ_SHUTDOWN ||
+ (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) &&
+ HAS_PENDING_DATA(alq))) && contigbytes < len)) {
+ ALQ_UNLOCK(alq);
+ return (NULL);
+ }
+
+ /*
+ * If we want ordered writes and there is already at least one thread
+ * waiting for resources to become available, sleep until we're woken.
+ */
+ if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) {
+ KASSERT(!(flags & ALQ_NOWAIT),
+ ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
+ alq->aq_waiters++;
+ msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqgnord", 0);
+ alq->aq_waiters--;
+ }
+
+ /*
+ * (ALQ_WAITOK && contigbytes < len) or contigbytes >= len, either enter
+ * while loop and sleep until we have enough contiguous free bytes
+ * (former) or skip (latter). If AQ_ORDERED is set, only 1 thread at a
+ * time will be in this loop. Otherwise, multiple threads may be
+ * sleeping here competing for ALQ resources.
+ */
+ while (contigbytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) {
+ KASSERT(!(flags & ALQ_NOWAIT),
+ ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__));
+ alq->aq_flags |= AQ_WANTED;
+ alq->aq_waiters++;
+ if (waitchan)
+ wakeup(waitchan);
+ msleep_spin(alq, &alq->aq_mtx, "alqgnres", 0);
+ alq->aq_waiters--;
+
+ if (alq->aq_writehead <= alq->aq_writetail)
+ contigbytes = alq->aq_freebytes;
+ else
+ contigbytes = alq->aq_buflen - alq->aq_writehead;
+
+ /*
+ * If we're the first thread to wake after an AQ_WANTED wakeup
+ * but there isn't enough free space for us, we're going to loop
+ * and sleep again. If there are other threads waiting in this
+ * loop, schedule a wakeup so that they can see if the space
+ * they require is available.
+ */
+ if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) &&
+ contigbytes < len && !(alq->aq_flags & AQ_WANTED))
+ waitchan = alq;
+ else
+ waitchan = NULL;
+ }
+
+ /*
+ * If there are waiters, we need to signal the waiting threads after we
+ * complete our work. The alq ptr is used as a wait channel for threads
+ * requiring resources to be freed up. In the AQ_ORDERED case, threads
+ * are not allowed to concurrently compete for resources in the above
+ * while loop, so we use a different wait channel in this case.
+ */
+ if (alq->aq_waiters > 0) {
+ if (alq->aq_flags & AQ_ORDERED)
+ waitchan = &alq->aq_waiters;
+ else
+ waitchan = alq;
+ } else
+ waitchan = NULL;
+
+ /* Bail if we're shutting down. */
+ if (alq->aq_flags & AQ_SHUTDOWN) {
+ ALQ_UNLOCK(alq);
+ if (waitchan != NULL)
+ wakeup_one(waitchan);
+ return (NULL);
+ }
+
+ /*
+ * If we are here, we have a contiguous number of bytes >= len
+ * available in our buffer starting at aq_writehead.
+ */
+ alq->aq_getpost.ae_data = alq->aq_entbuf + alq->aq_writehead;
+ alq->aq_getpost.ae_bytesused = len;
+
+ return (&alq->aq_getpost);
+}
+
+struct ale *
+alq_get(struct alq *alq, int flags)
+{
+ /* Should only be called in fixed length message (legacy) mode. */
+ KASSERT((alq->aq_flags & AQ_LEGACY),
+ ("%s: fixed length get on variable length queue", __func__));
+ return (alq_getn(alq, alq->aq_entlen, flags));
+}
+
+void
+alq_post_flags(struct alq *alq, struct ale *ale, int flags)
+{
+ int activate;
+ void *waitchan;
+
+ activate = 0;
+
+ if (ale->ae_bytesused > 0) {
+ if (!(alq->aq_flags & AQ_ACTIVE) &&
+ !(flags & ALQ_NOACTIVATE)) {
+ alq->aq_flags |= AQ_ACTIVE;
+ activate = 1;
+ }
+
+ alq->aq_writehead += ale->ae_bytesused;
+ alq->aq_freebytes -= ale->ae_bytesused;
+
+ /* Wrap aq_writehead if we filled to the end of the buffer. */
+ if (alq->aq_writehead == alq->aq_buflen)
+ alq->aq_writehead = 0;
+
+ KASSERT((alq->aq_writehead >= 0 &&
+ alq->aq_writehead < alq->aq_buflen),
+ ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen",
+ __func__));
+
+ KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__));
+ }
+
+ /*
+ * If there are waiters, we need to signal the waiting threads after we
+ * complete our work. The alq ptr is used as a wait channel for threads
+ * requiring resources to be freed up. In the AQ_ORDERED case, threads
+ * are not allowed to concurrently compete for resources in the
+ * alq_getn() while loop, so we use a different wait channel in this case.
+ */
+ if (alq->aq_waiters > 0) {
+ if (alq->aq_flags & AQ_ORDERED)
+ waitchan = &alq->aq_waiters;
+ else
+ waitchan = alq;
+ } else
+ waitchan = NULL;
+
+ ALQ_UNLOCK(alq);
+
+ if (activate) {
+ ALD_LOCK();
+ ald_activate(alq);
+ ALD_UNLOCK();
+ }
+
+ /* NB: We rely on wakeup_one waking threads in a FIFO manner. */
+ if (waitchan != NULL)
+ wakeup_one(waitchan);
+}
+
+void
+alq_flush(struct alq *alq)
+{
+ int needwakeup = 0;
+
+ ALD_LOCK();
+ ALQ_LOCK(alq);
+
+ /*
+ * Pull the lever iff there is data to flush and we're
+ * not already in the middle of a flush operation.
+ */
+ if (HAS_PENDING_DATA(alq) && !(alq->aq_flags & AQ_FLUSHING)) {
+ if (alq->aq_flags & AQ_ACTIVE)
+ ald_deactivate(alq);
+
+ ALD_UNLOCK();
+ needwakeup = alq_doio(alq);
+ } else
+ ALD_UNLOCK();
+
+ ALQ_UNLOCK(alq);
+
+ if (needwakeup)
+ wakeup_one(alq);
+}
+
+/*
+ * Flush remaining data, close the file and free all resources.
+ */
+void
+alq_close(struct alq *alq)
+{
+ /* Only flush and destroy alq if not already shutting down. */
+ if (ald_rem(alq) == 0)
+ alq_destroy(alq);
+}
+
+static int
+alq_load_handler(module_t mod, int what, void *arg)
+{
+ int ret;
+
+ ret = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ case MOD_SHUTDOWN:
+ break;
+
+ case MOD_QUIESCE:
+ ALD_LOCK();
+ /* Only allow unload if there are no open queues. */
+ if (LIST_FIRST(&ald_queues) == NULL) {
+ ald_shutingdown = 1;
+ ALD_UNLOCK();
+ EVENTHANDLER_DEREGISTER(shutdown_pre_sync,
+ alq_eventhandler_tag);
+ ald_shutdown(NULL, 0);
+ mtx_destroy(&ald_mtx);
+ } else {
+ ALD_UNLOCK();
+ ret = EBUSY;
+ }
+ break;
+
+ case MOD_UNLOAD:
+ /* If MOD_QUIESCE failed we must fail here too. */
+ if (ald_shutingdown == 0)
+ ret = EBUSY;
+ break;
+
+ default:
+ ret = EINVAL;
+ break;
+ }
+
+ return (ret);
+}
+
+static moduledata_t alq_mod =
+{
+ "alq",
+ alq_load_handler,
+ NULL
+};
+
+DECLARE_MODULE(alq, alq_mod, SI_SUB_SMP, SI_ORDER_ANY);
+MODULE_VERSION(alq, 1);
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
new file mode 100644
index 0000000..4cfd219
--- /dev/null
+++ b/sys/kern/kern_clock.c
@@ -0,0 +1,895 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdb.h"
+#include "opt_device_polling.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_ntp.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/limits.h>
+#include <sys/timetc.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DEFINE( , , clock, hard);
+PMC_SOFT_DEFINE( , , clock, stat);
+PMC_SOFT_DEFINE_EX( , , clock, prof, \
+ cpu_startprofclock, cpu_stopprofclock);
+#endif
+
+#ifdef DEVICE_POLLING
+extern void hardclock_device_poll(void);
+#endif /* DEVICE_POLLING */
+
+static void initclocks(void *dummy);
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
+
+/* Spin-lock protecting profiling statistics. */
+static struct mtx time_lock;
+
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE2(sched, , , tick, tick, "struct thread *", "struct proc *");
+
+static int
+sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long cp_time[CPUSTATES];
+#ifdef SCTL_MASK32
+ int i;
+ unsigned int cp_time32[CPUSTATES];
+#endif
+
+ read_cpu_time(cp_time);
+#ifdef SCTL_MASK32
+ if (req->flags & SCTL_MASK32) {
+ if (!req->oldptr)
+ return SYSCTL_OUT(req, 0, sizeof(cp_time32));
+ for (i = 0; i < CPUSTATES; i++)
+ cp_time32[i] = (unsigned int)cp_time[i];
+ error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
+ } else
+#endif
+ {
+ if (!req->oldptr)
+ return SYSCTL_OUT(req, 0, sizeof(cp_time));
+ error = SYSCTL_OUT(req, cp_time, sizeof(cp_time));
+ }
+ return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
+ 0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
+
+static long empty[CPUSTATES];
+
+static int
+sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS)
+{
+ struct pcpu *pcpu;
+ int error;
+ int c;
+ long *cp_time;
+#ifdef SCTL_MASK32
+ unsigned int cp_time32[CPUSTATES];
+ int i;
+#endif
+
+ if (!req->oldptr) {
+#ifdef SCTL_MASK32
+ if (req->flags & SCTL_MASK32)
+ return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1));
+ else
+#endif
+ return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1));
+ }
+ for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) {
+ if (!CPU_ABSENT(c)) {
+ pcpu = pcpu_find(c);
+ cp_time = pcpu->pc_cp_time;
+ } else {
+ cp_time = empty;
+ }
+#ifdef SCTL_MASK32
+ if (req->flags & SCTL_MASK32) {
+ for (i = 0; i < CPUSTATES; i++)
+ cp_time32[i] = (unsigned int)cp_time[i];
+ error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
+ } else
+#endif
+ error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES);
+ }
+ return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
+ 0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics");
+
+#ifdef DEADLKRES
+static const char *blessed[] = {
+ "getblk",
+ "so_snd_sx",
+ "so_rcv_sx",
+ NULL
+};
+static int slptime_threshold = 1800;
+static int blktime_threshold = 900;
+static int sleepfreq = 3;
+
+static void
+deadlkres(void)
+{
+ struct proc *p;
+ struct thread *td;
+ void *wchan;
+ int blkticks, i, slpticks, slptype, tryl, tticks;
+
+ tryl = 0;
+ for (;;) {
+ blkticks = blktime_threshold * hz;
+ slpticks = slptime_threshold * hz;
+
+ /*
+ * Avoid to sleep on the sx_lock in order to avoid a possible
+ * priority inversion problem leading to starvation.
+ * If the lock can't be held after 100 tries, panic.
+ */
+ if (!sx_try_slock(&allproc_lock)) {
+ if (tryl > 100)
+ panic("%s: possible deadlock detected on allproc_lock\n",
+ __func__);
+ tryl++;
+ pause("allproc", sleepfreq * hz);
+ continue;
+ }
+ tryl = 0;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+
+ thread_lock(td);
+ if (TD_ON_LOCK(td)) {
+
+ /*
+ * The thread should be blocked on a
+ * turnstile, simply check if the
+ * turnstile channel is in good state.
+ */
+ MPASS(td->td_blocked != NULL);
+
+ tticks = ticks - td->td_blktick;
+ thread_unlock(td);
+ if (tticks > blkticks) {
+
+ /*
+ * Accordingly with provided
+ * thresholds, this thread is
+ * stuck for too long on a
+ * turnstile.
+ */
+ PROC_UNLOCK(p);
+ sx_sunlock(&allproc_lock);
+ panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
+ __func__, td, tticks);
+ }
+ } else if (TD_IS_SLEEPING(td) &&
+ TD_ON_SLEEPQ(td)) {
+
+ /*
+ * Check if the thread is sleeping on a
+ * lock, otherwise skip the check.
+ * Drop the thread lock in order to
+ * avoid a LOR with the sleepqueue
+ * spinlock.
+ */
+ wchan = td->td_wchan;
+ tticks = ticks - td->td_slptick;
+ thread_unlock(td);
+ slptype = sleepq_type(wchan);
+ if ((slptype == SLEEPQ_SX ||
+ slptype == SLEEPQ_LK) &&
+ tticks > slpticks) {
+
+ /*
+ * Accordingly with provided
+ * thresholds, this thread is
+ * stuck for too long on a
+ * sleepqueue.
+ * However, being on a
+ * sleepqueue, we might still
+ * check for the blessed
+ * list.
+ */
+ tryl = 0;
+ for (i = 0; blessed[i] != NULL;
+ i++) {
+ if (!strcmp(blessed[i],
+ td->td_wmesg)) {
+ tryl = 1;
+ break;
+ }
+ }
+ if (tryl != 0) {
+ tryl = 0;
+ continue;
+ }
+ PROC_UNLOCK(p);
+ sx_sunlock(&allproc_lock);
+ panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
+ __func__, td, tticks);
+ }
+ } else
+ thread_unlock(td);
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+
+ /* Sleep for sleepfreq seconds. */
+ pause("-", sleepfreq * hz);
+ }
+}
+
+static struct kthread_desc deadlkres_kd = {
+ "deadlkres",
+ deadlkres,
+ (struct thread **)NULL
+};
+
+SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd);
+
+static SYSCTL_NODE(_debug, OID_AUTO, deadlkres, CTLFLAG_RW, 0,
+ "Deadlock resolver");
+SYSCTL_INT(_debug_deadlkres, OID_AUTO, slptime_threshold, CTLFLAG_RW,
+ &slptime_threshold, 0,
+ "Number of seconds within is valid to sleep on a sleepqueue");
+SYSCTL_INT(_debug_deadlkres, OID_AUTO, blktime_threshold, CTLFLAG_RW,
+ &blktime_threshold, 0,
+ "Number of seconds within is valid to block on a turnstile");
+SYSCTL_INT(_debug_deadlkres, OID_AUTO, sleepfreq, CTLFLAG_RW, &sleepfreq, 0,
+ "Number of seconds between any deadlock resolver thread run");
+#endif /* DEADLKRES */
+
+void
+read_cpu_time(long *cp_time)
+{
+ struct pcpu *pc;
+ int i, j;
+
+ /* Sum up global cp_time[]. */
+ bzero(cp_time, sizeof(long) * CPUSTATES);
+ CPU_FOREACH(i) {
+ pc = pcpu_find(i);
+ for (j = 0; j < CPUSTATES; j++)
+ cp_time[j] += pc->pc_cp_time[j];
+ }
+}
+
+#ifdef SW_WATCHDOG
+#include <sys/watchdog.h>
+
+static int watchdog_ticks;
+static int watchdog_enabled;
+static void watchdog_fire(void);
+static void watchdog_config(void *, u_int, int *);
+#endif /* SW_WATCHDOG */
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.
+ *
+ * The main timer, running hz times per second, is used to trigger interval
+ * timers, timeouts and rescheduling as needed.
+ *
+ * The second timer handles kernel and user profiling,
+ * and does resource use estimation. If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks. For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires. Otherwise, it would never accumulate
+ * cpu ticks. The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock. This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling. This profile clock runs at profhz. We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics. (For profiling, every tick counts.)
+ *
+ * Time-of-day is maintained using a "timecounter", which may or may
+ * not be related to the hardware generating the above mentioned
+ * interrupts.
+ */
+
+int stathz;
+int profhz;
+int profprocs;
+volatile int ticks;
+int psratio;
+
+static DPCPU_DEFINE(int, pcputicks); /* Per-CPU version of ticks. */
+static int global_hardclock_run = 0;
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+ void *dummy;
+{
+ register int i;
+
+ /*
+ * Set divisors to 1 (normal case) and let the machine-specific
+ * code do its bit.
+ */
+ mtx_init(&time_lock, "time lock", NULL, MTX_DEF);
+ cpu_initclocks();
+
+ /*
+ * Compute profhz/stathz, and fix profhz if needed.
+ */
+ i = stathz ? stathz : hz;
+ if (profhz == 0)
+ profhz = i;
+ psratio = profhz / i;
+#ifdef SW_WATCHDOG
+ EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0);
+#endif
+}
+
+/*
+ * Each time the real-time timer fires, this function is called on all CPUs.
+ * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only
+ * the other CPUs in the system need to call this function.
+ */
+void
+hardclock_cpu(int usermode)
+{
+ struct pstats *pstats;
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int flags;
+
+ /*
+ * Run current process's virtual and profile time, as needed.
+ */
+ pstats = p->p_stats;
+ flags = 0;
+ if (usermode &&
+ timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
+ PROC_SLOCK(p);
+ if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+ flags |= TDF_ALRMPEND | TDF_ASTPENDING;
+ PROC_SUNLOCK(p);
+ }
+ if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
+ PROC_SLOCK(p);
+ if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+ flags |= TDF_PROFPEND | TDF_ASTPENDING;
+ PROC_SUNLOCK(p);
+ }
+ thread_lock(td);
+ sched_tick(1);
+ td->td_flags |= flags;
+ thread_unlock(td);
+
+#ifdef HWPMC_HOOKS
+ if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
+ PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
+ if (td->td_intr_frame != NULL)
+ PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
+#endif
+ callout_process(sbinuptime());
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(int usermode, uintfptr_t pc)
+{
+
+ atomic_add_int(&ticks, 1);
+ hardclock_cpu(usermode);
+ tc_ticktock(1);
+ cpu_tick_calibration();
+ /*
+ * If no separate statistics clock is available, run it from here.
+ *
+ * XXX: this only works for UP
+ */
+ if (stathz == 0) {
+ profclock(usermode, pc);
+ statclock(usermode);
+ }
+#ifdef DEVICE_POLLING
+ hardclock_device_poll(); /* this is very short and quick */
+#endif /* DEVICE_POLLING */
+#ifdef SW_WATCHDOG
+ if (watchdog_enabled > 0 && --watchdog_ticks <= 0)
+ watchdog_fire();
+#endif /* SW_WATCHDOG */
+}
+
+void
+hardclock_cnt(int cnt, int usermode)
+{
+ struct pstats *pstats;
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+ int *t = DPCPU_PTR(pcputicks);
+ int flags, global, newticks;
+#ifdef SW_WATCHDOG
+ int i;
+#endif /* SW_WATCHDOG */
+
+ /*
+ * Update per-CPU and possibly global ticks values.
+ */
+ *t += cnt;
+ do {
+ global = ticks;
+ newticks = *t - global;
+ if (newticks <= 0) {
+ if (newticks < -1)
+ *t = global - 1;
+ newticks = 0;
+ break;
+ }
+ } while (!atomic_cmpset_int(&ticks, global, *t));
+
+ /*
+ * Run current process's virtual and profile time, as needed.
+ */
+ pstats = p->p_stats;
+ flags = 0;
+ if (usermode &&
+ timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
+ PROC_SLOCK(p);
+ if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL],
+ tick * cnt) == 0)
+ flags |= TDF_ALRMPEND | TDF_ASTPENDING;
+ PROC_SUNLOCK(p);
+ }
+ if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
+ PROC_SLOCK(p);
+ if (itimerdecr(&pstats->p_timer[ITIMER_PROF],
+ tick * cnt) == 0)
+ flags |= TDF_PROFPEND | TDF_ASTPENDING;
+ PROC_SUNLOCK(p);
+ }
+ thread_lock(td);
+ sched_tick(cnt);
+ td->td_flags |= flags;
+ thread_unlock(td);
+
+#ifdef HWPMC_HOOKS
+ if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
+ PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
+ if (td->td_intr_frame != NULL)
+ PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
+#endif
+ /* We are in charge to handle this tick duty. */
+ if (newticks > 0) {
+ /* Dangerous and no need to call these things concurrently. */
+ if (atomic_cmpset_acq_int(&global_hardclock_run, 0, 1)) {
+ tc_ticktock(newticks);
+#ifdef DEVICE_POLLING
+ /* This is very short and quick. */
+ hardclock_device_poll();
+#endif /* DEVICE_POLLING */
+ atomic_store_rel_int(&global_hardclock_run, 0);
+ }
+#ifdef SW_WATCHDOG
+ if (watchdog_enabled > 0) {
+ i = atomic_fetchadd_int(&watchdog_ticks, -newticks);
+ if (i > 0 && i <= newticks)
+ watchdog_fire();
+ }
+#endif /* SW_WATCHDOG */
+ }
+ if (curcpu == CPU_FIRST())
+ cpu_tick_calibration();
+}
+
+void
+hardclock_sync(int cpu)
+{
+ int *t = DPCPU_ID_PTR(cpu, pcputicks);
+
+ *t = ticks;
+}
+
+/*
+ * Compute number of ticks in the specified amount of time.
+ */
+int
+tvtohz(tv)
+ struct timeval *tv;
+{
+ register unsigned long ticks;
+ register long sec, usec;
+
+ /*
+ * If the number of usecs in the whole seconds part of the time
+ * difference fits in a long, then the total number of usecs will
+ * fit in an unsigned long. Compute the total and convert it to
+ * ticks, rounding up and adding 1 to allow for the current tick
+ * to expire. Rounding also depends on unsigned long arithmetic
+ * to avoid overflow.
+ *
+ * Otherwise, if the number of ticks in the whole seconds part of
+ * the time difference fits in a long, then convert the parts to
+ * ticks separately and add, using similar rounding methods and
+ * overflow avoidance. This method would work in the previous
+ * case but it is slightly slower and assumes that hz is integral.
+ *
+ * Otherwise, round the time difference down to the maximum
+ * representable value.
+ *
+ * If ints have 32 bits, then the maximum value for any timeout in
+ * 10ms ticks is 248 days.
+ */
+ sec = tv->tv_sec;
+ usec = tv->tv_usec;
+ if (usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+ if (sec < 0) {
+#ifdef DIAGNOSTIC
+ if (usec > 0) {
+ sec++;
+ usec -= 1000000;
+ }
+ printf("tvotohz: negative time difference %ld sec %ld usec\n",
+ sec, usec);
+#endif
+ ticks = 1;
+ } else if (sec <= LONG_MAX / 1000000)
+ ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+ / tick + 1;
+ else if (sec <= LONG_MAX / hz)
+ ticks = sec * hz
+ + ((unsigned long)usec + (tick - 1)) / tick + 1;
+ else
+ ticks = LONG_MAX;
+ if (ticks > INT_MAX)
+ ticks = INT_MAX;
+ return ((int)ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+ register struct proc *p;
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (p->p_flag & P_STOPPROF)
+ return;
+ if ((p->p_flag & P_PROFIL) == 0) {
+ p->p_flag |= P_PROFIL;
+ mtx_lock(&time_lock);
+ if (++profprocs == 1)
+ cpu_startprofclock();
+ mtx_unlock(&time_lock);
+ }
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+ register struct proc *p;
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (p->p_flag & P_PROFIL) {
+ if (p->p_profthreads != 0) {
+ p->p_flag |= P_STOPPROF;
+ while (p->p_profthreads != 0)
+ msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
+ "stopprof", 0);
+ p->p_flag &= ~P_STOPPROF;
+ }
+ if ((p->p_flag & P_PROFIL) == 0)
+ return;
+ p->p_flag &= ~P_PROFIL;
+ mtx_lock(&time_lock);
+ if (--profprocs == 0)
+ cpu_stopprofclock();
+ mtx_unlock(&time_lock);
+ }
+}
+
+/*
+ * Statistics clock. Updates rusage information and calls the scheduler
+ * to adjust priorities of the active thread.
+ *
+ * This should be called by all active processors.
+ */
+void
+statclock(int usermode)
+{
+
+ statclock_cnt(1, usermode);
+}
+
+void
+statclock_cnt(int cnt, int usermode)
+{
+ struct rusage *ru;
+ struct vmspace *vm;
+ struct thread *td;
+ struct proc *p;
+ long rss;
+ long *cp_time;
+
+ td = curthread;
+ p = td->td_proc;
+
+ cp_time = (long *)PCPU_PTR(cp_time);
+ if (usermode) {
+ /*
+ * Charge the time as appropriate.
+ */
+ td->td_uticks += cnt;
+ if (p->p_nice > NZERO)
+ cp_time[CP_NICE] += cnt;
+ else
+ cp_time[CP_USER] += cnt;
+ } else {
+ /*
+ * Came from kernel mode, so we were:
+ * - handling an interrupt,
+ * - doing syscall or trap work on behalf of the current
+ * user process, or
+ * - spinning in the idle loop.
+ * Whichever it is, charge the time as appropriate.
+ * Note that we charge interrupts to the current process,
+ * regardless of whether they are ``for'' that process,
+ * so that we know how much of its real time was spent
+ * in ``non-process'' (i.e., interrupt) work.
+ */
+ if ((td->td_pflags & TDP_ITHREAD) ||
+ td->td_intr_nesting_level >= 2) {
+ td->td_iticks += cnt;
+ cp_time[CP_INTR] += cnt;
+ } else {
+ td->td_pticks += cnt;
+ td->td_sticks += cnt;
+ if (!TD_IS_IDLETHREAD(td))
+ cp_time[CP_SYS] += cnt;
+ else
+ cp_time[CP_IDLE] += cnt;
+ }
+ }
+
+ /* Update resource usage integrals and maximums. */
+ MPASS(p->p_vmspace != NULL);
+ vm = p->p_vmspace;
+ ru = &td->td_ru;
+ ru->ru_ixrss += pgtok(vm->vm_tsize) * cnt;
+ ru->ru_idrss += pgtok(vm->vm_dsize) * cnt;
+ ru->ru_isrss += pgtok(vm->vm_ssize) * cnt;
+ rss = pgtok(vmspace_resident_count(vm));
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock",
+ "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz);
+ SDT_PROBE2(sched, , , tick, td, td->td_proc);
+ thread_lock_flags(td, MTX_QUIET);
+ for ( ; cnt > 0; cnt--)
+ sched_clock(td);
+ thread_unlock(td);
+#ifdef HWPMC_HOOKS
+ if (td->td_intr_frame != NULL)
+ PMC_SOFT_CALL_TF( , , clock, stat, td->td_intr_frame);
+#endif
+}
+
+void
+profclock(int usermode, uintfptr_t pc)
+{
+
+ profclock_cnt(1, usermode, pc);
+}
+
+void
+profclock_cnt(int cnt, int usermode, uintfptr_t pc)
+{
+ struct thread *td;
+#ifdef GPROF
+ struct gmonparam *g;
+ uintfptr_t i;
+#endif
+
+ td = curthread;
+ if (usermode) {
+ /*
+ * Came from user mode; CPU was in user state.
+ * If this process is being profiled, record the tick.
+ * if there is no related user location yet, don't
+ * bother trying to count it.
+ */
+ if (td->td_proc->p_flag & P_PROFIL)
+ addupc_intr(td, pc, cnt);
+ }
+#ifdef GPROF
+ else {
+ /*
+ * Kernel statistics are just like addupc_intr, only easier.
+ */
+ g = &_gmonparam;
+ if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
+ i = PC_TO_I(g, pc);
+ if (i < g->textsize) {
+ KCOUNT(g, i) += cnt;
+ }
+ }
+ }
+#endif
+#ifdef HWPMC_HOOKS
+ if (td->td_intr_frame != NULL)
+ PMC_SOFT_CALL_TF( , , clock, prof, td->td_intr_frame);
+#endif
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
+{
+ struct clockinfo clkinfo;
+ /*
+ * Construct clockinfo structure.
+ */
+ bzero(&clkinfo, sizeof(clkinfo));
+ clkinfo.hz = hz;
+ clkinfo.tick = tick;
+ clkinfo.profhz = profhz;
+ clkinfo.stathz = stathz ? stathz : hz;
+ return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate,
+ CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_MPSAFE,
+ 0, 0, sysctl_kern_clockrate, "S,clockinfo",
+ "Rate and period of various kernel clocks");
+
+#ifdef SW_WATCHDOG
+
+static void
+watchdog_config(void *unused __unused, u_int cmd, int *error)
+{
+ u_int u;
+
+ u = cmd & WD_INTERVAL;
+ if (u >= WD_TO_1SEC) {
+ watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
+ watchdog_enabled = 1;
+ *error = 0;
+ } else {
+ watchdog_enabled = 0;
+ }
+}
+
+/*
+ * Handle a watchdog timeout by dumping interrupt information and
+ * then either dropping to DDB or panicking.
+ */
+static void
+watchdog_fire(void)
+{
+ int nintr;
+ uint64_t inttotal;
+ u_long *curintr;
+ char *curname;
+
+ curintr = intrcnt;
+ curname = intrnames;
+ inttotal = 0;
+ nintr = sintrcnt / sizeof(u_long);
+
+ printf("interrupt total\n");
+ while (--nintr >= 0) {
+ if (*curintr)
+ printf("%-12s %20lu\n", curname, *curintr);
+ curname += strlen(curname) + 1;
+ inttotal += *curintr++;
+ }
+ printf("Total %20ju\n", (uintmax_t)inttotal);
+
+#if defined(KDB) && !defined(KDB_UNATTENDED)
+ kdb_backtrace();
+ kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout");
+#else
+ panic("watchdog timeout");
+#endif
+}
+
+#endif /* SW_WATCHDOG */
diff --git a/sys/kern/kern_clocksource.c b/sys/kern/kern_clocksource.c
new file mode 100644
index 0000000..c2bebbe
--- /dev/null
+++ b/sys/kern/kern_clocksource.c
@@ -0,0 +1,949 @@
+/*-
+ * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Common routines to manage event timers hardware.
+ */
+
+#include "opt_device_polling.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/timeet.h>
+#include <sys/timetc.h>
+
+#include <machine/atomic.h>
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+cyclic_clock_func_t cyclic_clock_func = NULL;
+#endif
+
+int cpu_can_deep_sleep = 0; /* C3 state is available. */
+int cpu_disable_deep_sleep = 0; /* Timer dies in C3. */
+
+static void setuptimer(void);
+static void loadtimer(sbintime_t now, int first);
+static int doconfigtimer(void);
+static void configtimer(int start);
+static int round_freq(struct eventtimer *et, int freq);
+
+static sbintime_t getnextcpuevent(int idle);
+static sbintime_t getnextevent(void);
+static int handleevents(sbintime_t now, int fake);
+
+static struct mtx et_hw_mtx;
+
+#define ET_HW_LOCK(state) \
+ { \
+ if (timer->et_flags & ET_FLAGS_PERCPU) \
+ mtx_lock_spin(&(state)->et_hw_mtx); \
+ else \
+ mtx_lock_spin(&et_hw_mtx); \
+ }
+
+#define ET_HW_UNLOCK(state) \
+ { \
+ if (timer->et_flags & ET_FLAGS_PERCPU) \
+ mtx_unlock_spin(&(state)->et_hw_mtx); \
+ else \
+ mtx_unlock_spin(&et_hw_mtx); \
+ }
+
+static struct eventtimer *timer = NULL;
+static sbintime_t timerperiod; /* Timer period for periodic mode. */
+static sbintime_t statperiod; /* statclock() events period. */
+static sbintime_t profperiod; /* profclock() events period. */
+static sbintime_t nexttick; /* Next global timer tick time. */
+static u_int busy = 1; /* Reconfiguration is in progress. */
+static int profiling = 0; /* Profiling events enabled. */
+
+static char timername[32]; /* Wanted timer. */
+TUNABLE_STR("kern.eventtimer.timer", timername, sizeof(timername));
+
+static int singlemul = 0; /* Multiplier for periodic mode. */
+TUNABLE_INT("kern.eventtimer.singlemul", &singlemul);
+SYSCTL_INT(_kern_eventtimer, OID_AUTO, singlemul, CTLFLAG_RW, &singlemul,
+ 0, "Multiplier for periodic mode");
+
+static u_int idletick = 0; /* Run periodic events when idle. */
+TUNABLE_INT("kern.eventtimer.idletick", &idletick);
+SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RW, &idletick,
+ 0, "Run periodic events when idle");
+
+static int periodic = 0; /* Periodic or one-shot mode. */
+static int want_periodic = 0; /* What mode to prefer. */
+TUNABLE_INT("kern.eventtimer.periodic", &want_periodic);
+
+struct pcpu_state {
+ struct mtx et_hw_mtx; /* Per-CPU timer mutex. */
+ u_int action; /* Reconfiguration requests. */
+ u_int handle; /* Immediate handle resuests. */
+ sbintime_t now; /* Last tick time. */
+ sbintime_t nextevent; /* Next scheduled event on this CPU. */
+ sbintime_t nexttick; /* Next timer tick time. */
+ sbintime_t nexthard; /* Next hardlock() event. */
+ sbintime_t nextstat; /* Next statclock() event. */
+ sbintime_t nextprof; /* Next profclock() event. */
+ sbintime_t nextcall; /* Next callout event. */
+ sbintime_t nextcallopt; /* Next optional callout event. */
+#ifdef KDTRACE_HOOKS
+ sbintime_t nextcyc; /* Next OpenSolaris cyclics event. */
+#endif
+ int ipi; /* This CPU needs IPI. */
+ int idle; /* This CPU is in idle mode. */
+};
+
+static DPCPU_DEFINE(struct pcpu_state, timerstate);
+DPCPU_DEFINE(sbintime_t, hardclocktime);
+
+/*
+ * Timer broadcast IPI handler.
+ */
+int
+hardclockintr(void)
+{
+ sbintime_t now;
+ struct pcpu_state *state;
+ int done;
+
+ if (doconfigtimer() || busy)
+ return (FILTER_HANDLED);
+ state = DPCPU_PTR(timerstate);
+ now = state->now;
+ CTR3(KTR_SPARE2, "ipi at %d: now %d.%08x",
+ curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+ done = handleevents(now, 0);
+ return (done ? FILTER_HANDLED : FILTER_STRAY);
+}
+
+/*
+ * Handle all events for specified time on this CPU
+ */
+static int
+handleevents(sbintime_t now, int fake)
+{
+ sbintime_t t, *hct;
+ struct trapframe *frame;
+ struct pcpu_state *state;
+ int usermode;
+ int done, runs;
+
+ CTR3(KTR_SPARE2, "handle at %d: now %d.%08x",
+ curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+ done = 0;
+ if (fake) {
+ frame = NULL;
+ usermode = 0;
+ } else {
+ frame = curthread->td_intr_frame;
+ usermode = TRAPF_USERMODE(frame);
+ }
+
+ state = DPCPU_PTR(timerstate);
+
+ runs = 0;
+ while (now >= state->nexthard) {
+ state->nexthard += tick_sbt;
+ runs++;
+ }
+ if (runs) {
+ hct = DPCPU_PTR(hardclocktime);
+ *hct = state->nexthard - tick_sbt;
+ if (fake < 2) {
+ hardclock_cnt(runs, usermode);
+ done = 1;
+ }
+ }
+ runs = 0;
+ while (now >= state->nextstat) {
+ state->nextstat += statperiod;
+ runs++;
+ }
+ if (runs && fake < 2) {
+ statclock_cnt(runs, usermode);
+ done = 1;
+ }
+ if (profiling) {
+ runs = 0;
+ while (now >= state->nextprof) {
+ state->nextprof += profperiod;
+ runs++;
+ }
+ if (runs && !fake) {
+ profclock_cnt(runs, usermode, TRAPF_PC(frame));
+ done = 1;
+ }
+ } else
+ state->nextprof = state->nextstat;
+ if (now >= state->nextcallopt) {
+ state->nextcall = state->nextcallopt = INT64_MAX;
+ callout_process(now);
+ }
+
+#ifdef KDTRACE_HOOKS
+ if (fake == 0 && now >= state->nextcyc && cyclic_clock_func != NULL) {
+ state->nextcyc = INT64_MAX;
+ (*cyclic_clock_func)(frame);
+ }
+#endif
+
+ t = getnextcpuevent(0);
+ ET_HW_LOCK(state);
+ if (!busy) {
+ state->idle = 0;
+ state->nextevent = t;
+ loadtimer(now, 0);
+ }
+ ET_HW_UNLOCK(state);
+ return (done);
+}
+
+/*
+ * Schedule binuptime of the next event on current CPU.
+ */
+static sbintime_t
+getnextcpuevent(int idle)
+{
+ sbintime_t event;
+ struct pcpu_state *state;
+ u_int hardfreq;
+
+ state = DPCPU_PTR(timerstate);
+ /* Handle hardclock() events, skipping some if CPU is idle. */
+ event = state->nexthard;
+ if (idle) {
+ hardfreq = (u_int)hz / 2;
+ if (tc_min_ticktock_freq > 2
+#ifdef SMP
+ && curcpu == CPU_FIRST()
+#endif
+ )
+ hardfreq = hz / tc_min_ticktock_freq;
+ if (hardfreq > 1)
+ event += tick_sbt * (hardfreq - 1);
+ }
+ /* Handle callout events. */
+ if (event > state->nextcall)
+ event = state->nextcall;
+ if (!idle) { /* If CPU is active - handle other types of events. */
+ if (event > state->nextstat)
+ event = state->nextstat;
+ if (profiling && event > state->nextprof)
+ event = state->nextprof;
+ }
+#ifdef KDTRACE_HOOKS
+ if (event > state->nextcyc)
+ event = state->nextcyc;
+#endif
+ return (event);
+}
+
+/*
+ * Schedule binuptime of the next event on all CPUs.
+ */
+static sbintime_t
+getnextevent(void)
+{
+ struct pcpu_state *state;
+ sbintime_t event;
+#ifdef SMP
+ int cpu;
+#endif
+ int c;
+
+ state = DPCPU_PTR(timerstate);
+ event = state->nextevent;
+ c = -1;
+#ifdef SMP
+ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) {
+ CPU_FOREACH(cpu) {
+ state = DPCPU_ID_PTR(cpu, timerstate);
+ if (event > state->nextevent) {
+ event = state->nextevent;
+ c = cpu;
+ }
+ }
+ }
+#endif
+ CTR4(KTR_SPARE2, "next at %d: next %d.%08x by %d",
+ curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c);
+ return (event);
+}
+
+/* Hardware timer callback function. */
+static void
+timercb(struct eventtimer *et, void *arg)
+{
+ sbintime_t now;
+ sbintime_t *next;
+ struct pcpu_state *state;
+#ifdef SMP
+ int cpu, bcast;
+#endif
+
+ /* Do not touch anything if somebody reconfiguring timers. */
+ if (busy)
+ return;
+ /* Update present and next tick times. */
+ state = DPCPU_PTR(timerstate);
+ if (et->et_flags & ET_FLAGS_PERCPU) {
+ next = &state->nexttick;
+ } else
+ next = &nexttick;
+ now = sbinuptime();
+ if (periodic)
+ *next = now + timerperiod;
+ else
+ *next = -1; /* Next tick is not scheduled yet. */
+ state->now = now;
+ CTR3(KTR_SPARE2, "intr at %d: now %d.%08x",
+ curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+
+#ifdef SMP
+ /* Prepare broadcasting to other CPUs for non-per-CPU timers. */
+ bcast = 0;
+ if ((et->et_flags & ET_FLAGS_PERCPU) == 0 && smp_started) {
+ CPU_FOREACH(cpu) {
+ state = DPCPU_ID_PTR(cpu, timerstate);
+ ET_HW_LOCK(state);
+ state->now = now;
+ if (now >= state->nextevent) {
+ state->nextevent += SBT_1S;
+ if (curcpu != cpu) {
+ state->ipi = 1;
+ bcast = 1;
+ }
+ }
+ ET_HW_UNLOCK(state);
+ }
+ }
+#endif
+
+ /* Handle events for this time on this CPU. */
+ handleevents(now, 0);
+
+#ifdef SMP
+ /* Broadcast interrupt to other CPUs for non-per-CPU timers. */
+ if (bcast) {
+ CPU_FOREACH(cpu) {
+ if (curcpu == cpu)
+ continue;
+ state = DPCPU_ID_PTR(cpu, timerstate);
+ if (state->ipi) {
+ state->ipi = 0;
+ ipi_cpu(cpu, IPI_HARDCLOCK);
+ }
+ }
+ }
+#endif
+}
+
+/*
+ * Load new value into hardware timer.
+ */
+static void
+loadtimer(sbintime_t now, int start)
+{
+ struct pcpu_state *state;
+ sbintime_t new;
+ sbintime_t *next;
+ uint64_t tmp;
+ int eq;
+
+ if (timer->et_flags & ET_FLAGS_PERCPU) {
+ state = DPCPU_PTR(timerstate);
+ next = &state->nexttick;
+ } else
+ next = &nexttick;
+ if (periodic) {
+ if (start) {
+ /*
+ * Try to start all periodic timers aligned
+ * to period to make events synchronous.
+ */
+ tmp = now % timerperiod;
+ new = timerperiod - tmp;
+ if (new < tmp) /* Left less then passed. */
+ new += timerperiod;
+ CTR5(KTR_SPARE2, "load p at %d: now %d.%08x first in %d.%08x",
+ curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff),
+ (int)(new >> 32), (u_int)(new & 0xffffffff));
+ *next = new + now;
+ et_start(timer, new, timerperiod);
+ }
+ } else {
+ new = getnextevent();
+ eq = (new == *next);
+ CTR4(KTR_SPARE2, "load at %d: next %d.%08x eq %d",
+ curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq);
+ if (!eq) {
+ *next = new;
+ et_start(timer, new - now, 0);
+ }
+ }
+}
+
+/*
+ * Prepare event timer parameters after configuration changes.
+ */
+static void
+setuptimer(void)
+{
+ int freq;
+
+ if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0)
+ periodic = 0;
+ else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0)
+ periodic = 1;
+ singlemul = MIN(MAX(singlemul, 1), 20);
+ freq = hz * singlemul;
+ while (freq < (profiling ? profhz : stathz))
+ freq += hz;
+ freq = round_freq(timer, freq);
+ timerperiod = SBT_1S / freq;
+}
+
+/*
+ * Reconfigure specified per-CPU timer on other CPU. Called from IPI handler.
+ */
+static int
+doconfigtimer(void)
+{
+ sbintime_t now;
+ struct pcpu_state *state;
+
+ state = DPCPU_PTR(timerstate);
+ switch (atomic_load_acq_int(&state->action)) {
+ case 1:
+ now = sbinuptime();
+ ET_HW_LOCK(state);
+ loadtimer(now, 1);
+ ET_HW_UNLOCK(state);
+ state->handle = 0;
+ atomic_store_rel_int(&state->action, 0);
+ return (1);
+ case 2:
+ ET_HW_LOCK(state);
+ et_stop(timer);
+ ET_HW_UNLOCK(state);
+ state->handle = 0;
+ atomic_store_rel_int(&state->action, 0);
+ return (1);
+ }
+ if (atomic_readandclear_int(&state->handle) && !busy) {
+ now = sbinuptime();
+ handleevents(now, 0);
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Reconfigure specified timer.
+ * For per-CPU timers use IPI to make other CPUs to reconfigure.
+ */
+static void
+configtimer(int start)
+{
+ sbintime_t now, next;
+ struct pcpu_state *state;
+ int cpu;
+
+ if (start) {
+ setuptimer();
+ now = sbinuptime();
+ } else
+ now = 0;
+ critical_enter();
+ ET_HW_LOCK(DPCPU_PTR(timerstate));
+ if (start) {
+ /* Initialize time machine parameters. */
+ next = now + timerperiod;
+ if (periodic)
+ nexttick = next;
+ else
+ nexttick = -1;
+ CPU_FOREACH(cpu) {
+ state = DPCPU_ID_PTR(cpu, timerstate);
+ state->now = now;
+ if (!smp_started && cpu != CPU_FIRST())
+ state->nextevent = INT64_MAX;
+ else
+ state->nextevent = next;
+ if (periodic)
+ state->nexttick = next;
+ else
+ state->nexttick = -1;
+ state->nexthard = next;
+ state->nextstat = next;
+ state->nextprof = next;
+ state->nextcall = next;
+ state->nextcallopt = next;
+ hardclock_sync(cpu);
+ }
+ busy = 0;
+ /* Start global timer or per-CPU timer of this CPU. */
+ loadtimer(now, 1);
+ } else {
+ busy = 1;
+ /* Stop global timer or per-CPU timer of this CPU. */
+ et_stop(timer);
+ }
+ ET_HW_UNLOCK(DPCPU_PTR(timerstate));
+#ifdef SMP
+ /* If timer is global or there is no other CPUs yet - we are done. */
+ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || !smp_started) {
+ critical_exit();
+ return;
+ }
+ /* Set reconfigure flags for other CPUs. */
+ CPU_FOREACH(cpu) {
+ state = DPCPU_ID_PTR(cpu, timerstate);
+ atomic_store_rel_int(&state->action,
+ (cpu == curcpu) ? 0 : ( start ? 1 : 2));
+ }
+ /* Broadcast reconfigure IPI. */
+ ipi_all_but_self(IPI_HARDCLOCK);
+ /* Wait for reconfiguration completed. */
+restart:
+ cpu_spinwait();
+ CPU_FOREACH(cpu) {
+ if (cpu == curcpu)
+ continue;
+ state = DPCPU_ID_PTR(cpu, timerstate);
+ if (atomic_load_acq_int(&state->action))
+ goto restart;
+ }
+#endif
+ critical_exit();
+}
+
+/*
+ * Calculate nearest frequency supported by hardware timer.
+ */
+static int
+round_freq(struct eventtimer *et, int freq)
+{
+ uint64_t div;
+
+ if (et->et_frequency != 0) {
+ div = lmax((et->et_frequency + freq / 2) / freq, 1);
+ if (et->et_flags & ET_FLAGS_POW2DIV)
+ div = 1 << (flsl(div + div / 2) - 1);
+ freq = (et->et_frequency + div / 2) / div;
+ }
+ if (et->et_min_period > SBT_1S)
+ panic("Event timer \"%s\" doesn't support sub-second periods!",
+ et->et_name);
+ else if (et->et_min_period != 0)
+ freq = min(freq, SBT2FREQ(et->et_min_period));
+ if (et->et_max_period < SBT_1S && et->et_max_period != 0)
+ freq = max(freq, SBT2FREQ(et->et_max_period));
+ return (freq);
+}
+
+/*
+ * Configure and start event timers (BSP part).
+ */
+void
+cpu_initclocks_bsp(void)
+{
+ struct pcpu_state *state;
+ int base, div, cpu;
+
+ mtx_init(&et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
+ CPU_FOREACH(cpu) {
+ state = DPCPU_ID_PTR(cpu, timerstate);
+ mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN);
+#ifdef KDTRACE_HOOKS
+ state->nextcyc = INT64_MAX;
+#endif
+ state->nextcall = INT64_MAX;
+ state->nextcallopt = INT64_MAX;
+ }
+ periodic = want_periodic;
+ /* Grab requested timer or the best of present. */
+ if (timername[0])
+ timer = et_find(timername, 0, 0);
+ if (timer == NULL && periodic) {
+ timer = et_find(NULL,
+ ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC);
+ }
+ if (timer == NULL) {
+ timer = et_find(NULL,
+ ET_FLAGS_ONESHOT, ET_FLAGS_ONESHOT);
+ }
+ if (timer == NULL && !periodic) {
+ timer = et_find(NULL,
+ ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC);
+ }
+ if (timer == NULL)
+ panic("No usable event timer found!");
+ et_init(timer, timercb, NULL, NULL);
+
+ /* Adapt to timer capabilities. */
+ if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0)
+ periodic = 0;
+ else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0)
+ periodic = 1;
+ if (timer->et_flags & ET_FLAGS_C3STOP)
+ cpu_disable_deep_sleep++;
+
+ /*
+ * We honor the requested 'hz' value.
+ * We want to run stathz in the neighborhood of 128hz.
+ * We would like profhz to run as often as possible.
+ */
+ if (singlemul <= 0 || singlemul > 20) {
+ if (hz >= 1500 || (hz % 128) == 0)
+ singlemul = 1;
+ else if (hz >= 750)
+ singlemul = 2;
+ else
+ singlemul = 4;
+ }
+ if (periodic) {
+ base = round_freq(timer, hz * singlemul);
+ singlemul = max((base + hz / 2) / hz, 1);
+ hz = (base + singlemul / 2) / singlemul;
+ if (base <= 128)
+ stathz = base;
+ else {
+ div = base / 128;
+ if (div >= singlemul && (div % singlemul) == 0)
+ div++;
+ stathz = base / div;
+ }
+ profhz = stathz;
+ while ((profhz + stathz) <= 128 * 64)
+ profhz += stathz;
+ profhz = round_freq(timer, profhz);
+ } else {
+ hz = round_freq(timer, hz);
+ stathz = round_freq(timer, 127);
+ profhz = round_freq(timer, stathz * 64);
+ }
+ tick = 1000000 / hz;
+ tick_sbt = SBT_1S / hz;
+ tick_bt = sbttobt(tick_sbt);
+ statperiod = SBT_1S / stathz;
+ profperiod = SBT_1S / profhz;
+ ET_LOCK();
+ configtimer(1);
+ ET_UNLOCK();
+}
+
+/*
+ * Start per-CPU event timers on APs.
+ */
+void
+cpu_initclocks_ap(void)
+{
+ sbintime_t now;
+ struct pcpu_state *state;
+ struct thread *td;
+
+ state = DPCPU_PTR(timerstate);
+ now = sbinuptime();
+ ET_HW_LOCK(state);
+ state->now = now;
+ hardclock_sync(curcpu);
+ spinlock_enter();
+ ET_HW_UNLOCK(state);
+ td = curthread;
+ td->td_intr_nesting_level++;
+ handleevents(state->now, 2);
+ td->td_intr_nesting_level--;
+ spinlock_exit();
+}
+
+/*
+ * Switch to profiling clock rates.
+ */
+void
+cpu_startprofclock(void)
+{
+
+ ET_LOCK();
+ if (profiling == 0) {
+ if (periodic) {
+ configtimer(0);
+ profiling = 1;
+ configtimer(1);
+ } else
+ profiling = 1;
+ } else
+ profiling++;
+ ET_UNLOCK();
+}
+
+/*
+ * Switch to regular clock rates.
+ */
+void
+cpu_stopprofclock(void)
+{
+
+ ET_LOCK();
+ if (profiling == 1) {
+ if (periodic) {
+ configtimer(0);
+ profiling = 0;
+ configtimer(1);
+ } else
+ profiling = 0;
+ } else
+ profiling--;
+ ET_UNLOCK();
+}
+
+/*
+ * Switch to idle mode (all ticks handled).
+ */
+sbintime_t
+cpu_idleclock(void)
+{
+ sbintime_t now, t;
+ struct pcpu_state *state;
+
+ if (idletick || busy ||
+ (periodic && (timer->et_flags & ET_FLAGS_PERCPU))
+#ifdef DEVICE_POLLING
+ || curcpu == CPU_FIRST()
+#endif
+ )
+ return (-1);
+ state = DPCPU_PTR(timerstate);
+ if (periodic)
+ now = state->now;
+ else
+ now = sbinuptime();
+ CTR3(KTR_SPARE2, "idle at %d: now %d.%08x",
+ curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+ t = getnextcpuevent(1);
+ ET_HW_LOCK(state);
+ state->idle = 1;
+ state->nextevent = t;
+ if (!periodic)
+ loadtimer(now, 0);
+ ET_HW_UNLOCK(state);
+ return (MAX(t - now, 0));
+}
+
+/*
+ * Switch to active mode (skip empty ticks).
+ */
+void
+cpu_activeclock(void)
+{
+ sbintime_t now;
+ struct pcpu_state *state;
+ struct thread *td;
+
+ state = DPCPU_PTR(timerstate);
+ if (state->idle == 0 || busy)
+ return;
+ if (periodic)
+ now = state->now;
+ else
+ now = sbinuptime();
+ CTR3(KTR_SPARE2, "active at %d: now %d.%08x",
+ curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff));
+ spinlock_enter();
+ td = curthread;
+ td->td_intr_nesting_level++;
+ handleevents(now, 1);
+ td->td_intr_nesting_level--;
+ spinlock_exit();
+}
+
+#ifdef KDTRACE_HOOKS
+void
+clocksource_cyc_set(const struct bintime *bt)
+{
+ sbintime_t now, t;
+ struct pcpu_state *state;
+
+ /* Do not touch anything if somebody reconfiguring timers. */
+ if (busy)
+ return;
+ t = bttosbt(*bt);
+ state = DPCPU_PTR(timerstate);
+ if (periodic)
+ now = state->now;
+ else
+ now = sbinuptime();
+
+ CTR5(KTR_SPARE2, "set_cyc at %d: now %d.%08x t %d.%08x",
+ curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff),
+ (int)(t >> 32), (u_int)(t & 0xffffffff));
+
+ ET_HW_LOCK(state);
+ if (t == state->nextcyc)
+ goto done;
+ state->nextcyc = t;
+ if (t >= state->nextevent)
+ goto done;
+ state->nextevent = t;
+ if (!periodic)
+ loadtimer(now, 0);
+done:
+ ET_HW_UNLOCK(state);
+}
+#endif
+
+void
+cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt)
+{
+ struct pcpu_state *state;
+
+ /* Do not touch anything if somebody reconfiguring timers. */
+ if (busy)
+ return;
+ CTR6(KTR_SPARE2, "new co at %d: on %d at %d.%08x - %d.%08x",
+ curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff),
+ (int)(bt >> 32), (u_int)(bt & 0xffffffff));
+ state = DPCPU_ID_PTR(cpu, timerstate);
+ ET_HW_LOCK(state);
+
+ /*
+ * If there is callout time already set earlier -- do nothing.
+ * This check may appear redundant because we check already in
+ * callout_process() but this double check guarantees we're safe
+ * with respect to race conditions between interrupts execution
+ * and scheduling.
+ */
+ state->nextcallopt = bt_opt;
+ if (bt >= state->nextcall)
+ goto done;
+ state->nextcall = bt;
+ /* If there is some other event set earlier -- do nothing. */
+ if (bt >= state->nextevent)
+ goto done;
+ state->nextevent = bt;
+ /* If timer is periodic -- there is nothing to reprogram. */
+ if (periodic)
+ goto done;
+ /* If timer is global or of the current CPU -- reprogram it. */
+ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) {
+ loadtimer(sbinuptime(), 0);
+done:
+ ET_HW_UNLOCK(state);
+ return;
+ }
+ /* Otherwise make other CPU to reprogram it. */
+ state->handle = 1;
+ ET_HW_UNLOCK(state);
+#ifdef SMP
+ ipi_cpu(cpu, IPI_HARDCLOCK);
+#endif
+}
+
+/*
+ * Report or change the active event timers hardware.
+ */
+static int
+sysctl_kern_eventtimer_timer(SYSCTL_HANDLER_ARGS)
+{
+ char buf[32];
+ struct eventtimer *et;
+ int error;
+
+ ET_LOCK();
+ et = timer;
+ snprintf(buf, sizeof(buf), "%s", et->et_name);
+ ET_UNLOCK();
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ ET_LOCK();
+ et = timer;
+ if (error != 0 || req->newptr == NULL ||
+ strcasecmp(buf, et->et_name) == 0) {
+ ET_UNLOCK();
+ return (error);
+ }
+ et = et_find(buf, 0, 0);
+ if (et == NULL) {
+ ET_UNLOCK();
+ return (ENOENT);
+ }
+ configtimer(0);
+ et_free(timer);
+ if (et->et_flags & ET_FLAGS_C3STOP)
+ cpu_disable_deep_sleep++;
+ if (timer->et_flags & ET_FLAGS_C3STOP)
+ cpu_disable_deep_sleep--;
+ periodic = want_periodic;
+ timer = et;
+ et_init(timer, timercb, NULL, NULL);
+ configtimer(1);
+ ET_UNLOCK();
+ return (error);
+}
+SYSCTL_PROC(_kern_eventtimer, OID_AUTO, timer,
+ CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ 0, 0, sysctl_kern_eventtimer_timer, "A", "Chosen event timer");
+
+/*
+ * Report or change the active event timer periodicity.
+ */
+static int
+sysctl_kern_eventtimer_periodic(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = periodic;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ ET_LOCK();
+ configtimer(0);
+ periodic = want_periodic = val;
+ configtimer(1);
+ ET_UNLOCK();
+ return (error);
+}
+SYSCTL_PROC(_kern_eventtimer, OID_AUTO, periodic,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ 0, 0, sysctl_kern_eventtimer_periodic, "I", "Enable event timer periodic mode");
diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c
new file mode 100644
index 0000000..483ea2e
--- /dev/null
+++ b/sys/kern/kern_condvar.c
@@ -0,0 +1,456 @@
+/*-
+ * Copyright (c) 2000 Jake Burkholder <jake@freebsd.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/sched.h>
+#include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
+#include <sys/resourcevar.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+/*
+ * Common sanity checks for cv_wait* functions.
+ */
+#define CV_ASSERT(cvp, lock, td) do { \
+ KASSERT((td) != NULL, ("%s: td NULL", __func__)); \
+ KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__)); \
+ KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \
+ KASSERT((lock) != NULL, ("%s: lock NULL", __func__)); \
+} while (0)
+
+/*
+ * Initialize a condition variable. Must be called before use.
+ */
+void
+cv_init(struct cv *cvp, const char *desc)
+{
+
+ cvp->cv_description = desc;
+ cvp->cv_waiters = 0;
+}
+
+/*
+ * Destroy a condition variable. The condition variable must be re-initialized
+ * in order to be re-used.
+ */
+void
+cv_destroy(struct cv *cvp)
+{
+#ifdef INVARIANTS
+ struct sleepqueue *sq;
+
+ sleepq_lock(cvp);
+ sq = sleepq_lookup(cvp);
+ sleepq_release(cvp);
+ KASSERT(sq == NULL, ("%s: associated sleep queue non-empty", __func__));
+#endif
+}
+
+/*
+ * Wait on a condition variable. The current thread is placed on the condition
+ * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same
+ * condition variable will resume the thread. The mutex is released before
+ * sleeping and will be held on return. It is recommended that the mutex be
+ * held when cv_signal or cv_broadcast are called.
+ */
+void
+_cv_wait(struct cv *cvp, struct lock_object *lock)
+{
+ WITNESS_SAVE_DECL(lock_witness);
+ struct lock_class *class;
+ struct thread *td;
+ int lock_state;
+
+ td = curthread;
+ lock_state = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+ "Waiting on \"%s\"", cvp->cv_description);
+ class = LOCK_CLASS(lock);
+
+ if (cold || panicstr) {
+ /*
+ * During autoconfiguration, just give interrupts
+ * a chance, then just return. Don't run any other
+ * thread or panic below, in case this is the idle
+ * process and already asleep.
+ */
+ return;
+ }
+
+ sleepq_lock(cvp);
+
+ cvp->cv_waiters++;
+ if (lock == &Giant.lock_object)
+ mtx_assert(&Giant, MA_OWNED);
+ DROP_GIANT();
+
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+ if (lock != &Giant.lock_object) {
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ WITNESS_SAVE(lock, lock_witness);
+ lock_state = class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
+ }
+ sleepq_wait(cvp, 0);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+ PICKUP_GIANT();
+ if (lock != &Giant.lock_object) {
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
+ }
+}
+
+/*
+ * Wait on a condition variable. This function differs from cv_wait by
+ * not aquiring the mutex after condition variable was signaled.
+ */
+void
+_cv_wait_unlock(struct cv *cvp, struct lock_object *lock)
+{
+ struct lock_class *class;
+ struct thread *td;
+
+ td = curthread;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+ "Waiting on \"%s\"", cvp->cv_description);
+ KASSERT(lock != &Giant.lock_object,
+ ("cv_wait_unlock cannot be used with Giant"));
+ class = LOCK_CLASS(lock);
+
+ if (cold || panicstr) {
+ /*
+ * During autoconfiguration, just give interrupts
+ * a chance, then just return. Don't run any other
+ * thread or panic below, in case this is the idle
+ * process and already asleep.
+ */
+ class->lc_unlock(lock);
+ return;
+ }
+
+ sleepq_lock(cvp);
+
+ cvp->cv_waiters++;
+ DROP_GIANT();
+
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
+ sleepq_wait(cvp, 0);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+ PICKUP_GIANT();
+}
+
+/*
+ * Wait on a condition variable, allowing interruption by signals. Return 0 if
+ * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if
+ * a signal was caught. If ERESTART is returned the system call should be
+ * restarted if possible.
+ */
+int
+_cv_wait_sig(struct cv *cvp, struct lock_object *lock)
+{
+ WITNESS_SAVE_DECL(lock_witness);
+ struct lock_class *class;
+ struct thread *td;
+ int lock_state, rval;
+
+ td = curthread;
+ lock_state = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+ "Waiting on \"%s\"", cvp->cv_description);
+ class = LOCK_CLASS(lock);
+
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration, just give
+ * interrupts a chance, then just return; don't run any other
+ * procs or panic below, in case this is the idle process and
+ * already asleep.
+ */
+ return (0);
+ }
+
+ sleepq_lock(cvp);
+
+ cvp->cv_waiters++;
+ if (lock == &Giant.lock_object)
+ mtx_assert(&Giant, MA_OWNED);
+ DROP_GIANT();
+
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
+ SLEEPQ_INTERRUPTIBLE, 0);
+ if (lock != &Giant.lock_object) {
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ WITNESS_SAVE(lock, lock_witness);
+ lock_state = class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
+ }
+ rval = sleepq_wait_sig(cvp, 0);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+ PICKUP_GIANT();
+ if (lock != &Giant.lock_object) {
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
+ }
+
+ return (rval);
+}
+
+/*
+ * Wait on a condition variable for (at most) the value specified in sbt
+ * argument. Returns 0 if the process was resumed by cv_signal or cv_broadcast,
+ * EWOULDBLOCK if the timeout expires.
+ */
+int
+_cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt,
+ sbintime_t pr, int flags)
+{
+ WITNESS_SAVE_DECL(lock_witness);
+ struct lock_class *class;
+ struct thread *td;
+ int lock_state, rval;
+
+ td = curthread;
+ lock_state = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+ "Waiting on \"%s\"", cvp->cv_description);
+ class = LOCK_CLASS(lock);
+
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration, just give
+ * interrupts a chance, then just return; don't run any other
+ * thread or panic below, in case this is the idle process and
+ * already asleep.
+ */
+ return 0;
+ }
+
+ sleepq_lock(cvp);
+
+ cvp->cv_waiters++;
+ if (lock == &Giant.lock_object)
+ mtx_assert(&Giant, MA_OWNED);
+ DROP_GIANT();
+
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+ sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
+ if (lock != &Giant.lock_object) {
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ WITNESS_SAVE(lock, lock_witness);
+ lock_state = class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
+ }
+ rval = sleepq_timedwait(cvp, 0);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+ PICKUP_GIANT();
+ if (lock != &Giant.lock_object) {
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
+ }
+
+ return (rval);
+}
+
+/*
+ * Wait on a condition variable for (at most) the value specified in sbt
+ * argument, allowing interruption by signals.
+ * Returns 0 if the thread was resumed by cv_signal or cv_broadcast,
+ * EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if a signal
+ * was caught.
+ */
+int
+_cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock,
+ sbintime_t sbt, sbintime_t pr, int flags)
+{
+ WITNESS_SAVE_DECL(lock_witness);
+ struct lock_class *class;
+ struct thread *td;
+ int lock_state, rval;
+
+ td = curthread;
+ lock_state = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0, cv_wmesg(cvp));
+#endif
+ CV_ASSERT(cvp, lock, td);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+ "Waiting on \"%s\"", cvp->cv_description);
+ class = LOCK_CLASS(lock);
+
+ if (cold || panicstr) {
+ /*
+ * After a panic, or during autoconfiguration, just give
+ * interrupts a chance, then just return; don't run any other
+ * thread or panic below, in case this is the idle process and
+ * already asleep.
+ */
+ return 0;
+ }
+
+ sleepq_lock(cvp);
+
+ cvp->cv_waiters++;
+ if (lock == &Giant.lock_object)
+ mtx_assert(&Giant, MA_OWNED);
+ DROP_GIANT();
+
+ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
+ SLEEPQ_INTERRUPTIBLE, 0);
+ sleepq_set_timeout_sbt(cvp, sbt, pr, flags);
+ if (lock != &Giant.lock_object) {
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_release(cvp);
+ WITNESS_SAVE(lock, lock_witness);
+ lock_state = class->lc_unlock(lock);
+ if (class->lc_flags & LC_SLEEPABLE)
+ sleepq_lock(cvp);
+ }
+ rval = sleepq_timedwait_sig(cvp, 0);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0, cv_wmesg(cvp));
+#endif
+ PICKUP_GIANT();
+ if (lock != &Giant.lock_object) {
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
+ }
+
+ return (rval);
+}
+
+/*
+ * Signal a condition variable, wakes up one waiting thread. Will also wakeup
+ * the swapper if the process is not in memory, so that it can bring the
+ * sleeping process in. Note that this may also result in additional threads
+ * being made runnable. Should be called with the same mutex as was passed to
+ * cv_wait held.
+ */
+void
+cv_signal(struct cv *cvp)
+{
+ int wakeup_swapper;
+
+ wakeup_swapper = 0;
+ sleepq_lock(cvp);
+ if (cvp->cv_waiters > 0) {
+ cvp->cv_waiters--;
+ wakeup_swapper = sleepq_signal(cvp, SLEEPQ_CONDVAR, 0, 0);
+ }
+ sleepq_release(cvp);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+/*
+ * Broadcast a signal to a condition variable. Wakes up all waiting threads.
+ * Should be called with the same mutex as was passed to cv_wait held.
+ */
+void
+cv_broadcastpri(struct cv *cvp, int pri)
+{
+ int wakeup_swapper;
+
+ /*
+ * XXX sleepq_broadcast pri argument changed from -1 meaning
+ * no pri to 0 meaning no pri.
+ */
+ wakeup_swapper = 0;
+ if (pri == -1)
+ pri = 0;
+ sleepq_lock(cvp);
+ if (cvp->cv_waiters > 0) {
+ cvp->cv_waiters = 0;
+ wakeup_swapper = sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0);
+ }
+ sleepq_release(cvp);
+ if (wakeup_swapper)
+ kick_proc0();
+}
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
new file mode 100644
index 0000000..c04d1da
--- /dev/null
+++ b/sys/kern/kern_conf.c
@@ -0,0 +1,1459 @@
+/*-
+ * Copyright (c) 1999-2002 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/bio.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+#include <sys/queue.h>
+#include <sys/poll.h>
+#include <sys/sx.h>
+#include <sys/ctype.h>
+#include <sys/ucred.h>
+#include <sys/taskqueue.h>
+#include <machine/stdarg.h>
+
+#include <fs/devfs/devfs_int.h>
+#include <vm/vm.h>
+
+static MALLOC_DEFINE(M_DEVT, "cdev", "cdev storage");
+
+struct mtx devmtx;
+static void destroy_devl(struct cdev *dev);
+static int destroy_dev_sched_cbl(struct cdev *dev,
+ void (*cb)(void *), void *arg);
+static void destroy_dev_tq(void *ctx, int pending);
+static int make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw,
+ int unit, struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
+ va_list ap);
+
+static struct cdev_priv_list cdevp_free_list =
+ TAILQ_HEAD_INITIALIZER(cdevp_free_list);
+static SLIST_HEAD(free_cdevsw, cdevsw) cdevsw_gt_post_list =
+ SLIST_HEAD_INITIALIZER(cdevsw_gt_post_list);
+
+void
+dev_lock(void)
+{
+
+ mtx_lock(&devmtx);
+}
+
+/*
+ * Free all the memory collected while the cdev mutex was
+ * locked. Since devmtx is after the system map mutex, free() cannot
+ * be called immediately and is postponed until cdev mutex can be
+ * dropped.
+ */
+static void
+dev_unlock_and_free(void)
+{
+ struct cdev_priv_list cdp_free;
+ struct free_cdevsw csw_free;
+ struct cdev_priv *cdp;
+ struct cdevsw *csw;
+
+ mtx_assert(&devmtx, MA_OWNED);
+
+ /*
+ * Make the local copy of the list heads while the dev_mtx is
+ * held. Free it later.
+ */
+ TAILQ_INIT(&cdp_free);
+ TAILQ_CONCAT(&cdp_free, &cdevp_free_list, cdp_list);
+ csw_free = cdevsw_gt_post_list;
+ SLIST_INIT(&cdevsw_gt_post_list);
+
+ mtx_unlock(&devmtx);
+
+ while ((cdp = TAILQ_FIRST(&cdp_free)) != NULL) {
+ TAILQ_REMOVE(&cdp_free, cdp, cdp_list);
+ devfs_free(&cdp->cdp_c);
+ }
+ while ((csw = SLIST_FIRST(&csw_free)) != NULL) {
+ SLIST_REMOVE_HEAD(&csw_free, d_postfree_list);
+ free(csw, M_DEVT);
+ }
+}
+
+static void
+dev_free_devlocked(struct cdev *cdev)
+{
+ struct cdev_priv *cdp;
+
+ mtx_assert(&devmtx, MA_OWNED);
+ cdp = cdev2priv(cdev);
+ TAILQ_INSERT_HEAD(&cdevp_free_list, cdp, cdp_list);
+}
+
+static void
+cdevsw_free_devlocked(struct cdevsw *csw)
+{
+
+ mtx_assert(&devmtx, MA_OWNED);
+ SLIST_INSERT_HEAD(&cdevsw_gt_post_list, csw, d_postfree_list);
+}
+
+void
+dev_unlock(void)
+{
+
+ mtx_unlock(&devmtx);
+}
+
+void
+dev_ref(struct cdev *dev)
+{
+
+ mtx_assert(&devmtx, MA_NOTOWNED);
+ mtx_lock(&devmtx);
+ dev->si_refcount++;
+ mtx_unlock(&devmtx);
+}
+
+void
+dev_refl(struct cdev *dev)
+{
+
+ mtx_assert(&devmtx, MA_OWNED);
+ dev->si_refcount++;
+}
+
+void
+dev_rel(struct cdev *dev)
+{
+ int flag = 0;
+
+ mtx_assert(&devmtx, MA_NOTOWNED);
+ dev_lock();
+ dev->si_refcount--;
+ KASSERT(dev->si_refcount >= 0,
+ ("dev_rel(%s) gave negative count", devtoname(dev)));
+#if 0
+ if (dev->si_usecount == 0 &&
+ (dev->si_flags & SI_CHEAPCLONE) && (dev->si_flags & SI_NAMED))
+ ;
+ else
+#endif
+ if (dev->si_devsw == NULL && dev->si_refcount == 0) {
+ LIST_REMOVE(dev, si_list);
+ flag = 1;
+ }
+ dev_unlock();
+ if (flag)
+ devfs_free(dev);
+}
+
+struct cdevsw *
+dev_refthread(struct cdev *dev, int *ref)
+{
+ struct cdevsw *csw;
+ struct cdev_priv *cdp;
+
+ mtx_assert(&devmtx, MA_NOTOWNED);
+ if ((dev->si_flags & SI_ETERNAL) != 0) {
+ *ref = 0;
+ return (dev->si_devsw);
+ }
+ dev_lock();
+ csw = dev->si_devsw;
+ if (csw != NULL) {
+ cdp = cdev2priv(dev);
+ if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0)
+ dev->si_threadcount++;
+ else
+ csw = NULL;
+ }
+ dev_unlock();
+ *ref = 1;
+ return (csw);
+}
+
+struct cdevsw *
+devvn_refthread(struct vnode *vp, struct cdev **devp, int *ref)
+{
+ struct cdevsw *csw;
+ struct cdev_priv *cdp;
+ struct cdev *dev;
+
+ mtx_assert(&devmtx, MA_NOTOWNED);
+ if ((vp->v_vflag & VV_ETERNALDEV) != 0) {
+ dev = vp->v_rdev;
+ if (dev == NULL)
+ return (NULL);
+ KASSERT((dev->si_flags & SI_ETERNAL) != 0,
+ ("Not eternal cdev"));
+ *ref = 0;
+ csw = dev->si_devsw;
+ KASSERT(csw != NULL, ("Eternal cdev is destroyed"));
+ *devp = dev;
+ return (csw);
+ }
+
+ csw = NULL;
+ dev_lock();
+ dev = vp->v_rdev;
+ if (dev == NULL) {
+ dev_unlock();
+ return (NULL);
+ }
+ cdp = cdev2priv(dev);
+ if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) {
+ csw = dev->si_devsw;
+ if (csw != NULL)
+ dev->si_threadcount++;
+ }
+ dev_unlock();
+ if (csw != NULL) {
+ *devp = dev;
+ *ref = 1;
+ }
+ return (csw);
+}
+
+void
+dev_relthread(struct cdev *dev, int ref)
+{
+
+ mtx_assert(&devmtx, MA_NOTOWNED);
+ if (!ref)
+ return;
+ dev_lock();
+ KASSERT(dev->si_threadcount > 0,
+ ("%s threadcount is wrong", dev->si_name));
+ dev->si_threadcount--;
+ dev_unlock();
+}
+
+int
+nullop(void)
+{
+
+ return (0);
+}
+
+int
+eopnotsupp(void)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+enxio(void)
+{
+ return (ENXIO);
+}
+
+static int
+enodev(void)
+{
+ return (ENODEV);
+}
+
+/* Define a dead_cdevsw for use when devices leave unexpectedly. */
+
+#define dead_open (d_open_t *)enxio
+#define dead_close (d_close_t *)enxio
+#define dead_read (d_read_t *)enxio
+#define dead_write (d_write_t *)enxio
+#define dead_ioctl (d_ioctl_t *)enxio
+#define dead_poll (d_poll_t *)enodev
+#define dead_mmap (d_mmap_t *)enodev
+
+static void
+dead_strategy(struct bio *bp)
+{
+
+ biofinish(bp, NULL, ENXIO);
+}
+
+#define dead_dump (dumper_t *)enxio
+#define dead_kqfilter (d_kqfilter_t *)enxio
+#define dead_mmap_single (d_mmap_single_t *)enodev
+
+static struct cdevsw dead_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = dead_open,
+ .d_close = dead_close,
+ .d_read = dead_read,
+ .d_write = dead_write,
+ .d_ioctl = dead_ioctl,
+ .d_poll = dead_poll,
+ .d_mmap = dead_mmap,
+ .d_strategy = dead_strategy,
+ .d_name = "dead",
+ .d_dump = dead_dump,
+ .d_kqfilter = dead_kqfilter,
+ .d_mmap_single = dead_mmap_single
+};
+
+/* Default methods if driver does not specify method */
+
+#define null_open (d_open_t *)nullop
+#define null_close (d_close_t *)nullop
+#define no_read (d_read_t *)enodev
+#define no_write (d_write_t *)enodev
+#define no_ioctl (d_ioctl_t *)enodev
+#define no_mmap (d_mmap_t *)enodev
+#define no_kqfilter (d_kqfilter_t *)enodev
+#define no_mmap_single (d_mmap_single_t *)enodev
+
+static void
+no_strategy(struct bio *bp)
+{
+
+ biofinish(bp, NULL, ENODEV);
+}
+
+static int
+no_poll(struct cdev *dev __unused, int events, struct thread *td __unused)
+{
+
+ return (poll_no_poll(events));
+}
+
+#define no_dump (dumper_t *)enodev
+
+static int
+giant_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_open(dev, oflags, devtype, td);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static int
+giant_fdopen(struct cdev *dev, int oflags, struct thread *td, struct file *fp)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_fdopen(dev, oflags, td, fp);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static int
+giant_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_close(dev, fflag, devtype, td);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static void
+giant_strategy(struct bio *bp)
+{
+ struct cdevsw *dsw;
+ struct cdev *dev;
+ int ref;
+
+ dev = bp->bio_dev;
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL) {
+ biofinish(bp, NULL, ENXIO);
+ return;
+ }
+ mtx_lock(&Giant);
+ dsw->d_gianttrick->d_strategy(bp);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+}
+
+static int
+giant_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_ioctl(dev, cmd, data, fflag, td);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static int
+giant_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_read(dev, uio, ioflag);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static int
+giant_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_write(dev, uio, ioflag);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static int
+giant_poll(struct cdev *dev, int events, struct thread *td)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_poll(dev, events, td);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static int
+giant_kqfilter(struct cdev *dev, struct knote *kn)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_kqfilter(dev, kn);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static int
+giant_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot,
+ vm_memattr_t *memattr)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_mmap(dev, offset, paddr, nprot,
+ memattr);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static int
+giant_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size,
+ vm_object_t *object, int nprot)
+{
+ struct cdevsw *dsw;
+ int ref, retval;
+
+ dsw = dev_refthread(dev, &ref);
+ if (dsw == NULL)
+ return (ENXIO);
+ mtx_lock(&Giant);
+ retval = dsw->d_gianttrick->d_mmap_single(dev, offset, size, object,
+ nprot);
+ mtx_unlock(&Giant);
+ dev_relthread(dev, ref);
+ return (retval);
+}
+
+static void
+notify(struct cdev *dev, const char *ev, int flags)
+{
+ static const char prefix[] = "cdev=";
+ char *data;
+ int namelen, mflags;
+
+ if (cold)
+ return;
+ mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK;
+ namelen = strlen(dev->si_name);
+ data = malloc(namelen + sizeof(prefix), M_TEMP, mflags);
+ if (data == NULL)
+ return;
+ memcpy(data, prefix, sizeof(prefix) - 1);
+ memcpy(data + sizeof(prefix) - 1, dev->si_name, namelen + 1);
+ devctl_notify_f("DEVFS", "CDEV", ev, data, mflags);
+ free(data, M_TEMP);
+}
+
+static void
+notify_create(struct cdev *dev, int flags)
+{
+
+ notify(dev, "CREATE", flags);
+}
+
+static void
+notify_destroy(struct cdev *dev)
+{
+
+ notify(dev, "DESTROY", MAKEDEV_WAITOK);
+}
+
+static struct cdev *
+newdev(struct cdevsw *csw, int unit, struct cdev *si)
+{
+ struct cdev *si2;
+
+ mtx_assert(&devmtx, MA_OWNED);
+ if (csw->d_flags & D_NEEDMINOR) {
+ /* We may want to return an existing device */
+ LIST_FOREACH(si2, &csw->d_devs, si_list) {
+ if (dev2unit(si2) == unit) {
+ dev_free_devlocked(si);
+ return (si2);
+ }
+ }
+ }
+ si->si_drv0 = unit;
+ si->si_devsw = csw;
+ LIST_INSERT_HEAD(&csw->d_devs, si, si_list);
+ return (si);
+}
+
+static void
+fini_cdevsw(struct cdevsw *devsw)
+{
+ struct cdevsw *gt;
+
+ if (devsw->d_gianttrick != NULL) {
+ gt = devsw->d_gianttrick;
+ memcpy(devsw, gt, sizeof *devsw);
+ cdevsw_free_devlocked(gt);
+ devsw->d_gianttrick = NULL;
+ }
+ devsw->d_flags &= ~D_INIT;
+}
+
+static int
+prep_cdevsw(struct cdevsw *devsw, int flags)
+{
+ struct cdevsw *dsw2;
+
+ mtx_assert(&devmtx, MA_OWNED);
+ if (devsw->d_flags & D_INIT)
+ return (0);
+ if (devsw->d_flags & D_NEEDGIANT) {
+ dev_unlock();
+ dsw2 = malloc(sizeof *dsw2, M_DEVT,
+ (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK);
+ dev_lock();
+ if (dsw2 == NULL && !(devsw->d_flags & D_INIT))
+ return (ENOMEM);
+ } else
+ dsw2 = NULL;
+ if (devsw->d_flags & D_INIT) {
+ if (dsw2 != NULL)
+ cdevsw_free_devlocked(dsw2);
+ return (0);
+ }
+
+ if (devsw->d_version != D_VERSION_03) {
+ printf(
+ "WARNING: Device driver \"%s\" has wrong version %s\n",
+ devsw->d_name == NULL ? "???" : devsw->d_name,
+ "and is disabled. Recompile KLD module.");
+ devsw->d_open = dead_open;
+ devsw->d_close = dead_close;
+ devsw->d_read = dead_read;
+ devsw->d_write = dead_write;
+ devsw->d_ioctl = dead_ioctl;
+ devsw->d_poll = dead_poll;
+ devsw->d_mmap = dead_mmap;
+ devsw->d_mmap_single = dead_mmap_single;
+ devsw->d_strategy = dead_strategy;
+ devsw->d_dump = dead_dump;
+ devsw->d_kqfilter = dead_kqfilter;
+ }
+
+ if (devsw->d_flags & D_NEEDGIANT) {
+ if (devsw->d_gianttrick == NULL) {
+ memcpy(dsw2, devsw, sizeof *dsw2);
+ devsw->d_gianttrick = dsw2;
+ dsw2 = NULL;
+ }
+ }
+
+#define FIXUP(member, noop, giant) \
+ do { \
+ if (devsw->member == NULL) { \
+ devsw->member = noop; \
+ } else if (devsw->d_flags & D_NEEDGIANT) \
+ devsw->member = giant; \
+ } \
+ while (0)
+
+ FIXUP(d_open, null_open, giant_open);
+ FIXUP(d_fdopen, NULL, giant_fdopen);
+ FIXUP(d_close, null_close, giant_close);
+ FIXUP(d_read, no_read, giant_read);
+ FIXUP(d_write, no_write, giant_write);
+ FIXUP(d_ioctl, no_ioctl, giant_ioctl);
+ FIXUP(d_poll, no_poll, giant_poll);
+ FIXUP(d_mmap, no_mmap, giant_mmap);
+ FIXUP(d_strategy, no_strategy, giant_strategy);
+ FIXUP(d_kqfilter, no_kqfilter, giant_kqfilter);
+ FIXUP(d_mmap_single, no_mmap_single, giant_mmap_single);
+
+ if (devsw->d_dump == NULL) devsw->d_dump = no_dump;
+
+ LIST_INIT(&devsw->d_devs);
+
+ devsw->d_flags |= D_INIT;
+
+ if (dsw2 != NULL)
+ cdevsw_free_devlocked(dsw2);
+ return (0);
+}
+
+static int
+prep_devname(struct cdev *dev, const char *fmt, va_list ap)
+{
+ int len;
+ char *from, *q, *s, *to;
+
+ mtx_assert(&devmtx, MA_OWNED);
+
+ len = vsnrprintf(dev->si_name, sizeof(dev->si_name), 32, fmt, ap);
+ if (len > sizeof(dev->si_name) - 1)
+ return (ENAMETOOLONG);
+
+ /* Strip leading slashes. */
+ for (from = dev->si_name; *from == '/'; from++)
+ ;
+
+ for (to = dev->si_name; *from != '\0'; from++, to++) {
+ /*
+ * Spaces and double quotation marks cause
+ * problems for the devctl(4) protocol.
+ * Reject names containing those characters.
+ */
+ if (isspace(*from) || *from == '"')
+ return (EINVAL);
+ /* Treat multiple sequential slashes as single. */
+ while (from[0] == '/' && from[1] == '/')
+ from++;
+ /* Trailing slash is considered invalid. */
+ if (from[0] == '/' && from[1] == '\0')
+ return (EINVAL);
+ *to = *from;
+ }
+ *to = '\0';
+
+ if (dev->si_name[0] == '\0')
+ return (EINVAL);
+
+ /* Disallow "." and ".." components. */
+ for (s = dev->si_name;;) {
+ for (q = s; *q != '/' && *q != '\0'; q++)
+ ;
+ if (q - s == 1 && s[0] == '.')
+ return (EINVAL);
+ if (q - s == 2 && s[0] == '.' && s[1] == '.')
+ return (EINVAL);
+ if (*q != '/')
+ break;
+ s = q + 1;
+ }
+
+ if (devfs_dev_exists(dev->si_name) != 0)
+ return (EEXIST);
+
+ return (0);
+}
+
+static int
+make_dev_credv(int flags, struct cdev **dres, struct cdevsw *devsw, int unit,
+ struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
+ va_list ap)
+{
+ struct cdev *dev, *dev_new;
+ int res;
+
+ KASSERT((flags & MAKEDEV_WAITOK) == 0 || (flags & MAKEDEV_NOWAIT) == 0,
+ ("make_dev_credv: both WAITOK and NOWAIT specified"));
+ dev_new = devfs_alloc(flags);
+ if (dev_new == NULL)
+ return (ENOMEM);
+ dev_lock();
+ res = prep_cdevsw(devsw, flags);
+ if (res != 0) {
+ dev_unlock();
+ devfs_free(dev_new);
+ return (res);
+ }
+ dev = newdev(devsw, unit, dev_new);
+ if ((dev->si_flags & SI_NAMED) == 0) {
+ res = prep_devname(dev, fmt, ap);
+ if (res != 0) {
+ if ((flags & MAKEDEV_CHECKNAME) == 0) {
+ panic(
+ "make_dev_credv: bad si_name (error=%d, si_name=%s)",
+ res, dev->si_name);
+ }
+ if (dev == dev_new) {
+ LIST_REMOVE(dev, si_list);
+ dev_unlock();
+ devfs_free(dev);
+ } else
+ dev_unlock();
+ return (res);
+ }
+ }
+ if (flags & MAKEDEV_REF)
+ dev_refl(dev);
+ if (flags & MAKEDEV_ETERNAL)
+ dev->si_flags |= SI_ETERNAL;
+ if (dev->si_flags & SI_CHEAPCLONE &&
+ dev->si_flags & SI_NAMED) {
+ /*
+ * This is allowed as it removes races and generally
+ * simplifies cloning devices.
+ * XXX: still ??
+ */
+ dev_unlock_and_free();
+ *dres = dev;
+ return (0);
+ }
+ KASSERT(!(dev->si_flags & SI_NAMED),
+ ("make_dev() by driver %s on pre-existing device (min=%x, name=%s)",
+ devsw->d_name, dev2unit(dev), devtoname(dev)));
+ dev->si_flags |= SI_NAMED;
+ if (cr != NULL)
+ dev->si_cred = crhold(cr);
+ dev->si_uid = uid;
+ dev->si_gid = gid;
+ dev->si_mode = mode;
+
+ devfs_create(dev);
+ clean_unrhdrl(devfs_inos);
+ dev_unlock_and_free();
+
+ notify_create(dev, flags);
+
+ *dres = dev;
+ return (0);
+}
+
+struct cdev *
+make_dev(struct cdevsw *devsw, int unit, uid_t uid, gid_t gid, int mode,
+ const char *fmt, ...)
+{
+ struct cdev *dev;
+ va_list ap;
+ int res;
+
+ va_start(ap, fmt);
+ res = make_dev_credv(0, &dev, devsw, unit, NULL, uid, gid, mode, fmt,
+ ap);
+ va_end(ap);
+ KASSERT(res == 0 && dev != NULL,
+ ("make_dev: failed make_dev_credv (error=%d)", res));
+ return (dev);
+}
+
+struct cdev *
+make_dev_cred(struct cdevsw *devsw, int unit, struct ucred *cr, uid_t uid,
+ gid_t gid, int mode, const char *fmt, ...)
+{
+ struct cdev *dev;
+ va_list ap;
+ int res;
+
+ va_start(ap, fmt);
+ res = make_dev_credv(0, &dev, devsw, unit, cr, uid, gid, mode, fmt, ap);
+ va_end(ap);
+
+ KASSERT(res == 0 && dev != NULL,
+ ("make_dev_cred: failed make_dev_credv (error=%d)", res));
+ return (dev);
+}
+
+struct cdev *
+make_dev_credf(int flags, struct cdevsw *devsw, int unit, struct ucred *cr,
+ uid_t uid, gid_t gid, int mode, const char *fmt, ...)
+{
+ struct cdev *dev;
+ va_list ap;
+ int res;
+
+ va_start(ap, fmt);
+ res = make_dev_credv(flags, &dev, devsw, unit, cr, uid, gid, mode,
+ fmt, ap);
+ va_end(ap);
+
+ KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) ||
+ ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0,
+ ("make_dev_credf: failed make_dev_credv (error=%d)", res));
+ return (res == 0 ? dev : NULL);
+}
+
+int
+make_dev_p(int flags, struct cdev **cdev, struct cdevsw *devsw,
+ struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt, ...)
+{
+ va_list ap;
+ int res;
+
+ va_start(ap, fmt);
+ res = make_dev_credv(flags, cdev, devsw, 0, cr, uid, gid, mode,
+ fmt, ap);
+ va_end(ap);
+
+ KASSERT(((flags & MAKEDEV_NOWAIT) != 0 && res == ENOMEM) ||
+ ((flags & MAKEDEV_CHECKNAME) != 0 && res != ENOMEM) || res == 0,
+ ("make_dev_p: failed make_dev_credv (error=%d)", res));
+ return (res);
+}
+
+static void
+dev_dependsl(struct cdev *pdev, struct cdev *cdev)
+{
+
+ cdev->si_parent = pdev;
+ cdev->si_flags |= SI_CHILD;
+ LIST_INSERT_HEAD(&pdev->si_children, cdev, si_siblings);
+}
+
+
+void
+dev_depends(struct cdev *pdev, struct cdev *cdev)
+{
+
+ dev_lock();
+ dev_dependsl(pdev, cdev);
+ dev_unlock();
+}
+
+static int
+make_dev_alias_v(int flags, struct cdev **cdev, struct cdev *pdev,
+ const char *fmt, va_list ap)
+{
+ struct cdev *dev;
+ int error;
+
+ KASSERT(pdev != NULL, ("make_dev_alias_v: pdev is NULL"));
+ KASSERT((flags & MAKEDEV_WAITOK) == 0 || (flags & MAKEDEV_NOWAIT) == 0,
+ ("make_dev_alias_v: both WAITOK and NOWAIT specified"));
+ KASSERT((flags & ~(MAKEDEV_WAITOK | MAKEDEV_NOWAIT |
+ MAKEDEV_CHECKNAME)) == 0,
+ ("make_dev_alias_v: invalid flags specified (flags=%02x)", flags));
+
+ dev = devfs_alloc(flags);
+ if (dev == NULL)
+ return (ENOMEM);
+ dev_lock();
+ dev->si_flags |= SI_ALIAS;
+ error = prep_devname(dev, fmt, ap);
+ if (error != 0) {
+ if ((flags & MAKEDEV_CHECKNAME) == 0) {
+ panic("make_dev_alias_v: bad si_name "
+ "(error=%d, si_name=%s)", error, dev->si_name);
+ }
+ dev_unlock();
+ devfs_free(dev);
+ return (error);
+ }
+ dev->si_flags |= SI_NAMED;
+ devfs_create(dev);
+ dev_dependsl(pdev, dev);
+ clean_unrhdrl(devfs_inos);
+ dev_unlock();
+
+ notify_create(dev, flags);
+ *cdev = dev;
+
+ return (0);
+}
+
+struct cdev *
+make_dev_alias(struct cdev *pdev, const char *fmt, ...)
+{
+ struct cdev *dev;
+ va_list ap;
+ int res;
+
+ va_start(ap, fmt);
+ res = make_dev_alias_v(MAKEDEV_WAITOK, &dev, pdev, fmt, ap);
+ va_end(ap);
+
+ KASSERT(res == 0 && dev != NULL,
+ ("make_dev_alias: failed make_dev_alias_v (error=%d)", res));
+ return (dev);
+}
+
+int
+make_dev_alias_p(int flags, struct cdev **cdev, struct cdev *pdev,
+ const char *fmt, ...)
+{
+ va_list ap;
+ int res;
+
+ va_start(ap, fmt);
+ res = make_dev_alias_v(flags, cdev, pdev, fmt, ap);
+ va_end(ap);
+ return (res);
+}
+
+int
+make_dev_physpath_alias(int flags, struct cdev **cdev, struct cdev *pdev,
+ struct cdev *old_alias, const char *physpath)
+{
+ char *devfspath;
+ int physpath_len;
+ int max_parentpath_len;
+ int parentpath_len;
+ int devfspathbuf_len;
+ int mflags;
+ int ret;
+
+ *cdev = NULL;
+ devfspath = NULL;
+ physpath_len = strlen(physpath);
+ ret = EINVAL;
+ if (physpath_len == 0)
+ goto out;
+
+ if (strncmp("id1,", physpath, 4) == 0) {
+ physpath += 4;
+ physpath_len -= 4;
+ if (physpath_len == 0)
+ goto out;
+ }
+
+ max_parentpath_len = SPECNAMELEN - physpath_len - /*/*/1;
+ parentpath_len = strlen(pdev->si_name);
+ if (max_parentpath_len < parentpath_len) {
+ if (bootverbose)
+ printf("WARNING: Unable to alias %s "
+ "to %s/%s - path too long\n",
+ pdev->si_name, physpath, pdev->si_name);
+ ret = ENAMETOOLONG;
+ goto out;
+ }
+
+ mflags = (flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK;
+ devfspathbuf_len = physpath_len + /*/*/1 + parentpath_len + /*NUL*/1;
+ devfspath = malloc(devfspathbuf_len, M_DEVBUF, mflags);
+ if (devfspath == NULL) {
+ ret = ENOMEM;
+ goto out;
+ }
+
+ sprintf(devfspath, "%s/%s", physpath, pdev->si_name);
+ if (old_alias != NULL && strcmp(old_alias->si_name, devfspath) == 0) {
+ /* Retain the existing alias. */
+ *cdev = old_alias;
+ old_alias = NULL;
+ ret = 0;
+ } else {
+ ret = make_dev_alias_p(flags, cdev, pdev, "%s", devfspath);
+ }
+out:
+ if (old_alias != NULL)
+ destroy_dev(old_alias);
+ if (devfspath != NULL)
+ free(devfspath, M_DEVBUF);
+ return (ret);
+}
+
+static void
+destroy_devl(struct cdev *dev)
+{
+ struct cdevsw *csw;
+ struct cdev_privdata *p;
+
+ mtx_assert(&devmtx, MA_OWNED);
+ KASSERT(dev->si_flags & SI_NAMED,
+ ("WARNING: Driver mistake: destroy_dev on %d\n", dev2unit(dev)));
+ KASSERT((dev->si_flags & SI_ETERNAL) == 0,
+ ("WARNING: Driver mistake: destroy_dev on eternal %d\n",
+ dev2unit(dev)));
+
+ devfs_destroy(dev);
+
+ /* Remove name marking */
+ dev->si_flags &= ~SI_NAMED;
+
+ dev->si_refcount++; /* Avoid race with dev_rel() */
+
+ /* If we are a child, remove us from the parents list */
+ if (dev->si_flags & SI_CHILD) {
+ LIST_REMOVE(dev, si_siblings);
+ dev->si_flags &= ~SI_CHILD;
+ }
+
+ /* Kill our children */
+ while (!LIST_EMPTY(&dev->si_children))
+ destroy_devl(LIST_FIRST(&dev->si_children));
+
+ /* Remove from clone list */
+ if (dev->si_flags & SI_CLONELIST) {
+ LIST_REMOVE(dev, si_clone);
+ dev->si_flags &= ~SI_CLONELIST;
+ }
+
+ csw = dev->si_devsw;
+ dev->si_devsw = NULL; /* already NULL for SI_ALIAS */
+ while (csw != NULL && csw->d_purge != NULL && dev->si_threadcount) {
+ csw->d_purge(dev);
+ msleep(csw, &devmtx, PRIBIO, "devprg", hz/10);
+ if (dev->si_threadcount)
+ printf("Still %lu threads in %s\n",
+ dev->si_threadcount, devtoname(dev));
+ }
+ while (dev->si_threadcount != 0) {
+ /* Use unique dummy wait ident */
+ msleep(&csw, &devmtx, PRIBIO, "devdrn", hz / 10);
+ }
+
+ dev_unlock();
+ notify_destroy(dev);
+ mtx_lock(&cdevpriv_mtx);
+ while ((p = LIST_FIRST(&cdev2priv(dev)->cdp_fdpriv)) != NULL) {
+ devfs_destroy_cdevpriv(p);
+ mtx_lock(&cdevpriv_mtx);
+ }
+ mtx_unlock(&cdevpriv_mtx);
+ dev_lock();
+
+ dev->si_drv1 = 0;
+ dev->si_drv2 = 0;
+ bzero(&dev->__si_u, sizeof(dev->__si_u));
+
+ if (!(dev->si_flags & SI_ALIAS)) {
+ /* Remove from cdevsw list */
+ LIST_REMOVE(dev, si_list);
+
+ /* If cdevsw has no more struct cdev *'s, clean it */
+ if (LIST_EMPTY(&csw->d_devs)) {
+ fini_cdevsw(csw);
+ wakeup(&csw->d_devs);
+ }
+ }
+ dev->si_flags &= ~SI_ALIAS;
+ dev->si_refcount--; /* Avoid race with dev_rel() */
+
+ if (dev->si_refcount > 0) {
+ LIST_INSERT_HEAD(&dead_cdevsw.d_devs, dev, si_list);
+ } else {
+ dev_free_devlocked(dev);
+ }
+}
+
+void
+destroy_dev(struct cdev *dev)
+{
+
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "destroy_dev");
+ dev_lock();
+ destroy_devl(dev);
+ dev_unlock_and_free();
+}
+
+const char *
+devtoname(struct cdev *dev)
+{
+
+ return (dev->si_name);
+}
+
+int
+dev_stdclone(char *name, char **namep, const char *stem, int *unit)
+{
+ int u, i;
+
+ i = strlen(stem);
+ if (bcmp(stem, name, i) != 0)
+ return (0);
+ if (!isdigit(name[i]))
+ return (0);
+ u = 0;
+ if (name[i] == '0' && isdigit(name[i+1]))
+ return (0);
+ while (isdigit(name[i])) {
+ u *= 10;
+ u += name[i++] - '0';
+ }
+ if (u > 0xffffff)
+ return (0);
+ *unit = u;
+ if (namep)
+ *namep = &name[i];
+ if (name[i])
+ return (2);
+ return (1);
+}
+
+/*
+ * Helper functions for cloning device drivers.
+ *
+ * The objective here is to make it unnecessary for the device drivers to
+ * use rman or similar to manage their unit number space. Due to the way
+ * we do "on-demand" devices, using rman or other "private" methods
+ * will be very tricky to lock down properly once we lock down this file.
+ *
+ * Instead we give the drivers these routines which puts the struct cdev *'s
+ * that are to be managed on their own list, and gives the driver the ability
+ * to ask for the first free unit number or a given specified unit number.
+ *
+ * In addition these routines support paired devices (pty, nmdm and similar)
+ * by respecting a number of "flag" bits in the minor number.
+ *
+ */
+
+struct clonedevs {
+ LIST_HEAD(,cdev) head;
+};
+
+void
+clone_setup(struct clonedevs **cdp)
+{
+
+ *cdp = malloc(sizeof **cdp, M_DEVBUF, M_WAITOK | M_ZERO);
+ LIST_INIT(&(*cdp)->head);
+}
+
+int
+clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up,
+ struct cdev **dp, int extra)
+{
+ struct clonedevs *cd;
+ struct cdev *dev, *ndev, *dl, *de;
+ int unit, low, u;
+
+ KASSERT(*cdp != NULL,
+ ("clone_setup() not called in driver \"%s\"", csw->d_name));
+ KASSERT(!(extra & CLONE_UNITMASK),
+ ("Illegal extra bits (0x%x) in clone_create", extra));
+ KASSERT(*up <= CLONE_UNITMASK,
+ ("Too high unit (0x%x) in clone_create", *up));
+ KASSERT(csw->d_flags & D_NEEDMINOR,
+ ("clone_create() on cdevsw without minor numbers"));
+
+
+ /*
+ * Search the list for a lot of things in one go:
+ * A preexisting match is returned immediately.
+ * The lowest free unit number if we are passed -1, and the place
+ * in the list where we should insert that new element.
+ * The place to insert a specified unit number, if applicable
+ * the end of the list.
+ */
+ unit = *up;
+ ndev = devfs_alloc(MAKEDEV_WAITOK);
+ dev_lock();
+ prep_cdevsw(csw, MAKEDEV_WAITOK);
+ low = extra;
+ de = dl = NULL;
+ cd = *cdp;
+ LIST_FOREACH(dev, &cd->head, si_clone) {
+ KASSERT(dev->si_flags & SI_CLONELIST,
+ ("Dev %p(%s) should be on clonelist", dev, dev->si_name));
+ u = dev2unit(dev);
+ if (u == (unit | extra)) {
+ *dp = dev;
+ dev_unlock();
+ devfs_free(ndev);
+ return (0);
+ }
+ if (unit == -1 && u == low) {
+ low++;
+ de = dev;
+ continue;
+ } else if (u < (unit | extra)) {
+ de = dev;
+ continue;
+ } else if (u > (unit | extra)) {
+ dl = dev;
+ break;
+ }
+ }
+ if (unit == -1)
+ unit = low & CLONE_UNITMASK;
+ dev = newdev(csw, unit | extra, ndev);
+ if (dev->si_flags & SI_CLONELIST) {
+ printf("dev %p (%s) is on clonelist\n", dev, dev->si_name);
+ printf("unit=%d, low=%d, extra=0x%x\n", unit, low, extra);
+ LIST_FOREACH(dev, &cd->head, si_clone) {
+ printf("\t%p %s\n", dev, dev->si_name);
+ }
+ panic("foo");
+ }
+ KASSERT(!(dev->si_flags & SI_CLONELIST),
+ ("Dev %p(%s) should not be on clonelist", dev, dev->si_name));
+ if (dl != NULL)
+ LIST_INSERT_BEFORE(dl, dev, si_clone);
+ else if (de != NULL)
+ LIST_INSERT_AFTER(de, dev, si_clone);
+ else
+ LIST_INSERT_HEAD(&cd->head, dev, si_clone);
+ dev->si_flags |= SI_CLONELIST;
+ *up = unit;
+ dev_unlock_and_free();
+ return (1);
+}
+
+/*
+ * Kill everything still on the list. The driver should already have
+ * disposed of any softc hung of the struct cdev *'s at this time.
+ */
+void
+clone_cleanup(struct clonedevs **cdp)
+{
+ struct cdev *dev;
+ struct cdev_priv *cp;
+ struct clonedevs *cd;
+
+ cd = *cdp;
+ if (cd == NULL)
+ return;
+ dev_lock();
+ while (!LIST_EMPTY(&cd->head)) {
+ dev = LIST_FIRST(&cd->head);
+ LIST_REMOVE(dev, si_clone);
+ KASSERT(dev->si_flags & SI_CLONELIST,
+ ("Dev %p(%s) should be on clonelist", dev, dev->si_name));
+ dev->si_flags &= ~SI_CLONELIST;
+ cp = cdev2priv(dev);
+ if (!(cp->cdp_flags & CDP_SCHED_DTR)) {
+ cp->cdp_flags |= CDP_SCHED_DTR;
+ KASSERT(dev->si_flags & SI_NAMED,
+ ("Driver has goofed in cloning underways udev %x unit %x", dev2udev(dev), dev2unit(dev)));
+ destroy_devl(dev);
+ }
+ }
+ dev_unlock_and_free();
+ free(cd, M_DEVBUF);
+ *cdp = NULL;
+}
+
+static TAILQ_HEAD(, cdev_priv) dev_ddtr =
+ TAILQ_HEAD_INITIALIZER(dev_ddtr);
+static struct task dev_dtr_task = TASK_INITIALIZER(0, destroy_dev_tq, NULL);
+
+static void
+destroy_dev_tq(void *ctx, int pending)
+{
+ struct cdev_priv *cp;
+ struct cdev *dev;
+ void (*cb)(void *);
+ void *cb_arg;
+
+ dev_lock();
+ while (!TAILQ_EMPTY(&dev_ddtr)) {
+ cp = TAILQ_FIRST(&dev_ddtr);
+ dev = &cp->cdp_c;
+ KASSERT(cp->cdp_flags & CDP_SCHED_DTR,
+ ("cdev %p in dev_destroy_tq without CDP_SCHED_DTR", cp));
+ TAILQ_REMOVE(&dev_ddtr, cp, cdp_dtr_list);
+ cb = cp->cdp_dtr_cb;
+ cb_arg = cp->cdp_dtr_cb_arg;
+ destroy_devl(dev);
+ dev_unlock_and_free();
+ dev_rel(dev);
+ if (cb != NULL)
+ cb(cb_arg);
+ dev_lock();
+ }
+ dev_unlock();
+}
+
+/*
+ * devmtx shall be locked on entry. devmtx will be unlocked after
+ * function return.
+ */
+static int
+destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg)
+{
+ struct cdev_priv *cp;
+
+ mtx_assert(&devmtx, MA_OWNED);
+ cp = cdev2priv(dev);
+ if (cp->cdp_flags & CDP_SCHED_DTR) {
+ dev_unlock();
+ return (0);
+ }
+ dev_refl(dev);
+ cp->cdp_flags |= CDP_SCHED_DTR;
+ cp->cdp_dtr_cb = cb;
+ cp->cdp_dtr_cb_arg = arg;
+ TAILQ_INSERT_TAIL(&dev_ddtr, cp, cdp_dtr_list);
+ dev_unlock();
+ taskqueue_enqueue(taskqueue_swi_giant, &dev_dtr_task);
+ return (1);
+}
+
+int
+destroy_dev_sched_cb(struct cdev *dev, void (*cb)(void *), void *arg)
+{
+
+ dev_lock();
+ return (destroy_dev_sched_cbl(dev, cb, arg));
+}
+
+int
+destroy_dev_sched(struct cdev *dev)
+{
+
+ return (destroy_dev_sched_cb(dev, NULL, NULL));
+}
+
+void
+destroy_dev_drain(struct cdevsw *csw)
+{
+
+ dev_lock();
+ while (!LIST_EMPTY(&csw->d_devs)) {
+ msleep(&csw->d_devs, &devmtx, PRIBIO, "devscd", hz/10);
+ }
+ dev_unlock();
+}
+
+void
+drain_dev_clone_events(void)
+{
+
+ sx_xlock(&clone_drain_lock);
+ sx_xunlock(&clone_drain_lock);
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <sys/kernel.h>
+
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cdev, db_show_cdev)
+{
+ struct cdev_priv *cdp;
+ struct cdev *dev;
+ u_int flags;
+ char buf[512];
+
+ if (!have_addr) {
+ TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) {
+ dev = &cdp->cdp_c;
+ db_printf("%s %p\n", dev->si_name, dev);
+ if (db_pager_quit)
+ break;
+ }
+ return;
+ }
+
+ dev = (struct cdev *)addr;
+ cdp = cdev2priv(dev);
+ db_printf("dev %s ref %d use %ld thr %ld inuse %u fdpriv %p\n",
+ dev->si_name, dev->si_refcount, dev->si_usecount,
+ dev->si_threadcount, cdp->cdp_inuse, cdp->cdp_fdpriv.lh_first);
+ db_printf("devsw %p si_drv0 %d si_drv1 %p si_drv2 %p\n",
+ dev->si_devsw, dev->si_drv0, dev->si_drv1, dev->si_drv2);
+ flags = dev->si_flags;
+#define SI_FLAG(flag) do { \
+ if (flags & (flag)) { \
+ if (buf[0] != '\0') \
+ strlcat(buf, ", ", sizeof(buf)); \
+ strlcat(buf, (#flag) + 3, sizeof(buf)); \
+ flags &= ~(flag); \
+ } \
+} while (0)
+ buf[0] = '\0';
+ SI_FLAG(SI_ETERNAL);
+ SI_FLAG(SI_ALIAS);
+ SI_FLAG(SI_NAMED);
+ SI_FLAG(SI_CHEAPCLONE);
+ SI_FLAG(SI_CHILD);
+ SI_FLAG(SI_DUMPDEV);
+ SI_FLAG(SI_CLONELIST);
+ db_printf("si_flags %s\n", buf);
+
+ flags = cdp->cdp_flags;
+#define CDP_FLAG(flag) do { \
+ if (flags & (flag)) { \
+ if (buf[0] != '\0') \
+ strlcat(buf, ", ", sizeof(buf)); \
+ strlcat(buf, (#flag) + 4, sizeof(buf)); \
+ flags &= ~(flag); \
+ } \
+} while (0)
+ buf[0] = '\0';
+ CDP_FLAG(CDP_ACTIVE);
+ CDP_FLAG(CDP_SCHED_DTR);
+ db_printf("cdp_flags %s\n", buf);
+}
+#endif
diff --git a/sys/kern/kern_cons.c b/sys/kern/kern_cons.c
new file mode 100644
index 0000000..d17846a
--- /dev/null
+++ b/sys/kern/kern_cons.c
@@ -0,0 +1,643 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1991 The Regents of the University of California.
+ * Copyright (c) 1999 Michael Smith
+ * Copyright (c) 2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ *
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)cons.c 7.2 (Berkeley) 5/9/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/fcntl.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/msgbuf.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/reboot.h>
+#include <sys/sysctl.h>
+#include <sys/sbuf.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#include <ddb/ddb.h>
+
+#include <machine/cpu.h>
+#include <machine/clock.h>
+
+static MALLOC_DEFINE(M_TTYCONS, "tty console", "tty console handling");
+
+struct cn_device {
+ STAILQ_ENTRY(cn_device) cnd_next;
+ struct consdev *cnd_cn;
+};
+
+#define CNDEVPATHMAX 32
+#define CNDEVTAB_SIZE 4
+static struct cn_device cn_devtab[CNDEVTAB_SIZE];
+static STAILQ_HEAD(, cn_device) cn_devlist =
+ STAILQ_HEAD_INITIALIZER(cn_devlist);
+
+int cons_avail_mask = 0; /* Bit mask. Each registered low level console
+ * which is currently unavailable for inpit
+ * (i.e., if it is in graphics mode) will have
+ * this bit cleared.
+ */
+static int cn_mute;
+static char *consbuf; /* buffer used by `consmsgbuf' */
+static struct callout conscallout; /* callout for outputting to constty */
+struct msgbuf consmsgbuf; /* message buffer for console tty */
+static u_char console_pausing; /* pause after each line during probe */
+static char *console_pausestr=
+"<pause; press any key to proceed to next line or '.' to end pause mode>";
+struct tty *constty; /* pointer to console "window" tty */
+static struct mtx cnputs_mtx; /* Mutex for cnputs(). */
+static int use_cnputs_mtx = 0; /* != 0 if cnputs_mtx locking reqd. */
+
+static void constty_timeout(void *arg);
+
+static struct consdev cons_consdev;
+DATA_SET(cons_set, cons_consdev);
+SET_DECLARE(cons_set, struct consdev);
+
+void
+cninit(void)
+{
+ struct consdev *best_cn, *cn, **list;
+
+ /*
+ * Check if we should mute the console (for security reasons perhaps)
+ * It can be changes dynamically using sysctl kern.consmute
+ * once we are up and going.
+ *
+ */
+ cn_mute = ((boothowto & (RB_MUTE
+ |RB_SINGLE
+ |RB_VERBOSE
+ |RB_ASKNAME)) == RB_MUTE);
+
+ /*
+ * Find the first console with the highest priority.
+ */
+ best_cn = NULL;
+ SET_FOREACH(list, cons_set) {
+ cn = *list;
+ cnremove(cn);
+ /* Skip cons_consdev. */
+ if (cn->cn_ops == NULL)
+ continue;
+ cn->cn_ops->cn_probe(cn);
+ if (cn->cn_pri == CN_DEAD)
+ continue;
+ if (best_cn == NULL || cn->cn_pri > best_cn->cn_pri)
+ best_cn = cn;
+ if (boothowto & RB_MULTIPLE) {
+ /*
+ * Initialize console, and attach to it.
+ */
+ cn->cn_ops->cn_init(cn);
+ cnadd(cn);
+ }
+ }
+ if (best_cn == NULL)
+ return;
+ if ((boothowto & RB_MULTIPLE) == 0) {
+ best_cn->cn_ops->cn_init(best_cn);
+ cnadd(best_cn);
+ }
+ if (boothowto & RB_PAUSE)
+ console_pausing = 1;
+ /*
+ * Make the best console the preferred console.
+ */
+ cnselect(best_cn);
+}
+
+void
+cninit_finish()
+{
+ console_pausing = 0;
+}
+
+/* add a new physical console to back the virtual console */
+int
+cnadd(struct consdev *cn)
+{
+ struct cn_device *cnd;
+ int i;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+ if (cnd->cnd_cn == cn)
+ return (0);
+ for (i = 0; i < CNDEVTAB_SIZE; i++) {
+ cnd = &cn_devtab[i];
+ if (cnd->cnd_cn == NULL)
+ break;
+ }
+ if (cnd->cnd_cn != NULL)
+ return (ENOMEM);
+ cnd->cnd_cn = cn;
+ if (cn->cn_name[0] == '\0') {
+ /* XXX: it is unclear if/where this print might output */
+ printf("WARNING: console at %p has no name\n", cn);
+ }
+ STAILQ_INSERT_TAIL(&cn_devlist, cnd, cnd_next);
+ if (STAILQ_FIRST(&cn_devlist) == cnd)
+ ttyconsdev_select(cnd->cnd_cn->cn_name);
+
+ /* Add device to the active mask. */
+ cnavailable(cn, (cn->cn_flags & CN_FLAG_NOAVAIL) == 0);
+
+ return (0);
+}
+
+void
+cnremove(struct consdev *cn)
+{
+ struct cn_device *cnd;
+ int i;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ if (cnd->cnd_cn != cn)
+ continue;
+ if (STAILQ_FIRST(&cn_devlist) == cnd)
+ ttyconsdev_select(NULL);
+ STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
+ cnd->cnd_cn = NULL;
+
+ /* Remove this device from available mask. */
+ for (i = 0; i < CNDEVTAB_SIZE; i++)
+ if (cnd == &cn_devtab[i]) {
+ cons_avail_mask &= ~(1 << i);
+ break;
+ }
+#if 0
+ /*
+ * XXX
+ * syscons gets really confused if console resources are
+ * freed after the system has initialized.
+ */
+ if (cn->cn_term != NULL)
+ cn->cn_ops->cn_term(cn);
+#endif
+ return;
+ }
+}
+
+void
+cnselect(struct consdev *cn)
+{
+ struct cn_device *cnd;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ if (cnd->cnd_cn != cn)
+ continue;
+ if (cnd == STAILQ_FIRST(&cn_devlist))
+ return;
+ STAILQ_REMOVE(&cn_devlist, cnd, cn_device, cnd_next);
+ STAILQ_INSERT_HEAD(&cn_devlist, cnd, cnd_next);
+ ttyconsdev_select(cnd->cnd_cn->cn_name);
+ return;
+ }
+}
+
+void
+cnavailable(struct consdev *cn, int available)
+{
+ int i;
+
+ for (i = 0; i < CNDEVTAB_SIZE; i++) {
+ if (cn_devtab[i].cnd_cn == cn)
+ break;
+ }
+ if (available) {
+ if (i < CNDEVTAB_SIZE)
+ cons_avail_mask |= (1 << i);
+ cn->cn_flags &= ~CN_FLAG_NOAVAIL;
+ } else {
+ if (i < CNDEVTAB_SIZE)
+ cons_avail_mask &= ~(1 << i);
+ cn->cn_flags |= CN_FLAG_NOAVAIL;
+ }
+}
+
+int
+cnunavailable(void)
+{
+
+ return (cons_avail_mask == 0);
+}
+
+/*
+ * sysctl_kern_console() provides output parseable in conscontrol(1).
+ */
+static int
+sysctl_kern_console(SYSCTL_HANDLER_ARGS)
+{
+ struct cn_device *cnd;
+ struct consdev *cp, **list;
+ char *p;
+ int delete, error;
+ struct sbuf *sb;
+
+ sb = sbuf_new(NULL, NULL, CNDEVPATHMAX * 2, SBUF_AUTOEXTEND);
+ if (sb == NULL)
+ return (ENOMEM);
+ sbuf_clear(sb);
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next)
+ sbuf_printf(sb, "%s,", cnd->cnd_cn->cn_name);
+ sbuf_printf(sb, "/");
+ SET_FOREACH(list, cons_set) {
+ cp = *list;
+ if (cp->cn_name[0] != '\0')
+ sbuf_printf(sb, "%s,", cp->cn_name);
+ }
+ sbuf_finish(sb);
+ error = sysctl_handle_string(oidp, sbuf_data(sb), sbuf_len(sb), req);
+ if (error == 0 && req->newptr != NULL) {
+ p = sbuf_data(sb);
+ error = ENXIO;
+ delete = 0;
+ if (*p == '-') {
+ delete = 1;
+ p++;
+ }
+ SET_FOREACH(list, cons_set) {
+ cp = *list;
+ if (strcmp(p, cp->cn_name) != 0)
+ continue;
+ if (delete) {
+ cnremove(cp);
+ error = 0;
+ } else {
+ error = cnadd(cp);
+ if (error == 0)
+ cnselect(cp);
+ }
+ break;
+ }
+ }
+ sbuf_delete(sb);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, console, CTLTYPE_STRING|CTLFLAG_RW,
+ 0, 0, sysctl_kern_console, "A", "Console device control");
+
+/*
+ * User has changed the state of the console muting.
+ * This may require us to open or close the device in question.
+ */
+static int
+sysctl_kern_consmute(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ error = sysctl_handle_int(oidp, &cn_mute, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, consmute, CTLTYPE_INT|CTLFLAG_RW,
+ 0, sizeof(cn_mute), sysctl_kern_consmute, "I",
+ "State of the console muting");
+
+void
+cngrab()
+{
+ struct cn_device *cnd;
+ struct consdev *cn;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ cn = cnd->cnd_cn;
+ if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG))
+ cn->cn_ops->cn_grab(cn);
+ }
+}
+
+void
+cnungrab()
+{
+ struct cn_device *cnd;
+ struct consdev *cn;
+
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ cn = cnd->cnd_cn;
+ if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG))
+ cn->cn_ops->cn_ungrab(cn);
+ }
+}
+
+/*
+ * Low level console routines.
+ */
+int
+cngetc(void)
+{
+ int c;
+
+ if (cn_mute)
+ return (-1);
+ while ((c = cncheckc()) == -1)
+ cpu_spinwait();
+ if (c == '\r')
+ c = '\n'; /* console input is always ICRNL */
+ return (c);
+}
+
+int
+cncheckc(void)
+{
+ struct cn_device *cnd;
+ struct consdev *cn;
+ int c;
+
+ if (cn_mute)
+ return (-1);
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ cn = cnd->cnd_cn;
+ if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) {
+ c = cn->cn_ops->cn_getc(cn);
+ if (c != -1)
+ return (c);
+ }
+ }
+ return (-1);
+}
+
+void
+cngets(char *cp, size_t size, int visible)
+{
+ char *lp, *end;
+ int c;
+
+ cngrab();
+
+ lp = cp;
+ end = cp + size - 1;
+ for (;;) {
+ c = cngetc() & 0177;
+ switch (c) {
+ case '\n':
+ case '\r':
+ cnputc(c);
+ *lp = '\0';
+ cnungrab();
+ return;
+ case '\b':
+ case '\177':
+ if (lp > cp) {
+ if (visible) {
+ cnputc(c);
+ cnputs(" \b");
+ }
+ lp--;
+ }
+ continue;
+ case '\0':
+ continue;
+ default:
+ if (lp < end) {
+ switch (visible) {
+ case GETS_NOECHO:
+ break;
+ case GETS_ECHOPASS:
+ cnputc('*');
+ break;
+ default:
+ cnputc(c);
+ break;
+ }
+ *lp++ = c;
+ }
+ }
+ }
+}
+
+void
+cnputc(int c)
+{
+ struct cn_device *cnd;
+ struct consdev *cn;
+ char *cp;
+
+ if (cn_mute || c == '\0')
+ return;
+ STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
+ cn = cnd->cnd_cn;
+ if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) {
+ if (c == '\n')
+ cn->cn_ops->cn_putc(cn, '\r');
+ cn->cn_ops->cn_putc(cn, c);
+ }
+ }
+ if (console_pausing && c == '\n' && !kdb_active) {
+ for (cp = console_pausestr; *cp != '\0'; cp++)
+ cnputc(*cp);
+ cngrab();
+ if (cngetc() == '.')
+ console_pausing = 0;
+ cnungrab();
+ cnputc('\r');
+ for (cp = console_pausestr; *cp != '\0'; cp++)
+ cnputc(' ');
+ cnputc('\r');
+ }
+}
+
+void
+cnputs(char *p)
+{
+ int c;
+ int unlock_reqd = 0;
+
+ if (use_cnputs_mtx) {
+ mtx_lock_spin(&cnputs_mtx);
+ unlock_reqd = 1;
+ }
+
+ while ((c = *p++) != '\0')
+ cnputc(c);
+
+ if (unlock_reqd)
+ mtx_unlock_spin(&cnputs_mtx);
+}
+
+static int consmsgbuf_size = 8192;
+SYSCTL_INT(_kern, OID_AUTO, consmsgbuf_size, CTLFLAG_RW, &consmsgbuf_size, 0,
+ "Console tty buffer size");
+
+/*
+ * Redirect console output to a tty.
+ */
+void
+constty_set(struct tty *tp)
+{
+ int size;
+
+ KASSERT(tp != NULL, ("constty_set: NULL tp"));
+ if (consbuf == NULL) {
+ size = consmsgbuf_size;
+ consbuf = malloc(size, M_TTYCONS, M_WAITOK);
+ msgbuf_init(&consmsgbuf, consbuf, size);
+ callout_init(&conscallout, 0);
+ }
+ constty = tp;
+ constty_timeout(NULL);
+}
+
+/*
+ * Disable console redirection to a tty.
+ */
+void
+constty_clear(void)
+{
+ int c;
+
+ constty = NULL;
+ if (consbuf == NULL)
+ return;
+ callout_stop(&conscallout);
+ while ((c = msgbuf_getchar(&consmsgbuf)) != -1)
+ cnputc(c);
+ free(consbuf, M_TTYCONS);
+ consbuf = NULL;
+}
+
+/* Times per second to check for pending console tty messages. */
+static int constty_wakeups_per_second = 5;
+SYSCTL_INT(_kern, OID_AUTO, constty_wakeups_per_second, CTLFLAG_RW,
+ &constty_wakeups_per_second, 0,
+ "Times per second to check for pending console tty messages");
+
+static void
+constty_timeout(void *arg)
+{
+ int c;
+
+ if (constty != NULL) {
+ tty_lock(constty);
+ while ((c = msgbuf_getchar(&consmsgbuf)) != -1) {
+ if (tty_putchar(constty, c) < 0) {
+ tty_unlock(constty);
+ constty = NULL;
+ break;
+ }
+ }
+
+ if (constty != NULL)
+ tty_unlock(constty);
+ }
+ if (constty != NULL) {
+ callout_reset(&conscallout, hz / constty_wakeups_per_second,
+ constty_timeout, NULL);
+ } else {
+ /* Deallocate the constty buffer memory. */
+ constty_clear();
+ }
+}
+
+static void
+cn_drvinit(void *unused)
+{
+
+ mtx_init(&cnputs_mtx, "cnputs_mtx", NULL, MTX_SPIN | MTX_NOWITNESS);
+ use_cnputs_mtx = 1;
+}
+
+SYSINIT(cndev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, cn_drvinit, NULL);
+
+/*
+ * Sysbeep(), if we have hardware for it
+ */
+
+#ifdef HAS_TIMER_SPKR
+
+static int beeping;
+
+static void
+sysbeepstop(void *chan)
+{
+
+ timer_spkr_release();
+ beeping = 0;
+}
+
+int
+sysbeep(int pitch, int period)
+{
+
+ if (timer_spkr_acquire()) {
+ if (!beeping) {
+ /* Something else owns it. */
+ return (EBUSY);
+ }
+ }
+ timer_spkr_setfreq(pitch);
+ if (!beeping) {
+ beeping = period;
+ timeout(sysbeepstop, (void *)NULL, period);
+ }
+ return (0);
+}
+
+#else
+
+/*
+ * No hardware, no sound
+ */
+
+int
+sysbeep(int pitch __unused, int period __unused)
+{
+
+ return (ENODEV);
+}
+
+#endif
+
diff --git a/sys/kern/kern_context.c b/sys/kern/kern_context.c
new file mode 100644
index 0000000..70751d0
--- /dev/null
+++ b/sys/kern/kern_context.c
@@ -0,0 +1,129 @@
+/*-
+ * Copyright (c) 2002 Daniel M. Eischen <deischen@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/ucontext.h>
+
+/*
+ * The first two fields of a ucontext_t are the signal mask and the machine
+ * context. The next field is uc_link; we want to avoid destroying the link
+ * when copying out contexts.
+ */
+#define UC_COPY_SIZE offsetof(ucontext_t, uc_link)
+
+#ifndef _SYS_SYSPROTO_H_
+struct getcontext_args {
+ struct __ucontext *ucp;
+}
+struct setcontext_args {
+ const struct __ucontext_t *ucp;
+}
+struct swapcontext_args {
+ struct __ucontext *oucp;
+ const struct __ucontext_t *ucp;
+}
+#endif
+
+int
+sys_getcontext(struct thread *td, struct getcontext_args *uap)
+{
+ ucontext_t uc;
+ int ret;
+
+ if (uap->ucp == NULL)
+ ret = EINVAL;
+ else {
+ get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
+ PROC_LOCK(td->td_proc);
+ uc.uc_sigmask = td->td_sigmask;
+ PROC_UNLOCK(td->td_proc);
+ bzero(uc.__spare__, sizeof(uc.__spare__));
+ ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
+ }
+ return (ret);
+}
+
+int
+sys_setcontext(struct thread *td, struct setcontext_args *uap)
+{
+ ucontext_t uc;
+ int ret;
+
+ if (uap->ucp == NULL)
+ ret = EINVAL;
+ else {
+ ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
+ if (ret == 0) {
+ ret = set_mcontext(td, &uc.uc_mcontext);
+ if (ret == 0) {
+ kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask,
+ NULL, 0);
+ }
+ }
+ }
+ return (ret == 0 ? EJUSTRETURN : ret);
+}
+
+int
+sys_swapcontext(struct thread *td, struct swapcontext_args *uap)
+{
+ ucontext_t uc;
+ int ret;
+
+ if (uap->oucp == NULL || uap->ucp == NULL)
+ ret = EINVAL;
+ else {
+ get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
+ bzero(uc.__spare__, sizeof(uc.__spare__));
+ PROC_LOCK(td->td_proc);
+ uc.uc_sigmask = td->td_sigmask;
+ PROC_UNLOCK(td->td_proc);
+ ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
+ if (ret == 0) {
+ ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
+ if (ret == 0) {
+ ret = set_mcontext(td, &uc.uc_mcontext);
+ if (ret == 0) {
+ kern_sigprocmask(td, SIG_SETMASK,
+ &uc.uc_sigmask, NULL, 0);
+ }
+ }
+ }
+ }
+ return (ret == 0 ? EJUSTRETURN : ret);
+}
diff --git a/sys/kern/kern_cpu.c b/sys/kern/kern_cpu.c
new file mode 100644
index 0000000..6df4d3f
--- /dev/null
+++ b/sys/kern/kern_cpu.c
@@ -0,0 +1,1063 @@
+/*-
+ * Copyright (c) 2004-2007 Nate Lawson (SDG)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/cpu.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/timetc.h>
+#include <sys/taskqueue.h>
+
+#include "cpufreq_if.h"
+
+/*
+ * Common CPU frequency glue code. Drivers for specific hardware can
+ * attach this interface to allow users to get/set the CPU frequency.
+ */
+
+/*
+ * Number of levels we can handle. Levels are synthesized from settings
+ * so for M settings and N drivers, there may be M*N levels.
+ */
+#define CF_MAX_LEVELS 64
+
+struct cf_saved_freq {
+ struct cf_level level;
+ int priority;
+ SLIST_ENTRY(cf_saved_freq) link;
+};
+
+struct cpufreq_softc {
+ struct sx lock;
+ struct cf_level curr_level;
+ int curr_priority;
+ SLIST_HEAD(, cf_saved_freq) saved_freq;
+ struct cf_level_lst all_levels;
+ int all_count;
+ int max_mhz;
+ device_t dev;
+ struct sysctl_ctx_list sysctl_ctx;
+ struct task startup_task;
+ struct cf_level *levels_buf;
+};
+
+struct cf_setting_array {
+ struct cf_setting sets[MAX_SETTINGS];
+ int count;
+ TAILQ_ENTRY(cf_setting_array) link;
+};
+
+TAILQ_HEAD(cf_setting_lst, cf_setting_array);
+
+#define CF_MTX_INIT(x) sx_init((x), "cpufreq lock")
+#define CF_MTX_LOCK(x) sx_xlock((x))
+#define CF_MTX_UNLOCK(x) sx_xunlock((x))
+#define CF_MTX_ASSERT(x) sx_assert((x), SX_XLOCKED)
+
+#define CF_DEBUG(msg...) do { \
+ if (cf_verbose) \
+ printf("cpufreq: " msg); \
+ } while (0)
+
+static int cpufreq_attach(device_t dev);
+static void cpufreq_startup_task(void *ctx, int pending);
+static int cpufreq_detach(device_t dev);
+static int cf_set_method(device_t dev, const struct cf_level *level,
+ int priority);
+static int cf_get_method(device_t dev, struct cf_level *level);
+static int cf_levels_method(device_t dev, struct cf_level *levels,
+ int *count);
+static int cpufreq_insert_abs(struct cpufreq_softc *sc,
+ struct cf_setting *sets, int count);
+static int cpufreq_expand_set(struct cpufreq_softc *sc,
+ struct cf_setting_array *set_arr);
+static struct cf_level *cpufreq_dup_set(struct cpufreq_softc *sc,
+ struct cf_level *dup, struct cf_setting *set);
+static int cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS);
+static int cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS);
+static int cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS);
+
+static device_method_t cpufreq_methods[] = {
+ DEVMETHOD(device_probe, bus_generic_probe),
+ DEVMETHOD(device_attach, cpufreq_attach),
+ DEVMETHOD(device_detach, cpufreq_detach),
+
+ DEVMETHOD(cpufreq_set, cf_set_method),
+ DEVMETHOD(cpufreq_get, cf_get_method),
+ DEVMETHOD(cpufreq_levels, cf_levels_method),
+ {0, 0}
+};
+static driver_t cpufreq_driver = {
+ "cpufreq", cpufreq_methods, sizeof(struct cpufreq_softc)
+};
+static devclass_t cpufreq_dc;
+DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
+
+static int cf_lowest_freq;
+static int cf_verbose;
+TUNABLE_INT("debug.cpufreq.lowest", &cf_lowest_freq);
+TUNABLE_INT("debug.cpufreq.verbose", &cf_verbose);
+static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL,
+ "cpufreq debugging");
+SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RW, &cf_lowest_freq, 1,
+ "Don't provide levels below this frequency.");
+SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RW, &cf_verbose, 1,
+ "Print verbose debugging messages");
+
+static int
+cpufreq_attach(device_t dev)
+{
+ struct cpufreq_softc *sc;
+ struct pcpu *pc;
+ device_t parent;
+ uint64_t rate;
+ int numdevs;
+
+ CF_DEBUG("initializing %s\n", device_get_nameunit(dev));
+ sc = device_get_softc(dev);
+ parent = device_get_parent(dev);
+ sc->dev = dev;
+ sysctl_ctx_init(&sc->sysctl_ctx);
+ TAILQ_INIT(&sc->all_levels);
+ CF_MTX_INIT(&sc->lock);
+ sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
+ SLIST_INIT(&sc->saved_freq);
+ /* Try to get nominal CPU freq to use it as maximum later if needed */
+ sc->max_mhz = cpu_get_nominal_mhz(dev);
+ /* If that fails, try to measure the current rate */
+ if (sc->max_mhz <= 0) {
+ pc = cpu_get_pcpu(dev);
+ if (cpu_est_clockrate(pc->pc_cpuid, &rate) == 0)
+ sc->max_mhz = rate / 1000000;
+ else
+ sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
+ }
+
+ /*
+ * Only initialize one set of sysctls for all CPUs. In the future,
+ * if multiple CPUs can have different settings, we can move these
+ * sysctls to be under every CPU instead of just the first one.
+ */
+ numdevs = devclass_get_count(cpufreq_dc);
+ if (numdevs > 1)
+ return (0);
+
+ CF_DEBUG("initializing one-time data for %s\n",
+ device_get_nameunit(dev));
+ sc->levels_buf = malloc(CF_MAX_LEVELS * sizeof(*sc->levels_buf),
+ M_DEVBUF, M_WAITOK);
+ SYSCTL_ADD_PROC(&sc->sysctl_ctx,
+ SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
+ OID_AUTO, "freq", CTLTYPE_INT | CTLFLAG_RW, sc, 0,
+ cpufreq_curr_sysctl, "I", "Current CPU frequency");
+ SYSCTL_ADD_PROC(&sc->sysctl_ctx,
+ SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
+ OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
+ cpufreq_levels_sysctl, "A", "CPU frequency levels");
+
+ /*
+ * Queue a one-shot broadcast that levels have changed.
+ * It will run once the system has completed booting.
+ */
+ TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev);
+ taskqueue_enqueue(taskqueue_thread, &sc->startup_task);
+
+ return (0);
+}
+
+/* Handle any work to be done for all drivers that attached during boot. */
+static void
+cpufreq_startup_task(void *ctx, int pending)
+{
+
+ cpufreq_settings_changed((device_t)ctx);
+}
+
+static int
+cpufreq_detach(device_t dev)
+{
+ struct cpufreq_softc *sc;
+ struct cf_saved_freq *saved_freq;
+ int numdevs;
+
+ CF_DEBUG("shutdown %s\n", device_get_nameunit(dev));
+ sc = device_get_softc(dev);
+ sysctl_ctx_free(&sc->sysctl_ctx);
+
+ while ((saved_freq = SLIST_FIRST(&sc->saved_freq)) != NULL) {
+ SLIST_REMOVE_HEAD(&sc->saved_freq, link);
+ free(saved_freq, M_TEMP);
+ }
+
+ /* Only clean up these resources when the last device is detaching. */
+ numdevs = devclass_get_count(cpufreq_dc);
+ if (numdevs == 1) {
+ CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev));
+ free(sc->levels_buf, M_DEVBUF);
+ }
+
+ return (0);
+}
+
+static int
+cf_set_method(device_t dev, const struct cf_level *level, int priority)
+{
+ struct cpufreq_softc *sc;
+ const struct cf_setting *set;
+ struct cf_saved_freq *saved_freq, *curr_freq;
+ struct pcpu *pc;
+ int error, i;
+
+ sc = device_get_softc(dev);
+ error = 0;
+ set = NULL;
+ saved_freq = NULL;
+
+ /* We are going to change levels so notify the pre-change handler. */
+ EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error);
+ if (error != 0) {
+ EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
+ return (error);
+ }
+
+ CF_MTX_LOCK(&sc->lock);
+
+#ifdef SMP
+ /*
+ * If still booting and secondary CPUs not started yet, don't allow
+ * changing the frequency until they're online. This is because we
+ * can't switch to them using sched_bind() and thus we'd only be
+ * switching the main CPU. XXXTODO: Need to think more about how to
+ * handle having different CPUs at different frequencies.
+ */
+ if (mp_ncpus > 1 && !smp_active) {
+ device_printf(dev, "rejecting change, SMP not started yet\n");
+ error = ENXIO;
+ goto out;
+ }
+#endif /* SMP */
+
+ /*
+ * If the requested level has a lower priority, don't allow
+ * the new level right now.
+ */
+ if (priority < sc->curr_priority) {
+ CF_DEBUG("ignoring, curr prio %d less than %d\n", priority,
+ sc->curr_priority);
+ error = EPERM;
+ goto out;
+ }
+
+ /*
+ * If the caller didn't specify a level and one is saved, prepare to
+ * restore the saved level. If none has been saved, return an error.
+ */
+ if (level == NULL) {
+ saved_freq = SLIST_FIRST(&sc->saved_freq);
+ if (saved_freq == NULL) {
+ CF_DEBUG("NULL level, no saved level\n");
+ error = ENXIO;
+ goto out;
+ }
+ level = &saved_freq->level;
+ priority = saved_freq->priority;
+ CF_DEBUG("restoring saved level, freq %d prio %d\n",
+ level->total_set.freq, priority);
+ }
+
+ /* Reject levels that are below our specified threshold. */
+ if (level->total_set.freq < cf_lowest_freq) {
+ CF_DEBUG("rejecting freq %d, less than %d limit\n",
+ level->total_set.freq, cf_lowest_freq);
+ error = EINVAL;
+ goto out;
+ }
+
+ /* If already at this level, just return. */
+ if (sc->curr_level.total_set.freq == level->total_set.freq) {
+ CF_DEBUG("skipping freq %d, same as current level %d\n",
+ level->total_set.freq, sc->curr_level.total_set.freq);
+ goto skip;
+ }
+
+ /* First, set the absolute frequency via its driver. */
+ set = &level->abs_set;
+ if (set->dev) {
+ if (!device_is_attached(set->dev)) {
+ error = ENXIO;
+ goto out;
+ }
+
+ /* Bind to the target CPU before switching. */
+ pc = cpu_get_pcpu(set->dev);
+ thread_lock(curthread);
+ sched_bind(curthread, pc->pc_cpuid);
+ thread_unlock(curthread);
+ CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
+ device_get_nameunit(set->dev), PCPU_GET(cpuid));
+ error = CPUFREQ_DRV_SET(set->dev, set);
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
+ if (error) {
+ goto out;
+ }
+ }
+
+ /* Next, set any/all relative frequencies via their drivers. */
+ for (i = 0; i < level->rel_count; i++) {
+ set = &level->rel_set[i];
+ if (!device_is_attached(set->dev)) {
+ error = ENXIO;
+ goto out;
+ }
+
+ /* Bind to the target CPU before switching. */
+ pc = cpu_get_pcpu(set->dev);
+ thread_lock(curthread);
+ sched_bind(curthread, pc->pc_cpuid);
+ thread_unlock(curthread);
+ CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
+ device_get_nameunit(set->dev), PCPU_GET(cpuid));
+ error = CPUFREQ_DRV_SET(set->dev, set);
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
+ if (error) {
+ /* XXX Back out any successful setting? */
+ goto out;
+ }
+ }
+
+skip:
+ /*
+ * Before recording the current level, check if we're going to a
+ * higher priority. If so, save the previous level and priority.
+ */
+ if (sc->curr_level.total_set.freq != CPUFREQ_VAL_UNKNOWN &&
+ priority > sc->curr_priority) {
+ CF_DEBUG("saving level, freq %d prio %d\n",
+ sc->curr_level.total_set.freq, sc->curr_priority);
+ curr_freq = malloc(sizeof(*curr_freq), M_TEMP, M_NOWAIT);
+ if (curr_freq == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ curr_freq->level = sc->curr_level;
+ curr_freq->priority = sc->curr_priority;
+ SLIST_INSERT_HEAD(&sc->saved_freq, curr_freq, link);
+ }
+ sc->curr_level = *level;
+ sc->curr_priority = priority;
+
+ /* If we were restoring a saved state, reset it to "unused". */
+ if (saved_freq != NULL) {
+ CF_DEBUG("resetting saved level\n");
+ sc->curr_level.total_set.freq = CPUFREQ_VAL_UNKNOWN;
+ SLIST_REMOVE_HEAD(&sc->saved_freq, link);
+ free(saved_freq, M_TEMP);
+ }
+
+out:
+ CF_MTX_UNLOCK(&sc->lock);
+
+ /*
+ * We changed levels (or attempted to) so notify the post-change
+ * handler of new frequency or error.
+ */
+ EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
+ if (error && set)
+ device_printf(set->dev, "set freq failed, err %d\n", error);
+
+ return (error);
+}
+
+static int
+cf_get_method(device_t dev, struct cf_level *level)
+{
+ struct cpufreq_softc *sc;
+ struct cf_level *levels;
+ struct cf_setting *curr_set, set;
+ struct pcpu *pc;
+ device_t *devs;
+ int count, error, i, n, numdevs;
+ uint64_t rate;
+
+ sc = device_get_softc(dev);
+ error = 0;
+ levels = NULL;
+
+ /* If we already know the current frequency, we're done. */
+ CF_MTX_LOCK(&sc->lock);
+ curr_set = &sc->curr_level.total_set;
+ if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
+ CF_DEBUG("get returning known freq %d\n", curr_set->freq);
+ goto out;
+ }
+ CF_MTX_UNLOCK(&sc->lock);
+
+ /*
+ * We need to figure out the current level. Loop through every
+ * driver, getting the current setting. Then, attempt to get a best
+ * match of settings against each level.
+ */
+ count = CF_MAX_LEVELS;
+ levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT);
+ if (levels == NULL)
+ return (ENOMEM);
+ error = CPUFREQ_LEVELS(sc->dev, levels, &count);
+ if (error) {
+ if (error == E2BIG)
+ printf("cpufreq: need to increase CF_MAX_LEVELS\n");
+ free(levels, M_TEMP);
+ return (error);
+ }
+ error = device_get_children(device_get_parent(dev), &devs, &numdevs);
+ if (error) {
+ free(levels, M_TEMP);
+ return (error);
+ }
+
+ /*
+ * Reacquire the lock and search for the given level.
+ *
+ * XXX Note: this is not quite right since we really need to go
+ * through each level and compare both absolute and relative
+ * settings for each driver in the system before making a match.
+ * The estimation code below catches this case though.
+ */
+ CF_MTX_LOCK(&sc->lock);
+ for (n = 0; n < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; n++) {
+ if (!device_is_attached(devs[n]))
+ continue;
+ if (CPUFREQ_DRV_GET(devs[n], &set) != 0)
+ continue;
+ for (i = 0; i < count; i++) {
+ if (set.freq == levels[i].total_set.freq) {
+ sc->curr_level = levels[i];
+ break;
+ }
+ }
+ }
+ free(devs, M_TEMP);
+ if (curr_set->freq != CPUFREQ_VAL_UNKNOWN) {
+ CF_DEBUG("get matched freq %d from drivers\n", curr_set->freq);
+ goto out;
+ }
+
+ /*
+ * We couldn't find an exact match, so attempt to estimate and then
+ * match against a level.
+ */
+ pc = cpu_get_pcpu(dev);
+ if (pc == NULL) {
+ error = ENXIO;
+ goto out;
+ }
+ cpu_est_clockrate(pc->pc_cpuid, &rate);
+ rate /= 1000000;
+ for (i = 0; i < count; i++) {
+ if (CPUFREQ_CMP(rate, levels[i].total_set.freq)) {
+ sc->curr_level = levels[i];
+ CF_DEBUG("get estimated freq %d\n", curr_set->freq);
+ goto out;
+ }
+ }
+ error = ENXIO;
+
+out:
+ if (error == 0)
+ *level = sc->curr_level;
+
+ CF_MTX_UNLOCK(&sc->lock);
+ if (levels)
+ free(levels, M_TEMP);
+ return (error);
+}
+
+static int
+cf_levels_method(device_t dev, struct cf_level *levels, int *count)
+{
+ struct cf_setting_array *set_arr;
+ struct cf_setting_lst rel_sets;
+ struct cpufreq_softc *sc;
+ struct cf_level *lev;
+ struct cf_setting *sets;
+ struct pcpu *pc;
+ device_t *devs;
+ int error, i, numdevs, set_count, type;
+ uint64_t rate;
+
+ if (levels == NULL || count == NULL)
+ return (EINVAL);
+
+ TAILQ_INIT(&rel_sets);
+ sc = device_get_softc(dev);
+ error = device_get_children(device_get_parent(dev), &devs, &numdevs);
+ if (error)
+ return (error);
+ sets = malloc(MAX_SETTINGS * sizeof(*sets), M_TEMP, M_NOWAIT);
+ if (sets == NULL) {
+ free(devs, M_TEMP);
+ return (ENOMEM);
+ }
+
+ /* Get settings from all cpufreq drivers. */
+ CF_MTX_LOCK(&sc->lock);
+ for (i = 0; i < numdevs; i++) {
+ /* Skip devices that aren't ready. */
+ if (!device_is_attached(devs[i]))
+ continue;
+
+ /*
+ * Get settings, skipping drivers that offer no settings or
+ * provide settings for informational purposes only.
+ */
+ error = CPUFREQ_DRV_TYPE(devs[i], &type);
+ if (error || (type & CPUFREQ_FLAG_INFO_ONLY)) {
+ if (error == 0) {
+ CF_DEBUG("skipping info-only driver %s\n",
+ device_get_nameunit(devs[i]));
+ }
+ continue;
+ }
+ set_count = MAX_SETTINGS;
+ error = CPUFREQ_DRV_SETTINGS(devs[i], sets, &set_count);
+ if (error || set_count == 0)
+ continue;
+
+ /* Add the settings to our absolute/relative lists. */
+ switch (type & CPUFREQ_TYPE_MASK) {
+ case CPUFREQ_TYPE_ABSOLUTE:
+ error = cpufreq_insert_abs(sc, sets, set_count);
+ break;
+ case CPUFREQ_TYPE_RELATIVE:
+ CF_DEBUG("adding %d relative settings\n", set_count);
+ set_arr = malloc(sizeof(*set_arr), M_TEMP, M_NOWAIT);
+ if (set_arr == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ bcopy(sets, set_arr->sets, set_count * sizeof(*sets));
+ set_arr->count = set_count;
+ TAILQ_INSERT_TAIL(&rel_sets, set_arr, link);
+ break;
+ default:
+ error = EINVAL;
+ }
+ if (error)
+ goto out;
+ }
+
+ /*
+ * If there are no absolute levels, create a fake one at 100%. We
+ * then cache the clockrate for later use as our base frequency.
+ */
+ if (TAILQ_EMPTY(&sc->all_levels)) {
+ if (sc->max_mhz == CPUFREQ_VAL_UNKNOWN) {
+ sc->max_mhz = cpu_get_nominal_mhz(dev);
+ /*
+ * If the CPU can't report a rate for 100%, hope
+ * the CPU is running at its nominal rate right now,
+ * and use that instead.
+ */
+ if (sc->max_mhz <= 0) {
+ pc = cpu_get_pcpu(dev);
+ cpu_est_clockrate(pc->pc_cpuid, &rate);
+ sc->max_mhz = rate / 1000000;
+ }
+ }
+ memset(&sets[0], CPUFREQ_VAL_UNKNOWN, sizeof(*sets));
+ sets[0].freq = sc->max_mhz;
+ sets[0].dev = NULL;
+ error = cpufreq_insert_abs(sc, sets, 1);
+ if (error)
+ goto out;
+ }
+
+ /* Create a combined list of absolute + relative levels. */
+ TAILQ_FOREACH(set_arr, &rel_sets, link)
+ cpufreq_expand_set(sc, set_arr);
+
+ /* If the caller doesn't have enough space, return the actual count. */
+ if (sc->all_count > *count) {
+ *count = sc->all_count;
+ error = E2BIG;
+ goto out;
+ }
+
+ /* Finally, output the list of levels. */
+ i = 0;
+ TAILQ_FOREACH(lev, &sc->all_levels, link) {
+
+ /* Skip levels that have a frequency that is too low. */
+ if (lev->total_set.freq < cf_lowest_freq) {
+ sc->all_count--;
+ continue;
+ }
+
+ levels[i] = *lev;
+ i++;
+ }
+ *count = sc->all_count;
+ error = 0;
+
+out:
+ /* Clear all levels since we regenerate them each time. */
+ while ((lev = TAILQ_FIRST(&sc->all_levels)) != NULL) {
+ TAILQ_REMOVE(&sc->all_levels, lev, link);
+ free(lev, M_TEMP);
+ }
+ sc->all_count = 0;
+
+ CF_MTX_UNLOCK(&sc->lock);
+ while ((set_arr = TAILQ_FIRST(&rel_sets)) != NULL) {
+ TAILQ_REMOVE(&rel_sets, set_arr, link);
+ free(set_arr, M_TEMP);
+ }
+ free(devs, M_TEMP);
+ free(sets, M_TEMP);
+ return (error);
+}
+
+/*
+ * Create levels for an array of absolute settings and insert them in
+ * sorted order in the specified list.
+ */
+static int
+cpufreq_insert_abs(struct cpufreq_softc *sc, struct cf_setting *sets,
+ int count)
+{
+ struct cf_level_lst *list;
+ struct cf_level *level, *search;
+ int i;
+
+ CF_MTX_ASSERT(&sc->lock);
+
+ list = &sc->all_levels;
+ for (i = 0; i < count; i++) {
+ level = malloc(sizeof(*level), M_TEMP, M_NOWAIT | M_ZERO);
+ if (level == NULL)
+ return (ENOMEM);
+ level->abs_set = sets[i];
+ level->total_set = sets[i];
+ level->total_set.dev = NULL;
+ sc->all_count++;
+
+ if (TAILQ_EMPTY(list)) {
+ CF_DEBUG("adding abs setting %d at head\n",
+ sets[i].freq);
+ TAILQ_INSERT_HEAD(list, level, link);
+ continue;
+ }
+
+ TAILQ_FOREACH_REVERSE(search, list, cf_level_lst, link) {
+ if (sets[i].freq <= search->total_set.freq) {
+ CF_DEBUG("adding abs setting %d after %d\n",
+ sets[i].freq, search->total_set.freq);
+ TAILQ_INSERT_AFTER(list, search, level, link);
+ break;
+ }
+ }
+ }
+ return (0);
+}
+
+/*
+ * Expand a group of relative settings, creating derived levels from them.
+ */
+static int
+cpufreq_expand_set(struct cpufreq_softc *sc, struct cf_setting_array *set_arr)
+{
+ struct cf_level *fill, *search;
+ struct cf_setting *set;
+ int i;
+
+ CF_MTX_ASSERT(&sc->lock);
+
+ /*
+ * Walk the set of all existing levels in reverse. This is so we
+ * create derived states from the lowest absolute settings first
+ * and discard duplicates created from higher absolute settings.
+ * For instance, a level of 50 Mhz derived from 100 Mhz + 50% is
+ * preferable to 200 Mhz + 25% because absolute settings are more
+ * efficient since they often change the voltage as well.
+ */
+ TAILQ_FOREACH_REVERSE(search, &sc->all_levels, cf_level_lst, link) {
+ /* Add each setting to the level, duplicating if necessary. */
+ for (i = 0; i < set_arr->count; i++) {
+ set = &set_arr->sets[i];
+
+ /*
+ * If this setting is less than 100%, split the level
+ * into two and add this setting to the new level.
+ */
+ fill = search;
+ if (set->freq < 10000) {
+ fill = cpufreq_dup_set(sc, search, set);
+
+ /*
+ * The new level was a duplicate of an existing
+ * level or its absolute setting is too high
+ * so we freed it. For example, we discard a
+ * derived level of 1000 MHz/25% if a level
+ * of 500 MHz/100% already exists.
+ */
+ if (fill == NULL)
+ break;
+ }
+
+ /* Add this setting to the existing or new level. */
+ KASSERT(fill->rel_count < MAX_SETTINGS,
+ ("cpufreq: too many relative drivers (%d)",
+ MAX_SETTINGS));
+ fill->rel_set[fill->rel_count] = *set;
+ fill->rel_count++;
+ CF_DEBUG(
+ "expand set added rel setting %d%% to %d level\n",
+ set->freq / 100, fill->total_set.freq);
+ }
+ }
+
+ return (0);
+}
+
+static struct cf_level *
+cpufreq_dup_set(struct cpufreq_softc *sc, struct cf_level *dup,
+ struct cf_setting *set)
+{
+ struct cf_level_lst *list;
+ struct cf_level *fill, *itr;
+ struct cf_setting *fill_set, *itr_set;
+ int i;
+
+ CF_MTX_ASSERT(&sc->lock);
+
+ /*
+ * Create a new level, copy it from the old one, and update the
+ * total frequency and power by the percentage specified in the
+ * relative setting.
+ */
+ fill = malloc(sizeof(*fill), M_TEMP, M_NOWAIT);
+ if (fill == NULL)
+ return (NULL);
+ *fill = *dup;
+ fill_set = &fill->total_set;
+ fill_set->freq =
+ ((uint64_t)fill_set->freq * set->freq) / 10000;
+ if (fill_set->power != CPUFREQ_VAL_UNKNOWN) {
+ fill_set->power = ((uint64_t)fill_set->power * set->freq)
+ / 10000;
+ }
+ if (set->lat != CPUFREQ_VAL_UNKNOWN) {
+ if (fill_set->lat != CPUFREQ_VAL_UNKNOWN)
+ fill_set->lat += set->lat;
+ else
+ fill_set->lat = set->lat;
+ }
+ CF_DEBUG("dup set considering derived setting %d\n", fill_set->freq);
+
+ /*
+ * If we copied an old level that we already modified (say, at 100%),
+ * we need to remove that setting before adding this one. Since we
+ * process each setting array in order, we know any settings for this
+ * driver will be found at the end.
+ */
+ for (i = fill->rel_count; i != 0; i--) {
+ if (fill->rel_set[i - 1].dev != set->dev)
+ break;
+ CF_DEBUG("removed last relative driver: %s\n",
+ device_get_nameunit(set->dev));
+ fill->rel_count--;
+ }
+
+ /*
+ * Insert the new level in sorted order. If it is a duplicate of an
+ * existing level (1) or has an absolute setting higher than the
+ * existing level (2), do not add it. We can do this since any such
+ * level is guaranteed use less power. For example (1), a level with
+ * one absolute setting of 800 Mhz uses less power than one composed
+ * of an absolute setting of 1600 Mhz and a relative setting at 50%.
+ * Also for example (2), a level of 800 Mhz/75% is preferable to
+ * 1600 Mhz/25% even though the latter has a lower total frequency.
+ */
+ list = &sc->all_levels;
+ KASSERT(!TAILQ_EMPTY(list), ("all levels list empty in dup set"));
+ TAILQ_FOREACH_REVERSE(itr, list, cf_level_lst, link) {
+ itr_set = &itr->total_set;
+ if (CPUFREQ_CMP(fill_set->freq, itr_set->freq)) {
+ CF_DEBUG("dup set rejecting %d (dupe)\n",
+ fill_set->freq);
+ itr = NULL;
+ break;
+ } else if (fill_set->freq < itr_set->freq) {
+ if (fill->abs_set.freq <= itr->abs_set.freq) {
+ CF_DEBUG(
+ "dup done, inserting new level %d after %d\n",
+ fill_set->freq, itr_set->freq);
+ TAILQ_INSERT_AFTER(list, itr, fill, link);
+ sc->all_count++;
+ } else {
+ CF_DEBUG("dup set rejecting %d (abs too big)\n",
+ fill_set->freq);
+ itr = NULL;
+ }
+ break;
+ }
+ }
+
+ /* We didn't find a good place for this new level so free it. */
+ if (itr == NULL) {
+ CF_DEBUG("dup set freeing new level %d (not optimal)\n",
+ fill_set->freq);
+ free(fill, M_TEMP);
+ fill = NULL;
+ }
+
+ return (fill);
+}
+
+static int
+cpufreq_curr_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct cpufreq_softc *sc;
+ struct cf_level *levels;
+ int best, count, diff, bdiff, devcount, error, freq, i, n;
+ device_t *devs;
+
+ devs = NULL;
+ sc = oidp->oid_arg1;
+ levels = sc->levels_buf;
+
+ error = CPUFREQ_GET(sc->dev, &levels[0]);
+ if (error)
+ goto out;
+ freq = levels[0].total_set.freq;
+ error = sysctl_handle_int(oidp, &freq, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ goto out;
+
+ /*
+ * While we only call cpufreq_get() on one device (assuming all
+ * CPUs have equal levels), we call cpufreq_set() on all CPUs.
+ * This is needed for some MP systems.
+ */
+ error = devclass_get_devices(cpufreq_dc, &devs, &devcount);
+ if (error)
+ goto out;
+ for (n = 0; n < devcount; n++) {
+ count = CF_MAX_LEVELS;
+ error = CPUFREQ_LEVELS(devs[n], levels, &count);
+ if (error) {
+ if (error == E2BIG)
+ printf(
+ "cpufreq: need to increase CF_MAX_LEVELS\n");
+ break;
+ }
+ best = 0;
+ bdiff = 1 << 30;
+ for (i = 0; i < count; i++) {
+ diff = abs(levels[i].total_set.freq - freq);
+ if (diff < bdiff) {
+ bdiff = diff;
+ best = i;
+ }
+ }
+ error = CPUFREQ_SET(devs[n], &levels[best], CPUFREQ_PRIO_USER);
+ }
+
+out:
+ if (devs)
+ free(devs, M_TEMP);
+ return (error);
+}
+
+static int
+cpufreq_levels_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct cpufreq_softc *sc;
+ struct cf_level *levels;
+ struct cf_setting *set;
+ struct sbuf sb;
+ int count, error, i;
+
+ sc = oidp->oid_arg1;
+ sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
+
+ /* Get settings from the device and generate the output string. */
+ count = CF_MAX_LEVELS;
+ levels = sc->levels_buf;
+ if (levels == NULL) {
+ sbuf_delete(&sb);
+ return (ENOMEM);
+ }
+ error = CPUFREQ_LEVELS(sc->dev, levels, &count);
+ if (error) {
+ if (error == E2BIG)
+ printf("cpufreq: need to increase CF_MAX_LEVELS\n");
+ goto out;
+ }
+ if (count) {
+ for (i = 0; i < count; i++) {
+ set = &levels[i].total_set;
+ sbuf_printf(&sb, "%d/%d ", set->freq, set->power);
+ }
+ } else
+ sbuf_cpy(&sb, "0");
+ sbuf_trim(&sb);
+ sbuf_finish(&sb);
+ error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+
+out:
+ sbuf_delete(&sb);
+ return (error);
+}
+
+static int
+cpufreq_settings_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ device_t dev;
+ struct cf_setting *sets;
+ struct sbuf sb;
+ int error, i, set_count;
+
+ dev = oidp->oid_arg1;
+ sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
+
+ /* Get settings from the device and generate the output string. */
+ set_count = MAX_SETTINGS;
+ sets = malloc(set_count * sizeof(*sets), M_TEMP, M_NOWAIT);
+ if (sets == NULL) {
+ sbuf_delete(&sb);
+ return (ENOMEM);
+ }
+ error = CPUFREQ_DRV_SETTINGS(dev, sets, &set_count);
+ if (error)
+ goto out;
+ if (set_count) {
+ for (i = 0; i < set_count; i++)
+ sbuf_printf(&sb, "%d/%d ", sets[i].freq, sets[i].power);
+ } else
+ sbuf_cpy(&sb, "0");
+ sbuf_trim(&sb);
+ sbuf_finish(&sb);
+ error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+
+out:
+ free(sets, M_TEMP);
+ sbuf_delete(&sb);
+ return (error);
+}
+
+int
+cpufreq_register(device_t dev)
+{
+ struct cpufreq_softc *sc;
+ device_t cf_dev, cpu_dev;
+
+ /* Add a sysctl to get each driver's settings separately. */
+ SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev),
+ SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+ OID_AUTO, "freq_settings", CTLTYPE_STRING | CTLFLAG_RD, dev, 0,
+ cpufreq_settings_sysctl, "A", "CPU frequency driver settings");
+
+ /*
+ * Add only one cpufreq device to each CPU. Currently, all CPUs
+ * must offer the same levels and be switched at the same time.
+ */
+ cpu_dev = device_get_parent(dev);
+ if ((cf_dev = device_find_child(cpu_dev, "cpufreq", -1))) {
+ sc = device_get_softc(cf_dev);
+ sc->max_mhz = CPUFREQ_VAL_UNKNOWN;
+ return (0);
+ }
+
+ /* Add the child device and possibly sysctls. */
+ cf_dev = BUS_ADD_CHILD(cpu_dev, 0, "cpufreq", -1);
+ if (cf_dev == NULL)
+ return (ENOMEM);
+ device_quiet(cf_dev);
+
+ return (device_probe_and_attach(cf_dev));
+}
+
+int
+cpufreq_unregister(device_t dev)
+{
+ device_t cf_dev, *devs;
+ int cfcount, devcount, error, i, type;
+
+ /*
+ * If this is the last cpufreq child device, remove the control
+ * device as well. We identify cpufreq children by calling a method
+ * they support.
+ */
+ error = device_get_children(device_get_parent(dev), &devs, &devcount);
+ if (error)
+ return (error);
+ cf_dev = device_find_child(device_get_parent(dev), "cpufreq", -1);
+ if (cf_dev == NULL) {
+ device_printf(dev,
+ "warning: cpufreq_unregister called with no cpufreq device active\n");
+ return (0);
+ }
+ cfcount = 0;
+ for (i = 0; i < devcount; i++) {
+ if (!device_is_attached(devs[i]))
+ continue;
+ if (CPUFREQ_DRV_TYPE(devs[i], &type) == 0)
+ cfcount++;
+ }
+ if (cfcount <= 1)
+ device_delete_child(device_get_parent(cf_dev), cf_dev);
+ free(devs, M_TEMP);
+
+ return (0);
+}
+
+int
+cpufreq_settings_changed(device_t dev)
+{
+
+ EVENTHANDLER_INVOKE(cpufreq_levels_changed,
+ device_get_unit(device_get_parent(dev)));
+ return (0);
+}
diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c
new file mode 100644
index 0000000..42c95c9
--- /dev/null
+++ b/sys/kern/kern_cpuset.c
@@ -0,0 +1,1166 @@
+/*-
+ * Copyright (c) 2008, Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Copyright (c) 2008 Nokia Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/cpuset.h>
+#include <sys/sx.h>
+#include <sys/queue.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif /* DDB */
+
+/*
+ * cpusets provide a mechanism for creating and manipulating sets of
+ * processors for the purpose of constraining the scheduling of threads to
+ * specific processors.
+ *
+ * Each process belongs to an identified set, by default this is set 1. Each
+ * thread may further restrict the cpus it may run on to a subset of this
+ * named set. This creates an anonymous set which other threads and processes
+ * may not join by number.
+ *
+ * The named set is referred to herein as the 'base' set to avoid ambiguity.
+ * This set is usually a child of a 'root' set while the anonymous set may
+ * simply be referred to as a mask. In the syscall api these are referred to
+ * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
+ *
+ * Threads inherit their set from their creator whether it be anonymous or
+ * not. This means that anonymous sets are immutable because they may be
+ * shared. To modify an anonymous set a new set is created with the desired
+ * mask and the same parent as the existing anonymous set. This gives the
+ * illusion of each thread having a private mask.
+ *
+ * Via the syscall apis a user may ask to retrieve or modify the root, base,
+ * or mask that is discovered via a pid, tid, or setid. Modifying a set
+ * modifies all numbered and anonymous child sets to comply with the new mask.
+ * Modifying a pid or tid's mask applies only to that tid but must still
+ * exist within the assigned parent set.
+ *
+ * A thread may not be assigned to a group separate from other threads in
+ * the process. This is to remove ambiguity when the setid is queried with
+ * a pid argument. There is no other technical limitation.
+ *
+ * This somewhat complex arrangement is intended to make it easy for
+ * applications to query available processors and bind their threads to
+ * specific processors while also allowing administrators to dynamically
+ * reprovision by changing sets which apply to groups of processes.
+ *
+ * A simple application should not concern itself with sets at all and
+ * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
+ * meaning 'curthread'. It may query available cpus for that tid with a
+ * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
+ */
+static uma_zone_t cpuset_zone;
+static struct mtx cpuset_lock;
+static struct setlist cpuset_ids;
+static struct unrhdr *cpuset_unr;
+static struct cpuset *cpuset_zero;
+
+/* Return the size of cpuset_t at the kernel level */
+SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD,
+ 0, sizeof(cpuset_t), "sizeof(cpuset_t)");
+
+cpuset_t *cpuset_root;
+
+/*
+ * Acquire a reference to a cpuset, all pointers must be tracked with refs.
+ */
+struct cpuset *
+cpuset_ref(struct cpuset *set)
+{
+
+ refcount_acquire(&set->cs_ref);
+ return (set);
+}
+
+/*
+ * Walks up the tree from 'set' to find the root. Returns the root
+ * referenced.
+ */
+static struct cpuset *
+cpuset_refroot(struct cpuset *set)
+{
+
+ for (; set->cs_parent != NULL; set = set->cs_parent)
+ if (set->cs_flags & CPU_SET_ROOT)
+ break;
+ cpuset_ref(set);
+
+ return (set);
+}
+
+/*
+ * Find the first non-anonymous set starting from 'set'. Returns this set
+ * referenced. May return the passed in set with an extra ref if it is
+ * not anonymous.
+ */
+static struct cpuset *
+cpuset_refbase(struct cpuset *set)
+{
+
+ if (set->cs_id == CPUSET_INVALID)
+ set = set->cs_parent;
+ cpuset_ref(set);
+
+ return (set);
+}
+
+/*
+ * Release a reference in a context where it is safe to allocate.
+ */
+void
+cpuset_rel(struct cpuset *set)
+{
+ cpusetid_t id;
+
+ if (refcount_release(&set->cs_ref) == 0)
+ return;
+ mtx_lock_spin(&cpuset_lock);
+ LIST_REMOVE(set, cs_siblings);
+ id = set->cs_id;
+ if (id != CPUSET_INVALID)
+ LIST_REMOVE(set, cs_link);
+ mtx_unlock_spin(&cpuset_lock);
+ cpuset_rel(set->cs_parent);
+ uma_zfree(cpuset_zone, set);
+ if (id != CPUSET_INVALID)
+ free_unr(cpuset_unr, id);
+}
+
+/*
+ * Deferred release must be used when in a context that is not safe to
+ * allocate/free. This places any unreferenced sets on the list 'head'.
+ */
+static void
+cpuset_rel_defer(struct setlist *head, struct cpuset *set)
+{
+
+ if (refcount_release(&set->cs_ref) == 0)
+ return;
+ mtx_lock_spin(&cpuset_lock);
+ LIST_REMOVE(set, cs_siblings);
+ if (set->cs_id != CPUSET_INVALID)
+ LIST_REMOVE(set, cs_link);
+ LIST_INSERT_HEAD(head, set, cs_link);
+ mtx_unlock_spin(&cpuset_lock);
+}
+
+/*
+ * Complete a deferred release. Removes the set from the list provided to
+ * cpuset_rel_defer.
+ */
+static void
+cpuset_rel_complete(struct cpuset *set)
+{
+ LIST_REMOVE(set, cs_link);
+ cpuset_rel(set->cs_parent);
+ uma_zfree(cpuset_zone, set);
+}
+
+/*
+ * Find a set based on an id. Returns it with a ref.
+ */
+static struct cpuset *
+cpuset_lookup(cpusetid_t setid, struct thread *td)
+{
+ struct cpuset *set;
+
+ if (setid == CPUSET_INVALID)
+ return (NULL);
+ mtx_lock_spin(&cpuset_lock);
+ LIST_FOREACH(set, &cpuset_ids, cs_link)
+ if (set->cs_id == setid)
+ break;
+ if (set)
+ cpuset_ref(set);
+ mtx_unlock_spin(&cpuset_lock);
+
+ KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
+ if (set != NULL && jailed(td->td_ucred)) {
+ struct cpuset *jset, *tset;
+
+ jset = td->td_ucred->cr_prison->pr_cpuset;
+ for (tset = set; tset != NULL; tset = tset->cs_parent)
+ if (tset == jset)
+ break;
+ if (tset == NULL) {
+ cpuset_rel(set);
+ set = NULL;
+ }
+ }
+
+ return (set);
+}
+
+/*
+ * Create a set in the space provided in 'set' with the provided parameters.
+ * The set is returned with a single ref. May return EDEADLK if the set
+ * will have no valid cpu based on restrictions from the parent.
+ */
+static int
+_cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
+ cpusetid_t id)
+{
+
+ if (!CPU_OVERLAP(&parent->cs_mask, mask))
+ return (EDEADLK);
+ CPU_COPY(mask, &set->cs_mask);
+ LIST_INIT(&set->cs_children);
+ refcount_init(&set->cs_ref, 1);
+ set->cs_flags = 0;
+ mtx_lock_spin(&cpuset_lock);
+ CPU_AND(&set->cs_mask, &parent->cs_mask);
+ set->cs_id = id;
+ set->cs_parent = cpuset_ref(parent);
+ LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
+ if (set->cs_id != CPUSET_INVALID)
+ LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
+ mtx_unlock_spin(&cpuset_lock);
+
+ return (0);
+}
+
+/*
+ * Create a new non-anonymous set with the requested parent and mask. May
+ * return failures if the mask is invalid or a new number can not be
+ * allocated.
+ */
+static int
+cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
+{
+ struct cpuset *set;
+ cpusetid_t id;
+ int error;
+
+ id = alloc_unr(cpuset_unr);
+ if (id == -1)
+ return (ENFILE);
+ *setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
+ error = _cpuset_create(set, parent, mask, id);
+ if (error == 0)
+ return (0);
+ free_unr(cpuset_unr, id);
+ uma_zfree(cpuset_zone, set);
+
+ return (error);
+}
+
+/*
+ * Recursively check for errors that would occur from applying mask to
+ * the tree of sets starting at 'set'. Checks for sets that would become
+ * empty as well as RDONLY flags.
+ */
+static int
+cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
+{
+ struct cpuset *nset;
+ cpuset_t newmask;
+ int error;
+
+ mtx_assert(&cpuset_lock, MA_OWNED);
+ if (set->cs_flags & CPU_SET_RDONLY)
+ return (EPERM);
+ if (check_mask) {
+ if (!CPU_OVERLAP(&set->cs_mask, mask))
+ return (EDEADLK);
+ CPU_COPY(&set->cs_mask, &newmask);
+ CPU_AND(&newmask, mask);
+ } else
+ CPU_COPY(mask, &newmask);
+ error = 0;
+ LIST_FOREACH(nset, &set->cs_children, cs_siblings)
+ if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
+ break;
+ return (error);
+}
+
+/*
+ * Applies the mask 'mask' without checking for empty sets or permissions.
+ */
+static void
+cpuset_update(struct cpuset *set, cpuset_t *mask)
+{
+ struct cpuset *nset;
+
+ mtx_assert(&cpuset_lock, MA_OWNED);
+ CPU_AND(&set->cs_mask, mask);
+ LIST_FOREACH(nset, &set->cs_children, cs_siblings)
+ cpuset_update(nset, &set->cs_mask);
+
+ return;
+}
+
+/*
+ * Modify the set 'set' to use a copy of the mask provided. Apply this new
+ * mask to restrict all children in the tree. Checks for validity before
+ * applying the changes.
+ */
+static int
+cpuset_modify(struct cpuset *set, cpuset_t *mask)
+{
+ struct cpuset *root;
+ int error;
+
+ error = priv_check(curthread, PRIV_SCHED_CPUSET);
+ if (error)
+ return (error);
+ /*
+ * In case we are called from within the jail
+ * we do not allow modifying the dedicated root
+ * cpuset of the jail but may still allow to
+ * change child sets.
+ */
+ if (jailed(curthread->td_ucred) &&
+ set->cs_flags & CPU_SET_ROOT)
+ return (EPERM);
+ /*
+ * Verify that we have access to this set of
+ * cpus.
+ */
+ root = set->cs_parent;
+ if (root && !CPU_SUBSET(&root->cs_mask, mask))
+ return (EINVAL);
+ mtx_lock_spin(&cpuset_lock);
+ error = cpuset_testupdate(set, mask, 0);
+ if (error)
+ goto out;
+ CPU_COPY(mask, &set->cs_mask);
+ cpuset_update(set, mask);
+out:
+ mtx_unlock_spin(&cpuset_lock);
+
+ return (error);
+}
+
+/*
+ * Resolve the 'which' parameter of several cpuset apis.
+ *
+ * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also
+ * checks for permission via p_cansched().
+ *
+ * For WHICH_SET returns a valid set with a new reference.
+ *
+ * -1 may be supplied for any argument to mean the current proc/thread or
+ * the base set of the current thread. May fail with ESRCH/EPERM.
+ */
+static int
+cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
+ struct cpuset **setp)
+{
+ struct cpuset *set;
+ struct thread *td;
+ struct proc *p;
+ int error;
+
+ *pp = p = NULL;
+ *tdp = td = NULL;
+ *setp = set = NULL;
+ switch (which) {
+ case CPU_WHICH_PID:
+ if (id == -1) {
+ PROC_LOCK(curproc);
+ p = curproc;
+ break;
+ }
+ if ((p = pfind(id)) == NULL)
+ return (ESRCH);
+ break;
+ case CPU_WHICH_TID:
+ if (id == -1) {
+ PROC_LOCK(curproc);
+ p = curproc;
+ td = curthread;
+ break;
+ }
+ td = tdfind(id, -1);
+ if (td == NULL)
+ return (ESRCH);
+ p = td->td_proc;
+ break;
+ case CPU_WHICH_CPUSET:
+ if (id == -1) {
+ thread_lock(curthread);
+ set = cpuset_refbase(curthread->td_cpuset);
+ thread_unlock(curthread);
+ } else
+ set = cpuset_lookup(id, curthread);
+ if (set) {
+ *setp = set;
+ return (0);
+ }
+ return (ESRCH);
+ case CPU_WHICH_JAIL:
+ {
+ /* Find `set' for prison with given id. */
+ struct prison *pr;
+
+ sx_slock(&allprison_lock);
+ pr = prison_find_child(curthread->td_ucred->cr_prison, id);
+ sx_sunlock(&allprison_lock);
+ if (pr == NULL)
+ return (ESRCH);
+ cpuset_ref(pr->pr_cpuset);
+ *setp = pr->pr_cpuset;
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ case CPU_WHICH_IRQ:
+ return (0);
+ default:
+ return (EINVAL);
+ }
+ error = p_cansched(curthread, p);
+ if (error) {
+ PROC_UNLOCK(p);
+ return (error);
+ }
+ if (td == NULL)
+ td = FIRST_THREAD_IN_PROC(p);
+ *pp = p;
+ *tdp = td;
+ return (0);
+}
+
+/*
+ * Create an anonymous set with the provided mask in the space provided by
+ * 'fset'. If the passed in set is anonymous we use its parent otherwise
+ * the new set is a child of 'set'.
+ */
+static int
+cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
+{
+ struct cpuset *parent;
+
+ if (set->cs_id == CPUSET_INVALID)
+ parent = set->cs_parent;
+ else
+ parent = set;
+ if (!CPU_SUBSET(&parent->cs_mask, mask))
+ return (EDEADLK);
+ return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
+}
+
+/*
+ * Handle two cases for replacing the base set or mask of an entire process.
+ *
+ * 1) Set is non-null and mask is null. This reparents all anonymous sets
+ * to the provided set and replaces all non-anonymous td_cpusets with the
+ * provided set.
+ * 2) Mask is non-null and set is null. This replaces or creates anonymous
+ * sets for every thread with the existing base as a parent.
+ *
+ * This is overly complicated because we can't allocate while holding a
+ * spinlock and spinlocks must be held while changing and examining thread
+ * state.
+ */
+static int
+cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
+{
+ struct setlist freelist;
+ struct setlist droplist;
+ struct cpuset *tdset;
+ struct cpuset *nset;
+ struct thread *td;
+ struct proc *p;
+ int threads;
+ int nfree;
+ int error;
+ /*
+ * The algorithm requires two passes due to locking considerations.
+ *
+ * 1) Lookup the process and acquire the locks in the required order.
+ * 2) If enough cpusets have not been allocated release the locks and
+ * allocate them. Loop.
+ */
+ LIST_INIT(&freelist);
+ LIST_INIT(&droplist);
+ nfree = 0;
+ for (;;) {
+ error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
+ if (error)
+ goto out;
+ if (nfree >= p->p_numthreads)
+ break;
+ threads = p->p_numthreads;
+ PROC_UNLOCK(p);
+ for (; nfree < threads; nfree++) {
+ nset = uma_zalloc(cpuset_zone, M_WAITOK);
+ LIST_INSERT_HEAD(&freelist, nset, cs_link);
+ }
+ }
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ /*
+ * Now that the appropriate locks are held and we have enough cpusets,
+ * make sure the operation will succeed before applying changes. The
+ * proc lock prevents td_cpuset from changing between calls.
+ */
+ error = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ tdset = td->td_cpuset;
+ /*
+ * Verify that a new mask doesn't specify cpus outside of
+ * the set the thread is a member of.
+ */
+ if (mask) {
+ if (tdset->cs_id == CPUSET_INVALID)
+ tdset = tdset->cs_parent;
+ if (!CPU_SUBSET(&tdset->cs_mask, mask))
+ error = EDEADLK;
+ /*
+ * Verify that a new set won't leave an existing thread
+ * mask without a cpu to run on. It can, however, restrict
+ * the set.
+ */
+ } else if (tdset->cs_id == CPUSET_INVALID) {
+ if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
+ error = EDEADLK;
+ }
+ thread_unlock(td);
+ if (error)
+ goto unlock_out;
+ }
+ /*
+ * Replace each thread's cpuset while using deferred release. We
+ * must do this because the thread lock must be held while operating
+ * on the thread and this limits the type of operations allowed.
+ */
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ /*
+ * If we presently have an anonymous set or are applying a
+ * mask we must create an anonymous shadow set. That is
+ * either parented to our existing base or the supplied set.
+ *
+ * If we have a base set with no anonymous shadow we simply
+ * replace it outright.
+ */
+ tdset = td->td_cpuset;
+ if (tdset->cs_id == CPUSET_INVALID || mask) {
+ nset = LIST_FIRST(&freelist);
+ LIST_REMOVE(nset, cs_link);
+ if (mask)
+ error = cpuset_shadow(tdset, nset, mask);
+ else
+ error = _cpuset_create(nset, set,
+ &tdset->cs_mask, CPUSET_INVALID);
+ if (error) {
+ LIST_INSERT_HEAD(&freelist, nset, cs_link);
+ thread_unlock(td);
+ break;
+ }
+ } else
+ nset = cpuset_ref(set);
+ cpuset_rel_defer(&droplist, tdset);
+ td->td_cpuset = nset;
+ sched_affinity(td);
+ thread_unlock(td);
+ }
+unlock_out:
+ PROC_UNLOCK(p);
+out:
+ while ((nset = LIST_FIRST(&droplist)) != NULL)
+ cpuset_rel_complete(nset);
+ while ((nset = LIST_FIRST(&freelist)) != NULL) {
+ LIST_REMOVE(nset, cs_link);
+ uma_zfree(cpuset_zone, nset);
+ }
+ return (error);
+}
+
+/*
+ * Return a string representing a valid layout for a cpuset_t object.
+ * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
+ */
+char *
+cpusetobj_strprint(char *buf, const cpuset_t *set)
+{
+ char *tbuf;
+ size_t i, bytesp, bufsiz;
+
+ tbuf = buf;
+ bytesp = 0;
+ bufsiz = CPUSETBUFSIZ;
+
+ for (i = 0; i < (_NCPUWORDS - 1); i++) {
+ bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
+ bufsiz -= bytesp;
+ tbuf += bytesp;
+ }
+ snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
+ return (buf);
+}
+
+/*
+ * Build a valid cpuset_t object from a string representation.
+ * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
+ */
+int
+cpusetobj_strscan(cpuset_t *set, const char *buf)
+{
+ u_int nwords;
+ int i, ret;
+
+ if (strlen(buf) > CPUSETBUFSIZ - 1)
+ return (-1);
+
+ /* Allow to pass a shorter version of the mask when necessary. */
+ nwords = 1;
+ for (i = 0; buf[i] != '\0'; i++)
+ if (buf[i] == ',')
+ nwords++;
+ if (nwords > _NCPUWORDS)
+ return (-1);
+
+ CPU_ZERO(set);
+ for (i = 0; i < (nwords - 1); i++) {
+ ret = sscanf(buf, "%lx,", &set->__bits[i]);
+ if (ret == 0 || ret == -1)
+ return (-1);
+ buf = strstr(buf, ",");
+ if (buf == NULL)
+ return (-1);
+ buf++;
+ }
+ ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
+ if (ret == 0 || ret == -1)
+ return (-1);
+ return (0);
+}
+
+/*
+ * Apply an anonymous mask to a single thread.
+ */
+int
+cpuset_setthread(lwpid_t id, cpuset_t *mask)
+{
+ struct cpuset *nset;
+ struct cpuset *set;
+ struct thread *td;
+ struct proc *p;
+ int error;
+
+ nset = uma_zalloc(cpuset_zone, M_WAITOK);
+ error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
+ if (error)
+ goto out;
+ set = NULL;
+ thread_lock(td);
+ error = cpuset_shadow(td->td_cpuset, nset, mask);
+ if (error == 0) {
+ set = td->td_cpuset;
+ td->td_cpuset = nset;
+ sched_affinity(td);
+ nset = NULL;
+ }
+ thread_unlock(td);
+ PROC_UNLOCK(p);
+ if (set)
+ cpuset_rel(set);
+out:
+ if (nset)
+ uma_zfree(cpuset_zone, nset);
+ return (error);
+}
+
+/*
+ * Creates the cpuset for thread0. We make two sets:
+ *
+ * 0 - The root set which should represent all valid processors in the
+ * system. It is initially created with a mask of all processors
+ * because we don't know what processors are valid until cpuset_init()
+ * runs. This set is immutable.
+ * 1 - The default set which all processes are a member of until changed.
+ * This allows an administrator to move all threads off of given cpus to
+ * dedicate them to high priority tasks or save power etc.
+ */
+struct cpuset *
+cpuset_thread0(void)
+{
+ struct cpuset *set;
+ int error;
+
+ cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, 0);
+ mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
+ /*
+ * Create the root system set for the whole machine. Doesn't use
+ * cpuset_create() due to NULL parent.
+ */
+ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+ CPU_FILL(&set->cs_mask);
+ LIST_INIT(&set->cs_children);
+ LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
+ set->cs_ref = 1;
+ set->cs_flags = CPU_SET_ROOT;
+ cpuset_zero = set;
+ cpuset_root = &set->cs_mask;
+ /*
+ * Now derive a default, modifiable set from that to give out.
+ */
+ set = uma_zalloc(cpuset_zone, M_WAITOK);
+ error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
+ KASSERT(error == 0, ("Error creating default set: %d\n", error));
+ /*
+ * Initialize the unit allocator. 0 and 1 are allocated above.
+ */
+ cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
+
+ return (set);
+}
+
+/*
+ * Create a cpuset, which would be cpuset_create() but
+ * mark the new 'set' as root.
+ *
+ * We are not going to reparent the td to it. Use cpuset_setproc_update_set()
+ * for that.
+ *
+ * In case of no error, returns the set in *setp locked with a reference.
+ */
+int
+cpuset_create_root(struct prison *pr, struct cpuset **setp)
+{
+ struct cpuset *set;
+ int error;
+
+ KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
+ KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
+
+ error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
+ if (error)
+ return (error);
+
+ KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
+ __func__, __LINE__));
+
+ /* Mark the set as root. */
+ set = *setp;
+ set->cs_flags |= CPU_SET_ROOT;
+
+ return (0);
+}
+
+int
+cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
+{
+ int error;
+
+ KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
+ KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
+
+ cpuset_ref(set);
+ error = cpuset_setproc(p->p_pid, set, NULL);
+ if (error)
+ return (error);
+ cpuset_rel(set);
+ return (0);
+}
+
+/*
+ * This is called once the final set of system cpus is known. Modifies
+ * the root set and all children and mark the root read-only.
+ */
+static void
+cpuset_init(void *arg)
+{
+ cpuset_t mask;
+
+ mask = all_cpus;
+ if (cpuset_modify(cpuset_zero, &mask))
+ panic("Can't set initial cpuset mask.\n");
+ cpuset_zero->cs_flags |= CPU_SET_RDONLY;
+}
+SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_args {
+ cpusetid_t *setid;
+};
+#endif
+int
+sys_cpuset(struct thread *td, struct cpuset_args *uap)
+{
+ struct cpuset *root;
+ struct cpuset *set;
+ int error;
+
+ thread_lock(td);
+ root = cpuset_refroot(td->td_cpuset);
+ thread_unlock(td);
+ error = cpuset_create(&set, root, &root->cs_mask);
+ cpuset_rel(root);
+ if (error)
+ return (error);
+ error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
+ if (error == 0)
+ error = cpuset_setproc(-1, set, NULL);
+ cpuset_rel(set);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_setid_args {
+ cpuwhich_t which;
+ id_t id;
+ cpusetid_t setid;
+};
+#endif
+int
+sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
+{
+ struct cpuset *set;
+ int error;
+
+ /*
+ * Presently we only support per-process sets.
+ */
+ if (uap->which != CPU_WHICH_PID)
+ return (EINVAL);
+ set = cpuset_lookup(uap->setid, td);
+ if (set == NULL)
+ return (ESRCH);
+ error = cpuset_setproc(uap->id, set, NULL);
+ cpuset_rel(set);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_getid_args {
+ cpulevel_t level;
+ cpuwhich_t which;
+ id_t id;
+ cpusetid_t *setid;
+};
+#endif
+int
+sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
+{
+ struct cpuset *nset;
+ struct cpuset *set;
+ struct thread *ttd;
+ struct proc *p;
+ cpusetid_t id;
+ int error;
+
+ if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
+ return (EINVAL);
+ error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+ if (error)
+ return (error);
+ switch (uap->which) {
+ case CPU_WHICH_TID:
+ case CPU_WHICH_PID:
+ thread_lock(ttd);
+ set = cpuset_refbase(ttd->td_cpuset);
+ thread_unlock(ttd);
+ PROC_UNLOCK(p);
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ break;
+ case CPU_WHICH_IRQ:
+ return (EINVAL);
+ }
+ switch (uap->level) {
+ case CPU_LEVEL_ROOT:
+ nset = cpuset_refroot(set);
+ cpuset_rel(set);
+ set = nset;
+ break;
+ case CPU_LEVEL_CPUSET:
+ break;
+ case CPU_LEVEL_WHICH:
+ break;
+ }
+ id = set->cs_id;
+ cpuset_rel(set);
+ if (error == 0)
+ error = copyout(&id, uap->setid, sizeof(id));
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_getaffinity_args {
+ cpulevel_t level;
+ cpuwhich_t which;
+ id_t id;
+ size_t cpusetsize;
+ cpuset_t *mask;
+};
+#endif
+int
+sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
+{
+ struct thread *ttd;
+ struct cpuset *nset;
+ struct cpuset *set;
+ struct proc *p;
+ cpuset_t *mask;
+ int error;
+ size_t size;
+
+ if (uap->cpusetsize < sizeof(cpuset_t) ||
+ uap->cpusetsize > CPU_MAXSIZE / NBBY)
+ return (ERANGE);
+ size = uap->cpusetsize;
+ mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
+ error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+ if (error)
+ goto out;
+ switch (uap->level) {
+ case CPU_LEVEL_ROOT:
+ case CPU_LEVEL_CPUSET:
+ switch (uap->which) {
+ case CPU_WHICH_TID:
+ case CPU_WHICH_PID:
+ thread_lock(ttd);
+ set = cpuset_ref(ttd->td_cpuset);
+ thread_unlock(ttd);
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ break;
+ case CPU_WHICH_IRQ:
+ error = EINVAL;
+ goto out;
+ }
+ if (uap->level == CPU_LEVEL_ROOT)
+ nset = cpuset_refroot(set);
+ else
+ nset = cpuset_refbase(set);
+ CPU_COPY(&nset->cs_mask, mask);
+ cpuset_rel(nset);
+ break;
+ case CPU_LEVEL_WHICH:
+ switch (uap->which) {
+ case CPU_WHICH_TID:
+ thread_lock(ttd);
+ CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
+ thread_unlock(ttd);
+ break;
+ case CPU_WHICH_PID:
+ FOREACH_THREAD_IN_PROC(p, ttd) {
+ thread_lock(ttd);
+ CPU_OR(mask, &ttd->td_cpuset->cs_mask);
+ thread_unlock(ttd);
+ }
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ CPU_COPY(&set->cs_mask, mask);
+ break;
+ case CPU_WHICH_IRQ:
+ error = intr_getaffinity(uap->id, mask);
+ break;
+ }
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (set)
+ cpuset_rel(set);
+ if (p)
+ PROC_UNLOCK(p);
+ if (error == 0)
+ error = copyout(mask, uap->mask, size);
+out:
+ free(mask, M_TEMP);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct cpuset_setaffinity_args {
+ cpulevel_t level;
+ cpuwhich_t which;
+ id_t id;
+ size_t cpusetsize;
+ const cpuset_t *mask;
+};
+#endif
+int
+sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
+{
+ struct cpuset *nset;
+ struct cpuset *set;
+ struct thread *ttd;
+ struct proc *p;
+ cpuset_t *mask;
+ int error;
+
+ if (uap->cpusetsize < sizeof(cpuset_t) ||
+ uap->cpusetsize > CPU_MAXSIZE / NBBY)
+ return (ERANGE);
+ mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
+ error = copyin(uap->mask, mask, uap->cpusetsize);
+ if (error)
+ goto out;
+ /*
+ * Verify that no high bits are set.
+ */
+ if (uap->cpusetsize > sizeof(cpuset_t)) {
+ char *end;
+ char *cp;
+
+ end = cp = (char *)&mask->__bits;
+ end += uap->cpusetsize;
+ cp += sizeof(cpuset_t);
+ while (cp != end)
+ if (*cp++ != 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ }
+ switch (uap->level) {
+ case CPU_LEVEL_ROOT:
+ case CPU_LEVEL_CPUSET:
+ error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
+ if (error)
+ break;
+ switch (uap->which) {
+ case CPU_WHICH_TID:
+ case CPU_WHICH_PID:
+ thread_lock(ttd);
+ set = cpuset_ref(ttd->td_cpuset);
+ thread_unlock(ttd);
+ PROC_UNLOCK(p);
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ break;
+ case CPU_WHICH_IRQ:
+ error = EINVAL;
+ goto out;
+ }
+ if (uap->level == CPU_LEVEL_ROOT)
+ nset = cpuset_refroot(set);
+ else
+ nset = cpuset_refbase(set);
+ error = cpuset_modify(nset, mask);
+ cpuset_rel(nset);
+ cpuset_rel(set);
+ break;
+ case CPU_LEVEL_WHICH:
+ switch (uap->which) {
+ case CPU_WHICH_TID:
+ error = cpuset_setthread(uap->id, mask);
+ break;
+ case CPU_WHICH_PID:
+ error = cpuset_setproc(uap->id, NULL, mask);
+ break;
+ case CPU_WHICH_CPUSET:
+ case CPU_WHICH_JAIL:
+ error = cpuset_which(uap->which, uap->id, &p,
+ &ttd, &set);
+ if (error == 0) {
+ error = cpuset_modify(set, mask);
+ cpuset_rel(set);
+ }
+ break;
+ case CPU_WHICH_IRQ:
+ error = intr_setaffinity(uap->id, mask);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+out:
+ free(mask, M_TEMP);
+ return (error);
+}
+
+#ifdef DDB
+void
+ddb_display_cpuset(const cpuset_t *set)
+{
+ int cpu, once;
+
+ for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+ if (CPU_ISSET(cpu, set)) {
+ if (once == 0) {
+ db_printf("%d", cpu);
+ once = 1;
+ } else
+ db_printf(",%d", cpu);
+ }
+ }
+ if (once == 0)
+ db_printf("<none>");
+}
+
+DB_SHOW_COMMAND(cpusets, db_show_cpusets)
+{
+ struct cpuset *set;
+
+ LIST_FOREACH(set, &cpuset_ids, cs_link) {
+ db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
+ set, set->cs_id, set->cs_ref, set->cs_flags,
+ (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
+ db_printf(" mask=");
+ ddb_display_cpuset(&set->cs_mask);
+ db_printf("\n");
+ if (db_pager_quit)
+ break;
+ }
+}
+#endif /* DDB */
diff --git a/sys/kern/kern_ctf.c b/sys/kern/kern_ctf.c
new file mode 100644
index 0000000..319414c
--- /dev/null
+++ b/sys/kern/kern_ctf.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2008 John Birrell <jb@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Note this file is included by both link_elf.c and link_elf_obj.c.
+ *
+ * The CTF header structure definition can't be used here because it's
+ * (annoyingly) covered by the CDDL. We will just use a few bytes from
+ * it as an integer array where we 'know' what they mean.
+ */
+#define CTF_HDR_SIZE 36
+#define CTF_HDR_STRTAB_U32 7
+#define CTF_HDR_STRLEN_U32 8
+
+#ifdef DDB_CTF
+static void *
+z_alloc(void *nil, u_int items, u_int size)
+{
+ void *ptr;
+
+ ptr = malloc(items * size, M_TEMP, M_NOWAIT);
+ return ptr;
+}
+
+static void
+z_free(void *nil, void *ptr)
+{
+ free(ptr, M_TEMP);
+}
+
+#endif
+
+static int
+link_elf_ctf_get(linker_file_t lf, linker_ctf_t *lc)
+{
+#ifdef DDB_CTF
+ Elf_Ehdr *hdr = NULL;
+ Elf_Shdr *shdr = NULL;
+ caddr_t ctftab = NULL;
+ caddr_t raw = NULL;
+ caddr_t shstrtab = NULL;
+ elf_file_t ef = (elf_file_t) lf;
+ int flags;
+ int i;
+ int nbytes;
+ ssize_t resid;
+ size_t sz;
+ struct nameidata nd;
+ struct thread *td = curthread;
+ uint8_t ctf_hdr[CTF_HDR_SIZE];
+#endif
+ int error = 0;
+
+ if (lf == NULL || lc == NULL)
+ return (EINVAL);
+
+ /* Set the defaults for no CTF present. That's not a crime! */
+ bzero(lc, sizeof(*lc));
+
+#ifdef DDB_CTF
+ /*
+ * First check if we've tried to load CTF data previously and the
+ * CTF ELF section wasn't found. We flag that condition by setting
+ * ctfcnt to -1. See below.
+ */
+ if (ef->ctfcnt < 0)
+ return (EFTYPE);
+
+ /* Now check if we've already loaded the CTF data.. */
+ if (ef->ctfcnt > 0) {
+ /* We only need to load once. */
+ lc->ctftab = ef->ctftab;
+ lc->ctfcnt = ef->ctfcnt;
+ lc->symtab = ef->ddbsymtab;
+ lc->strtab = ef->ddbstrtab;
+ lc->strcnt = ef->ddbstrcnt;
+ lc->nsym = ef->ddbsymcnt;
+ lc->ctfoffp = (uint32_t **) &ef->ctfoff;
+ lc->typoffp = (uint32_t **) &ef->typoff;
+ lc->typlenp = &ef->typlen;
+ return (0);
+ }
+
+ /*
+ * We need to try reading the CTF data. Flag no CTF data present
+ * by default and if we actually succeed in reading it, we'll
+ * update ctfcnt to the number of bytes read.
+ */
+ ef->ctfcnt = -1;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, lf->pathname, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ /* Allocate memory for the FLF header. */
+ if ((hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK)) == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* Read the ELF header. */
+ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, hdr, sizeof(*hdr),
+ 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid,
+ td)) != 0)
+ goto out;
+
+ /* Sanity check. */
+ if (!IS_ELF(*hdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ nbytes = hdr->e_shnum * hdr->e_shentsize;
+ if (nbytes == 0 || hdr->e_shoff == 0 ||
+ hdr->e_shentsize != sizeof(Elf_Shdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /* Allocate memory for all the section headers */
+ if ((shdr = malloc(nbytes, M_LINKER, M_WAITOK)) == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* Read all the section headers */
+ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes,
+ hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td)) != 0)
+ goto out;
+
+ /*
+ * We need to search for the CTF section by name, so if the
+ * section names aren't present, then we can't locate the
+ * .SUNW_ctf section containing the CTF data.
+ */
+ if (hdr->e_shstrndx == 0 || shdr[hdr->e_shstrndx].sh_type != SHT_STRTAB) {
+ printf("%s(%d): module %s e_shstrndx is %d, sh_type is %d\n",
+ __func__, __LINE__, lf->pathname, hdr->e_shstrndx,
+ shdr[hdr->e_shstrndx].sh_type);
+ error = EFTYPE;
+ goto out;
+ }
+
+ /* Allocate memory to buffer the section header strings. */
+ if ((shstrtab = malloc(shdr[hdr->e_shstrndx].sh_size, M_LINKER,
+ M_WAITOK)) == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* Read the section header strings. */
+ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, shstrtab,
+ shdr[hdr->e_shstrndx].sh_size, shdr[hdr->e_shstrndx].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid,
+ td)) != 0)
+ goto out;
+
+ /* Search for the section containing the CTF data. */
+ for (i = 0; i < hdr->e_shnum; i++)
+ if (strcmp(".SUNW_ctf", shstrtab + shdr[i].sh_name) == 0)
+ break;
+
+ /* Check if the CTF section wasn't found. */
+ if (i >= hdr->e_shnum) {
+ printf("%s(%d): module %s has no .SUNW_ctf section\n",
+ __func__, __LINE__, lf->pathname);
+ error = EFTYPE;
+ goto out;
+ }
+
+ /* Read the CTF header. */
+ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, ctf_hdr, sizeof(ctf_hdr),
+ shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
+ NOCRED, &resid, td)) != 0)
+ goto out;
+
+ /* Check the CTF magic number. (XXX check for big endian!) */
+ if (ctf_hdr[0] != 0xf1 || ctf_hdr[1] != 0xcf) {
+ printf("%s(%d): module %s has invalid format\n",
+ __func__, __LINE__, lf->pathname);
+ error = EFTYPE;
+ goto out;
+ }
+
+ /* Check if version 2. */
+ if (ctf_hdr[2] != 2) {
+ printf("%s(%d): module %s CTF format version is %d "
+ "(2 expected)\n",
+ __func__, __LINE__, lf->pathname, ctf_hdr[2]);
+ error = EFTYPE;
+ goto out;
+ }
+
+ /* Check if the data is compressed. */
+ if ((ctf_hdr[3] & 0x1) != 0) {
+ uint32_t *u32 = (uint32_t *) ctf_hdr;
+
+ /*
+ * The last two fields in the CTF header are the offset
+ * from the end of the header to the start of the string
+ * data and the length of that string data. se this
+ * information to determine the decompressed CTF data
+ * buffer required.
+ */
+ sz = u32[CTF_HDR_STRTAB_U32] + u32[CTF_HDR_STRLEN_U32] +
+ sizeof(ctf_hdr);
+
+ /*
+ * Allocate memory for the compressed CTF data, including
+ * the header (which isn't compressed).
+ */
+ if ((raw = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK)) == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ } else {
+ /*
+ * The CTF data is not compressed, so the ELF section
+ * size is the same as the buffer size required.
+ */
+ sz = shdr[i].sh_size;
+ }
+
+ /*
+ * Allocate memory to buffer the CTF data in it's decompressed
+ * form.
+ */
+ if ((ctftab = malloc(sz, M_LINKER, M_WAITOK)) == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Read the CTF data into the raw buffer if compressed, or
+ * directly into the CTF buffer otherwise.
+ */
+ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, raw == NULL ? ctftab : raw,
+ shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED,
+ td->td_ucred, NOCRED, &resid, td)) != 0)
+ goto out;
+
+ /* Check if decompression is required. */
+ if (raw != NULL) {
+ z_stream zs;
+ int ret;
+
+ /*
+ * The header isn't compressed, so copy that into the
+ * CTF buffer first.
+ */
+ bcopy(ctf_hdr, ctftab, sizeof(ctf_hdr));
+
+ /* Initialise the zlib structure. */
+ bzero(&zs, sizeof(zs));
+ zs.zalloc = z_alloc;
+ zs.zfree = z_free;
+
+ if (inflateInit(&zs) != Z_OK) {
+ error = EIO;
+ goto out;
+ }
+
+ zs.avail_in = shdr[i].sh_size - sizeof(ctf_hdr);
+ zs.next_in = ((uint8_t *) raw) + sizeof(ctf_hdr);
+ zs.avail_out = sz - sizeof(ctf_hdr);
+ zs.next_out = ((uint8_t *) ctftab) + sizeof(ctf_hdr);
+ if ((ret = inflate(&zs, Z_FINISH)) != Z_STREAM_END) {
+ printf("%s(%d): zlib inflate returned %d\n", __func__, __LINE__, ret);
+ error = EIO;
+ goto out;
+ }
+ }
+
+ /* Got the CTF data! */
+ ef->ctftab = ctftab;
+ ef->ctfcnt = shdr[i].sh_size;
+
+ /* We'll retain the memory allocated for the CTF data. */
+ ctftab = NULL;
+
+ /* Let the caller use the CTF data read. */
+ lc->ctftab = ef->ctftab;
+ lc->ctfcnt = ef->ctfcnt;
+ lc->symtab = ef->ddbsymtab;
+ lc->strtab = ef->ddbstrtab;
+ lc->strcnt = ef->ddbstrcnt;
+ lc->nsym = ef->ddbsymcnt;
+ lc->ctfoffp = (uint32_t **) &ef->ctfoff;
+ lc->typoffp = (uint32_t **) &ef->typoff;
+ lc->typlenp = &ef->typlen;
+
+out:
+ VOP_UNLOCK(nd.ni_vp, 0);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+
+ if (hdr != NULL)
+ free(hdr, M_LINKER);
+ if (shdr != NULL)
+ free(shdr, M_LINKER);
+ if (shstrtab != NULL)
+ free(shstrtab, M_LINKER);
+ if (ctftab != NULL)
+ free(ctftab, M_LINKER);
+ if (raw != NULL)
+ free(raw, M_LINKER);
+#else
+ error = EOPNOTSUPP;
+#endif
+
+ return (error);
+}
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
new file mode 100644
index 0000000..9e9010f
--- /dev/null
+++ b/sys/kern/kern_descrip.c
@@ -0,0 +1,4016 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_ddb.h"
+#include "opt_ktrace.h"
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <sys/capability.h>
+#include <sys/conf.h>
+#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/ksem.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/mqueue.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/selinfo.h>
+#include <sys/pipe.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/protosw.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/tty.h>
+#include <sys/unistd.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+
+#include <ddb/ddb.h>
+
+static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
+static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
+ "file desc to leader structures");
+static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
+
+MALLOC_DECLARE(M_FADVISE);
+
+static uma_zone_t file_zone;
+
+void (*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
+
+static int closefp(struct filedesc *fdp, int fd, struct file *fp,
+ struct thread *td, int holdleaders);
+static int fd_first_free(struct filedesc *fdp, int low, int size);
+static int fd_last_used(struct filedesc *fdp, int size);
+static void fdgrowtable(struct filedesc *fdp, int nfd);
+static void fdunused(struct filedesc *fdp, int fd);
+static void fdused(struct filedesc *fdp, int fd);
+static int fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
+static int fill_procdesc_info(struct procdesc *pdp,
+ struct kinfo_file *kif);
+static int fill_pts_info(struct tty *tp, struct kinfo_file *kif);
+static int fill_sem_info(struct file *fp, struct kinfo_file *kif);
+static int fill_shm_info(struct file *fp, struct kinfo_file *kif);
+static int fill_socket_info(struct socket *so, struct kinfo_file *kif);
+static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
+
+/*
+ * Each process has:
+ *
+ * - An array of open file descriptors (fd_ofiles)
+ * - An array of file flags (fd_ofileflags)
+ * - A bitmap recording which descriptors are in use (fd_map)
+ *
+ * A process starts out with NDFILE descriptors. The value of NDFILE has
+ * been selected based the historical limit of 20 open files, and an
+ * assumption that the majority of processes, especially short-lived
+ * processes like shells, will never need more.
+ *
+ * If this initial allocation is exhausted, a larger descriptor table and
+ * map are allocated dynamically, and the pointers in the process's struct
+ * filedesc are updated to point to those. This is repeated every time
+ * the process runs out of file descriptors (provided it hasn't hit its
+ * resource limit).
+ *
+ * Since threads may hold references to individual descriptor table
+ * entries, the tables are never freed. Instead, they are placed on a
+ * linked list and freed only when the struct filedesc is released.
+ */
+#define NDFILE 20
+#define NDSLOTSIZE sizeof(NDSLOTTYPE)
+#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT)
+#define NDSLOT(x) ((x) / NDENTRIES)
+#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
+#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES)
+
+/*
+ * SLIST entry used to keep track of ofiles which must be reclaimed when
+ * the process exits.
+ */
+struct freetable {
+ struct filedescent *ft_table;
+ SLIST_ENTRY(freetable) ft_next;
+};
+
+/*
+ * Initial allocation: a filedesc structure + the head of SLIST used to
+ * keep track of old ofiles + enough space for NDFILE descriptors.
+ */
+struct filedesc0 {
+ struct filedesc fd_fd;
+ SLIST_HEAD(, freetable) fd_free;
+ struct filedescent fd_dfiles[NDFILE];
+ NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
+};
+
+/*
+ * Descriptor management.
+ */
+volatile int openfiles; /* actual number of open files */
+struct mtx sigio_lock; /* mtx to protect pointers to sigio */
+void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
+
+/* A mutex to protect the association between a proc and filedesc. */
+static struct mtx fdesc_mtx;
+
+/*
+ * If low >= size, just return low. Otherwise find the first zero bit in the
+ * given bitmap, starting at low and not exceeding size - 1. Return size if
+ * not found.
+ */
+static int
+fd_first_free(struct filedesc *fdp, int low, int size)
+{
+ NDSLOTTYPE *map = fdp->fd_map;
+ NDSLOTTYPE mask;
+ int off, maxoff;
+
+ if (low >= size)
+ return (low);
+
+ off = NDSLOT(low);
+ if (low % NDENTRIES) {
+ mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
+ if ((mask &= ~map[off]) != 0UL)
+ return (off * NDENTRIES + ffsl(mask) - 1);
+ ++off;
+ }
+ for (maxoff = NDSLOTS(size); off < maxoff; ++off)
+ if (map[off] != ~0UL)
+ return (off * NDENTRIES + ffsl(~map[off]) - 1);
+ return (size);
+}
+
+/*
+ * Find the highest non-zero bit in the given bitmap, starting at 0 and
+ * not exceeding size - 1. Return -1 if not found.
+ */
+static int
+fd_last_used(struct filedesc *fdp, int size)
+{
+ NDSLOTTYPE *map = fdp->fd_map;
+ NDSLOTTYPE mask;
+ int off, minoff;
+
+ off = NDSLOT(size);
+ if (size % NDENTRIES) {
+ mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
+ if ((mask &= map[off]) != 0)
+ return (off * NDENTRIES + flsl(mask) - 1);
+ --off;
+ }
+ for (minoff = NDSLOT(0); off >= minoff; --off)
+ if (map[off] != 0)
+ return (off * NDENTRIES + flsl(map[off]) - 1);
+ return (-1);
+}
+
+static int
+fdisused(struct filedesc *fdp, int fd)
+{
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+ ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
+ return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
+}
+
+/*
+ * Mark a file descriptor as used.
+ */
+static void
+fdused(struct filedesc *fdp, int fd)
+{
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
+
+ fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
+ if (fd > fdp->fd_lastfile)
+ fdp->fd_lastfile = fd;
+ if (fd == fdp->fd_freefile)
+ fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
+}
+
+/*
+ * Mark a file descriptor as unused.
+ */
+static void
+fdunused(struct filedesc *fdp, int fd)
+{
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
+ KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+ ("fd=%d is still in use", fd));
+
+ fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
+ if (fd < fdp->fd_freefile)
+ fdp->fd_freefile = fd;
+ if (fd == fdp->fd_lastfile)
+ fdp->fd_lastfile = fd_last_used(fdp, fd);
+}
+
+/*
+ * Free a file descriptor.
+ */
+static inline void
+fdfree(struct filedesc *fdp, int fd)
+{
+ struct filedescent *fde;
+
+ fde = &fdp->fd_ofiles[fd];
+ filecaps_free(&fde->fde_caps);
+ bzero(fde, sizeof(*fde));
+ fdunused(fdp, fd);
+}
+
+/*
+ * System calls on descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
+{
+ struct proc *p = td->td_proc;
+ uint64_t lim;
+
+ PROC_LOCK(p);
+ td->td_retval[0] =
+ min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+ lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
+ PROC_UNLOCK(p);
+ if (lim < td->td_retval[0])
+ td->td_retval[0] = lim;
+ return (0);
+}
+
+/*
+ * Duplicate a file descriptor to a particular value.
+ *
+ * Note: keep in mind that a potential race condition exists when closing
+ * descriptors from a shared descriptor table (via rfork).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+ u_int from;
+ u_int to;
+};
+#endif
+/* ARGSUSED */
+int
+sys_dup2(struct thread *td, struct dup2_args *uap)
+{
+
+ return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
+ td->td_retval));
+}
+
+/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+ u_int fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_dup(struct thread *td, struct dup_args *uap)
+{
+
+ return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
+}
+
+/*
+ * The file control system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+ int fd;
+ int cmd;
+ long arg;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fcntl(struct thread *td, struct fcntl_args *uap)
+{
+ struct flock fl;
+ struct __oflock ofl;
+ intptr_t arg;
+ int error;
+ int cmd;
+
+ error = 0;
+ cmd = uap->cmd;
+ switch (uap->cmd) {
+ case F_OGETLK:
+ case F_OSETLK:
+ case F_OSETLKW:
+ /*
+ * Convert old flock structure to new.
+ */
+ error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
+ fl.l_start = ofl.l_start;
+ fl.l_len = ofl.l_len;
+ fl.l_pid = ofl.l_pid;
+ fl.l_type = ofl.l_type;
+ fl.l_whence = ofl.l_whence;
+ fl.l_sysid = 0;
+
+ switch (uap->cmd) {
+ case F_OGETLK:
+ cmd = F_GETLK;
+ break;
+ case F_OSETLK:
+ cmd = F_SETLK;
+ break;
+ case F_OSETLKW:
+ cmd = F_SETLKW;
+ break;
+ }
+ arg = (intptr_t)&fl;
+ break;
+ case F_GETLK:
+ case F_SETLK:
+ case F_SETLKW:
+ case F_SETLK_REMOTE:
+ error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
+ arg = (intptr_t)&fl;
+ break;
+ default:
+ arg = uap->arg;
+ break;
+ }
+ if (error)
+ return (error);
+ error = kern_fcntl(td, uap->fd, cmd, arg);
+ if (error)
+ return (error);
+ if (uap->cmd == F_OGETLK) {
+ ofl.l_start = fl.l_start;
+ ofl.l_len = fl.l_len;
+ ofl.l_pid = fl.l_pid;
+ ofl.l_type = fl.l_type;
+ ofl.l_whence = fl.l_whence;
+ error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
+ } else if (uap->cmd == F_GETLK) {
+ error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
+ }
+ return (error);
+}
+
+int
+kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
+{
+ struct filedesc *fdp;
+ struct flock *flp;
+ struct file *fp, *fp2;
+ struct filedescent *fde;
+ struct proc *p;
+ struct vnode *vp;
+ cap_rights_t rights;
+ int error, flg, tmp;
+ u_int old, new;
+ uint64_t bsize;
+ off_t foffset;
+
+ error = 0;
+ flg = F_POSIX;
+ p = td->td_proc;
+ fdp = p->p_fd;
+
+ switch (cmd) {
+ case F_DUPFD:
+ tmp = arg;
+ error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
+ break;
+
+ case F_DUPFD_CLOEXEC:
+ tmp = arg;
+ error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
+ td->td_retval);
+ break;
+
+ case F_DUP2FD:
+ tmp = arg;
+ error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
+ break;
+
+ case F_DUP2FD_CLOEXEC:
+ tmp = arg;
+ error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
+ td->td_retval);
+ break;
+
+ case F_GETFD:
+ FILEDESC_SLOCK(fdp);
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
+ fde = &fdp->fd_ofiles[fd];
+ td->td_retval[0] =
+ (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+ FILEDESC_SUNLOCK(fdp);
+ break;
+
+ case F_SETFD:
+ FILEDESC_XLOCK(fdp);
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ error = EBADF;
+ break;
+ }
+ fde = &fdp->fd_ofiles[fd];
+ fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
+ (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
+ FILEDESC_XUNLOCK(fdp);
+ break;
+
+ case F_GETFL:
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
+ if (error != 0)
+ break;
+ td->td_retval[0] = OFLAGS(fp->f_flag);
+ fdrop(fp, td);
+ break;
+
+ case F_SETFL:
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
+ if (error != 0)
+ break;
+ do {
+ tmp = flg = fp->f_flag;
+ tmp &= ~FCNTLFLAGS;
+ tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
+ } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
+ tmp = fp->f_flag & FNONBLOCK;
+ error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+ if (error != 0) {
+ fdrop(fp, td);
+ break;
+ }
+ tmp = fp->f_flag & FASYNC;
+ error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
+ if (error == 0) {
+ fdrop(fp, td);
+ break;
+ }
+ atomic_clear_int(&fp->f_flag, FNONBLOCK);
+ tmp = 0;
+ (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
+ fdrop(fp, td);
+ break;
+
+ case F_GETOWN:
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
+ if (error != 0)
+ break;
+ error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
+ if (error == 0)
+ td->td_retval[0] = tmp;
+ fdrop(fp, td);
+ break;
+
+ case F_SETOWN:
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
+ if (error != 0)
+ break;
+ tmp = arg;
+ error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
+ fdrop(fp, td);
+ break;
+
+ case F_SETLK_REMOTE:
+ error = priv_check(td, PRIV_NFS_LOCKD);
+ if (error)
+ return (error);
+ flg = F_REMOTE;
+ goto do_setlk;
+
+ case F_SETLKW:
+ flg |= F_WAIT;
+ /* FALLTHROUGH F_SETLK */
+
+ case F_SETLK:
+ do_setlk:
+ cap_rights_init(&rights, CAP_FLOCK);
+ error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
+ if (error != 0)
+ break;
+ if (fp->f_type != DTYPE_VNODE) {
+ error = EBADF;
+ fdrop(fp, td);
+ break;
+ }
+
+ flp = (struct flock *)arg;
+ if (flp->l_whence == SEEK_CUR) {
+ foffset = foffset_get(fp);
+ if (foffset < 0 ||
+ (flp->l_start > 0 &&
+ foffset > OFF_MAX - flp->l_start)) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EOVERFLOW;
+ fdrop(fp, td);
+ break;
+ }
+ flp->l_start += foffset;
+ }
+
+ vp = fp->f_vnode;
+ switch (flp->l_type) {
+ case F_RDLCK:
+ if ((fp->f_flag & FREAD) == 0) {
+ error = EBADF;
+ break;
+ }
+ PROC_LOCK(p->p_leader);
+ p->p_leader->p_flag |= P_ADVLOCK;
+ PROC_UNLOCK(p->p_leader);
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+ flp, flg);
+ break;
+ case F_WRLCK:
+ if ((fp->f_flag & FWRITE) == 0) {
+ error = EBADF;
+ break;
+ }
+ PROC_LOCK(p->p_leader);
+ p->p_leader->p_flag |= P_ADVLOCK;
+ PROC_UNLOCK(p->p_leader);
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+ flp, flg);
+ break;
+ case F_UNLCK:
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
+ flp, flg);
+ break;
+ case F_UNLCKSYS:
+ /*
+ * Temporary api for testing remote lock
+ * infrastructure.
+ */
+ if (flg != F_REMOTE) {
+ error = EINVAL;
+ break;
+ }
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+ F_UNLCKSYS, flp, flg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (error != 0 || flp->l_type == F_UNLCK ||
+ flp->l_type == F_UNLCKSYS) {
+ fdrop(fp, td);
+ break;
+ }
+
+ /*
+ * Check for a race with close.
+ *
+ * The vnode is now advisory locked (or unlocked, but this case
+ * is not really important) as the caller requested.
+ * We had to drop the filedesc lock, so we need to recheck if
+ * the descriptor is still valid, because if it was closed
+ * in the meantime we need to remove advisory lock from the
+ * vnode - close on any descriptor leading to an advisory
+ * locked vnode, removes that lock.
+ * We will return 0 on purpose in that case, as the result of
+ * successful advisory lock might have been externally visible
+ * already. This is fine - effectively we pretend to the caller
+ * that the closing thread was a bit slower and that the
+ * advisory lock succeeded before the close.
+ */
+ error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
+ if (error != 0) {
+ fdrop(fp, td);
+ break;
+ }
+ if (fp != fp2) {
+ flp->l_whence = SEEK_SET;
+ flp->l_start = 0;
+ flp->l_len = 0;
+ flp->l_type = F_UNLCK;
+ (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+ F_UNLCK, flp, F_POSIX);
+ }
+ fdrop(fp, td);
+ fdrop(fp2, td);
+ break;
+
+ case F_GETLK:
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
+ if (error != 0)
+ break;
+ if (fp->f_type != DTYPE_VNODE) {
+ error = EBADF;
+ fdrop(fp, td);
+ break;
+ }
+ flp = (struct flock *)arg;
+ if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
+ flp->l_type != F_UNLCK) {
+ error = EINVAL;
+ fdrop(fp, td);
+ break;
+ }
+ if (flp->l_whence == SEEK_CUR) {
+ foffset = foffset_get(fp);
+ if ((flp->l_start > 0 &&
+ foffset > OFF_MAX - flp->l_start) ||
+ (flp->l_start < 0 &&
+ foffset < OFF_MIN - flp->l_start)) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EOVERFLOW;
+ fdrop(fp, td);
+ break;
+ }
+ flp->l_start += foffset;
+ }
+ vp = fp->f_vnode;
+ error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
+ F_POSIX);
+ fdrop(fp, td);
+ break;
+
+ case F_RDAHEAD:
+ arg = arg ? 128 * 1024: 0;
+ /* FALLTHROUGH */
+ case F_READAHEAD:
+ error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
+ if (error != 0)
+ break;
+ if (fp->f_type != DTYPE_VNODE) {
+ fdrop(fp, td);
+ error = EBADF;
+ break;
+ }
+ if (arg >= 0) {
+ vp = fp->f_vnode;
+ error = vn_lock(vp, LK_SHARED);
+ if (error != 0) {
+ fdrop(fp, td);
+ break;
+ }
+ bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
+ VOP_UNLOCK(vp, 0);
+ fp->f_seqcount = (arg + bsize - 1) / bsize;
+ do {
+ new = old = fp->f_flag;
+ new |= FRDAHEAD;
+ } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
+ } else {
+ do {
+ new = old = fp->f_flag;
+ new &= ~FRDAHEAD;
+ } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
+ }
+ fdrop(fp, td);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
+ */
+int
+do_dup(struct thread *td, int flags, int old, int new,
+ register_t *retval)
+{
+ struct filedesc *fdp;
+ struct filedescent *oldfde, *newfde;
+ struct proc *p;
+ struct file *fp;
+ struct file *delfp;
+ int error, maxfd;
+
+ p = td->td_proc;
+ fdp = p->p_fd;
+
+ /*
+ * Verify we have a valid descriptor to dup from and possibly to
+ * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
+ * return EINVAL when the new descriptor is out of bounds.
+ */
+ if (old < 0)
+ return (EBADF);
+ if (new < 0)
+ return (flags & DUP_FCNTL ? EINVAL : EBADF);
+ PROC_LOCK(p);
+ maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+ PROC_UNLOCK(p);
+ if (new >= maxfd)
+ return (flags & DUP_FCNTL ? EINVAL : EBADF);
+
+ FILEDESC_XLOCK(fdp);
+ if (fget_locked(fdp, old) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+ oldfde = &fdp->fd_ofiles[old];
+ if (flags & DUP_FIXED && old == new) {
+ *retval = new;
+ if (flags & DUP_CLOEXEC)
+ fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
+ FILEDESC_XUNLOCK(fdp);
+ return (0);
+ }
+ fp = oldfde->fde_file;
+ fhold(fp);
+
+ /*
+ * If the caller specified a file descriptor, make sure the file
+ * table is large enough to hold it, and grab it. Otherwise, just
+ * allocate a new descriptor the usual way.
+ */
+ if (flags & DUP_FIXED) {
+ if (new >= fdp->fd_nfiles) {
+ /*
+ * The resource limits are here instead of e.g.
+ * fdalloc(), because the file descriptor table may be
+ * shared between processes, so we can't really use
+ * racct_add()/racct_sub(). Instead of counting the
+ * number of actually allocated descriptors, just put
+ * the limit on the size of the file descriptor table.
+ */
+#ifdef RACCT
+ PROC_LOCK(p);
+ error = racct_set(p, RACCT_NOFILE, new + 1);
+ PROC_UNLOCK(p);
+ if (error != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ fdrop(fp, td);
+ return (EMFILE);
+ }
+#endif
+ fdgrowtable(fdp, new + 1);
+ oldfde = &fdp->fd_ofiles[old];
+ }
+ newfde = &fdp->fd_ofiles[new];
+ if (newfde->fde_file == NULL)
+ fdused(fdp, new);
+ } else {
+ if ((error = fdalloc(td, new, &new)) != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ fdrop(fp, td);
+ return (error);
+ }
+ newfde = &fdp->fd_ofiles[new];
+ }
+
+ KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
+ KASSERT(old != new, ("new fd is same as old"));
+
+ delfp = newfde->fde_file;
+
+ /*
+ * Duplicate the source descriptor.
+ */
+ *newfde = *oldfde;
+ filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
+ if ((flags & DUP_CLOEXEC) != 0)
+ newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
+ else
+ newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+ if (new > fdp->fd_lastfile)
+ fdp->fd_lastfile = new;
+ *retval = new;
+
+ if (delfp != NULL) {
+ (void) closefp(fdp, new, delfp, td, 1);
+ /* closefp() drops the FILEDESC lock for us. */
+ } else {
+ FILEDESC_XUNLOCK(fdp);
+ }
+
+ return (0);
+}
+
+/*
+ * If sigio is on the list associated with a process or process group,
+ * disable signalling from the device, remove sigio from the list and
+ * free sigio.
+ */
+void
+funsetown(struct sigio **sigiop)
+{
+ struct sigio *sigio;
+
+ SIGIO_LOCK();
+ sigio = *sigiop;
+ if (sigio == NULL) {
+ SIGIO_UNLOCK();
+ return;
+ }
+ *(sigio->sio_myref) = NULL;
+ if ((sigio)->sio_pgid < 0) {
+ struct pgrp *pg = (sigio)->sio_pgrp;
+ PGRP_LOCK(pg);
+ SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
+ sigio, sio_pgsigio);
+ PGRP_UNLOCK(pg);
+ } else {
+ struct proc *p = (sigio)->sio_proc;
+ PROC_LOCK(p);
+ SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
+ sigio, sio_pgsigio);
+ PROC_UNLOCK(p);
+ }
+ SIGIO_UNLOCK();
+ crfree(sigio->sio_ucred);
+ free(sigio, M_SIGIO);
+}
+
+/*
+ * Free a list of sigio structures.
+ * We only need to lock the SIGIO_LOCK because we have made ourselves
+ * inaccessible to callers of fsetown and therefore do not need to lock
+ * the proc or pgrp struct for the list manipulation.
+ */
+void
+funsetownlst(struct sigiolst *sigiolst)
+{
+ struct proc *p;
+ struct pgrp *pg;
+ struct sigio *sigio;
+
+ sigio = SLIST_FIRST(sigiolst);
+ if (sigio == NULL)
+ return;
+ p = NULL;
+ pg = NULL;
+
+ /*
+ * Every entry of the list should belong
+ * to a single proc or pgrp.
+ */
+ if (sigio->sio_pgid < 0) {
+ pg = sigio->sio_pgrp;
+ PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
+ } else /* if (sigio->sio_pgid > 0) */ {
+ p = sigio->sio_proc;
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ }
+
+ SIGIO_LOCK();
+ while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
+ *(sigio->sio_myref) = NULL;
+ if (pg != NULL) {
+ KASSERT(sigio->sio_pgid < 0,
+ ("Proc sigio in pgrp sigio list"));
+ KASSERT(sigio->sio_pgrp == pg,
+ ("Bogus pgrp in sigio list"));
+ PGRP_LOCK(pg);
+ SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
+ sio_pgsigio);
+ PGRP_UNLOCK(pg);
+ } else /* if (p != NULL) */ {
+ KASSERT(sigio->sio_pgid > 0,
+ ("Pgrp sigio in proc sigio list"));
+ KASSERT(sigio->sio_proc == p,
+ ("Bogus proc in sigio list"));
+ PROC_LOCK(p);
+ SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
+ sio_pgsigio);
+ PROC_UNLOCK(p);
+ }
+ SIGIO_UNLOCK();
+ crfree(sigio->sio_ucred);
+ free(sigio, M_SIGIO);
+ SIGIO_LOCK();
+ }
+ SIGIO_UNLOCK();
+}
+
+/*
+ * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
+ *
+ * After permission checking, add a sigio structure to the sigio list for
+ * the process or process group.
+ */
+int
+fsetown(pid_t pgid, struct sigio **sigiop)
+{
+ struct proc *proc;
+ struct pgrp *pgrp;
+ struct sigio *sigio;
+ int ret;
+
+ if (pgid == 0) {
+ funsetown(sigiop);
+ return (0);
+ }
+
+ ret = 0;
+
+ /* Allocate and fill in the new sigio out of locks. */
+ sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
+ sigio->sio_pgid = pgid;
+ sigio->sio_ucred = crhold(curthread->td_ucred);
+ sigio->sio_myref = sigiop;
+
+ sx_slock(&proctree_lock);
+ if (pgid > 0) {
+ proc = pfind(pgid);
+ if (proc == NULL) {
+ ret = ESRCH;
+ goto fail;
+ }
+
+ /*
+ * Policy - Don't allow a process to FSETOWN a process
+ * in another session.
+ *
+ * Remove this test to allow maximum flexibility or
+ * restrict FSETOWN to the current process or process
+ * group for maximum safety.
+ */
+ PROC_UNLOCK(proc);
+ if (proc->p_session != curthread->td_proc->p_session) {
+ ret = EPERM;
+ goto fail;
+ }
+
+ pgrp = NULL;
+ } else /* if (pgid < 0) */ {
+ pgrp = pgfind(-pgid);
+ if (pgrp == NULL) {
+ ret = ESRCH;
+ goto fail;
+ }
+ PGRP_UNLOCK(pgrp);
+
+ /*
+ * Policy - Don't allow a process to FSETOWN a process
+ * in another session.
+ *
+ * Remove this test to allow maximum flexibility or
+ * restrict FSETOWN to the current process or process
+ * group for maximum safety.
+ */
+ if (pgrp->pg_session != curthread->td_proc->p_session) {
+ ret = EPERM;
+ goto fail;
+ }
+
+ proc = NULL;
+ }
+ funsetown(sigiop);
+ if (pgid > 0) {
+ PROC_LOCK(proc);
+ /*
+ * Since funsetownlst() is called without the proctree
+ * locked, we need to check for P_WEXIT.
+ * XXX: is ESRCH correct?
+ */
+ if ((proc->p_flag & P_WEXIT) != 0) {
+ PROC_UNLOCK(proc);
+ ret = ESRCH;
+ goto fail;
+ }
+ SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
+ sigio->sio_proc = proc;
+ PROC_UNLOCK(proc);
+ } else {
+ PGRP_LOCK(pgrp);
+ SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
+ sigio->sio_pgrp = pgrp;
+ PGRP_UNLOCK(pgrp);
+ }
+ sx_sunlock(&proctree_lock);
+ SIGIO_LOCK();
+ *sigiop = sigio;
+ SIGIO_UNLOCK();
+ return (0);
+
+fail:
+ sx_sunlock(&proctree_lock);
+ crfree(sigio->sio_ucred);
+ free(sigio, M_SIGIO);
+ return (ret);
+}
+
+/*
+ * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
+ */
+pid_t
+fgetown(sigiop)
+ struct sigio **sigiop;
+{
+ pid_t pgid;
+
+ SIGIO_LOCK();
+ pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
+ SIGIO_UNLOCK();
+ return (pgid);
+}
+
+/*
+ * Function drops the filedesc lock on return.
+ */
+static int
+closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
+ int holdleaders)
+{
+ int error;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ if (holdleaders) {
+ if (td->td_proc->p_fdtol != NULL) {
+ /*
+ * Ask fdfree() to sleep to ensure that all relevant
+ * process leaders can be traversed in closef().
+ */
+ fdp->fd_holdleaderscount++;
+ } else {
+ holdleaders = 0;
+ }
+ }
+
+ /*
+ * We now hold the fp reference that used to be owned by the
+ * descriptor array. We have to unlock the FILEDESC *AFTER*
+ * knote_fdclose to prevent a race of the fd getting opened, a knote
+ * added, and deleteing a knote for the new fd.
+ */
+ knote_fdclose(td, fd);
+
+ /*
+ * We need to notify mqueue if the object is of type mqueue.
+ */
+ if (fp->f_type == DTYPE_MQUEUE)
+ mq_fdclose(td, fd, fp);
+ FILEDESC_XUNLOCK(fdp);
+
+ error = closef(fp, td);
+ if (holdleaders) {
+ FILEDESC_XLOCK(fdp);
+ fdp->fd_holdleaderscount--;
+ if (fdp->fd_holdleaderscount == 0 &&
+ fdp->fd_holdleaderswakeup != 0) {
+ fdp->fd_holdleaderswakeup = 0;
+ wakeup(&fdp->fd_holdleaderscount);
+ }
+ FILEDESC_XUNLOCK(fdp);
+ }
+ return (error);
+}
+
+/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_close(td, uap)
+ struct thread *td;
+ struct close_args *uap;
+{
+
+ return (kern_close(td, uap->fd));
+}
+
+int
+kern_close(td, fd)
+ struct thread *td;
+ int fd;
+{
+ struct filedesc *fdp;
+ struct file *fp;
+
+ fdp = td->td_proc->p_fd;
+
+ AUDIT_SYSCLOSE(td, fd);
+
+ FILEDESC_XLOCK(fdp);
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+ fdfree(fdp, fd);
+
+ /* closefp() drops the FILEDESC lock for us. */
+ return (closefp(fdp, fd, fp, td, 1));
+}
+
+/*
+ * Close open file descriptors.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct closefrom_args {
+ int lowfd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_closefrom(struct thread *td, struct closefrom_args *uap)
+{
+ struct filedesc *fdp;
+ int fd;
+
+ fdp = td->td_proc->p_fd;
+ AUDIT_ARG_FD(uap->lowfd);
+
+ /*
+ * Treat negative starting file descriptor values identical to
+ * closefrom(0) which closes all files.
+ */
+ if (uap->lowfd < 0)
+ uap->lowfd = 0;
+ FILEDESC_SLOCK(fdp);
+ for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
+ if (fdp->fd_ofiles[fd].fde_file != NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ (void)kern_close(td, fd);
+ FILEDESC_SLOCK(fdp);
+ }
+ }
+ FILEDESC_SUNLOCK(fdp);
+ return (0);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+ int fd;
+ struct ostat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+ofstat(struct thread *td, struct ofstat_args *uap)
+{
+ struct ostat oub;
+ struct stat ub;
+ int error;
+
+ error = kern_fstat(td, uap->fd, &ub);
+ if (error == 0) {
+ cvtstat(&ub, &oub);
+ error = copyout(&oub, uap->sb, sizeof(oub));
+ }
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+ int fd;
+ struct stat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fstat(struct thread *td, struct fstat_args *uap)
+{
+ struct stat ub;
+ int error;
+
+ error = kern_fstat(td, uap->fd, &ub);
+ if (error == 0)
+ error = copyout(&ub, uap->sb, sizeof(ub));
+ return (error);
+}
+
+int
+kern_fstat(struct thread *td, int fd, struct stat *sbp)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+
+ error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
+ if (error != 0)
+ return (error);
+
+ AUDIT_ARG_FILE(td->td_proc, fp);
+
+ error = fo_stat(fp, sbp, td->td_ucred, td);
+ fdrop(fp, td);
+#ifdef KTRACE
+ if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+ ktrstat(sbp);
+#endif
+ return (error);
+}
+
+/*
+ * Return status information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nfstat_args {
+ int fd;
+ struct nstat *sb;
+};
+#endif
+/* ARGSUSED */
+int
+sys_nfstat(struct thread *td, struct nfstat_args *uap)
+{
+ struct nstat nub;
+ struct stat ub;
+ int error;
+
+ error = kern_fstat(td, uap->fd, &ub);
+ if (error == 0) {
+ cvtnstat(&ub, &nub);
+ error = copyout(&nub, uap->sb, sizeof(nub));
+ }
+ return (error);
+}
+
+/*
+ * Return pathconf information about a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+ int fd;
+ int name;
+};
+#endif
+/* ARGSUSED */
+int
+sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
+{
+ struct file *fp;
+ struct vnode *vp;
+ cap_rights_t rights;
+ int error;
+
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
+ if (error != 0)
+ return (error);
+
+ /* If asynchronous I/O is available, it works for all descriptors. */
+ if (uap->name == _PC_ASYNC_IO) {
+ td->td_retval[0] = async_io_version;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp != NULL) {
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_PATHCONF(vp, uap->name, td->td_retval);
+ VOP_UNLOCK(vp, 0);
+ } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
+ if (uap->name != _PC_PIPE_BUF) {
+ error = EINVAL;
+ } else {
+ td->td_retval[0] = PIPE_BUF;
+ error = 0;
+ }
+ } else {
+ error = EOPNOTSUPP;
+ }
+out:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Initialize filecaps structure.
+ */
+void
+filecaps_init(struct filecaps *fcaps)
+{
+
+ bzero(fcaps, sizeof(*fcaps));
+ fcaps->fc_nioctls = -1;
+}
+
+/*
+ * Copy filecaps structure allocating memory for ioctls array if needed.
+ */
+void
+filecaps_copy(const struct filecaps *src, struct filecaps *dst)
+{
+ size_t size;
+
+ *dst = *src;
+ if (src->fc_ioctls != NULL) {
+ KASSERT(src->fc_nioctls > 0,
+ ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
+
+ size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+ dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
+ bcopy(src->fc_ioctls, dst->fc_ioctls, size);
+ }
+}
+
+/*
+ * Move filecaps structure to the new place and clear the old place.
+ */
+void
+filecaps_move(struct filecaps *src, struct filecaps *dst)
+{
+
+ *dst = *src;
+ bzero(src, sizeof(*src));
+}
+
+/*
+ * Fill the given filecaps structure with full rights.
+ */
+static void
+filecaps_fill(struct filecaps *fcaps)
+{
+
+ CAP_ALL(&fcaps->fc_rights);
+ fcaps->fc_ioctls = NULL;
+ fcaps->fc_nioctls = -1;
+ fcaps->fc_fcntls = CAP_FCNTL_ALL;
+}
+
+/*
+ * Free memory allocated within filecaps structure.
+ */
+void
+filecaps_free(struct filecaps *fcaps)
+{
+
+ free(fcaps->fc_ioctls, M_FILECAPS);
+ bzero(fcaps, sizeof(*fcaps));
+}
+
+/*
+ * Validate the given filecaps structure.
+ */
+static void
+filecaps_validate(const struct filecaps *fcaps, const char *func)
+{
+
+ KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
+ ("%s: invalid rights", func));
+ KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
+ ("%s: invalid fcntls", func));
+ KASSERT(fcaps->fc_fcntls == 0 ||
+ cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
+ ("%s: fcntls without CAP_FCNTL", func));
+ KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
+ (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
+ ("%s: invalid ioctls", func));
+ KASSERT(fcaps->fc_nioctls == 0 ||
+ cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
+ ("%s: ioctls without CAP_IOCTL", func));
+}
+
+/*
+ * Grow the file table to accomodate (at least) nfd descriptors.
+ */
+static void
+fdgrowtable(struct filedesc *fdp, int nfd)
+{
+ struct filedesc0 *fdp0;
+ struct freetable *ft;
+ struct filedescent *ntable;
+ struct filedescent *otable;
+ int nnfiles, onfiles;
+ NDSLOTTYPE *nmap, *omap;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
+
+ /* save old values */
+ onfiles = fdp->fd_nfiles;
+ otable = fdp->fd_ofiles;
+ omap = fdp->fd_map;
+
+ /* compute the size of the new table */
+ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
+ if (nnfiles <= onfiles)
+ /* the table is already large enough */
+ return;
+
+ /*
+ * Allocate a new table and map. We need enough space for the
+ * file entries themselves and the struct freetable we will use
+ * when we decommission the table and place it on the freelist.
+ * We place the struct freetable in the middle so we don't have
+ * to worry about padding.
+ */
+ ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
+ M_FILEDESC, M_ZERO | M_WAITOK);
+ nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
+ M_ZERO | M_WAITOK);
+
+ /* copy the old data over and point at the new tables */
+ memcpy(ntable, otable, onfiles * sizeof(*otable));
+ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
+
+ /* update the pointers and counters */
+ fdp->fd_nfiles = nnfiles;
+ memcpy(ntable, otable, onfiles * sizeof(ntable[0]));
+ fdp->fd_ofiles = ntable;
+ fdp->fd_map = nmap;
+
+ /*
+ * Do not free the old file table, as some threads may still
+ * reference entries within it. Instead, place it on a freelist
+ * which will be processed when the struct filedesc is released.
+ *
+ * Do, however, free the old map.
+ *
+ * Note that if onfiles == NDFILE, we're dealing with the original
+ * static allocation contained within (struct filedesc0 *)fdp,
+ * which must not be freed.
+ */
+ if (onfiles > NDFILE) {
+ ft = (struct freetable *)&otable[onfiles];
+ fdp0 = (struct filedesc0 *)fdp;
+ ft->ft_table = otable;
+ SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
+ free(omap, M_FILEDESC);
+ }
+}
+
+/*
+ * Allocate a file descriptor for the process.
+ */
+int
+fdalloc(struct thread *td, int minfd, int *result)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ int fd = -1, maxfd, allocfd;
+#ifdef RACCT
+ int error;
+#endif
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ if (fdp->fd_freefile > minfd)
+ minfd = fdp->fd_freefile;
+
+ PROC_LOCK(p);
+ maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+ PROC_UNLOCK(p);
+
+ /*
+ * Search the bitmap for a free descriptor starting at minfd.
+ * If none is found, grow the file table.
+ */
+ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+ if (fd >= maxfd)
+ return (EMFILE);
+ if (fd >= fdp->fd_nfiles) {
+ allocfd = min(fd * 2, maxfd);
+#ifdef RACCT
+ PROC_LOCK(p);
+ error = racct_set(p, RACCT_NOFILE, allocfd);
+ PROC_UNLOCK(p);
+ if (error != 0)
+ return (EMFILE);
+#endif
+ /*
+ * fd is already equal to first free descriptor >= minfd, so
+ * we only need to grow the table and we are done.
+ */
+ fdgrowtable(fdp, allocfd);
+ }
+
+ /*
+ * Perform some sanity checks, then mark the file descriptor as
+ * used and return it to the caller.
+ */
+ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
+ ("invalid descriptor %d", fd));
+ KASSERT(!fdisused(fdp, fd),
+ ("fd_first_free() returned non-free descriptor"));
+ KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+ ("file descriptor isn't free"));
+ KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
+ fdused(fdp, fd);
+ *result = fd;
+ return (0);
+}
+
+/*
+ * Allocate n file descriptors for the process.
+ */
+int
+fdallocn(struct thread *td, int minfd, int *fds, int n)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ int i;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ if (!fdavail(td, n))
+ return (EMFILE);
+
+ for (i = 0; i < n; i++)
+ if (fdalloc(td, 0, &fds[i]) != 0)
+ break;
+
+ if (i < n) {
+ for (i--; i >= 0; i--)
+ fdunused(fdp, fds[i]);
+ return (EMFILE);
+ }
+
+ return (0);
+}
+
+/*
+ * Check to see whether n user file descriptors are available to the process
+ * p.
+ */
+int
+fdavail(struct thread *td, int n)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = td->td_proc->p_fd;
+ int i, lim, last;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ /*
+ * XXX: This is only called from uipc_usrreq.c:unp_externalize();
+ * call racct_add() from there instead of dealing with containers
+ * here.
+ */
+ PROC_LOCK(p);
+ lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+ PROC_UNLOCK(p);
+ if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
+ return (1);
+ last = min(fdp->fd_nfiles, lim);
+ for (i = fdp->fd_freefile; i < last; i++) {
+ if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Create a new open file structure and allocate a file decriptor for the
+ * process that refers to it. We add one reference to the file for the
+ * descriptor table and one reference for resultfp. This is to prevent us
+ * being preempted and the entry in the descriptor table closed after we
+ * release the FILEDESC lock.
+ */
+int
+falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
+{
+ struct file *fp;
+ int error, fd;
+
+ error = falloc_noinstall(td, &fp);
+ if (error)
+ return (error); /* no reference held on error */
+
+ error = finstall(td, fp, &fd, flags, NULL);
+ if (error) {
+ fdrop(fp, td); /* one reference (fp only) */
+ return (error);
+ }
+
+ if (resultfp != NULL)
+ *resultfp = fp; /* copy out result */
+ else
+ fdrop(fp, td); /* release local reference */
+
+ if (resultfd != NULL)
+ *resultfd = fd;
+
+ return (0);
+}
+
+/*
+ * Create a new open file structure without allocating a file descriptor.
+ */
+int
+falloc_noinstall(struct thread *td, struct file **resultfp)
+{
+ struct file *fp;
+ int maxuserfiles = maxfiles - (maxfiles / 20);
+ static struct timeval lastfail;
+ static int curfail;
+
+ KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
+
+ if ((openfiles >= maxuserfiles &&
+ priv_check(td, PRIV_MAXFILES) != 0) ||
+ openfiles >= maxfiles) {
+ if (ppsratecheck(&lastfail, &curfail, 1)) {
+ printf("kern.maxfiles limit exceeded by uid %i, "
+ "please see tuning(7).\n", td->td_ucred->cr_ruid);
+ }
+ return (ENFILE);
+ }
+ atomic_add_int(&openfiles, 1);
+ fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
+ refcount_init(&fp->f_count, 1);
+ fp->f_cred = crhold(td->td_ucred);
+ fp->f_ops = &badfileops;
+ fp->f_data = NULL;
+ fp->f_vnode = NULL;
+ *resultfp = fp;
+ return (0);
+}
+
+/*
+ * Install a file in a file descriptor table.
+ */
+int
+finstall(struct thread *td, struct file *fp, int *fd, int flags,
+ struct filecaps *fcaps)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ struct filedescent *fde;
+ int error;
+
+ KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
+ KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
+ if (fcaps != NULL)
+ filecaps_validate(fcaps, __func__);
+
+ FILEDESC_XLOCK(fdp);
+ if ((error = fdalloc(td, 0, fd))) {
+ FILEDESC_XUNLOCK(fdp);
+ return (error);
+ }
+ fhold(fp);
+ fde = &fdp->fd_ofiles[*fd];
+ fde->fde_file = fp;
+ if ((flags & O_CLOEXEC) != 0)
+ fde->fde_flags |= UF_EXCLOSE;
+ if (fcaps != NULL)
+ filecaps_move(fcaps, &fde->fde_caps);
+ else
+ filecaps_fill(&fde->fde_caps);
+ FILEDESC_XUNLOCK(fdp);
+ return (0);
+}
+
+/*
+ * Build a new filedesc structure from another.
+ * Copy the current, root, and jail root vnode references.
+ */
+struct filedesc *
+fdinit(struct filedesc *fdp)
+{
+ struct filedesc0 *newfdp;
+
+ newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
+ FILEDESC_LOCK_INIT(&newfdp->fd_fd);
+ if (fdp != NULL) {
+ FILEDESC_XLOCK(fdp);
+ newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
+ if (newfdp->fd_fd.fd_cdir)
+ VREF(newfdp->fd_fd.fd_cdir);
+ newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
+ if (newfdp->fd_fd.fd_rdir)
+ VREF(newfdp->fd_fd.fd_rdir);
+ newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
+ if (newfdp->fd_fd.fd_jdir)
+ VREF(newfdp->fd_fd.fd_jdir);
+ FILEDESC_XUNLOCK(fdp);
+ }
+
+ /* Create the file descriptor table. */
+ newfdp->fd_fd.fd_refcnt = 1;
+ newfdp->fd_fd.fd_holdcnt = 1;
+ newfdp->fd_fd.fd_cmask = CMASK;
+ newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
+ newfdp->fd_fd.fd_nfiles = NDFILE;
+ newfdp->fd_fd.fd_map = newfdp->fd_dmap;
+ newfdp->fd_fd.fd_lastfile = -1;
+ return (&newfdp->fd_fd);
+}
+
+static struct filedesc *
+fdhold(struct proc *p)
+{
+ struct filedesc *fdp;
+
+ mtx_lock(&fdesc_mtx);
+ fdp = p->p_fd;
+ if (fdp != NULL)
+ fdp->fd_holdcnt++;
+ mtx_unlock(&fdesc_mtx);
+ return (fdp);
+}
+
+static void
+fddrop(struct filedesc *fdp)
+{
+ struct filedesc0 *fdp0;
+ struct freetable *ft;
+ int i;
+
+ mtx_lock(&fdesc_mtx);
+ i = --fdp->fd_holdcnt;
+ mtx_unlock(&fdesc_mtx);
+ if (i > 0)
+ return;
+
+ FILEDESC_LOCK_DESTROY(fdp);
+ fdp0 = (struct filedesc0 *)fdp;
+ while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
+ SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
+ free(ft->ft_table, M_FILEDESC);
+ }
+ free(fdp, M_FILEDESC);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(struct filedesc *fdp)
+{
+
+ FILEDESC_XLOCK(fdp);
+ fdp->fd_refcnt++;
+ FILEDESC_XUNLOCK(fdp);
+ return (fdp);
+}
+
+/*
+ * Unshare a filedesc structure, if necessary by making a copy
+ */
+void
+fdunshare(struct proc *p, struct thread *td)
+{
+
+ FILEDESC_XLOCK(p->p_fd);
+ if (p->p_fd->fd_refcnt > 1) {
+ struct filedesc *tmp;
+
+ FILEDESC_XUNLOCK(p->p_fd);
+ tmp = fdcopy(p->p_fd);
+ fdescfree(td);
+ p->p_fd = tmp;
+ } else
+ FILEDESC_XUNLOCK(p->p_fd);
+}
+
+/*
+ * Copy a filedesc structure. A NULL pointer in returns a NULL reference,
+ * this is to ease callers, not catch errors.
+ */
+struct filedesc *
+fdcopy(struct filedesc *fdp)
+{
+ struct filedesc *newfdp;
+ struct filedescent *nfde, *ofde;
+ int i;
+
+ /* Certain daemons might not have file descriptors. */
+ if (fdp == NULL)
+ return (NULL);
+
+ newfdp = fdinit(fdp);
+ FILEDESC_SLOCK(fdp);
+ while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
+ FILEDESC_SUNLOCK(fdp);
+ FILEDESC_XLOCK(newfdp);
+ fdgrowtable(newfdp, fdp->fd_lastfile + 1);
+ FILEDESC_XUNLOCK(newfdp);
+ FILEDESC_SLOCK(fdp);
+ }
+ /* copy all passable descriptors (i.e. not kqueue) */
+ newfdp->fd_freefile = -1;
+ for (i = 0; i <= fdp->fd_lastfile; ++i) {
+ ofde = &fdp->fd_ofiles[i];
+ if (fdisused(fdp, i) &&
+ (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
+ ofde->fde_file->f_ops != &badfileops) {
+ nfde = &newfdp->fd_ofiles[i];
+ *nfde = *ofde;
+ filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
+ fhold(nfde->fde_file);
+ newfdp->fd_lastfile = i;
+ } else {
+ if (newfdp->fd_freefile == -1)
+ newfdp->fd_freefile = i;
+ }
+ }
+ newfdp->fd_cmask = fdp->fd_cmask;
+ FILEDESC_SUNLOCK(fdp);
+ FILEDESC_XLOCK(newfdp);
+ for (i = 0; i <= newfdp->fd_lastfile; ++i) {
+ if (newfdp->fd_ofiles[i].fde_file != NULL)
+ fdused(newfdp, i);
+ }
+ if (newfdp->fd_freefile == -1)
+ newfdp->fd_freefile = i;
+ FILEDESC_XUNLOCK(newfdp);
+ return (newfdp);
+}
+
+/*
+ * Release a filedesc structure.
+ */
+void
+fdescfree(struct thread *td)
+{
+ struct filedesc *fdp;
+ int i;
+ struct filedesc_to_leader *fdtol;
+ struct file *fp;
+ struct vnode *cdir, *jdir, *rdir, *vp;
+ struct flock lf;
+
+ /* Certain daemons might not have file descriptors. */
+ fdp = td->td_proc->p_fd;
+ if (fdp == NULL)
+ return;
+
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ racct_set(td->td_proc, RACCT_NOFILE, 0);
+ PROC_UNLOCK(td->td_proc);
+#endif
+
+ /* Check for special need to clear POSIX style locks */
+ fdtol = td->td_proc->p_fdtol;
+ if (fdtol != NULL) {
+ FILEDESC_XLOCK(fdp);
+ KASSERT(fdtol->fdl_refcount > 0,
+ ("filedesc_to_refcount botch: fdl_refcount=%d",
+ fdtol->fdl_refcount));
+ if (fdtol->fdl_refcount == 1 &&
+ (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fp = fdp->fd_ofiles[i].fde_file;
+ if (fp == NULL || fp->f_type != DTYPE_VNODE)
+ continue;
+ fhold(fp);
+ FILEDESC_XUNLOCK(fdp);
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ vp = fp->f_vnode;
+ (void) VOP_ADVLOCK(vp,
+ (caddr_t)td->td_proc->p_leader, F_UNLCK,
+ &lf, F_POSIX);
+ FILEDESC_XLOCK(fdp);
+ fdrop(fp, td);
+ }
+ }
+ retry:
+ if (fdtol->fdl_refcount == 1) {
+ if (fdp->fd_holdleaderscount > 0 &&
+ (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
+ /*
+ * close() or do_dup() has cleared a reference
+ * in a shared file descriptor table.
+ */
+ fdp->fd_holdleaderswakeup = 1;
+ sx_sleep(&fdp->fd_holdleaderscount,
+ FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
+ goto retry;
+ }
+ if (fdtol->fdl_holdcount > 0) {
+ /*
+ * Ensure that fdtol->fdl_leader remains
+ * valid in closef().
+ */
+ fdtol->fdl_wakeup = 1;
+ sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
+ "fdlhold", 0);
+ goto retry;
+ }
+ }
+ fdtol->fdl_refcount--;
+ if (fdtol->fdl_refcount == 0 &&
+ fdtol->fdl_holdcount == 0) {
+ fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
+ fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
+ } else
+ fdtol = NULL;
+ td->td_proc->p_fdtol = NULL;
+ FILEDESC_XUNLOCK(fdp);
+ if (fdtol != NULL)
+ free(fdtol, M_FILEDESC_TO_LEADER);
+ }
+ FILEDESC_XLOCK(fdp);
+ i = --fdp->fd_refcnt;
+ FILEDESC_XUNLOCK(fdp);
+ if (i > 0)
+ return;
+
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fp = fdp->fd_ofiles[i].fde_file;
+ if (fp != NULL) {
+ FILEDESC_XLOCK(fdp);
+ fdfree(fdp, i);
+ FILEDESC_XUNLOCK(fdp);
+ (void) closef(fp, td);
+ }
+ }
+ FILEDESC_XLOCK(fdp);
+
+ /* XXX This should happen earlier. */
+ mtx_lock(&fdesc_mtx);
+ td->td_proc->p_fd = NULL;
+ mtx_unlock(&fdesc_mtx);
+
+ if (fdp->fd_nfiles > NDFILE)
+ free(fdp->fd_ofiles, M_FILEDESC);
+ if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
+ free(fdp->fd_map, M_FILEDESC);
+
+ fdp->fd_nfiles = 0;
+
+ cdir = fdp->fd_cdir;
+ fdp->fd_cdir = NULL;
+ rdir = fdp->fd_rdir;
+ fdp->fd_rdir = NULL;
+ jdir = fdp->fd_jdir;
+ fdp->fd_jdir = NULL;
+ FILEDESC_XUNLOCK(fdp);
+
+ if (cdir != NULL)
+ vrele(cdir);
+ if (rdir != NULL)
+ vrele(rdir);
+ if (jdir != NULL)
+ vrele(jdir);
+
+ fddrop(fdp);
+}
+
+/*
+ * For setugid programs, we don't want to people to use that setugidness
+ * to generate error messages which write to a file which otherwise would
+ * otherwise be off-limits to the process. We check for filesystems where
+ * the vnode can change out from under us after execve (like [lin]procfs).
+ *
+ * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
+ * sufficient. We also don't check for setugidness since we know we are.
+ */
+static int
+is_unsafe(struct file *fp)
+{
+ if (fp->f_type == DTYPE_VNODE) {
+ struct vnode *vp = fp->f_vnode;
+
+ if ((vp->v_vflag & VV_PROCDEP) != 0)
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * Make this setguid thing safe, if at all possible.
+ */
+void
+setugidsafety(struct thread *td)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ int i;
+
+ /* Certain daemons might not have file descriptors. */
+ fdp = td->td_proc->p_fd;
+ if (fdp == NULL)
+ return;
+
+ /*
+ * Note: fdp->fd_ofiles may be reallocated out from under us while
+ * we are blocked in a close. Be careful!
+ */
+ FILEDESC_XLOCK(fdp);
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ if (i > 2)
+ break;
+ fp = fdp->fd_ofiles[i].fde_file;
+ if (fp != NULL && is_unsafe(fp)) {
+ knote_fdclose(td, i);
+ /*
+ * NULL-out descriptor prior to close to avoid
+ * a race while close blocks.
+ */
+ fdfree(fdp, i);
+ FILEDESC_XUNLOCK(fdp);
+ (void) closef(fp, td);
+ FILEDESC_XLOCK(fdp);
+ }
+ }
+ FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * If a specific file object occupies a specific file descriptor, close the
+ * file descriptor entry and drop a reference on the file object. This is a
+ * convenience function to handle a subsequent error in a function that calls
+ * falloc() that handles the race that another thread might have closed the
+ * file descriptor out from under the thread creating the file object.
+ */
+void
+fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
+{
+
+ FILEDESC_XLOCK(fdp);
+ if (fdp->fd_ofiles[idx].fde_file == fp) {
+ fdfree(fdp, idx);
+ FILEDESC_XUNLOCK(fdp);
+ fdrop(fp, td);
+ } else
+ FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(struct thread *td)
+{
+ struct filedesc *fdp;
+ struct filedescent *fde;
+ struct file *fp;
+ int i;
+
+ /* Certain daemons might not have file descriptors. */
+ fdp = td->td_proc->p_fd;
+ if (fdp == NULL)
+ return;
+
+ /*
+ * We cannot cache fd_ofiles since operations
+ * may block and rip them out from under us.
+ */
+ FILEDESC_XLOCK(fdp);
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fde = &fdp->fd_ofiles[i];
+ fp = fde->fde_file;
+ if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
+ (fde->fde_flags & UF_EXCLOSE))) {
+ fdfree(fdp, i);
+ (void) closefp(fdp, i, fp, td, 0);
+ /* closefp() drops the FILEDESC lock. */
+ FILEDESC_XLOCK(fdp);
+ }
+ }
+ FILEDESC_XUNLOCK(fdp);
+}
+
+/*
+ * It is unsafe for set[ug]id processes to be started with file
+ * descriptors 0..2 closed, as these descriptors are given implicit
+ * significance in the Standard C library. fdcheckstd() will create a
+ * descriptor referencing /dev/null for each of stdin, stdout, and
+ * stderr that is not already open.
+ */
+int
+fdcheckstd(struct thread *td)
+{
+ struct filedesc *fdp;
+ register_t retval, save;
+ int i, error, devnull;
+
+ fdp = td->td_proc->p_fd;
+ if (fdp == NULL)
+ return (0);
+ KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
+ devnull = -1;
+ error = 0;
+ for (i = 0; i < 3; i++) {
+ if (fdp->fd_ofiles[i].fde_file != NULL)
+ continue;
+ if (devnull < 0) {
+ save = td->td_retval[0];
+ error = kern_open(td, "/dev/null", UIO_SYSSPACE,
+ O_RDWR, 0);
+ devnull = td->td_retval[0];
+ td->td_retval[0] = save;
+ if (error)
+ break;
+ KASSERT(devnull == i, ("oof, we didn't get our fd"));
+ } else {
+ error = do_dup(td, DUP_FIXED, devnull, i, &retval);
+ if (error != 0)
+ break;
+ }
+ }
+ return (error);
+}
+
+/*
+ * Internal form of close. Decrement reference count on file structure.
+ * Note: td may be NULL when closing a file that was being passed in a
+ * message.
+ *
+ * XXXRW: Giant is not required for the caller, but often will be held; this
+ * makes it moderately likely the Giant will be recursed in the VFS case.
+ */
+int
+closef(struct file *fp, struct thread *td)
+{
+ struct vnode *vp;
+ struct flock lf;
+ struct filedesc_to_leader *fdtol;
+ struct filedesc *fdp;
+
+ /*
+ * POSIX record locking dictates that any close releases ALL
+ * locks owned by this process. This is handled by setting
+ * a flag in the unlock to free ONLY locks obeying POSIX
+ * semantics, and not to free BSD-style file locks.
+ * If the descriptor was in a message, POSIX-style locks
+ * aren't passed with the descriptor, and the thread pointer
+ * will be NULL. Callers should be careful only to pass a
+ * NULL thread pointer when there really is no owning
+ * context that might have locks, or the locks will be
+ * leaked.
+ */
+ if (fp->f_type == DTYPE_VNODE && td != NULL) {
+ vp = fp->f_vnode;
+ if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
+ F_UNLCK, &lf, F_POSIX);
+ }
+ fdtol = td->td_proc->p_fdtol;
+ if (fdtol != NULL) {
+ /*
+ * Handle special case where file descriptor table is
+ * shared between multiple process leaders.
+ */
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+ for (fdtol = fdtol->fdl_next;
+ fdtol != td->td_proc->p_fdtol;
+ fdtol = fdtol->fdl_next) {
+ if ((fdtol->fdl_leader->p_flag &
+ P_ADVLOCK) == 0)
+ continue;
+ fdtol->fdl_holdcount++;
+ FILEDESC_XUNLOCK(fdp);
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ vp = fp->f_vnode;
+ (void) VOP_ADVLOCK(vp,
+ (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
+ F_POSIX);
+ FILEDESC_XLOCK(fdp);
+ fdtol->fdl_holdcount--;
+ if (fdtol->fdl_holdcount == 0 &&
+ fdtol->fdl_wakeup != 0) {
+ fdtol->fdl_wakeup = 0;
+ wakeup(fdtol);
+ }
+ }
+ FILEDESC_XUNLOCK(fdp);
+ }
+ }
+ return (fdrop(fp, td));
+}
+
+/*
+ * Initialize the file pointer with the specified properties.
+ *
+ * The ops are set with release semantics to be certain that the flags, type,
+ * and data are visible when ops is. This is to prevent ops methods from being
+ * called with bad data.
+ */
+void
+finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
+{
+ fp->f_data = data;
+ fp->f_flag = flag;
+ fp->f_type = type;
+ atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
+}
+
+int
+fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
+ int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
+{
+ struct file *fp;
+ u_int count;
+#ifdef CAPABILITIES
+ cap_rights_t haverights;
+ int error;
+#endif
+
+ if (fd < 0 || fd >= fdp->fd_nfiles)
+ return (EBADF);
+ /*
+ * Fetch the descriptor locklessly. We avoid fdrop() races by
+ * never raising a refcount above 0. To accomplish this we have
+ * to use a cmpset loop rather than an atomic_add. The descriptor
+ * must be re-verified once we acquire a reference to be certain
+ * that the identity is still correct and we did not lose a race
+ * due to preemption.
+ */
+ for (;;) {
+ fp = fdp->fd_ofiles[fd].fde_file;
+ if (fp == NULL)
+ return (EBADF);
+#ifdef CAPABILITIES
+ haverights = *cap_rights(fdp, fd);
+ if (needrightsp != NULL) {
+ error = cap_check(&haverights, needrightsp);
+ if (error != 0)
+ return (error);
+ if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
+ error = cap_fcntl_check(fdp, fd, needfcntl);
+ if (error != 0)
+ return (error);
+ }
+ }
+#endif
+ count = fp->f_count;
+ if (count == 0)
+ continue;
+ /*
+ * Use an acquire barrier to prevent caching of fd_ofiles
+ * so it is refreshed for verification.
+ */
+ if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
+ continue;
+ if (fp == fdp->fd_ofiles[fd].fde_file)
+ break;
+ fdrop(fp, curthread);
+ }
+ *fpp = fp;
+ if (haverightsp != NULL) {
+#ifdef CAPABILITIES
+ *haverightsp = haverights;
+#else
+ CAP_ALL(haverightsp);
+#endif
+ }
+ return (0);
+}
+
+/*
+ * Extract the file pointer associated with the specified descriptor for the
+ * current user process.
+ *
+ * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
+ * returned.
+ *
+ * File's rights will be checked against the capability rights mask.
+ *
+ * If an error occured the non-zero error is returned and *fpp is set to
+ * NULL. Otherwise *fpp is held and set and zero is returned. Caller is
+ * responsible for fdrop().
+ */
+static __inline int
+_fget(struct thread *td, int fd, struct file **fpp, int flags,
+ cap_rights_t *needrightsp, u_char *maxprotp)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ cap_rights_t haverights, needrights;
+ int error;
+
+ *fpp = NULL;
+ if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+ return (EBADF);
+ if (needrightsp != NULL)
+ needrights = *needrightsp;
+ else
+ cap_rights_init(&needrights);
+ if (maxprotp != NULL)
+ cap_rights_set(&needrights, CAP_MMAP);
+ error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
+ if (error != 0)
+ return (error);
+ if (fp->f_ops == &badfileops) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+
+#ifdef CAPABILITIES
+ /*
+ * If requested, convert capability rights to access flags.
+ */
+ if (maxprotp != NULL)
+ *maxprotp = cap_rights_to_vmprot(&haverights);
+#else /* !CAPABILITIES */
+ if (maxprotp != NULL)
+ *maxprotp = VM_PROT_ALL;
+#endif /* CAPABILITIES */
+
+ /*
+ * FREAD and FWRITE failure return EBADF as per POSIX.
+ */
+ error = 0;
+ switch (flags) {
+ case FREAD:
+ case FWRITE:
+ if ((fp->f_flag & flags) == 0)
+ error = EBADF;
+ break;
+ case FEXEC:
+ if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
+ ((fp->f_flag & FWRITE) != 0))
+ error = EBADF;
+ break;
+ case 0:
+ break;
+ default:
+ KASSERT(0, ("wrong flags"));
+ }
+
+ if (error != 0) {
+ fdrop(fp, td);
+ return (error);
+ }
+
+ *fpp = fp;
+ return (0);
+}
+
+int
+fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+ return(_fget(td, fd, fpp, 0, rightsp, NULL));
+}
+
+int
+fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
+ struct file **fpp)
+{
+
+ return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
+}
+
+int
+fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+ return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
+}
+
+int
+fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+
+ return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
+}
+
+/*
+ * Like fget() but loads the underlying vnode, or returns an error if the
+ * descriptor does not represent a vnode. Note that pipes use vnodes but
+ * never have VM objects. The returned vnode will be vref()'d.
+ *
+ * XXX: what about the unused flags ?
+ */
+static __inline int
+_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
+ struct vnode **vpp)
+{
+ struct file *fp;
+ int error;
+
+ *vpp = NULL;
+ error = _fget(td, fd, &fp, flags, needrightsp, NULL);
+ if (error != 0)
+ return (error);
+ if (fp->f_vnode == NULL) {
+ error = EINVAL;
+ } else {
+ *vpp = fp->f_vnode;
+ vref(*vpp);
+ }
+ fdrop(fp, td);
+
+ return (error);
+}
+
+int
+fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+ return (_fgetvp(td, fd, 0, rightsp, vpp));
+}
+
+int
+fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
+ struct filecaps *havecaps, struct vnode **vpp)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+#ifdef CAPABILITIES
+ int error;
+#endif
+
+ if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+ return (EBADF);
+
+ fp = fget_locked(fdp, fd);
+ if (fp == NULL || fp->f_ops == &badfileops)
+ return (EBADF);
+
+#ifdef CAPABILITIES
+ if (needrightsp != NULL) {
+ error = cap_check(cap_rights(fdp, fd), needrightsp);
+ if (error != 0)
+ return (error);
+ }
+#endif
+
+ if (fp->f_vnode == NULL)
+ return (EINVAL);
+
+ *vpp = fp->f_vnode;
+ vref(*vpp);
+ filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
+
+ return (0);
+}
+
+int
+fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+ return (_fgetvp(td, fd, FREAD, rightsp, vpp));
+}
+
+int
+fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+{
+
+ return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
+}
+
+#ifdef notyet
+int
+fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
+ struct vnode **vpp)
+{
+
+ return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
+}
+#endif
+
+/*
+ * Like fget() but loads the underlying socket, or returns an error if the
+ * descriptor does not represent a socket.
+ *
+ * We bump the ref count on the returned socket. XXX Also obtain the SX lock
+ * in the future.
+ *
+ * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
+ * on their file descriptor reference to prevent the socket from being free'd
+ * during use.
+ */
+int
+fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
+ u_int *fflagp)
+{
+ struct file *fp;
+ int error;
+
+ *spp = NULL;
+ if (fflagp != NULL)
+ *fflagp = 0;
+ if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_SOCKET) {
+ error = ENOTSOCK;
+ } else {
+ *spp = fp->f_data;
+ if (fflagp)
+ *fflagp = fp->f_flag;
+ SOCK_LOCK(*spp);
+ soref(*spp);
+ SOCK_UNLOCK(*spp);
+ }
+ fdrop(fp, td);
+
+ return (error);
+}
+
+/*
+ * Drop the reference count on the socket and XXX release the SX lock in the
+ * future. The last reference closes the socket.
+ *
+ * Note: fputsock() is deprecated, see comment for fgetsock().
+ */
+void
+fputsock(struct socket *so)
+{
+
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ CURVNET_SET(so->so_vnet);
+ sorele(so);
+ CURVNET_RESTORE();
+}
+
+/*
+ * Handle the last reference to a file being closed.
+ */
+int
+_fdrop(struct file *fp, struct thread *td)
+{
+ int error;
+
+ error = 0;
+ if (fp->f_count != 0)
+ panic("fdrop: count %d", fp->f_count);
+ if (fp->f_ops != &badfileops)
+ error = fo_close(fp, td);
+ atomic_subtract_int(&openfiles, 1);
+ crfree(fp->f_cred);
+ free(fp->f_advice, M_FADVISE);
+ uma_zfree(file_zone, fp);
+
+ return (error);
+}
+
+/*
+ * Apply an advisory lock on a file descriptor.
+ *
+ * Just attempt to get a record lock of the requested type on the entire file
+ * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+ int fd;
+ int how;
+};
+#endif
+/* ARGSUSED */
+int
+sys_flock(struct thread *td, struct flock_args *uap)
+{
+ struct file *fp;
+ struct vnode *vp;
+ struct flock lf;
+ cap_rights_t rights;
+ int error;
+
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
+ if (error != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE) {
+ fdrop(fp, td);
+ return (EOPNOTSUPP);
+ }
+
+ vp = fp->f_vnode;
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (uap->how & LOCK_UN) {
+ lf.l_type = F_UNLCK;
+ atomic_clear_int(&fp->f_flag, FHASLOCK);
+ error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+ goto done2;
+ }
+ if (uap->how & LOCK_EX)
+ lf.l_type = F_WRLCK;
+ else if (uap->how & LOCK_SH)
+ lf.l_type = F_RDLCK;
+ else {
+ error = EBADF;
+ goto done2;
+ }
+ atomic_set_int(&fp->f_flag, FHASLOCK);
+ error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+ (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
+done2:
+ fdrop(fp, td);
+ return (error);
+}
+/*
+ * Duplicate the specified descriptor to a free descriptor.
+ */
+int
+dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
+ int openerror, int *indxp)
+{
+ struct file *fp;
+ int error, indx;
+
+ KASSERT(openerror == ENODEV || openerror == ENXIO,
+ ("unexpected error %d in %s", openerror, __func__));
+
+ /*
+ * If the to-be-dup'd fd number is greater than the allowed number
+ * of file descriptors, or the fd to be dup'd has already been
+ * closed, then reject.
+ */
+ FILEDESC_XLOCK(fdp);
+ if ((fp = fget_locked(fdp, dfd)) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+
+ error = fdalloc(td, 0, &indx);
+ if (error != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ return (error);
+ }
+
+ /*
+ * There are two cases of interest here.
+ *
+ * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
+ *
+ * For ENXIO steal away the file structure from (dfd) and store it in
+ * (indx). (dfd) is effectively closed by this operation.
+ */
+ switch (openerror) {
+ case ENODEV:
+ /*
+ * Check that the mode the file is being opened for is a
+ * subset of the mode of the existing descriptor.
+ */
+ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+ fdunused(fdp, indx);
+ FILEDESC_XUNLOCK(fdp);
+ return (EACCES);
+ }
+ fhold(fp);
+ fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+ filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps,
+ &fdp->fd_ofiles[indx].fde_caps);
+ break;
+ case ENXIO:
+ /*
+ * Steal away the file pointer from dfd and stuff it into indx.
+ */
+ fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
+ bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd]));
+ fdunused(fdp, dfd);
+ break;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ *indxp = indx;
+ return (0);
+}
+
+/*
+ * Scan all active processes and prisons to see if any of them have a current
+ * or root directory of `olddp'. If so, replace them with the new mount point.
+ */
+void
+mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
+{
+ struct filedesc *fdp;
+ struct prison *pr;
+ struct proc *p;
+ int nrele;
+
+ if (vrefcnt(olddp) == 1)
+ return;
+ nrele = 0;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ fdp = fdhold(p);
+ if (fdp == NULL)
+ continue;
+ FILEDESC_XLOCK(fdp);
+ if (fdp->fd_cdir == olddp) {
+ vref(newdp);
+ fdp->fd_cdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_rdir == olddp) {
+ vref(newdp);
+ fdp->fd_rdir = newdp;
+ nrele++;
+ }
+ if (fdp->fd_jdir == olddp) {
+ vref(newdp);
+ fdp->fd_jdir = newdp;
+ nrele++;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ fddrop(fdp);
+ }
+ sx_sunlock(&allproc_lock);
+ if (rootvnode == olddp) {
+ vref(newdp);
+ rootvnode = newdp;
+ nrele++;
+ }
+ mtx_lock(&prison0.pr_mtx);
+ if (prison0.pr_root == olddp) {
+ vref(newdp);
+ prison0.pr_root = newdp;
+ nrele++;
+ }
+ mtx_unlock(&prison0.pr_mtx);
+ sx_slock(&allprison_lock);
+ TAILQ_FOREACH(pr, &allprison, pr_list) {
+ mtx_lock(&pr->pr_mtx);
+ if (pr->pr_root == olddp) {
+ vref(newdp);
+ pr->pr_root = newdp;
+ nrele++;
+ }
+ mtx_unlock(&pr->pr_mtx);
+ }
+ sx_sunlock(&allprison_lock);
+ while (nrele--)
+ vrele(olddp);
+}
+
+struct filedesc_to_leader *
+filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
+{
+ struct filedesc_to_leader *fdtol;
+
+ fdtol = malloc(sizeof(struct filedesc_to_leader),
+ M_FILEDESC_TO_LEADER,
+ M_WAITOK);
+ fdtol->fdl_refcount = 1;
+ fdtol->fdl_holdcount = 0;
+ fdtol->fdl_wakeup = 0;
+ fdtol->fdl_leader = leader;
+ if (old != NULL) {
+ FILEDESC_XLOCK(fdp);
+ fdtol->fdl_next = old->fdl_next;
+ fdtol->fdl_prev = old;
+ old->fdl_next = fdtol;
+ fdtol->fdl_next->fdl_prev = fdtol;
+ FILEDESC_XUNLOCK(fdp);
+ } else {
+ fdtol->fdl_next = fdtol;
+ fdtol->fdl_prev = fdtol;
+ }
+ return (fdtol);
+}
+
+/*
+ * Get file structures globally.
+ */
+static int
+sysctl_kern_file(SYSCTL_HANDLER_ARGS)
+{
+ struct xfile xf;
+ struct filedesc *fdp;
+ struct file *fp;
+ struct proc *p;
+ int error, n;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ if (req->oldptr == NULL) {
+ n = 0;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ fdp = fdhold(p);
+ if (fdp == NULL)
+ continue;
+ /* overestimates sparse tables. */
+ if (fdp->fd_lastfile > 0)
+ n += fdp->fd_lastfile;
+ fddrop(fdp);
+ }
+ sx_sunlock(&allproc_lock);
+ return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
+ }
+ error = 0;
+ bzero(&xf, sizeof(xf));
+ xf.xf_size = sizeof(xf);
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ if (p_cansee(req->td, p) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ xf.xf_pid = p->p_pid;
+ xf.xf_uid = p->p_ucred->cr_uid;
+ PROC_UNLOCK(p);
+ fdp = fdhold(p);
+ if (fdp == NULL)
+ continue;
+ FILEDESC_SLOCK(fdp);
+ for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ continue;
+ xf.xf_fd = n;
+ xf.xf_file = fp;
+ xf.xf_data = fp->f_data;
+ xf.xf_vnode = fp->f_vnode;
+ xf.xf_type = fp->f_type;
+ xf.xf_count = fp->f_count;
+ xf.xf_msgcount = 0;
+ xf.xf_offset = foffset_get(fp);
+ xf.xf_flag = fp->f_flag;
+ error = SYSCTL_OUT(req, &xf, sizeof(xf));
+ if (error)
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ fddrop(fdp);
+ if (error)
+ break;
+ }
+ sx_sunlock(&allproc_lock);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
+
+#ifdef KINFO_OFILE_SIZE
+CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
+#endif
+
+#ifdef COMPAT_FREEBSD7
+static int
+export_vnode_for_osysctl(struct vnode *vp, int type,
+ struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
+{
+ int error;
+ char *fullpath, *freepath;
+
+ bzero(kif, sizeof(*kif));
+ kif->kf_structsize = sizeof(*kif);
+
+ vref(vp);
+ kif->kf_fd = type;
+ kif->kf_type = KF_TYPE_VNODE;
+ /* This function only handles directories. */
+ if (vp->v_type != VDIR) {
+ vrele(vp);
+ return (ENOTDIR);
+ }
+ kif->kf_vnode_type = KF_VTYPE_VDIR;
+
+ /*
+ * This is not a true file descriptor, so we set a bogus refcount
+ * and offset to indicate these fields should be ignored.
+ */
+ kif->kf_ref_count = -1;
+ kif->kf_offset = -1;
+
+ freepath = NULL;
+ fullpath = "-";
+ FILEDESC_SUNLOCK(fdp);
+ vn_fullpath(curthread, vp, &fullpath, &freepath);
+ vrele(vp);
+ strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
+ if (freepath != NULL)
+ free(freepath, M_TEMP);
+ error = SYSCTL_OUT(req, kif, sizeof(*kif));
+ FILEDESC_SLOCK(fdp);
+ return (error);
+}
+
+/*
+ * Get per-process file descriptors for use by procstat(1), et al.
+ */
+static int
+sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
+{
+ char *fullpath, *freepath;
+ struct kinfo_ofile *kif;
+ struct filedesc *fdp;
+ int error, i, *name;
+ struct shmfd *shmfd;
+ struct socket *so;
+ struct vnode *vp;
+ struct ksem *ks;
+ struct file *fp;
+ struct proc *p;
+ struct tty *tp;
+
+ name = (int *)arg1;
+ error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+ if (error != 0)
+ return (error);
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ if (fdp == NULL)
+ return (ENOENT);
+ kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
+ FILEDESC_SLOCK(fdp);
+ if (fdp->fd_cdir != NULL)
+ export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
+ fdp, req);
+ if (fdp->fd_rdir != NULL)
+ export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
+ fdp, req);
+ if (fdp->fd_jdir != NULL)
+ export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
+ fdp, req);
+ for (i = 0; i < fdp->fd_nfiles; i++) {
+ if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
+ continue;
+ bzero(kif, sizeof(*kif));
+ kif->kf_structsize = sizeof(*kif);
+ ks = NULL;
+ vp = NULL;
+ so = NULL;
+ tp = NULL;
+ shmfd = NULL;
+ kif->kf_fd = i;
+
+ switch (fp->f_type) {
+ case DTYPE_VNODE:
+ kif->kf_type = KF_TYPE_VNODE;
+ vp = fp->f_vnode;
+ break;
+
+ case DTYPE_SOCKET:
+ kif->kf_type = KF_TYPE_SOCKET;
+ so = fp->f_data;
+ break;
+
+ case DTYPE_PIPE:
+ kif->kf_type = KF_TYPE_PIPE;
+ break;
+
+ case DTYPE_FIFO:
+ kif->kf_type = KF_TYPE_FIFO;
+ vp = fp->f_vnode;
+ break;
+
+ case DTYPE_KQUEUE:
+ kif->kf_type = KF_TYPE_KQUEUE;
+ break;
+
+ case DTYPE_CRYPTO:
+ kif->kf_type = KF_TYPE_CRYPTO;
+ break;
+
+ case DTYPE_MQUEUE:
+ kif->kf_type = KF_TYPE_MQUEUE;
+ break;
+
+ case DTYPE_SHM:
+ kif->kf_type = KF_TYPE_SHM;
+ shmfd = fp->f_data;
+ break;
+
+ case DTYPE_SEM:
+ kif->kf_type = KF_TYPE_SEM;
+ ks = fp->f_data;
+ break;
+
+ case DTYPE_PTS:
+ kif->kf_type = KF_TYPE_PTS;
+ tp = fp->f_data;
+ break;
+
+#ifdef PROCDESC
+ case DTYPE_PROCDESC:
+ kif->kf_type = KF_TYPE_PROCDESC;
+ break;
+#endif
+
+ default:
+ kif->kf_type = KF_TYPE_UNKNOWN;
+ break;
+ }
+ kif->kf_ref_count = fp->f_count;
+ if (fp->f_flag & FREAD)
+ kif->kf_flags |= KF_FLAG_READ;
+ if (fp->f_flag & FWRITE)
+ kif->kf_flags |= KF_FLAG_WRITE;
+ if (fp->f_flag & FAPPEND)
+ kif->kf_flags |= KF_FLAG_APPEND;
+ if (fp->f_flag & FASYNC)
+ kif->kf_flags |= KF_FLAG_ASYNC;
+ if (fp->f_flag & FFSYNC)
+ kif->kf_flags |= KF_FLAG_FSYNC;
+ if (fp->f_flag & FNONBLOCK)
+ kif->kf_flags |= KF_FLAG_NONBLOCK;
+ if (fp->f_flag & O_DIRECT)
+ kif->kf_flags |= KF_FLAG_DIRECT;
+ if (fp->f_flag & FHASLOCK)
+ kif->kf_flags |= KF_FLAG_HASLOCK;
+ kif->kf_offset = foffset_get(fp);
+ if (vp != NULL) {
+ vref(vp);
+ switch (vp->v_type) {
+ case VNON:
+ kif->kf_vnode_type = KF_VTYPE_VNON;
+ break;
+ case VREG:
+ kif->kf_vnode_type = KF_VTYPE_VREG;
+ break;
+ case VDIR:
+ kif->kf_vnode_type = KF_VTYPE_VDIR;
+ break;
+ case VBLK:
+ kif->kf_vnode_type = KF_VTYPE_VBLK;
+ break;
+ case VCHR:
+ kif->kf_vnode_type = KF_VTYPE_VCHR;
+ break;
+ case VLNK:
+ kif->kf_vnode_type = KF_VTYPE_VLNK;
+ break;
+ case VSOCK:
+ kif->kf_vnode_type = KF_VTYPE_VSOCK;
+ break;
+ case VFIFO:
+ kif->kf_vnode_type = KF_VTYPE_VFIFO;
+ break;
+ case VBAD:
+ kif->kf_vnode_type = KF_VTYPE_VBAD;
+ break;
+ default:
+ kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
+ break;
+ }
+ /*
+ * It is OK to drop the filedesc lock here as we will
+ * re-validate and re-evaluate its properties when
+ * the loop continues.
+ */
+ freepath = NULL;
+ fullpath = "-";
+ FILEDESC_SUNLOCK(fdp);
+ vn_fullpath(curthread, vp, &fullpath, &freepath);
+ vrele(vp);
+ strlcpy(kif->kf_path, fullpath,
+ sizeof(kif->kf_path));
+ if (freepath != NULL)
+ free(freepath, M_TEMP);
+ FILEDESC_SLOCK(fdp);
+ }
+ if (so != NULL) {
+ struct sockaddr *sa;
+
+ if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
+ == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
+ bcopy(sa, &kif->kf_sa_local, sa->sa_len);
+ free(sa, M_SONAME);
+ }
+ if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
+ == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
+ bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
+ free(sa, M_SONAME);
+ }
+ kif->kf_sock_domain =
+ so->so_proto->pr_domain->dom_family;
+ kif->kf_sock_type = so->so_type;
+ kif->kf_sock_protocol = so->so_proto->pr_protocol;
+ }
+ if (tp != NULL) {
+ strlcpy(kif->kf_path, tty_devname(tp),
+ sizeof(kif->kf_path));
+ }
+ if (shmfd != NULL)
+ shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
+ if (ks != NULL && ksem_info != NULL)
+ ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
+ error = SYSCTL_OUT(req, kif, sizeof(*kif));
+ if (error)
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ fddrop(fdp);
+ free(kif, M_TEMP);
+ return (0);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
+ sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
+#endif /* COMPAT_FREEBSD7 */
+
+#ifdef KINFO_FILE_SIZE
+CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
+#endif
+
+struct export_fd_buf {
+ struct filedesc *fdp;
+ struct sbuf *sb;
+ ssize_t remainder;
+ struct kinfo_file kif;
+};
+
+static int
+export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
+ int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
+{
+ struct {
+ int fflag;
+ int kf_fflag;
+ } fflags_table[] = {
+ { FAPPEND, KF_FLAG_APPEND },
+ { FASYNC, KF_FLAG_ASYNC },
+ { FFSYNC, KF_FLAG_FSYNC },
+ { FHASLOCK, KF_FLAG_HASLOCK },
+ { FNONBLOCK, KF_FLAG_NONBLOCK },
+ { FREAD, KF_FLAG_READ },
+ { FWRITE, KF_FLAG_WRITE },
+ { O_CREAT, KF_FLAG_CREAT },
+ { O_DIRECT, KF_FLAG_DIRECT },
+ { O_EXCL, KF_FLAG_EXCL },
+ { O_EXEC, KF_FLAG_EXEC },
+ { O_EXLOCK, KF_FLAG_EXLOCK },
+ { O_NOFOLLOW, KF_FLAG_NOFOLLOW },
+ { O_SHLOCK, KF_FLAG_SHLOCK },
+ { O_TRUNC, KF_FLAG_TRUNC }
+ };
+#define NFFLAGS (sizeof(fflags_table) / sizeof(*fflags_table))
+ struct kinfo_file *kif;
+ struct vnode *vp;
+ int error, locked;
+ unsigned int i;
+
+ if (efbuf->remainder == 0)
+ return (0);
+ kif = &efbuf->kif;
+ bzero(kif, sizeof(*kif));
+ locked = efbuf->fdp != NULL;
+ switch (type) {
+ case KF_TYPE_FIFO:
+ case KF_TYPE_VNODE:
+ if (locked) {
+ FILEDESC_SUNLOCK(efbuf->fdp);
+ locked = 0;
+ }
+ vp = (struct vnode *)data;
+ error = fill_vnode_info(vp, kif);
+ vrele(vp);
+ break;
+ case KF_TYPE_SOCKET:
+ error = fill_socket_info((struct socket *)data, kif);
+ break;
+ case KF_TYPE_PIPE:
+ error = fill_pipe_info((struct pipe *)data, kif);
+ break;
+ case KF_TYPE_PTS:
+ error = fill_pts_info((struct tty *)data, kif);
+ break;
+ case KF_TYPE_PROCDESC:
+ error = fill_procdesc_info((struct procdesc *)data, kif);
+ break;
+ case KF_TYPE_SEM:
+ error = fill_sem_info((struct file *)data, kif);
+ break;
+ case KF_TYPE_SHM:
+ error = fill_shm_info((struct file *)data, kif);
+ break;
+ default:
+ error = 0;
+ }
+ if (error == 0)
+ kif->kf_status |= KF_ATTR_VALID;
+
+ /*
+ * Translate file access flags.
+ */
+ for (i = 0; i < NFFLAGS; i++)
+ if (fflags & fflags_table[i].fflag)
+ kif->kf_flags |= fflags_table[i].kf_fflag;
+ if (rightsp != NULL)
+ kif->kf_cap_rights = *rightsp;
+ else
+ cap_rights_init(&kif->kf_cap_rights);
+ kif->kf_fd = fd;
+ kif->kf_type = type;
+ kif->kf_ref_count = refcnt;
+ kif->kf_offset = offset;
+ /* Pack record size down */
+ kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
+ strlen(kif->kf_path) + 1;
+ kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
+ if (efbuf->remainder != -1) {
+ if (efbuf->remainder < kif->kf_structsize) {
+ /* Terminate export. */
+ efbuf->remainder = 0;
+ if (efbuf->fdp != NULL && !locked)
+ FILEDESC_SLOCK(efbuf->fdp);
+ return (0);
+ }
+ efbuf->remainder -= kif->kf_structsize;
+ }
+ if (locked)
+ FILEDESC_SUNLOCK(efbuf->fdp);
+ error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize);
+ if (efbuf->fdp != NULL)
+ FILEDESC_SLOCK(efbuf->fdp);
+ return (error);
+}
+
+/*
+ * Store a process file descriptor information to sbuf.
+ *
+ * Takes a locked proc as argument, and returns with the proc unlocked.
+ */
+int
+kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen)
+{
+ struct file *fp;
+ struct filedesc *fdp;
+ struct export_fd_buf *efbuf;
+ struct vnode *cttyvp, *textvp, *tracevp;
+ int64_t offset;
+ void *data;
+ int error, i;
+ int type, refcnt, fflags;
+ cap_rights_t rights;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ /* ktrace vnode */
+ tracevp = p->p_tracevp;
+ if (tracevp != NULL)
+ vref(tracevp);
+ /* text vnode */
+ textvp = p->p_textvp;
+ if (textvp != NULL)
+ vref(textvp);
+ /* Controlling tty. */
+ cttyvp = NULL;
+ if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
+ cttyvp = p->p_pgrp->pg_session->s_ttyvp;
+ if (cttyvp != NULL)
+ vref(cttyvp);
+ }
+ fdp = fdhold(p);
+ PROC_UNLOCK(p);
+ efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
+ efbuf->fdp = NULL;
+ efbuf->sb = sb;
+ efbuf->remainder = maxlen;
+ if (tracevp != NULL)
+ export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
+ FREAD | FWRITE, -1, -1, NULL, efbuf);
+ if (textvp != NULL)
+ export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
+ FREAD, -1, -1, NULL, efbuf);
+ if (cttyvp != NULL)
+ export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
+ FREAD | FWRITE, -1, -1, NULL, efbuf);
+ error = 0;
+ if (fdp == NULL)
+ goto fail;
+ efbuf->fdp = fdp;
+ FILEDESC_SLOCK(fdp);
+ /* working directory */
+ if (fdp->fd_cdir != NULL) {
+ vref(fdp->fd_cdir);
+ data = fdp->fd_cdir;
+ export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
+ FREAD, -1, -1, NULL, efbuf);
+ }
+ /* root directory */
+ if (fdp->fd_rdir != NULL) {
+ vref(fdp->fd_rdir);
+ data = fdp->fd_rdir;
+ export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
+ FREAD, -1, -1, NULL, efbuf);
+ }
+ /* jail directory */
+ if (fdp->fd_jdir != NULL) {
+ vref(fdp->fd_jdir);
+ data = fdp->fd_jdir;
+ export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
+ FREAD, -1, -1, NULL, efbuf);
+ }
+ for (i = 0; i < fdp->fd_nfiles; i++) {
+ if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
+ continue;
+ data = NULL;
+#ifdef CAPABILITIES
+ rights = *cap_rights(fdp, i);
+#else /* !CAPABILITIES */
+ cap_rights_init(&rights);
+#endif
+ switch (fp->f_type) {
+ case DTYPE_VNODE:
+ type = KF_TYPE_VNODE;
+ vref(fp->f_vnode);
+ data = fp->f_vnode;
+ break;
+
+ case DTYPE_SOCKET:
+ type = KF_TYPE_SOCKET;
+ data = fp->f_data;
+ break;
+
+ case DTYPE_PIPE:
+ type = KF_TYPE_PIPE;
+ data = fp->f_data;
+ break;
+
+ case DTYPE_FIFO:
+ type = KF_TYPE_FIFO;
+ vref(fp->f_vnode);
+ data = fp->f_vnode;
+ break;
+
+ case DTYPE_KQUEUE:
+ type = KF_TYPE_KQUEUE;
+ break;
+
+ case DTYPE_CRYPTO:
+ type = KF_TYPE_CRYPTO;
+ break;
+
+ case DTYPE_MQUEUE:
+ type = KF_TYPE_MQUEUE;
+ break;
+
+ case DTYPE_SHM:
+ type = KF_TYPE_SHM;
+ data = fp;
+ break;
+
+ case DTYPE_SEM:
+ type = KF_TYPE_SEM;
+ data = fp;
+ break;
+
+ case DTYPE_PTS:
+ type = KF_TYPE_PTS;
+ data = fp->f_data;
+ break;
+
+#ifdef PROCDESC
+ case DTYPE_PROCDESC:
+ type = KF_TYPE_PROCDESC;
+ data = fp->f_data;
+ break;
+#endif
+
+ default:
+ type = KF_TYPE_UNKNOWN;
+ break;
+ }
+ refcnt = fp->f_count;
+ fflags = fp->f_flag;
+ offset = foffset_get(fp);
+
+ /*
+ * Create sysctl entry.
+ * It is OK to drop the filedesc lock here as we will
+ * re-validate and re-evaluate its properties when
+ * the loop continues.
+ */
+ error = export_fd_to_sb(data, type, i, fflags, refcnt,
+ offset, &rights, efbuf);
+ if (error != 0)
+ break;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ fddrop(fdp);
+fail:
+ free(efbuf, M_TEMP);
+ return (error);
+}
+
+#define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5)
+
+/*
+ * Get per-process file descriptors for use by procstat(1), et al.
+ */
+static int
+sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sb;
+ struct proc *p;
+ ssize_t maxlen;
+ int error, error2, *name;
+
+ name = (int *)arg1;
+
+ sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
+ error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+ if (error != 0) {
+ sbuf_delete(&sb);
+ return (error);
+ }
+ maxlen = req->oldptr != NULL ? req->oldlen : -1;
+ error = kern_proc_filedesc_out(p, &sb, maxlen);
+ error2 = sbuf_finish(&sb);
+ sbuf_delete(&sb);
+ return (error != 0 ? error : error2);
+}
+
+int
+vntype_to_kinfo(int vtype)
+{
+ struct {
+ int vtype;
+ int kf_vtype;
+ } vtypes_table[] = {
+ { VBAD, KF_VTYPE_VBAD },
+ { VBLK, KF_VTYPE_VBLK },
+ { VCHR, KF_VTYPE_VCHR },
+ { VDIR, KF_VTYPE_VDIR },
+ { VFIFO, KF_VTYPE_VFIFO },
+ { VLNK, KF_VTYPE_VLNK },
+ { VNON, KF_VTYPE_VNON },
+ { VREG, KF_VTYPE_VREG },
+ { VSOCK, KF_VTYPE_VSOCK }
+ };
+#define NVTYPES (sizeof(vtypes_table) / sizeof(*vtypes_table))
+ unsigned int i;
+
+ /*
+ * Perform vtype translation.
+ */
+ for (i = 0; i < NVTYPES; i++)
+ if (vtypes_table[i].vtype == vtype)
+ break;
+ if (i < NVTYPES)
+ return (vtypes_table[i].kf_vtype);
+
+ return (KF_VTYPE_UNKNOWN);
+}
+
+static int
+fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
+{
+ struct vattr va;
+ char *fullpath, *freepath;
+ int error;
+
+ if (vp == NULL)
+ return (1);
+ kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
+ freepath = NULL;
+ fullpath = "-";
+ error = vn_fullpath(curthread, vp, &fullpath, &freepath);
+ if (error == 0) {
+ strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
+ }
+ if (freepath != NULL)
+ free(freepath, M_TEMP);
+
+ /*
+ * Retrieve vnode attributes.
+ */
+ va.va_fsid = VNOVAL;
+ va.va_rdev = NODEV;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &va, curthread->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ if (error != 0)
+ return (error);
+ if (va.va_fsid != VNOVAL)
+ kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
+ else
+ kif->kf_un.kf_file.kf_file_fsid =
+ vp->v_mount->mnt_stat.f_fsid.val[0];
+ kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
+ kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
+ kif->kf_un.kf_file.kf_file_size = va.va_size;
+ kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
+ return (0);
+}
+
+static int
+fill_socket_info(struct socket *so, struct kinfo_file *kif)
+{
+ struct sockaddr *sa;
+ struct inpcb *inpcb;
+ struct unpcb *unpcb;
+ int error;
+
+ if (so == NULL)
+ return (1);
+ kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
+ kif->kf_sock_type = so->so_type;
+ kif->kf_sock_protocol = so->so_proto->pr_protocol;
+ kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
+ switch(kif->kf_sock_domain) {
+ case AF_INET:
+ case AF_INET6:
+ if (kif->kf_sock_protocol == IPPROTO_TCP) {
+ if (so->so_pcb != NULL) {
+ inpcb = (struct inpcb *)(so->so_pcb);
+ kif->kf_un.kf_sock.kf_sock_inpcb =
+ (uintptr_t)inpcb->inp_ppcb;
+ }
+ }
+ break;
+ case AF_UNIX:
+ if (so->so_pcb != NULL) {
+ unpcb = (struct unpcb *)(so->so_pcb);
+ if (unpcb->unp_conn) {
+ kif->kf_un.kf_sock.kf_sock_unpconn =
+ (uintptr_t)unpcb->unp_conn;
+ kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
+ so->so_rcv.sb_state;
+ kif->kf_un.kf_sock.kf_sock_snd_sb_state =
+ so->so_snd.sb_state;
+ }
+ }
+ break;
+ }
+ error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+ if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
+ bcopy(sa, &kif->kf_sa_local, sa->sa_len);
+ free(sa, M_SONAME);
+ }
+ error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
+ if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
+ bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
+ free(sa, M_SONAME);
+ }
+ strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
+ sizeof(kif->kf_path));
+ return (0);
+}
+
+static int
+fill_pts_info(struct tty *tp, struct kinfo_file *kif)
+{
+
+ if (tp == NULL)
+ return (1);
+ kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
+ strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
+ return (0);
+}
+
+static int
+fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
+{
+
+ if (pi == NULL)
+ return (1);
+ kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
+ kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
+ kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
+ return (0);
+}
+
+static int
+fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
+{
+
+ if (pdp == NULL)
+ return (1);
+ kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
+ return (0);
+}
+
+static int
+fill_sem_info(struct file *fp, struct kinfo_file *kif)
+{
+ struct thread *td;
+ struct stat sb;
+
+ td = curthread;
+ if (fp->f_data == NULL)
+ return (1);
+ if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
+ return (1);
+ if (ksem_info == NULL)
+ return (1);
+ ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
+ &kif->kf_un.kf_sem.kf_sem_value);
+ kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
+ return (0);
+}
+
+static int
+fill_shm_info(struct file *fp, struct kinfo_file *kif)
+{
+ struct thread *td;
+ struct stat sb;
+
+ td = curthread;
+ if (fp->f_data == NULL)
+ return (1);
+ if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
+ return (1);
+ shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
+ kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
+ kif->kf_un.kf_file.kf_file_size = sb.st_size;
+ return (0);
+}
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
+ sysctl_kern_proc_filedesc, "Process filedesc entries");
+
+#ifdef DDB
+/*
+ * For the purposes of debugging, generate a human-readable string for the
+ * file type.
+ */
+static const char *
+file_type_to_name(short type)
+{
+
+ switch (type) {
+ case 0:
+ return ("zero");
+ case DTYPE_VNODE:
+ return ("vnod");
+ case DTYPE_SOCKET:
+ return ("sock");
+ case DTYPE_PIPE:
+ return ("pipe");
+ case DTYPE_FIFO:
+ return ("fifo");
+ case DTYPE_KQUEUE:
+ return ("kque");
+ case DTYPE_CRYPTO:
+ return ("crpt");
+ case DTYPE_MQUEUE:
+ return ("mque");
+ case DTYPE_SHM:
+ return ("shm");
+ case DTYPE_SEM:
+ return ("ksem");
+ default:
+ return ("unkn");
+ }
+}
+
+/*
+ * For the purposes of debugging, identify a process (if any, perhaps one of
+ * many) that references the passed file in its file descriptor array. Return
+ * NULL if none.
+ */
+static struct proc *
+file_to_first_proc(struct file *fp)
+{
+ struct filedesc *fdp;
+ struct proc *p;
+ int n;
+
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ fdp = p->p_fd;
+ if (fdp == NULL)
+ continue;
+ for (n = 0; n < fdp->fd_nfiles; n++) {
+ if (fp == fdp->fd_ofiles[n].fde_file)
+ return (p);
+ }
+ }
+ return (NULL);
+}
+
+static void
+db_print_file(struct file *fp, int header)
+{
+ struct proc *p;
+
+ if (header)
+ db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
+ "File", "Type", "Data", "Flag", "GCFl", "Count",
+ "MCount", "Vnode", "FPID", "FCmd");
+ p = file_to_first_proc(fp);
+ db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
+ file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
+ 0, fp->f_count, 0, fp->f_vnode,
+ p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
+}
+
+DB_SHOW_COMMAND(file, db_show_file)
+{
+ struct file *fp;
+
+ if (!have_addr) {
+ db_printf("usage: show file <addr>\n");
+ return;
+ }
+ fp = (struct file *)addr;
+ db_print_file(fp, 1);
+}
+
+DB_SHOW_COMMAND(files, db_show_files)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ struct proc *p;
+ int header;
+ int n;
+
+ header = 1;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (p->p_state == PRS_NEW)
+ continue;
+ if ((fdp = p->p_fd) == NULL)
+ continue;
+ for (n = 0; n < fdp->fd_nfiles; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
+ continue;
+ db_print_file(fp, header);
+ header = 0;
+ }
+ }
+}
+#endif
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
+ &maxfilesperproc, 0, "Maximum files allowed open per process");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
+ &maxfiles, 0, "Maximum number of files");
+
+SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
+ __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
+
+/* ARGSUSED*/
+static void
+filelistinit(void *dummy)
+{
+
+ file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
+ mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
+}
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
+
+/*-------------------------------------------------------------------*/
+
+static int
+badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+static int
+badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (0);
+}
+
+static int
+badfo_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_close(struct file *fp, struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EBADF);
+}
+
+static int
+badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+ struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+ int kflags, struct thread *td)
+{
+
+ return (EBADF);
+}
+
+struct fileops badfileops = {
+ .fo_read = badfo_readwrite,
+ .fo_write = badfo_readwrite,
+ .fo_truncate = badfo_truncate,
+ .fo_ioctl = badfo_ioctl,
+ .fo_poll = badfo_poll,
+ .fo_kqfilter = badfo_kqfilter,
+ .fo_stat = badfo_stat,
+ .fo_close = badfo_close,
+ .fo_chmod = badfo_chmod,
+ .fo_chown = badfo_chown,
+ .fo_sendfile = badfo_sendfile,
+};
+
+int
+invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+int
+invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+int
+invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+ struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+ int kflags, struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+/*-------------------------------------------------------------------*/
+
+/*
+ * File Descriptor pseudo-device driver (/dev/fd/).
+ *
+ * Opening minor device N dup()s the file (if any) connected to file
+ * descriptor N belonging to the calling process. Note that this driver
+ * consists of only the ``open()'' routine, because all subsequent
+ * references to this file will be direct to the other driver.
+ *
+ * XXX: we could give this one a cloning event handler if necessary.
+ */
+
+/* ARGSUSED */
+static int
+fdopen(struct cdev *dev, int mode, int type, struct thread *td)
+{
+
+ /*
+ * XXX Kludge: set curthread->td_dupfd to contain the value of the
+ * the file descriptor being sought for duplication. The error
+ * return ensures that the vnode for this device will be released
+ * by vn_open. Open will detect this special error and take the
+ * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
+ * will simply report the error.
+ */
+ td->td_dupfd = dev2unit(dev);
+ return (ENODEV);
+}
+
+static struct cdevsw fildesc_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = fdopen,
+ .d_name = "FD",
+};
+
+static void
+fildesc_drvinit(void *unused)
+{
+ struct cdev *dev;
+
+ dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
+ UID_ROOT, GID_WHEEL, 0666, "fd/0");
+ make_dev_alias(dev, "stdin");
+ dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
+ UID_ROOT, GID_WHEEL, 0666, "fd/1");
+ make_dev_alias(dev, "stdout");
+ dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
+ UID_ROOT, GID_WHEEL, 0666, "fd/2");
+ make_dev_alias(dev, "stderr");
+}
+
+SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
diff --git a/sys/kern/kern_dtrace.c b/sys/kern/kern_dtrace.c
new file mode 100644
index 0000000..5582fb9
--- /dev/null
+++ b/sys/kern/kern_dtrace.c
@@ -0,0 +1,117 @@
+/*-
+ * Copyright (c) 2007-2008 John Birrell <jb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/dtrace_bsd.h>
+#include <sys/sysctl.h>
+
+#define KDTRACE_PROC_SIZE 64
+#define KDTRACE_THREAD_SIZE 256
+
+FEATURE(kdtrace_hooks,
+ "Kernel DTrace hooks which are required to load DTrace kernel modules");
+
+static MALLOC_DEFINE(M_KDTRACE, "kdtrace", "DTrace hooks");
+
+/* Return the DTrace process data size compiled in the kernel hooks. */
+size_t
+kdtrace_proc_size()
+{
+
+ return (KDTRACE_PROC_SIZE);
+}
+
+static void
+kdtrace_proc_ctor(void *arg __unused, struct proc *p)
+{
+
+ p->p_dtrace = malloc(KDTRACE_PROC_SIZE, M_KDTRACE, M_WAITOK|M_ZERO);
+}
+
+static void
+kdtrace_proc_dtor(void *arg __unused, struct proc *p)
+{
+
+ if (p->p_dtrace != NULL) {
+ free(p->p_dtrace, M_KDTRACE);
+ p->p_dtrace = NULL;
+ }
+}
+
+/* Return the DTrace thread data size compiled in the kernel hooks. */
+size_t
+kdtrace_thread_size()
+{
+
+ return (KDTRACE_THREAD_SIZE);
+}
+
+static void
+kdtrace_thread_ctor(void *arg __unused, struct thread *td)
+{
+
+ td->td_dtrace = malloc(KDTRACE_THREAD_SIZE, M_KDTRACE, M_WAITOK|M_ZERO);
+}
+
+static void
+kdtrace_thread_dtor(void *arg __unused, struct thread *td)
+{
+
+ if (td->td_dtrace != NULL) {
+ free(td->td_dtrace, M_KDTRACE);
+ td->td_dtrace = NULL;
+ }
+}
+
+/*
+ * Initialise the kernel DTrace hooks.
+ */
+static void
+init_dtrace(void *dummy __unused)
+{
+
+ EVENTHANDLER_REGISTER(process_ctor, kdtrace_proc_ctor, NULL,
+ EVENTHANDLER_PRI_ANY);
+ EVENTHANDLER_REGISTER(process_dtor, kdtrace_proc_dtor, NULL,
+ EVENTHANDLER_PRI_ANY);
+ EVENTHANDLER_REGISTER(thread_ctor, kdtrace_thread_ctor, NULL,
+ EVENTHANDLER_PRI_ANY);
+ EVENTHANDLER_REGISTER(thread_dtor, kdtrace_thread_dtor, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+SYSINIT(kdtrace, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_dtrace, NULL);
diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c
new file mode 100644
index 0000000..e89b3f7
--- /dev/null
+++ b/sys/kern/kern_environment.c
@@ -0,0 +1,626 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The unified bootloader passes us a pointer to a preserved copy of
+ * bootstrap/kernel environment variables. We convert them to a
+ * dynamic array of strings later when the VM subsystem is up.
+ *
+ * We make these available through the kenv(2) syscall for userland
+ * and through getenv()/freeenv() setenv() unsetenv() testenv() for
+ * the kernel.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/libkern.h>
+#include <sys/kenv.h>
+
+#include <security/mac/mac_framework.h>
+
+static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
+
+#define KENV_SIZE 512 /* Maximum number of environment strings */
+
+/* pointer to the static environment */
+char *kern_envp;
+static int env_len;
+static int env_pos;
+static char *kernenv_next(char *);
+
+/* dynamic environment variables */
+char **kenvp;
+struct mtx kenv_lock;
+
+/*
+ * No need to protect this with a mutex since SYSINITS are single threaded.
+ */
+int dynamic_kenv = 0;
+
+#define KENV_CHECK if (!dynamic_kenv) \
+ panic("%s: called before SI_SUB_KMEM", __func__)
+
+int
+sys_kenv(td, uap)
+ struct thread *td;
+ struct kenv_args /* {
+ int what;
+ const char *name;
+ char *value;
+ int len;
+ } */ *uap;
+{
+ char *name, *value, *buffer = NULL;
+ size_t len, done, needed, buflen;
+ int error, i;
+
+ KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0"));
+
+ error = 0;
+ if (uap->what == KENV_DUMP) {
+#ifdef MAC
+ error = mac_kenv_check_dump(td->td_ucred);
+ if (error)
+ return (error);
+#endif
+ done = needed = 0;
+ buflen = uap->len;
+ if (buflen > KENV_SIZE * (KENV_MNAMELEN + KENV_MVALLEN + 2))
+ buflen = KENV_SIZE * (KENV_MNAMELEN +
+ KENV_MVALLEN + 2);
+ if (uap->len > 0 && uap->value != NULL)
+ buffer = malloc(buflen, M_TEMP, M_WAITOK|M_ZERO);
+ mtx_lock(&kenv_lock);
+ for (i = 0; kenvp[i] != NULL; i++) {
+ len = strlen(kenvp[i]) + 1;
+ needed += len;
+ len = min(len, buflen - done);
+ /*
+ * If called with a NULL or insufficiently large
+ * buffer, just keep computing the required size.
+ */
+ if (uap->value != NULL && buffer != NULL && len > 0) {
+ bcopy(kenvp[i], buffer + done, len);
+ done += len;
+ }
+ }
+ mtx_unlock(&kenv_lock);
+ if (buffer != NULL) {
+ error = copyout(buffer, uap->value, done);
+ free(buffer, M_TEMP);
+ }
+ td->td_retval[0] = ((done == needed) ? 0 : needed);
+ return (error);
+ }
+
+ switch (uap->what) {
+ case KENV_SET:
+ error = priv_check(td, PRIV_KENV_SET);
+ if (error)
+ return (error);
+ break;
+
+ case KENV_UNSET:
+ error = priv_check(td, PRIV_KENV_UNSET);
+ if (error)
+ return (error);
+ break;
+ }
+
+ name = malloc(KENV_MNAMELEN + 1, M_TEMP, M_WAITOK);
+
+ error = copyinstr(uap->name, name, KENV_MNAMELEN + 1, NULL);
+ if (error)
+ goto done;
+
+ switch (uap->what) {
+ case KENV_GET:
+#ifdef MAC
+ error = mac_kenv_check_get(td->td_ucred, name);
+ if (error)
+ goto done;
+#endif
+ value = getenv(name);
+ if (value == NULL) {
+ error = ENOENT;
+ goto done;
+ }
+ len = strlen(value) + 1;
+ if (len > uap->len)
+ len = uap->len;
+ error = copyout(value, uap->value, len);
+ freeenv(value);
+ if (error)
+ goto done;
+ td->td_retval[0] = len;
+ break;
+ case KENV_SET:
+ len = uap->len;
+ if (len < 1) {
+ error = EINVAL;
+ goto done;
+ }
+ if (len > KENV_MVALLEN + 1)
+ len = KENV_MVALLEN + 1;
+ value = malloc(len, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->value, value, len, NULL);
+ if (error) {
+ free(value, M_TEMP);
+ goto done;
+ }
+#ifdef MAC
+ error = mac_kenv_check_set(td->td_ucred, name, value);
+ if (error == 0)
+#endif
+ setenv(name, value);
+ free(value, M_TEMP);
+ break;
+ case KENV_UNSET:
+#ifdef MAC
+ error = mac_kenv_check_unset(td->td_ucred, name);
+ if (error)
+ goto done;
+#endif
+ error = unsetenv(name);
+ if (error)
+ error = ENOENT;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+done:
+ free(name, M_TEMP);
+ return (error);
+}
+
+void
+init_static_kenv(char *buf, size_t len)
+{
+ kern_envp = buf;
+ env_len = len;
+ env_pos = 0;
+}
+
+/*
+ * Setup the dynamic kernel environment.
+ */
+static void
+init_dynamic_kenv(void *data __unused)
+{
+ char *cp;
+ size_t len;
+ int i;
+
+ kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV,
+ M_WAITOK | M_ZERO);
+ i = 0;
+ if (kern_envp && *kern_envp != '\0') {
+ for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+ len = strlen(cp) + 1;
+ if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
+ printf(
+ "WARNING: too long kenv string, ignoring %s\n",
+ cp);
+ continue;
+ }
+ if (i < KENV_SIZE) {
+ kenvp[i] = malloc(len, M_KENV, M_WAITOK);
+ strcpy(kenvp[i++], cp);
+ } else
+ printf(
+ "WARNING: too many kenv strings, ignoring %s\n",
+ cp);
+ }
+ }
+ kenvp[i] = NULL;
+
+ mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF);
+ dynamic_kenv = 1;
+}
+SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
+
+void
+freeenv(char *env)
+{
+
+ if (dynamic_kenv)
+ free(env, M_KENV);
+}
+
+/*
+ * Internal functions for string lookup.
+ */
+static char *
+_getenv_dynamic(const char *name, int *idx)
+{
+ char *cp;
+ int len, i;
+
+ mtx_assert(&kenv_lock, MA_OWNED);
+ len = strlen(name);
+ for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
+ if ((strncmp(cp, name, len) == 0) &&
+ (cp[len] == '=')) {
+ if (idx != NULL)
+ *idx = i;
+ return (cp + len + 1);
+ }
+ }
+ return (NULL);
+}
+
+static char *
+_getenv_static(const char *name)
+{
+ char *cp, *ep;
+ int len;
+
+ for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+ for (ep = cp; (*ep != '=') && (*ep != 0); ep++)
+ ;
+ if (*ep != '=')
+ continue;
+ len = ep - cp;
+ ep++;
+ if (!strncmp(name, cp, len) && name[len] == 0)
+ return (ep);
+ }
+ return (NULL);
+}
+
+/*
+ * Look up an environment variable by name.
+ * Return a pointer to the string if found.
+ * The pointer has to be freed with freeenv()
+ * after use.
+ */
+char *
+getenv(const char *name)
+{
+ char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
+ char *ret, *cp;
+ int len;
+
+ if (dynamic_kenv) {
+ mtx_lock(&kenv_lock);
+ cp = _getenv_dynamic(name, NULL);
+ if (cp != NULL) {
+ strcpy(buf, cp);
+ mtx_unlock(&kenv_lock);
+ len = strlen(buf) + 1;
+ ret = malloc(len, M_KENV, M_WAITOK);
+ strcpy(ret, buf);
+ } else {
+ mtx_unlock(&kenv_lock);
+ ret = NULL;
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "getenv");
+ }
+ } else
+ ret = _getenv_static(name);
+ return (ret);
+}
+
+/*
+ * Test if an environment variable is defined.
+ */
+int
+testenv(const char *name)
+{
+ char *cp;
+
+ if (dynamic_kenv) {
+ mtx_lock(&kenv_lock);
+ cp = _getenv_dynamic(name, NULL);
+ mtx_unlock(&kenv_lock);
+ } else
+ cp = _getenv_static(name);
+ if (cp != NULL)
+ return (1);
+ return (0);
+}
+
+static int
+setenv_static(const char *name, const char *value)
+{
+ int len;
+
+ if (env_pos >= env_len)
+ return (-1);
+
+ /* Check space for x=y and two nuls */
+ len = strlen(name) + strlen(value);
+ if (len + 3 < env_len - env_pos) {
+ len = sprintf(&kern_envp[env_pos], "%s=%s", name, value);
+ env_pos += len+1;
+ kern_envp[env_pos] = '\0';
+ return (0);
+ } else
+ return (-1);
+
+}
+
+/*
+ * Set an environment variable by name.
+ */
+int
+setenv(const char *name, const char *value)
+{
+ char *buf, *cp, *oldenv;
+ int namelen, vallen, i;
+
+ if (dynamic_kenv == 0 && env_len > 0)
+ return (setenv_static(name, value));
+
+ KENV_CHECK;
+
+ namelen = strlen(name) + 1;
+ if (namelen > KENV_MNAMELEN + 1)
+ return (-1);
+ vallen = strlen(value) + 1;
+ if (vallen > KENV_MVALLEN + 1)
+ return (-1);
+ buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
+ sprintf(buf, "%s=%s", name, value);
+
+ mtx_lock(&kenv_lock);
+ cp = _getenv_dynamic(name, &i);
+ if (cp != NULL) {
+ oldenv = kenvp[i];
+ kenvp[i] = buf;
+ mtx_unlock(&kenv_lock);
+ free(oldenv, M_KENV);
+ } else {
+ /* We add the option if it wasn't found */
+ for (i = 0; (cp = kenvp[i]) != NULL; i++)
+ ;
+
+ /* Bounds checking */
+ if (i < 0 || i >= KENV_SIZE) {
+ free(buf, M_KENV);
+ mtx_unlock(&kenv_lock);
+ return (-1);
+ }
+
+ kenvp[i] = buf;
+ kenvp[i + 1] = NULL;
+ mtx_unlock(&kenv_lock);
+ }
+ return (0);
+}
+
+/*
+ * Unset an environment variable string.
+ */
+int
+unsetenv(const char *name)
+{
+ char *cp, *oldenv;
+ int i, j;
+
+ KENV_CHECK;
+
+ mtx_lock(&kenv_lock);
+ cp = _getenv_dynamic(name, &i);
+ if (cp != NULL) {
+ oldenv = kenvp[i];
+ for (j = i + 1; kenvp[j] != NULL; j++)
+ kenvp[i++] = kenvp[j];
+ kenvp[i] = NULL;
+ mtx_unlock(&kenv_lock);
+ free(oldenv, M_KENV);
+ return (0);
+ }
+ mtx_unlock(&kenv_lock);
+ return (-1);
+}
+
+/*
+ * Return a string value from an environment variable.
+ */
+int
+getenv_string(const char *name, char *data, int size)
+{
+ char *tmp;
+
+ tmp = getenv(name);
+ if (tmp != NULL) {
+ strlcpy(data, tmp, size);
+ freeenv(tmp);
+ return (1);
+ } else
+ return (0);
+}
+
+/*
+ * Return an integer value from an environment variable.
+ */
+int
+getenv_int(const char *name, int *data)
+{
+ quad_t tmp;
+ int rval;
+
+ rval = getenv_quad(name, &tmp);
+ if (rval)
+ *data = (int) tmp;
+ return (rval);
+}
+
+/*
+ * Return an unsigned integer value from an environment variable.
+ */
+int
+getenv_uint(const char *name, unsigned int *data)
+{
+ quad_t tmp;
+ int rval;
+
+ rval = getenv_quad(name, &tmp);
+ if (rval)
+ *data = (unsigned int) tmp;
+ return (rval);
+}
+
+/*
+ * Return a long value from an environment variable.
+ */
+int
+getenv_long(const char *name, long *data)
+{
+ quad_t tmp;
+ int rval;
+
+ rval = getenv_quad(name, &tmp);
+ if (rval)
+ *data = (long) tmp;
+ return (rval);
+}
+
+/*
+ * Return an unsigned long value from an environment variable.
+ */
+int
+getenv_ulong(const char *name, unsigned long *data)
+{
+ quad_t tmp;
+ int rval;
+
+ rval = getenv_quad(name, &tmp);
+ if (rval)
+ *data = (unsigned long) tmp;
+ return (rval);
+}
+
+/*
+ * Return a quad_t value from an environment variable.
+ */
+int
+getenv_quad(const char *name, quad_t *data)
+{
+ char *value;
+ char *vtp;
+ quad_t iv;
+
+ value = getenv(name);
+ if (value == NULL)
+ return (0);
+ iv = strtoq(value, &vtp, 0);
+ if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) {
+ freeenv(value);
+ return (0);
+ }
+ switch (vtp[0]) {
+ case 't': case 'T':
+ iv *= 1024;
+ case 'g': case 'G':
+ iv *= 1024;
+ case 'm': case 'M':
+ iv *= 1024;
+ case 'k': case 'K':
+ iv *= 1024;
+ case '\0':
+ break;
+ default:
+ freeenv(value);
+ return (0);
+ }
+ *data = iv;
+ freeenv(value);
+ return (1);
+}
+
+/*
+ * Find the next entry after the one which (cp) falls within, return a
+ * pointer to its start or NULL if there are no more.
+ */
+static char *
+kernenv_next(char *cp)
+{
+
+ if (cp != NULL) {
+ while (*cp != 0)
+ cp++;
+ cp++;
+ if (*cp == 0)
+ cp = NULL;
+ }
+ return (cp);
+}
+
+void
+tunable_int_init(void *data)
+{
+ struct tunable_int *d = (struct tunable_int *)data;
+
+ TUNABLE_INT_FETCH(d->path, d->var);
+}
+
+void
+tunable_long_init(void *data)
+{
+ struct tunable_long *d = (struct tunable_long *)data;
+
+ TUNABLE_LONG_FETCH(d->path, d->var);
+}
+
+void
+tunable_ulong_init(void *data)
+{
+ struct tunable_ulong *d = (struct tunable_ulong *)data;
+
+ TUNABLE_ULONG_FETCH(d->path, d->var);
+}
+
+void
+tunable_quad_init(void *data)
+{
+ struct tunable_quad *d = (struct tunable_quad *)data;
+
+ TUNABLE_QUAD_FETCH(d->path, d->var);
+}
+
+void
+tunable_str_init(void *data)
+{
+ struct tunable_str *d = (struct tunable_str *)data;
+
+ TUNABLE_STR_FETCH(d->path, d->var, d->size);
+}
diff --git a/sys/kern/kern_et.c b/sys/kern/kern_et.c
new file mode 100644
index 0000000..d07316c
--- /dev/null
+++ b/sys/kern/kern_et.c
@@ -0,0 +1,246 @@
+/*-
+ * Copyright (c) 2010-2013 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/queue.h>
+#include <sys/timeet.h>
+
+SLIST_HEAD(et_eventtimers_list, eventtimer);
+static struct et_eventtimers_list eventtimers = SLIST_HEAD_INITIALIZER(et_eventtimers);
+
+struct mtx et_eventtimers_mtx;
+MTX_SYSINIT(et_eventtimers_init, &et_eventtimers_mtx, "et_mtx", MTX_DEF);
+
+SYSCTL_NODE(_kern, OID_AUTO, eventtimer, CTLFLAG_RW, 0, "Event timers");
+static SYSCTL_NODE(_kern_eventtimer, OID_AUTO, et, CTLFLAG_RW, 0, "");
+
+/*
+ * Register a new event timer hardware.
+ */
+int
+et_register(struct eventtimer *et)
+{
+ struct eventtimer *tmp, *next;
+
+ if (et->et_quality >= 0 || bootverbose) {
+ if (et->et_frequency == 0) {
+ printf("Event timer \"%s\" quality %d\n",
+ et->et_name, et->et_quality);
+ } else {
+ printf("Event timer \"%s\" "
+ "frequency %ju Hz quality %d\n",
+ et->et_name, (uintmax_t)et->et_frequency,
+ et->et_quality);
+ }
+ }
+ KASSERT(et->et_start, ("et_register: timer has no start function"));
+ et->et_sysctl = SYSCTL_ADD_NODE(NULL,
+ SYSCTL_STATIC_CHILDREN(_kern_eventtimer_et), OID_AUTO, et->et_name,
+ CTLFLAG_RW, 0, "event timer description");
+ SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
+ "flags", CTLFLAG_RD, &(et->et_flags), 0,
+ "Event timer capabilities");
+ SYSCTL_ADD_UQUAD(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
+ "frequency", CTLFLAG_RD, &(et->et_frequency),
+ "Event timer base frequency");
+ SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(et->et_sysctl), OID_AUTO,
+ "quality", CTLFLAG_RD, &(et->et_quality), 0,
+ "Goodness of event timer");
+ ET_LOCK();
+ if (SLIST_EMPTY(&eventtimers) ||
+ SLIST_FIRST(&eventtimers)->et_quality < et->et_quality) {
+ SLIST_INSERT_HEAD(&eventtimers, et, et_all);
+ } else {
+ SLIST_FOREACH(tmp, &eventtimers, et_all) {
+ next = SLIST_NEXT(tmp, et_all);
+ if (next == NULL || next->et_quality < et->et_quality) {
+ SLIST_INSERT_AFTER(tmp, et, et_all);
+ break;
+ }
+ }
+ }
+ ET_UNLOCK();
+ return (0);
+}
+
+/*
+ * Deregister event timer hardware.
+ */
+int
+et_deregister(struct eventtimer *et)
+{
+ int err = 0;
+
+ if (et->et_deregister_cb != NULL) {
+ if ((err = et->et_deregister_cb(et, et->et_arg)) != 0)
+ return (err);
+ }
+
+ ET_LOCK();
+ SLIST_REMOVE(&eventtimers, et, eventtimer, et_all);
+ ET_UNLOCK();
+ sysctl_remove_oid(et->et_sysctl, 1, 1);
+ return (0);
+}
+
+/*
+ * Find free event timer hardware with specified parameters.
+ */
+struct eventtimer *
+et_find(const char *name, int check, int want)
+{
+ struct eventtimer *et = NULL;
+
+ SLIST_FOREACH(et, &eventtimers, et_all) {
+ if (et->et_active)
+ continue;
+ if (name != NULL && strcasecmp(et->et_name, name) != 0)
+ continue;
+ if (name == NULL && et->et_quality < 0)
+ continue;
+ if ((et->et_flags & check) != want)
+ continue;
+ break;
+ }
+ return (et);
+}
+
+/*
+ * Initialize event timer hardware. Set callbacks.
+ */
+int
+et_init(struct eventtimer *et, et_event_cb_t *event,
+ et_deregister_cb_t *deregister, void *arg)
+{
+
+ if (event == NULL)
+ return (EINVAL);
+ if (et->et_active)
+ return (EBUSY);
+
+ et->et_active = 1;
+ et->et_event_cb = event;
+ et->et_deregister_cb = deregister;
+ et->et_arg = arg;
+ return (0);
+}
+
+/*
+ * Start event timer hardware.
+ * first - delay before first tick.
+ * period - period of subsequent periodic ticks.
+ */
+int
+et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
+{
+
+ if (!et->et_active)
+ return (ENXIO);
+ KASSERT(period >= 0, ("et_start: negative period"));
+ KASSERT((et->et_flags & ET_FLAGS_PERIODIC) || period == 0,
+ ("et_start: period specified for oneshot-only timer"));
+ KASSERT((et->et_flags & ET_FLAGS_ONESHOT) || period != 0,
+ ("et_start: period not specified for periodic-only timer"));
+ if (period != 0) {
+ if (period < et->et_min_period)
+ period = et->et_min_period;
+ else if (period > et->et_max_period)
+ period = et->et_max_period;
+ }
+ if (period == 0 || first != 0) {
+ if (first < et->et_min_period)
+ first = et->et_min_period;
+ else if (first > et->et_max_period)
+ first = et->et_max_period;
+ }
+ return (et->et_start(et, first, period));
+}
+
+/* Stop event timer hardware. */
+int
+et_stop(struct eventtimer *et)
+{
+
+ if (!et->et_active)
+ return (ENXIO);
+ if (et->et_stop)
+ return (et->et_stop(et));
+ return (0);
+}
+
+/* Mark event timer hardware as broken. */
+int
+et_ban(struct eventtimer *et)
+{
+
+ et->et_flags &= ~(ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT);
+ return (0);
+}
+
+/* Free event timer hardware. */
+int
+et_free(struct eventtimer *et)
+{
+
+ if (!et->et_active)
+ return (ENXIO);
+
+ et->et_active = 0;
+ return (0);
+}
+
+/* Report list of supported event timers hardware via sysctl. */
+static int
+sysctl_kern_eventtimer_choice(SYSCTL_HANDLER_ARGS)
+{
+ char buf[512], *spc;
+ struct eventtimer *et;
+ int error, off;
+
+ spc = "";
+ error = 0;
+ buf[0] = 0;
+ off = 0;
+ ET_LOCK();
+ SLIST_FOREACH(et, &eventtimers, et_all) {
+ off += snprintf(buf + off, sizeof(buf) - off, "%s%s(%d)",
+ spc, et->et_name, et->et_quality);
+ spc = " ";
+ }
+ ET_UNLOCK();
+ error = SYSCTL_OUT(req, buf, strlen(buf));
+ return (error);
+}
+SYSCTL_PROC(_kern_eventtimer, OID_AUTO, choice,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ 0, 0, sysctl_kern_eventtimer_choice, "A", "Present event timers");
+
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
new file mode 100644
index 0000000..8bde25a
--- /dev/null
+++ b/sys/kern/kern_event.c
@@ -0,0 +1,2261 @@
+/*-
+ * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
+ * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
+ * Copyright (c) 2009 Apple, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/unistd.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/kthread.h>
+#include <sys/selinfo.h>
+#include <sys/stdatomic.h>
+#include <sys/queue.h>
+#include <sys/event.h>
+#include <sys/eventvar.h>
+#include <sys/poll.h>
+#include <sys/protosw.h>
+#include <sys/sigio.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/syscallsubr.h>
+#include <sys/taskqueue.h>
+#include <sys/uio.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/uma.h>
+
+static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
+
+/*
+ * This lock is used if multiple kq locks are required. This possibly
+ * should be made into a per proc lock.
+ */
+static struct mtx kq_global;
+MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
+#define KQ_GLOBAL_LOCK(lck, haslck) do { \
+ if (!haslck) \
+ mtx_lock(lck); \
+ haslck = 1; \
+} while (0)
+#define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
+ if (haslck) \
+ mtx_unlock(lck); \
+ haslck = 0; \
+} while (0)
+
+TASKQUEUE_DEFINE_THREAD(kqueue);
+
+static int kevent_copyout(void *arg, struct kevent *kevp, int count);
+static int kevent_copyin(void *arg, struct kevent *kevp, int count);
+static int kqueue_register(struct kqueue *kq, struct kevent *kev,
+ struct thread *td, int waitok);
+static int kqueue_acquire(struct file *fp, struct kqueue **kqp);
+static void kqueue_release(struct kqueue *kq, int locked);
+static int kqueue_expand(struct kqueue *kq, struct filterops *fops,
+ uintptr_t ident, int waitok);
+static void kqueue_task(void *arg, int pending);
+static int kqueue_scan(struct kqueue *kq, int maxevents,
+ struct kevent_copyops *k_ops,
+ const struct timespec *timeout,
+ struct kevent *keva, struct thread *td);
+static void kqueue_wakeup(struct kqueue *kq);
+static struct filterops *kqueue_fo_find(int filt);
+static void kqueue_fo_release(int filt);
+
+static fo_rdwr_t kqueue_read;
+static fo_rdwr_t kqueue_write;
+static fo_truncate_t kqueue_truncate;
+static fo_ioctl_t kqueue_ioctl;
+static fo_poll_t kqueue_poll;
+static fo_kqfilter_t kqueue_kqfilter;
+static fo_stat_t kqueue_stat;
+static fo_close_t kqueue_close;
+
+static struct fileops kqueueops = {
+ .fo_read = kqueue_read,
+ .fo_write = kqueue_write,
+ .fo_truncate = kqueue_truncate,
+ .fo_ioctl = kqueue_ioctl,
+ .fo_poll = kqueue_poll,
+ .fo_kqfilter = kqueue_kqfilter,
+ .fo_stat = kqueue_stat,
+ .fo_close = kqueue_close,
+ .fo_chmod = invfo_chmod,
+ .fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
+};
+
+static int knote_attach(struct knote *kn, struct kqueue *kq);
+static void knote_drop(struct knote *kn, struct thread *td);
+static void knote_enqueue(struct knote *kn);
+static void knote_dequeue(struct knote *kn);
+static void knote_init(void);
+static struct knote *knote_alloc(int waitok);
+static void knote_free(struct knote *kn);
+
+static void filt_kqdetach(struct knote *kn);
+static int filt_kqueue(struct knote *kn, long hint);
+static int filt_procattach(struct knote *kn);
+static void filt_procdetach(struct knote *kn);
+static int filt_proc(struct knote *kn, long hint);
+static int filt_fileattach(struct knote *kn);
+static void filt_timerexpire(void *knx);
+static int filt_timerattach(struct knote *kn);
+static void filt_timerdetach(struct knote *kn);
+static int filt_timer(struct knote *kn, long hint);
+static int filt_userattach(struct knote *kn);
+static void filt_userdetach(struct knote *kn);
+static int filt_user(struct knote *kn, long hint);
+static void filt_usertouch(struct knote *kn, struct kevent *kev,
+ u_long type);
+
+static struct filterops file_filtops = {
+ .f_isfd = 1,
+ .f_attach = filt_fileattach,
+};
+static struct filterops kqread_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_kqdetach,
+ .f_event = filt_kqueue,
+};
+/* XXX - move to kern_proc.c? */
+static struct filterops proc_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_procattach,
+ .f_detach = filt_procdetach,
+ .f_event = filt_proc,
+};
+static struct filterops timer_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_timerattach,
+ .f_detach = filt_timerdetach,
+ .f_event = filt_timer,
+};
+static struct filterops user_filtops = {
+ .f_attach = filt_userattach,
+ .f_detach = filt_userdetach,
+ .f_event = filt_user,
+ .f_touch = filt_usertouch,
+};
+
+static uma_zone_t knote_zone;
+static atomic_uint kq_ncallouts = ATOMIC_VAR_INIT(0);
+static unsigned int kq_calloutmax = 4 * 1024;
+SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
+ &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
+
+/* XXX - ensure not KN_INFLUX?? */
+#define KNOTE_ACTIVATE(kn, islock) do { \
+ if ((islock)) \
+ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
+ else \
+ KQ_LOCK((kn)->kn_kq); \
+ (kn)->kn_status |= KN_ACTIVE; \
+ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
+ knote_enqueue((kn)); \
+ if (!(islock)) \
+ KQ_UNLOCK((kn)->kn_kq); \
+} while(0)
+#define KQ_LOCK(kq) do { \
+ mtx_lock(&(kq)->kq_lock); \
+} while (0)
+#define KQ_FLUX_WAKEUP(kq) do { \
+ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
+ (kq)->kq_state &= ~KQ_FLUXWAIT; \
+ wakeup((kq)); \
+ } \
+} while (0)
+#define KQ_UNLOCK_FLUX(kq) do { \
+ KQ_FLUX_WAKEUP(kq); \
+ mtx_unlock(&(kq)->kq_lock); \
+} while (0)
+#define KQ_UNLOCK(kq) do { \
+ mtx_unlock(&(kq)->kq_lock); \
+} while (0)
+#define KQ_OWNED(kq) do { \
+ mtx_assert(&(kq)->kq_lock, MA_OWNED); \
+} while (0)
+#define KQ_NOTOWNED(kq) do { \
+ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
+} while (0)
+#define KN_LIST_LOCK(kn) do { \
+ if (kn->kn_knlist != NULL) \
+ kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \
+} while (0)
+#define KN_LIST_UNLOCK(kn) do { \
+ if (kn->kn_knlist != NULL) \
+ kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \
+} while (0)
+#define KNL_ASSERT_LOCK(knl, islocked) do { \
+ if (islocked) \
+ KNL_ASSERT_LOCKED(knl); \
+ else \
+ KNL_ASSERT_UNLOCKED(knl); \
+} while (0)
+#ifdef INVARIANTS
+#define KNL_ASSERT_LOCKED(knl) do { \
+ knl->kl_assert_locked((knl)->kl_lockarg); \
+} while (0)
+#define KNL_ASSERT_UNLOCKED(knl) do { \
+ knl->kl_assert_unlocked((knl)->kl_lockarg); \
+} while (0)
+#else /* !INVARIANTS */
+#define KNL_ASSERT_LOCKED(knl) do {} while(0)
+#define KNL_ASSERT_UNLOCKED(knl) do {} while (0)
+#endif /* INVARIANTS */
+
+#define KN_HASHSIZE 64 /* XXX should be tunable */
+#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
+
+static int
+filt_nullattach(struct knote *kn)
+{
+
+ return (ENXIO);
+};
+
+struct filterops null_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_nullattach,
+};
+
+/* XXX - make SYSINIT to add these, and move into respective modules. */
+extern struct filterops sig_filtops;
+extern struct filterops fs_filtops;
+
+/*
+ * Table for for all system-defined filters.
+ */
+static struct mtx filterops_lock;
+MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
+ MTX_DEF);
+static struct {
+ struct filterops *for_fop;
+ int for_refcnt;
+} sysfilt_ops[EVFILT_SYSCOUNT] = {
+ { &file_filtops }, /* EVFILT_READ */
+ { &file_filtops }, /* EVFILT_WRITE */
+ { &null_filtops }, /* EVFILT_AIO */
+ { &file_filtops }, /* EVFILT_VNODE */
+ { &proc_filtops }, /* EVFILT_PROC */
+ { &sig_filtops }, /* EVFILT_SIGNAL */
+ { &timer_filtops }, /* EVFILT_TIMER */
+ { &null_filtops }, /* former EVFILT_NETDEV */
+ { &fs_filtops }, /* EVFILT_FS */
+ { &null_filtops }, /* EVFILT_LIO */
+ { &user_filtops }, /* EVFILT_USER */
+};
+
+/*
+ * Simple redirection for all cdevsw style objects to call their fo_kqfilter
+ * method.
+ */
+static int
+filt_fileattach(struct knote *kn)
+{
+
+ return (fo_kqfilter(kn->kn_fp, kn));
+}
+
+/*ARGSUSED*/
+static int
+kqueue_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct kqueue *kq = kn->kn_fp->f_data;
+
+ if (kn->kn_filter != EVFILT_READ)
+ return (EINVAL);
+
+ kn->kn_status |= KN_KQUEUE;
+ kn->kn_fop = &kqread_filtops;
+ knlist_add(&kq->kq_sel.si_note, kn, 0);
+
+ return (0);
+}
+
+static void
+filt_kqdetach(struct knote *kn)
+{
+ struct kqueue *kq = kn->kn_fp->f_data;
+
+ knlist_remove(&kq->kq_sel.si_note, kn, 0);
+}
+
+/*ARGSUSED*/
+static int
+filt_kqueue(struct knote *kn, long hint)
+{
+ struct kqueue *kq = kn->kn_fp->f_data;
+
+ kn->kn_data = kq->kq_count;
+ return (kn->kn_data > 0);
+}
+
+/* XXX - move to kern_proc.c? */
+static int
+filt_procattach(struct knote *kn)
+{
+ struct proc *p;
+ int immediate;
+ int error;
+
+ immediate = 0;
+ p = pfind(kn->kn_id);
+ if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
+ p = zpfind(kn->kn_id);
+ immediate = 1;
+ } else if (p != NULL && (p->p_flag & P_WEXIT)) {
+ immediate = 1;
+ }
+
+ if (p == NULL)
+ return (ESRCH);
+ if ((error = p_cansee(curthread, p))) {
+ PROC_UNLOCK(p);
+ return (error);
+ }
+
+ kn->kn_ptr.p_proc = p;
+ kn->kn_flags |= EV_CLEAR; /* automatically set */
+
+ /*
+ * internal flag indicating registration done by kernel
+ */
+ if (kn->kn_flags & EV_FLAG1) {
+ kn->kn_data = kn->kn_sdata; /* ppid */
+ kn->kn_fflags = NOTE_CHILD;
+ kn->kn_flags &= ~EV_FLAG1;
+ }
+
+ if (immediate == 0)
+ knlist_add(&p->p_klist, kn, 1);
+
+ /*
+ * Immediately activate any exit notes if the target process is a
+ * zombie. This is necessary to handle the case where the target
+ * process, e.g. a child, dies before the kevent is registered.
+ */
+ if (immediate && filt_proc(kn, NOTE_EXIT))
+ KNOTE_ACTIVATE(kn, 0);
+
+ PROC_UNLOCK(p);
+
+ return (0);
+}
+
+/*
+ * The knote may be attached to a different process, which may exit,
+ * leaving nothing for the knote to be attached to. So when the process
+ * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
+ * it will be deleted when read out. However, as part of the knote deletion,
+ * this routine is called, so a check is needed to avoid actually performing
+ * a detach, because the original process does not exist any more.
+ */
+/* XXX - move to kern_proc.c? */
+static void
+filt_procdetach(struct knote *kn)
+{
+ struct proc *p;
+
+ p = kn->kn_ptr.p_proc;
+ knlist_remove(&p->p_klist, kn, 0);
+ kn->kn_ptr.p_proc = NULL;
+}
+
+/* XXX - move to kern_proc.c? */
+static int
+filt_proc(struct knote *kn, long hint)
+{
+ struct proc *p = kn->kn_ptr.p_proc;
+ u_int event;
+
+ /*
+ * mask off extra data
+ */
+ event = (u_int)hint & NOTE_PCTRLMASK;
+
+ /*
+ * if the user is interested in this event, record it.
+ */
+ if (kn->kn_sfflags & event)
+ kn->kn_fflags |= event;
+
+ /*
+ * process is gone, so flag the event as finished.
+ */
+ if (event == NOTE_EXIT) {
+ if (!(kn->kn_status & KN_DETACHED))
+ knlist_remove_inevent(&p->p_klist, kn);
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ kn->kn_ptr.p_proc = NULL;
+ if (kn->kn_fflags & NOTE_EXIT)
+ kn->kn_data = p->p_xstat;
+ if (kn->kn_fflags == 0)
+ kn->kn_flags |= EV_DROP;
+ return (1);
+ }
+
+ return (kn->kn_fflags != 0);
+}
+
+/*
+ * Called when the process forked. It mostly does the same as the
+ * knote(), activating all knotes registered to be activated when the
+ * process forked. Additionally, for each knote attached to the
+ * parent, check whether user wants to track the new process. If so
+ * attach a new knote to it, and immediately report an event with the
+ * child's pid.
+ */
+void
+knote_fork(struct knlist *list, int pid)
+{
+ struct kqueue *kq;
+ struct knote *kn;
+ struct kevent kev;
+ int error;
+
+ if (list == NULL)
+ return;
+ list->kl_lock(list->kl_lockarg);
+
+ SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+ if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
+ continue;
+ kq = kn->kn_kq;
+ KQ_LOCK(kq);
+ if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ KQ_UNLOCK(kq);
+ continue;
+ }
+
+ /*
+ * The same as knote(), activate the event.
+ */
+ if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
+ kn->kn_status |= KN_HASKQLOCK;
+ if (kn->kn_fop->f_event(kn, NOTE_FORK))
+ KNOTE_ACTIVATE(kn, 1);
+ kn->kn_status &= ~KN_HASKQLOCK;
+ KQ_UNLOCK(kq);
+ continue;
+ }
+
+ /*
+ * The NOTE_TRACK case. In addition to the activation
+ * of the event, we need to register new event to
+ * track the child. Drop the locks in preparation for
+ * the call to kqueue_register().
+ */
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ list->kl_unlock(list->kl_lockarg);
+
+ /*
+ * Activate existing knote and register a knote with
+ * new process.
+ */
+ kev.ident = pid;
+ kev.filter = kn->kn_filter;
+ kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
+ kev.fflags = kn->kn_sfflags;
+ kev.data = kn->kn_id; /* parent */
+ kev.udata = kn->kn_kevent.udata;/* preserve udata */
+ error = kqueue_register(kq, &kev, NULL, 0);
+ if (error)
+ kn->kn_fflags |= NOTE_TRACKERR;
+ if (kn->kn_fop->f_event(kn, NOTE_FORK))
+ KNOTE_ACTIVATE(kn, 0);
+ KQ_LOCK(kq);
+ kn->kn_status &= ~KN_INFLUX;
+ KQ_UNLOCK_FLUX(kq);
+ list->kl_lock(list->kl_lockarg);
+ }
+ list->kl_unlock(list->kl_lockarg);
+}
+
+/*
+ * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
+ * interval timer support code.
+ */
+static __inline sbintime_t
+timer2sbintime(intptr_t data)
+{
+
+ return (SBT_1MS * data);
+}
+
+static void
+filt_timerexpire(void *knx)
+{
+ struct callout *calloutp;
+ struct knote *kn;
+
+ kn = knx;
+ kn->kn_data++;
+ KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
+
+ if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
+ calloutp = (struct callout *)kn->kn_hook;
+ callout_reset_sbt_on(calloutp,
+ timer2sbintime(kn->kn_sdata), 0 /* 1ms? */,
+ filt_timerexpire, kn, PCPU_GET(cpuid), 0);
+ }
+}
+
+/*
+ * data contains amount of time to sleep, in milliseconds
+ */
+static int
+filt_timerattach(struct knote *kn)
+{
+ struct callout *calloutp;
+ unsigned int ncallouts;
+
+ ncallouts = atomic_load_explicit(&kq_ncallouts, memory_order_relaxed);
+ do {
+ if (ncallouts >= kq_calloutmax)
+ return (ENOMEM);
+ } while (!atomic_compare_exchange_weak_explicit(&kq_ncallouts,
+ &ncallouts, ncallouts + 1, memory_order_relaxed,
+ memory_order_relaxed));
+
+ kn->kn_flags |= EV_CLEAR; /* automatically set */
+ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */
+ calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
+ callout_init(calloutp, CALLOUT_MPSAFE);
+ kn->kn_hook = calloutp;
+ callout_reset_sbt_on(calloutp,
+ timer2sbintime(kn->kn_sdata), 0 /* 1ms? */,
+ filt_timerexpire, kn, PCPU_GET(cpuid), 0);
+
+ return (0);
+}
+
+static void
+filt_timerdetach(struct knote *kn)
+{
+ struct callout *calloutp;
+ unsigned int old;
+
+ calloutp = (struct callout *)kn->kn_hook;
+ callout_drain(calloutp);
+ free(calloutp, M_KQUEUE);
+ old = atomic_fetch_sub_explicit(&kq_ncallouts, 1, memory_order_relaxed);
+ KASSERT(old > 0, ("Number of callouts cannot become negative"));
+ kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */
+}
+
+static int
+filt_timer(struct knote *kn, long hint)
+{
+
+ return (kn->kn_data != 0);
+}
+
+static int
+filt_userattach(struct knote *kn)
+{
+
+ /*
+ * EVFILT_USER knotes are not attached to anything in the kernel.
+ */
+ kn->kn_hook = NULL;
+ if (kn->kn_fflags & NOTE_TRIGGER)
+ kn->kn_hookid = 1;
+ else
+ kn->kn_hookid = 0;
+ return (0);
+}
+
+static void
+filt_userdetach(__unused struct knote *kn)
+{
+
+ /*
+ * EVFILT_USER knotes are not attached to anything in the kernel.
+ */
+}
+
+static int
+filt_user(struct knote *kn, __unused long hint)
+{
+
+ return (kn->kn_hookid);
+}
+
+static void
+filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
+{
+ u_int ffctrl;
+
+ switch (type) {
+ case EVENT_REGISTER:
+ if (kev->fflags & NOTE_TRIGGER)
+ kn->kn_hookid = 1;
+
+ ffctrl = kev->fflags & NOTE_FFCTRLMASK;
+ kev->fflags &= NOTE_FFLAGSMASK;
+ switch (ffctrl) {
+ case NOTE_FFNOP:
+ break;
+
+ case NOTE_FFAND:
+ kn->kn_sfflags &= kev->fflags;
+ break;
+
+ case NOTE_FFOR:
+ kn->kn_sfflags |= kev->fflags;
+ break;
+
+ case NOTE_FFCOPY:
+ kn->kn_sfflags = kev->fflags;
+ break;
+
+ default:
+ /* XXX Return error? */
+ break;
+ }
+ kn->kn_sdata = kev->data;
+ if (kev->flags & EV_CLEAR) {
+ kn->kn_hookid = 0;
+ kn->kn_data = 0;
+ kn->kn_fflags = 0;
+ }
+ break;
+
+ case EVENT_PROCESS:
+ *kev = kn->kn_kevent;
+ kev->fflags = kn->kn_sfflags;
+ kev->data = kn->kn_sdata;
+ if (kn->kn_flags & EV_CLEAR) {
+ kn->kn_hookid = 0;
+ kn->kn_data = 0;
+ kn->kn_fflags = 0;
+ }
+ break;
+
+ default:
+ panic("filt_usertouch() - invalid type (%ld)", type);
+ break;
+ }
+}
+
+int
+sys_kqueue(struct thread *td, struct kqueue_args *uap)
+{
+ struct filedesc *fdp;
+ struct kqueue *kq;
+ struct file *fp;
+ int fd, error;
+
+ fdp = td->td_proc->p_fd;
+ error = falloc(td, &fp, &fd, 0);
+ if (error)
+ goto done2;
+
+ /* An extra reference on `fp' has been held for us by falloc(). */
+ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
+ mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
+ TAILQ_INIT(&kq->kq_head);
+ kq->kq_fdp = fdp;
+ knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
+ TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+
+ FILEDESC_XLOCK(fdp);
+ SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
+ FILEDESC_XUNLOCK(fdp);
+
+ finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
+ fdrop(fp, td);
+
+ td->td_retval[0] = fd;
+done2:
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kevent_args {
+ int fd;
+ const struct kevent *changelist;
+ int nchanges;
+ struct kevent *eventlist;
+ int nevents;
+ const struct timespec *timeout;
+};
+#endif
+int
+sys_kevent(struct thread *td, struct kevent_args *uap)
+{
+ struct timespec ts, *tsp;
+ struct kevent_copyops k_ops = { uap,
+ kevent_copyout,
+ kevent_copyin};
+ int error;
+#ifdef KTRACE
+ struct uio ktruio;
+ struct iovec ktriov;
+ struct uio *ktruioin = NULL;
+ struct uio *ktruioout = NULL;
+#endif
+
+ if (uap->timeout != NULL) {
+ error = copyin(uap->timeout, &ts, sizeof(ts));
+ if (error)
+ return (error);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO)) {
+ ktriov.iov_base = uap->changelist;
+ ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
+ ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
+ .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
+ .uio_td = td };
+ ktruioin = cloneuio(&ktruio);
+ ktriov.iov_base = uap->eventlist;
+ ktriov.iov_len = uap->nevents * sizeof(struct kevent);
+ ktruioout = cloneuio(&ktruio);
+ }
+#endif
+
+ error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
+ &k_ops, tsp);
+
+#ifdef KTRACE
+ if (ktruioin != NULL) {
+ ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
+ ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
+ ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
+ ktrgenio(uap->fd, UIO_READ, ktruioout, error);
+ }
+#endif
+
+ return (error);
+}
+
+/*
+ * Copy 'count' items into the destination list pointed to by uap->eventlist.
+ */
+static int
+kevent_copyout(void *arg, struct kevent *kevp, int count)
+{
+ struct kevent_args *uap;
+ int error;
+
+ KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
+ uap = (struct kevent_args *)arg;
+
+ error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
+ if (error == 0)
+ uap->eventlist += count;
+ return (error);
+}
+
+/*
+ * Copy 'count' items from the list pointed to by uap->changelist.
+ */
+static int
+kevent_copyin(void *arg, struct kevent *kevp, int count)
+{
+ struct kevent_args *uap;
+ int error;
+
+ KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
+ uap = (struct kevent_args *)arg;
+
+ error = copyin(uap->changelist, kevp, count * sizeof *kevp);
+ if (error == 0)
+ uap->changelist += count;
+ return (error);
+}
+
+int
+kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
+ struct kevent_copyops *k_ops, const struct timespec *timeout)
+{
+ struct kevent keva[KQ_NEVENTS];
+ struct kevent *kevp, *changes;
+ struct kqueue *kq;
+ struct file *fp;
+ cap_rights_t rights;
+ int i, n, nerrors, error;
+
+ error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp);
+ if (error != 0)
+ return (error);
+ if ((error = kqueue_acquire(fp, &kq)) != 0)
+ goto done_norel;
+
+ nerrors = 0;
+
+ while (nchanges > 0) {
+ n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
+ error = k_ops->k_copyin(k_ops->arg, keva, n);
+ if (error)
+ goto done;
+ changes = keva;
+ for (i = 0; i < n; i++) {
+ kevp = &changes[i];
+ if (!kevp->filter)
+ continue;
+ kevp->flags &= ~EV_SYSFLAGS;
+ error = kqueue_register(kq, kevp, td, 1);
+ if (error || (kevp->flags & EV_RECEIPT)) {
+ if (nevents != 0) {
+ kevp->flags = EV_ERROR;
+ kevp->data = error;
+ (void) k_ops->k_copyout(k_ops->arg,
+ kevp, 1);
+ nevents--;
+ nerrors++;
+ } else {
+ goto done;
+ }
+ }
+ }
+ nchanges -= n;
+ }
+ if (nerrors) {
+ td->td_retval[0] = nerrors;
+ error = 0;
+ goto done;
+ }
+
+ error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
+done:
+ kqueue_release(kq, 0);
+done_norel:
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+kqueue_add_filteropts(int filt, struct filterops *filtops)
+{
+ int error;
+
+ error = 0;
+ if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
+ printf(
+"trying to add a filterop that is out of range: %d is beyond %d\n",
+ ~filt, EVFILT_SYSCOUNT);
+ return EINVAL;
+ }
+ mtx_lock(&filterops_lock);
+ if (sysfilt_ops[~filt].for_fop != &null_filtops &&
+ sysfilt_ops[~filt].for_fop != NULL)
+ error = EEXIST;
+ else {
+ sysfilt_ops[~filt].for_fop = filtops;
+ sysfilt_ops[~filt].for_refcnt = 0;
+ }
+ mtx_unlock(&filterops_lock);
+
+ return (error);
+}
+
+int
+kqueue_del_filteropts(int filt)
+{
+ int error;
+
+ error = 0;
+ if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+ return EINVAL;
+
+ mtx_lock(&filterops_lock);
+ if (sysfilt_ops[~filt].for_fop == &null_filtops ||
+ sysfilt_ops[~filt].for_fop == NULL)
+ error = EINVAL;
+ else if (sysfilt_ops[~filt].for_refcnt != 0)
+ error = EBUSY;
+ else {
+ sysfilt_ops[~filt].for_fop = &null_filtops;
+ sysfilt_ops[~filt].for_refcnt = 0;
+ }
+ mtx_unlock(&filterops_lock);
+
+ return error;
+}
+
+static struct filterops *
+kqueue_fo_find(int filt)
+{
+
+ if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+ return NULL;
+
+ mtx_lock(&filterops_lock);
+ sysfilt_ops[~filt].for_refcnt++;
+ if (sysfilt_ops[~filt].for_fop == NULL)
+ sysfilt_ops[~filt].for_fop = &null_filtops;
+ mtx_unlock(&filterops_lock);
+
+ return sysfilt_ops[~filt].for_fop;
+}
+
+static void
+kqueue_fo_release(int filt)
+{
+
+ if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+ return;
+
+ mtx_lock(&filterops_lock);
+ KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
+ ("filter object refcount not valid on release"));
+ sysfilt_ops[~filt].for_refcnt--;
+ mtx_unlock(&filterops_lock);
+}
+
+/*
+ * A ref to kq (obtained via kqueue_acquire) must be held. waitok will
+ * influence if memory allocation should wait. Make sure it is 0 if you
+ * hold any mutexes.
+ */
+static int
+kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
+{
+ struct filterops *fops;
+ struct file *fp;
+ struct knote *kn, *tkn;
+ cap_rights_t rights;
+ int error, filt, event;
+ int haskqglobal;
+
+ fp = NULL;
+ kn = NULL;
+ error = 0;
+ haskqglobal = 0;
+
+ filt = kev->filter;
+ fops = kqueue_fo_find(filt);
+ if (fops == NULL)
+ return EINVAL;
+
+ tkn = knote_alloc(waitok); /* prevent waiting with locks */
+
+findkn:
+ if (fops->f_isfd) {
+ KASSERT(td != NULL, ("td is NULL"));
+ error = fget(td, kev->ident,
+ cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
+ if (error)
+ goto done;
+
+ if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
+ kev->ident, 0) != 0) {
+ /* try again */
+ fdrop(fp, td);
+ fp = NULL;
+ error = kqueue_expand(kq, fops, kev->ident, waitok);
+ if (error)
+ goto done;
+ goto findkn;
+ }
+
+ if (fp->f_type == DTYPE_KQUEUE) {
+ /*
+ * if we add some inteligence about what we are doing,
+ * we should be able to support events on ourselves.
+ * We need to know when we are doing this to prevent
+ * getting both the knlist lock and the kq lock since
+ * they are the same thing.
+ */
+ if (fp->f_data == kq) {
+ error = EINVAL;
+ goto done;
+ }
+
+ KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+ }
+
+ KQ_LOCK(kq);
+ if (kev->ident < kq->kq_knlistsize) {
+ SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
+ if (kev->filter == kn->kn_filter)
+ break;
+ }
+ } else {
+ if ((kev->flags & EV_ADD) == EV_ADD)
+ kqueue_expand(kq, fops, kev->ident, waitok);
+
+ KQ_LOCK(kq);
+ if (kq->kq_knhashmask != 0) {
+ struct klist *list;
+
+ list = &kq->kq_knhash[
+ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
+ SLIST_FOREACH(kn, list, kn_link)
+ if (kev->ident == kn->kn_id &&
+ kev->filter == kn->kn_filter)
+ break;
+ }
+ }
+
+ /* knote is in the process of changing, wait for it to stablize. */
+ if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+ kq->kq_state |= KQ_FLUXWAIT;
+ msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
+ if (fp != NULL) {
+ fdrop(fp, td);
+ fp = NULL;
+ }
+ goto findkn;
+ }
+
+ /*
+ * kn now contains the matching knote, or NULL if no match
+ */
+ if (kn == NULL) {
+ if (kev->flags & EV_ADD) {
+ kn = tkn;
+ tkn = NULL;
+ if (kn == NULL) {
+ KQ_UNLOCK(kq);
+ error = ENOMEM;
+ goto done;
+ }
+ kn->kn_fp = fp;
+ kn->kn_kq = kq;
+ kn->kn_fop = fops;
+ /*
+ * apply reference counts to knote structure, and
+ * do not release it at the end of this routine.
+ */
+ fops = NULL;
+ fp = NULL;
+
+ kn->kn_sfflags = kev->fflags;
+ kn->kn_sdata = kev->data;
+ kev->fflags = 0;
+ kev->data = 0;
+ kn->kn_kevent = *kev;
+ kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
+ EV_ENABLE | EV_DISABLE);
+ kn->kn_status = KN_INFLUX|KN_DETACHED;
+
+ error = knote_attach(kn, kq);
+ KQ_UNLOCK(kq);
+ if (error != 0) {
+ tkn = kn;
+ goto done;
+ }
+
+ if ((error = kn->kn_fop->f_attach(kn)) != 0) {
+ knote_drop(kn, td);
+ goto done;
+ }
+ KN_LIST_LOCK(kn);
+ goto done_ev_add;
+ } else {
+ /* No matching knote and the EV_ADD flag is not set. */
+ KQ_UNLOCK(kq);
+ error = ENOENT;
+ goto done;
+ }
+ }
+
+ if (kev->flags & EV_DELETE) {
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ if (!(kn->kn_status & KN_DETACHED))
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ goto done;
+ }
+
+ /*
+ * The user may change some filter values after the initial EV_ADD,
+ * but doing so will not reset any filter which has already been
+ * triggered.
+ */
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ KN_LIST_LOCK(kn);
+ kn->kn_kevent.udata = kev->udata;
+ if (!fops->f_isfd && fops->f_touch != NULL) {
+ fops->f_touch(kn, kev, EVENT_REGISTER);
+ } else {
+ kn->kn_sfflags = kev->fflags;
+ kn->kn_sdata = kev->data;
+ }
+
+ /*
+ * We can get here with kn->kn_knlist == NULL. This can happen when
+ * the initial attach event decides that the event is "completed"
+ * already. i.e. filt_procattach is called on a zombie process. It
+ * will call filt_proc which will remove it from the list, and NULL
+ * kn_knlist.
+ */
+done_ev_add:
+ event = kn->kn_fop->f_event(kn, 0);
+ KQ_LOCK(kq);
+ if (event)
+ KNOTE_ACTIVATE(kn, 1);
+ kn->kn_status &= ~KN_INFLUX;
+ KN_LIST_UNLOCK(kn);
+
+ if ((kev->flags & EV_DISABLE) &&
+ ((kn->kn_status & KN_DISABLED) == 0)) {
+ kn->kn_status |= KN_DISABLED;
+ }
+
+ if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
+ kn->kn_status &= ~KN_DISABLED;
+ if ((kn->kn_status & KN_ACTIVE) &&
+ ((kn->kn_status & KN_QUEUED) == 0))
+ knote_enqueue(kn);
+ }
+ KQ_UNLOCK_FLUX(kq);
+
+done:
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+ if (fp != NULL)
+ fdrop(fp, td);
+ if (tkn != NULL)
+ knote_free(tkn);
+ if (fops != NULL)
+ kqueue_fo_release(filt);
+ return (error);
+}
+
+static int
+kqueue_acquire(struct file *fp, struct kqueue **kqp)
+{
+ int error;
+ struct kqueue *kq;
+
+ error = 0;
+
+ kq = fp->f_data;
+ if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
+ return (EBADF);
+ *kqp = kq;
+ KQ_LOCK(kq);
+ if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
+ KQ_UNLOCK(kq);
+ return (EBADF);
+ }
+ kq->kq_refcnt++;
+ KQ_UNLOCK(kq);
+
+ return error;
+}
+
+static void
+kqueue_release(struct kqueue *kq, int locked)
+{
+ if (locked)
+ KQ_OWNED(kq);
+ else
+ KQ_LOCK(kq);
+ kq->kq_refcnt--;
+ if (kq->kq_refcnt == 1)
+ wakeup(&kq->kq_refcnt);
+ if (!locked)
+ KQ_UNLOCK(kq);
+}
+
+static void
+kqueue_schedtask(struct kqueue *kq)
+{
+
+ KQ_OWNED(kq);
+ KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
+ ("scheduling kqueue task while draining"));
+
+ if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
+ taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
+ kq->kq_state |= KQ_TASKSCHED;
+ }
+}
+
+/*
+ * Expand the kq to make sure we have storage for fops/ident pair.
+ *
+ * Return 0 on success (or no work necessary), return errno on failure.
+ *
+ * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
+ * If kqueue_register is called from a non-fd context, there usually/should
+ * be no locks held.
+ */
+static int
+kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
+ int waitok)
+{
+ struct klist *list, *tmp_knhash, *to_free;
+ u_long tmp_knhashmask;
+ int size;
+ int fd;
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+
+ KQ_NOTOWNED(kq);
+
+ to_free = NULL;
+ if (fops->f_isfd) {
+ fd = ident;
+ if (kq->kq_knlistsize <= fd) {
+ size = kq->kq_knlistsize;
+ while (size <= fd)
+ size += KQEXTENT;
+ list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
+ if (list == NULL)
+ return ENOMEM;
+ KQ_LOCK(kq);
+ if (kq->kq_knlistsize > fd) {
+ to_free = list;
+ list = NULL;
+ } else {
+ if (kq->kq_knlist != NULL) {
+ bcopy(kq->kq_knlist, list,
+ kq->kq_knlistsize * sizeof(*list));
+ to_free = kq->kq_knlist;
+ kq->kq_knlist = NULL;
+ }
+ bzero((caddr_t)list +
+ kq->kq_knlistsize * sizeof(*list),
+ (size - kq->kq_knlistsize) * sizeof(*list));
+ kq->kq_knlistsize = size;
+ kq->kq_knlist = list;
+ }
+ KQ_UNLOCK(kq);
+ }
+ } else {
+ if (kq->kq_knhashmask == 0) {
+ tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
+ &tmp_knhashmask);
+ if (tmp_knhash == NULL)
+ return ENOMEM;
+ KQ_LOCK(kq);
+ if (kq->kq_knhashmask == 0) {
+ kq->kq_knhash = tmp_knhash;
+ kq->kq_knhashmask = tmp_knhashmask;
+ } else {
+ to_free = tmp_knhash;
+ }
+ KQ_UNLOCK(kq);
+ }
+ }
+ free(to_free, M_KQUEUE);
+
+ KQ_NOTOWNED(kq);
+ return 0;
+}
+
+static void
+kqueue_task(void *arg, int pending)
+{
+ struct kqueue *kq;
+ int haskqglobal;
+
+ haskqglobal = 0;
+ kq = arg;
+
+ KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+ KQ_LOCK(kq);
+
+ KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
+
+ kq->kq_state &= ~KQ_TASKSCHED;
+ if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
+ wakeup(&kq->kq_state);
+ }
+ KQ_UNLOCK(kq);
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+}
+
+/*
+ * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
+ * We treat KN_MARKER knotes as if they are INFLUX.
+ */
+static int
+kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
+ const struct timespec *tsp, struct kevent *keva, struct thread *td)
+{
+ struct kevent *kevp;
+ struct knote *kn, *marker;
+ sbintime_t asbt, rsbt;
+ int count, error, haskqglobal, influx, nkev, touch;
+
+ count = maxevents;
+ nkev = 0;
+ error = 0;
+ haskqglobal = 0;
+
+ if (maxevents == 0)
+ goto done_nl;
+
+ rsbt = 0;
+ if (tsp != NULL) {
+ if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
+ tsp->tv_nsec >= 1000000000) {
+ error = EINVAL;
+ goto done_nl;
+ }
+ if (timespecisset(tsp)) {
+ if (tsp->tv_sec <= INT32_MAX) {
+ rsbt = tstosbt(*tsp);
+ if (TIMESEL(&asbt, rsbt))
+ asbt += tc_tick_sbt;
+ if (asbt <= INT64_MAX - rsbt)
+ asbt += rsbt;
+ else
+ asbt = 0;
+ rsbt >>= tc_precexp;
+ } else
+ asbt = 0;
+ } else
+ asbt = -1;
+ } else
+ asbt = 0;
+ marker = knote_alloc(1);
+ if (marker == NULL) {
+ error = ENOMEM;
+ goto done_nl;
+ }
+ marker->kn_status = KN_MARKER;
+ KQ_LOCK(kq);
+
+retry:
+ kevp = keva;
+ if (kq->kq_count == 0) {
+ if (asbt == -1) {
+ error = EWOULDBLOCK;
+ } else {
+ kq->kq_state |= KQ_SLEEP;
+ error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
+ "kqread", asbt, rsbt, C_ABSOLUTE);
+ }
+ if (error == 0)
+ goto retry;
+ /* don't restart after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ else if (error == EWOULDBLOCK)
+ error = 0;
+ goto done;
+ }
+
+ TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
+ influx = 0;
+ while (count) {
+ KQ_OWNED(kq);
+ kn = TAILQ_FIRST(&kq->kq_head);
+
+ if ((kn->kn_status == KN_MARKER && kn != marker) ||
+ (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ if (influx) {
+ influx = 0;
+ KQ_FLUX_WAKEUP(kq);
+ }
+ kq->kq_state |= KQ_FLUXWAIT;
+ error = msleep(kq, &kq->kq_lock, PSOCK,
+ "kqflxwt", 0);
+ continue;
+ }
+
+ TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+ if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
+ kn->kn_status &= ~KN_QUEUED;
+ kq->kq_count--;
+ continue;
+ }
+ if (kn == marker) {
+ KQ_FLUX_WAKEUP(kq);
+ if (count == maxevents)
+ goto retry;
+ goto done;
+ }
+ KASSERT((kn->kn_status & KN_INFLUX) == 0,
+ ("KN_INFLUX set when not suppose to be"));
+
+ if ((kn->kn_flags & EV_DROP) == EV_DROP) {
+ kn->kn_status &= ~KN_QUEUED;
+ kn->kn_status |= KN_INFLUX;
+ kq->kq_count--;
+ KQ_UNLOCK(kq);
+ /*
+ * We don't need to lock the list since we've marked
+ * it _INFLUX.
+ */
+ if (!(kn->kn_status & KN_DETACHED))
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ KQ_LOCK(kq);
+ continue;
+ } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
+ kn->kn_status &= ~KN_QUEUED;
+ kn->kn_status |= KN_INFLUX;
+ kq->kq_count--;
+ KQ_UNLOCK(kq);
+ /*
+ * We don't need to lock the list since we've marked
+ * it _INFLUX.
+ */
+ *kevp = kn->kn_kevent;
+ if (!(kn->kn_status & KN_DETACHED))
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ KQ_LOCK(kq);
+ kn = NULL;
+ } else {
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
+ KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+ KN_LIST_LOCK(kn);
+ if (kn->kn_fop->f_event(kn, 0) == 0) {
+ KQ_LOCK(kq);
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+ kn->kn_status &=
+ ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
+ kq->kq_count--;
+ KN_LIST_UNLOCK(kn);
+ influx = 1;
+ continue;
+ }
+ touch = (!kn->kn_fop->f_isfd &&
+ kn->kn_fop->f_touch != NULL);
+ if (touch)
+ kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
+ else
+ *kevp = kn->kn_kevent;
+ KQ_LOCK(kq);
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+ if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
+ /*
+ * Manually clear knotes who weren't
+ * 'touch'ed.
+ */
+ if (touch == 0 && kn->kn_flags & EV_CLEAR) {
+ kn->kn_data = 0;
+ kn->kn_fflags = 0;
+ }
+ if (kn->kn_flags & EV_DISPATCH)
+ kn->kn_status |= KN_DISABLED;
+ kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+ kq->kq_count--;
+ } else
+ TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
+
+ kn->kn_status &= ~(KN_INFLUX);
+ KN_LIST_UNLOCK(kn);
+ influx = 1;
+ }
+
+ /* we are returning a copy to the user */
+ kevp++;
+ nkev++;
+ count--;
+
+ if (nkev == KQ_NEVENTS) {
+ influx = 0;
+ KQ_UNLOCK_FLUX(kq);
+ error = k_ops->k_copyout(k_ops->arg, keva, nkev);
+ nkev = 0;
+ kevp = keva;
+ KQ_LOCK(kq);
+ if (error)
+ break;
+ }
+ }
+ TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
+done:
+ KQ_OWNED(kq);
+ KQ_UNLOCK_FLUX(kq);
+ knote_free(marker);
+done_nl:
+ KQ_NOTOWNED(kq);
+ if (nkev != 0)
+ error = k_ops->k_copyout(k_ops->arg, keva, nkev);
+ td->td_retval[0] = maxevents - count;
+ return (error);
+}
+
+/*
+ * XXX
+ * This could be expanded to call kqueue_scan, if desired.
+ */
+/*ARGSUSED*/
+static int
+kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ return (ENXIO);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ return (ENXIO);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_ioctl(struct file *fp, u_long cmd, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+ /*
+ * Enabling sigio causes two major problems:
+ * 1) infinite recursion:
+ * Synopsys: kevent is being used to track signals and have FIOASYNC
+ * set. On receipt of a signal this will cause a kqueue to recurse
+ * into itself over and over. Sending the sigio causes the kqueue
+ * to become ready, which in turn posts sigio again, forever.
+ * Solution: this can be solved by setting a flag in the kqueue that
+ * we have a SIGIO in progress.
+ * 2) locking problems:
+ * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
+ * us above the proc and pgrp locks.
+ * Solution: Post a signal using an async mechanism, being sure to
+ * record a generation count in the delivery so that we do not deliver
+ * a signal to the wrong process.
+ *
+ * Note, these two mechanisms are somewhat mutually exclusive!
+ */
+#if 0
+ struct kqueue *kq;
+
+ kq = fp->f_data;
+ switch (cmd) {
+ case FIOASYNC:
+ if (*(int *)data) {
+ kq->kq_state |= KQ_ASYNC;
+ } else {
+ kq->kq_state &= ~KQ_ASYNC;
+ }
+ return (0);
+
+ case FIOSETOWN:
+ return (fsetown(*(int *)data, &kq->kq_sigio));
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(&kq->kq_sigio);
+ return (0);
+ }
+#endif
+
+ return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct kqueue *kq;
+ int revents = 0;
+ int error;
+
+ if ((error = kqueue_acquire(fp, &kq)))
+ return POLLERR;
+
+ KQ_LOCK(kq);
+ if (events & (POLLIN | POLLRDNORM)) {
+ if (kq->kq_count) {
+ revents |= events & (POLLIN | POLLRDNORM);
+ } else {
+ selrecord(td, &kq->kq_sel);
+ if (SEL_WAITING(&kq->kq_sel))
+ kq->kq_state |= KQ_SEL;
+ }
+ }
+ kqueue_release(kq, 1);
+ KQ_UNLOCK(kq);
+ return (revents);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ bzero((void *)st, sizeof *st);
+ /*
+ * We no longer return kq_count because the unlocked value is useless.
+ * If you spent all this time getting the count, why not spend your
+ * syscall better by calling kevent?
+ *
+ * XXX - This is needed for libc_r.
+ */
+ st->st_mode = S_IFIFO;
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+kqueue_close(struct file *fp, struct thread *td)
+{
+ struct kqueue *kq = fp->f_data;
+ struct filedesc *fdp;
+ struct knote *kn;
+ int i;
+ int error;
+
+ if ((error = kqueue_acquire(fp, &kq)))
+ return error;
+
+ KQ_LOCK(kq);
+
+ KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
+ ("kqueue already closing"));
+ kq->kq_state |= KQ_CLOSING;
+ if (kq->kq_refcnt > 1)
+ msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
+
+ KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
+ fdp = kq->kq_fdp;
+
+ KASSERT(knlist_empty(&kq->kq_sel.si_note),
+ ("kqueue's knlist not empty"));
+
+ for (i = 0; i < kq->kq_knlistsize; i++) {
+ while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
+ if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ kq->kq_state |= KQ_FLUXWAIT;
+ msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
+ continue;
+ }
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ if (!(kn->kn_status & KN_DETACHED))
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ KQ_LOCK(kq);
+ }
+ }
+ if (kq->kq_knhashmask != 0) {
+ for (i = 0; i <= kq->kq_knhashmask; i++) {
+ while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
+ if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ kq->kq_state |= KQ_FLUXWAIT;
+ msleep(kq, &kq->kq_lock, PSOCK,
+ "kqclo2", 0);
+ continue;
+ }
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ if (!(kn->kn_status & KN_DETACHED))
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ KQ_LOCK(kq);
+ }
+ }
+ }
+
+ if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
+ kq->kq_state |= KQ_TASKDRAIN;
+ msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
+ }
+
+ if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
+ selwakeuppri(&kq->kq_sel, PSOCK);
+ if (!SEL_WAITING(&kq->kq_sel))
+ kq->kq_state &= ~KQ_SEL;
+ }
+
+ KQ_UNLOCK(kq);
+
+ FILEDESC_XLOCK(fdp);
+ SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
+ FILEDESC_XUNLOCK(fdp);
+
+ seldrain(&kq->kq_sel);
+ knlist_destroy(&kq->kq_sel.si_note);
+ mtx_destroy(&kq->kq_lock);
+ kq->kq_fdp = NULL;
+
+ if (kq->kq_knhash != NULL)
+ free(kq->kq_knhash, M_KQUEUE);
+ if (kq->kq_knlist != NULL)
+ free(kq->kq_knlist, M_KQUEUE);
+
+ funsetown(&kq->kq_sigio);
+ free(kq, M_KQUEUE);
+ fp->f_data = NULL;
+
+ return (0);
+}
+
+static void
+kqueue_wakeup(struct kqueue *kq)
+{
+ KQ_OWNED(kq);
+
+ if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
+ kq->kq_state &= ~KQ_SLEEP;
+ wakeup(kq);
+ }
+ if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
+ selwakeuppri(&kq->kq_sel, PSOCK);
+ if (!SEL_WAITING(&kq->kq_sel))
+ kq->kq_state &= ~KQ_SEL;
+ }
+ if (!knlist_empty(&kq->kq_sel.si_note))
+ kqueue_schedtask(kq);
+ if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
+ pgsigio(&kq->kq_sigio, SIGIO, 0);
+ }
+}
+
+/*
+ * Walk down a list of knotes, activating them if their event has triggered.
+ *
+ * There is a possibility to optimize in the case of one kq watching another.
+ * Instead of scheduling a task to wake it up, you could pass enough state
+ * down the chain to make up the parent kqueue. Make this code functional
+ * first.
+ */
+void
+knote(struct knlist *list, long hint, int lockflags)
+{
+ struct kqueue *kq;
+ struct knote *kn;
+ int error;
+
+ if (list == NULL)
+ return;
+
+ KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
+
+ if ((lockflags & KNF_LISTLOCKED) == 0)
+ list->kl_lock(list->kl_lockarg);
+
+ /*
+ * If we unlock the list lock (and set KN_INFLUX), we can eliminate
+ * the kqueue scheduling, but this will introduce four
+ * lock/unlock's for each knote to test. If we do, continue to use
+ * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
+ * only safe if you want to remove the current item, which we are
+ * not doing.
+ */
+ SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+ kq = kn->kn_kq;
+ if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
+ KQ_LOCK(kq);
+ if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ KQ_UNLOCK(kq);
+ } else if ((lockflags & KNF_NOKQLOCK) != 0) {
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ error = kn->kn_fop->f_event(kn, hint);
+ KQ_LOCK(kq);
+ kn->kn_status &= ~KN_INFLUX;
+ if (error)
+ KNOTE_ACTIVATE(kn, 1);
+ KQ_UNLOCK_FLUX(kq);
+ } else {
+ kn->kn_status |= KN_HASKQLOCK;
+ if (kn->kn_fop->f_event(kn, hint))
+ KNOTE_ACTIVATE(kn, 1);
+ kn->kn_status &= ~KN_HASKQLOCK;
+ KQ_UNLOCK(kq);
+ }
+ }
+ kq = NULL;
+ }
+ if ((lockflags & KNF_LISTLOCKED) == 0)
+ list->kl_unlock(list->kl_lockarg);
+}
+
+/*
+ * add a knote to a knlist
+ */
+void
+knlist_add(struct knlist *knl, struct knote *kn, int islocked)
+{
+ KNL_ASSERT_LOCK(knl, islocked);
+ KQ_NOTOWNED(kn->kn_kq);
+ KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
+ (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
+ if (!islocked)
+ knl->kl_lock(knl->kl_lockarg);
+ SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
+ if (!islocked)
+ knl->kl_unlock(knl->kl_lockarg);
+ KQ_LOCK(kn->kn_kq);
+ kn->kn_knlist = knl;
+ kn->kn_status &= ~KN_DETACHED;
+ KQ_UNLOCK(kn->kn_kq);
+}
+
+static void
+knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
+{
+ KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
+ KNL_ASSERT_LOCK(knl, knlislocked);
+ mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
+ if (!kqislocked)
+ KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
+ ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
+ if (!knlislocked)
+ knl->kl_lock(knl->kl_lockarg);
+ SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
+ kn->kn_knlist = NULL;
+ if (!knlislocked)
+ knl->kl_unlock(knl->kl_lockarg);
+ if (!kqislocked)
+ KQ_LOCK(kn->kn_kq);
+ kn->kn_status |= KN_DETACHED;
+ if (!kqislocked)
+ KQ_UNLOCK(kn->kn_kq);
+}
+
+/*
+ * remove knote from the specified knlist
+ */
+void
+knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
+{
+
+ knlist_remove_kq(knl, kn, islocked, 0);
+}
+
+/*
+ * remove knote from the specified knlist while in f_event handler.
+ */
+void
+knlist_remove_inevent(struct knlist *knl, struct knote *kn)
+{
+
+ knlist_remove_kq(knl, kn, 1,
+ (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
+}
+
+int
+knlist_empty(struct knlist *knl)
+{
+
+ KNL_ASSERT_LOCKED(knl);
+ return SLIST_EMPTY(&knl->kl_list);
+}
+
+static struct mtx knlist_lock;
+MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
+ MTX_DEF);
+static void knlist_mtx_lock(void *arg);
+static void knlist_mtx_unlock(void *arg);
+
+static void
+knlist_mtx_lock(void *arg)
+{
+
+ mtx_lock((struct mtx *)arg);
+}
+
+static void
+knlist_mtx_unlock(void *arg)
+{
+
+ mtx_unlock((struct mtx *)arg);
+}
+
+static void
+knlist_mtx_assert_locked(void *arg)
+{
+
+ mtx_assert((struct mtx *)arg, MA_OWNED);
+}
+
+static void
+knlist_mtx_assert_unlocked(void *arg)
+{
+
+ mtx_assert((struct mtx *)arg, MA_NOTOWNED);
+}
+
+static void
+knlist_rw_rlock(void *arg)
+{
+
+ rw_rlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_runlock(void *arg)
+{
+
+ rw_runlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_assert_locked(void *arg)
+{
+
+ rw_assert((struct rwlock *)arg, RA_LOCKED);
+}
+
+static void
+knlist_rw_assert_unlocked(void *arg)
+{
+
+ rw_assert((struct rwlock *)arg, RA_UNLOCKED);
+}
+
+void
+knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
+ void (*kl_unlock)(void *),
+ void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
+{
+
+ if (lock == NULL)
+ knl->kl_lockarg = &knlist_lock;
+ else
+ knl->kl_lockarg = lock;
+
+ if (kl_lock == NULL)
+ knl->kl_lock = knlist_mtx_lock;
+ else
+ knl->kl_lock = kl_lock;
+ if (kl_unlock == NULL)
+ knl->kl_unlock = knlist_mtx_unlock;
+ else
+ knl->kl_unlock = kl_unlock;
+ if (kl_assert_locked == NULL)
+ knl->kl_assert_locked = knlist_mtx_assert_locked;
+ else
+ knl->kl_assert_locked = kl_assert_locked;
+ if (kl_assert_unlocked == NULL)
+ knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
+ else
+ knl->kl_assert_unlocked = kl_assert_unlocked;
+
+ SLIST_INIT(&knl->kl_list);
+}
+
+void
+knlist_init_mtx(struct knlist *knl, struct mtx *lock)
+{
+
+ knlist_init(knl, lock, NULL, NULL, NULL, NULL);
+}
+
+void
+knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
+{
+
+ knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
+ knlist_rw_assert_locked, knlist_rw_assert_unlocked);
+}
+
+void
+knlist_destroy(struct knlist *knl)
+{
+
+#ifdef INVARIANTS
+ /*
+ * if we run across this error, we need to find the offending
+ * driver and have it call knlist_clear or knlist_delete.
+ */
+ if (!SLIST_EMPTY(&knl->kl_list))
+ printf("WARNING: destroying knlist w/ knotes on it!\n");
+#endif
+
+ knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
+ SLIST_INIT(&knl->kl_list);
+}
+
+/*
+ * Even if we are locked, we may need to drop the lock to allow any influx
+ * knotes time to "settle".
+ */
+void
+knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
+{
+ struct knote *kn, *kn2;
+ struct kqueue *kq;
+
+ if (islocked)
+ KNL_ASSERT_LOCKED(knl);
+ else {
+ KNL_ASSERT_UNLOCKED(knl);
+again: /* need to reacquire lock since we have dropped it */
+ knl->kl_lock(knl->kl_lockarg);
+ }
+
+ SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
+ kq = kn->kn_kq;
+ KQ_LOCK(kq);
+ if ((kn->kn_status & KN_INFLUX)) {
+ KQ_UNLOCK(kq);
+ continue;
+ }
+ knlist_remove_kq(knl, kn, 1, 1);
+ if (killkn) {
+ kn->kn_status |= KN_INFLUX | KN_DETACHED;
+ KQ_UNLOCK(kq);
+ knote_drop(kn, td);
+ } else {
+ /* Make sure cleared knotes disappear soon */
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ KQ_UNLOCK(kq);
+ }
+ kq = NULL;
+ }
+
+ if (!SLIST_EMPTY(&knl->kl_list)) {
+ /* there are still KN_INFLUX remaining */
+ kn = SLIST_FIRST(&knl->kl_list);
+ kq = kn->kn_kq;
+ KQ_LOCK(kq);
+ KASSERT(kn->kn_status & KN_INFLUX,
+ ("knote removed w/o list lock"));
+ knl->kl_unlock(knl->kl_lockarg);
+ kq->kq_state |= KQ_FLUXWAIT;
+ msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
+ kq = NULL;
+ goto again;
+ }
+
+ if (islocked)
+ KNL_ASSERT_LOCKED(knl);
+ else {
+ knl->kl_unlock(knl->kl_lockarg);
+ KNL_ASSERT_UNLOCKED(knl);
+ }
+}
+
+/*
+ * Remove all knotes referencing a specified fd must be called with FILEDESC
+ * lock. This prevents a race where a new fd comes along and occupies the
+ * entry and we attach a knote to the fd.
+ */
+void
+knote_fdclose(struct thread *td, int fd)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ struct kqueue *kq;
+ struct knote *kn;
+ int influx;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ /*
+ * We shouldn't have to worry about new kevents appearing on fd
+ * since filedesc is locked.
+ */
+ SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
+ KQ_LOCK(kq);
+
+again:
+ influx = 0;
+ while (kq->kq_knlistsize > fd &&
+ (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
+ if (kn->kn_status & KN_INFLUX) {
+ /* someone else might be waiting on our knote */
+ if (influx)
+ wakeup(kq);
+ kq->kq_state |= KQ_FLUXWAIT;
+ msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
+ goto again;
+ }
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ if (!(kn->kn_status & KN_DETACHED))
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ influx = 1;
+ KQ_LOCK(kq);
+ }
+ KQ_UNLOCK_FLUX(kq);
+ }
+}
+
+static int
+knote_attach(struct knote *kn, struct kqueue *kq)
+{
+ struct klist *list;
+
+ KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
+ KQ_OWNED(kq);
+
+ if (kn->kn_fop->f_isfd) {
+ if (kn->kn_id >= kq->kq_knlistsize)
+ return ENOMEM;
+ list = &kq->kq_knlist[kn->kn_id];
+ } else {
+ if (kq->kq_knhash == NULL)
+ return ENOMEM;
+ list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
+ }
+
+ SLIST_INSERT_HEAD(list, kn, kn_link);
+
+ return 0;
+}
+
+/*
+ * knote must already have been detached using the f_detach method.
+ * no lock need to be held, it is assumed that the KN_INFLUX flag is set
+ * to prevent other removal.
+ */
+static void
+knote_drop(struct knote *kn, struct thread *td)
+{
+ struct kqueue *kq;
+ struct klist *list;
+
+ kq = kn->kn_kq;
+
+ KQ_NOTOWNED(kq);
+ KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
+ ("knote_drop called without KN_INFLUX set in kn_status"));
+
+ KQ_LOCK(kq);
+ if (kn->kn_fop->f_isfd)
+ list = &kq->kq_knlist[kn->kn_id];
+ else
+ list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
+
+ if (!SLIST_EMPTY(list))
+ SLIST_REMOVE(list, kn, knote, kn_link);
+ if (kn->kn_status & KN_QUEUED)
+ knote_dequeue(kn);
+ KQ_UNLOCK_FLUX(kq);
+
+ if (kn->kn_fop->f_isfd) {
+ fdrop(kn->kn_fp, td);
+ kn->kn_fp = NULL;
+ }
+ kqueue_fo_release(kn->kn_kevent.filter);
+ kn->kn_fop = NULL;
+ knote_free(kn);
+}
+
+static void
+knote_enqueue(struct knote *kn)
+{
+ struct kqueue *kq = kn->kn_kq;
+
+ KQ_OWNED(kn->kn_kq);
+ KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
+
+ TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
+ kn->kn_status |= KN_QUEUED;
+ kq->kq_count++;
+ kqueue_wakeup(kq);
+}
+
+static void
+knote_dequeue(struct knote *kn)
+{
+ struct kqueue *kq = kn->kn_kq;
+
+ KQ_OWNED(kn->kn_kq);
+ KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
+
+ TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+ kn->kn_status &= ~KN_QUEUED;
+ kq->kq_count--;
+}
+
+static void
+knote_init(void)
+{
+
+ knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
+
+static struct knote *
+knote_alloc(int waitok)
+{
+ return ((struct knote *)uma_zalloc(knote_zone,
+ (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
+}
+
+static void
+knote_free(struct knote *kn)
+{
+ if (kn != NULL)
+ uma_zfree(knote_zone, kn);
+}
+
+/*
+ * Register the kev w/ the kq specified by fd.
+ */
+int
+kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
+{
+ struct kqueue *kq;
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp);
+ if (error != 0)
+ return (error);
+ if ((error = kqueue_acquire(fp, &kq)) != 0)
+ goto noacquire;
+
+ error = kqueue_register(kq, kev, td, waitok);
+
+ kqueue_release(kq, 0);
+
+noacquire:
+ fdrop(fp, td);
+
+ return error;
+}
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
new file mode 100644
index 0000000..45f732b
--- /dev/null
+++ b/sys/kern/kern_exec.c
@@ -0,0 +1,1496 @@
+/*-
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/wait.h>
+#include <sys/malloc.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/pioctl.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/sf_buf.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/shm.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#include <machine/reg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+dtrace_execexit_func_t dtrace_fasttrap_exec;
+#endif
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE1(proc, kernel, , exec, exec, "char *");
+SDT_PROBE_DEFINE1(proc, kernel, , exec_failure, exec-failure, "int");
+SDT_PROBE_DEFINE1(proc, kernel, , exec_success, exec-success, "char *");
+
+MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
+
+static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
+static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
+static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
+static int do_execve(struct thread *td, struct image_args *args,
+ struct mac *mac_p);
+
+/* XXX This should be vm_size_t. */
+SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
+ NULL, 0, sysctl_kern_ps_strings, "LU", "");
+
+/* XXX This should be vm_size_t. */
+SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
+ CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", "");
+
+SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
+ NULL, 0, sysctl_kern_stackprot, "I", "");
+
+u_long ps_arg_cache_limit = PAGE_SIZE / 16;
+SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
+ &ps_arg_cache_limit, 0, "");
+
+static int map_at_zero = 0;
+TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
+SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
+ "Permit processes to map an object at virtual address 0.");
+
+static int
+sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
+{
+ struct proc *p;
+ int error;
+
+ p = curproc;
+#ifdef SCTL_MASK32
+ if (req->flags & SCTL_MASK32) {
+ unsigned int val;
+ val = (unsigned int)p->p_sysent->sv_psstrings;
+ error = SYSCTL_OUT(req, &val, sizeof(val));
+ } else
+#endif
+ error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
+ sizeof(p->p_sysent->sv_psstrings));
+ return error;
+}
+
+static int
+sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
+{
+ struct proc *p;
+ int error;
+
+ p = curproc;
+#ifdef SCTL_MASK32
+ if (req->flags & SCTL_MASK32) {
+ unsigned int val;
+ val = (unsigned int)p->p_sysent->sv_usrstack;
+ error = SYSCTL_OUT(req, &val, sizeof(val));
+ } else
+#endif
+ error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
+ sizeof(p->p_sysent->sv_usrstack));
+ return error;
+}
+
+static int
+sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
+{
+ struct proc *p;
+
+ p = curproc;
+ return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
+ sizeof(p->p_sysent->sv_stackprot)));
+}
+
+/*
+ * Each of the items is a pointer to a `const struct execsw', hence the
+ * double pointer here.
+ */
+static const struct execsw **execsw;
+
+#ifndef _SYS_SYSPROTO_H_
+struct execve_args {
+ char *fname;
+ char **argv;
+ char **envv;
+};
+#endif
+
+int
+sys_execve(td, uap)
+ struct thread *td;
+ struct execve_args /* {
+ char *fname;
+ char **argv;
+ char **envv;
+ } */ *uap;
+{
+ int error;
+ struct image_args args;
+
+ error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
+ uap->argv, uap->envv);
+ if (error == 0)
+ error = kern_execve(td, &args, NULL);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fexecve_args {
+ int fd;
+ char **argv;
+ char **envv;
+}
+#endif
+int
+sys_fexecve(struct thread *td, struct fexecve_args *uap)
+{
+ int error;
+ struct image_args args;
+
+ error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
+ uap->argv, uap->envv);
+ if (error == 0) {
+ args.fd = uap->fd;
+ error = kern_execve(td, &args, NULL);
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct __mac_execve_args {
+ char *fname;
+ char **argv;
+ char **envv;
+ struct mac *mac_p;
+};
+#endif
+
+int
+sys___mac_execve(td, uap)
+ struct thread *td;
+ struct __mac_execve_args /* {
+ char *fname;
+ char **argv;
+ char **envv;
+ struct mac *mac_p;
+ } */ *uap;
+{
+#ifdef MAC
+ int error;
+ struct image_args args;
+
+ error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
+ uap->argv, uap->envv);
+ if (error == 0)
+ error = kern_execve(td, &args, uap->mac_p);
+ return (error);
+#else
+ return (ENOSYS);
+#endif
+}
+
+/*
+ * XXX: kern_execve has the astonishing property of not always returning to
+ * the caller. If sufficiently bad things happen during the call to
+ * do_execve(), it can end up calling exit1(); as a result, callers must
+ * avoid doing anything which they might need to undo (e.g., allocating
+ * memory).
+ */
+int
+kern_execve(td, args, mac_p)
+ struct thread *td;
+ struct image_args *args;
+ struct mac *mac_p;
+{
+ struct proc *p = td->td_proc;
+ int error;
+
+ AUDIT_ARG_ARGV(args->begin_argv, args->argc,
+ args->begin_envv - args->begin_argv);
+ AUDIT_ARG_ENVV(args->begin_envv, args->envc,
+ args->endp - args->begin_envv);
+ if (p->p_flag & P_HADTHREADS) {
+ PROC_LOCK(p);
+ if (thread_single(SINGLE_BOUNDARY)) {
+ PROC_UNLOCK(p);
+ exec_free_args(args);
+ return (ERESTART); /* Try again later. */
+ }
+ PROC_UNLOCK(p);
+ }
+
+ error = do_execve(td, args, mac_p);
+
+ if (p->p_flag & P_HADTHREADS) {
+ PROC_LOCK(p);
+ /*
+ * If success, we upgrade to SINGLE_EXIT state to
+ * force other threads to suicide.
+ */
+ if (error == 0)
+ thread_single(SINGLE_EXIT);
+ else
+ thread_single_end();
+ PROC_UNLOCK(p);
+ }
+
+ return (error);
+}
+
+/*
+ * In-kernel implementation of execve(). All arguments are assumed to be
+ * userspace pointers from the passed thread.
+ */
+static int
+do_execve(td, args, mac_p)
+ struct thread *td;
+ struct image_args *args;
+ struct mac *mac_p;
+{
+ struct proc *p = td->td_proc;
+ struct nameidata nd;
+ struct ucred *newcred = NULL, *oldcred;
+ struct uidinfo *euip;
+ register_t *stack_base;
+ int error, i;
+ struct image_params image_params, *imgp;
+ struct vattr attr;
+ int (*img_first)(struct image_params *);
+ struct pargs *oldargs = NULL, *newargs = NULL;
+ struct sigacts *oldsigacts, *newsigacts;
+#ifdef KTRACE
+ struct vnode *tracevp = NULL;
+ struct ucred *tracecred = NULL;
+#endif
+ struct vnode *textvp = NULL, *binvp = NULL;
+ cap_rights_t rights;
+ int credential_changing;
+ int textset;
+#ifdef MAC
+ struct label *interpvplabel = NULL;
+ int will_transition;
+#endif
+#ifdef HWPMC_HOOKS
+ struct pmckern_procexec pe;
+#endif
+ static const char fexecv_proc_title[] = "(fexecv)";
+
+ imgp = &image_params;
+
+ /*
+ * Lock the process and set the P_INEXEC flag to indicate that
+ * it should be left alone until we're done here. This is
+ * necessary to avoid race conditions - e.g. in ptrace() -
+ * that might allow a local user to illicitly obtain elevated
+ * privileges.
+ */
+ PROC_LOCK(p);
+ KASSERT((p->p_flag & P_INEXEC) == 0,
+ ("%s(): process already has P_INEXEC flag", __func__));
+ p->p_flag |= P_INEXEC;
+ PROC_UNLOCK(p);
+
+ /*
+ * Initialize part of the common data
+ */
+ imgp->proc = p;
+ imgp->execlabel = NULL;
+ imgp->attr = &attr;
+ imgp->entry_addr = 0;
+ imgp->reloc_base = 0;
+ imgp->vmspace_destroyed = 0;
+ imgp->interpreted = 0;
+ imgp->opened = 0;
+ imgp->interpreter_name = NULL;
+ imgp->auxargs = NULL;
+ imgp->vp = NULL;
+ imgp->object = NULL;
+ imgp->firstpage = NULL;
+ imgp->ps_strings = 0;
+ imgp->auxarg_size = 0;
+ imgp->args = args;
+ imgp->execpath = imgp->freepath = NULL;
+ imgp->execpathp = 0;
+ imgp->canary = 0;
+ imgp->canarylen = 0;
+ imgp->pagesizes = 0;
+ imgp->pagesizeslen = 0;
+ imgp->stack_prot = 0;
+
+#ifdef MAC
+ error = mac_execve_enter(imgp, mac_p);
+ if (error)
+ goto exec_fail;
+#endif
+
+ imgp->image_header = NULL;
+
+ /*
+ * Translate the file name. namei() returns a vnode pointer
+ * in ni_vp amoung other things.
+ *
+ * XXXAUDIT: It would be desirable to also audit the name of the
+ * interpreter if this is an interpreted binary.
+ */
+ if (args->fname != NULL) {
+ NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
+ | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
+ }
+
+ SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
+
+interpret:
+ if (args->fname != NULL) {
+#ifdef CAPABILITY_MODE
+ /*
+ * While capability mode can't reach this point via direct
+ * path arguments to execve(), we also don't allow
+ * interpreters to be used in capability mode (for now).
+ * Catch indirect lookups and return a permissions error.
+ */
+ if (IN_CAPABILITY_MODE(td)) {
+ error = ECAPMODE;
+ goto exec_fail;
+ }
+#endif
+ error = namei(&nd);
+ if (error)
+ goto exec_fail;
+
+ binvp = nd.ni_vp;
+ imgp->vp = binvp;
+ } else {
+ AUDIT_ARG_FD(args->fd);
+ /*
+ * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
+ */
+ error = fgetvp_exec(td, args->fd,
+ cap_rights_init(&rights, CAP_FEXECVE), &binvp);
+ if (error)
+ goto exec_fail;
+ vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
+ AUDIT_ARG_VNODE1(binvp);
+ imgp->vp = binvp;
+ }
+
+ /*
+ * Check file permissions (also 'opens' file)
+ */
+ error = exec_check_permissions(imgp);
+ if (error)
+ goto exec_fail_dealloc;
+
+ imgp->object = imgp->vp->v_object;
+ if (imgp->object != NULL)
+ vm_object_reference(imgp->object);
+
+ /*
+ * Set VV_TEXT now so no one can write to the executable while we're
+ * activating it.
+ *
+ * Remember if this was set before and unset it in case this is not
+ * actually an executable image.
+ */
+ textset = VOP_IS_TEXT(imgp->vp);
+ VOP_SET_TEXT(imgp->vp);
+
+ error = exec_map_first_page(imgp);
+ if (error)
+ goto exec_fail_dealloc;
+
+ imgp->proc->p_osrel = 0;
+ /*
+ * If the current process has a special image activator it
+ * wants to try first, call it. For example, emulating shell
+ * scripts differently.
+ */
+ error = -1;
+ if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
+ error = img_first(imgp);
+
+ /*
+ * Loop through the list of image activators, calling each one.
+ * An activator returns -1 if there is no match, 0 on success,
+ * and an error otherwise.
+ */
+ for (i = 0; error == -1 && execsw[i]; ++i) {
+ if (execsw[i]->ex_imgact == NULL ||
+ execsw[i]->ex_imgact == img_first) {
+ continue;
+ }
+ error = (*execsw[i]->ex_imgact)(imgp);
+ }
+
+ if (error) {
+ if (error == -1) {
+ if (textset == 0)
+ VOP_UNSET_TEXT(imgp->vp);
+ error = ENOEXEC;
+ }
+ goto exec_fail_dealloc;
+ }
+
+ /*
+ * Special interpreter operation, cleanup and loop up to try to
+ * activate the interpreter.
+ */
+ if (imgp->interpreted) {
+ exec_unmap_first_page(imgp);
+ /*
+ * VV_TEXT needs to be unset for scripts. There is a short
+ * period before we determine that something is a script where
+ * VV_TEXT will be set. The vnode lock is held over this
+ * entire period so nothing should illegitimately be blocked.
+ */
+ VOP_UNSET_TEXT(imgp->vp);
+ /* free name buffer and old vnode */
+ if (args->fname != NULL)
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+#ifdef MAC
+ mac_execve_interpreter_enter(binvp, &interpvplabel);
+#endif
+ if (imgp->opened) {
+ VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
+ imgp->opened = 0;
+ }
+ vput(binvp);
+ vm_object_deallocate(imgp->object);
+ imgp->object = NULL;
+ /* set new name to that of the interpreter */
+ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+ UIO_SYSSPACE, imgp->interpreter_name, td);
+ args->fname = imgp->interpreter_name;
+ goto interpret;
+ }
+
+ /*
+ * NB: We unlock the vnode here because it is believed that none
+ * of the sv_copyout_strings/sv_fixup operations require the vnode.
+ */
+ VOP_UNLOCK(imgp->vp, 0);
+
+ /*
+ * Do the best to calculate the full path to the image file.
+ */
+ if (imgp->auxargs != NULL &&
+ ((args->fname != NULL && args->fname[0] == '/') ||
+ vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
+ imgp->execpath = args->fname;
+
+ /*
+ * Copy out strings (args and env) and initialize stack base
+ */
+ if (p->p_sysent->sv_copyout_strings)
+ stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
+ else
+ stack_base = exec_copyout_strings(imgp);
+
+ /*
+ * If custom stack fixup routine present for this process
+ * let it do the stack setup.
+ * Else stuff argument count as first item on stack
+ */
+ if (p->p_sysent->sv_fixup != NULL)
+ (*p->p_sysent->sv_fixup)(&stack_base, imgp);
+ else
+ suword(--stack_base, imgp->args->argc);
+
+ /*
+ * For security and other reasons, the file descriptor table cannot
+ * be shared after an exec.
+ */
+ fdunshare(p, td);
+
+ /*
+ * Malloc things before we need locks.
+ */
+ newcred = crget();
+ euip = uifind(attr.va_uid);
+ i = imgp->args->begin_envv - imgp->args->begin_argv;
+ /* Cache arguments if they fit inside our allowance */
+ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
+ newargs = pargs_alloc(i);
+ bcopy(imgp->args->begin_argv, newargs->ar_args, i);
+ }
+
+ /* close files on exec */
+ fdcloseexec(td);
+ vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+
+ /* Get a reference to the vnode prior to locking the proc */
+ VREF(binvp);
+
+ /*
+ * For security and other reasons, signal handlers cannot
+ * be shared after an exec. The new process gets a copy of the old
+ * handlers. In execsigs(), the new process will have its signals
+ * reset.
+ */
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+ if (sigacts_shared(p->p_sigacts)) {
+ oldsigacts = p->p_sigacts;
+ PROC_UNLOCK(p);
+ newsigacts = sigacts_alloc();
+ sigacts_copy(newsigacts, oldsigacts);
+ PROC_LOCK(p);
+ p->p_sigacts = newsigacts;
+ } else
+ oldsigacts = NULL;
+
+ /* Stop profiling */
+ stopprofclock(p);
+
+ /* reset caught signals */
+ execsigs(p);
+
+ /* name this process - nameiexec(p, ndp) */
+ bzero(p->p_comm, sizeof(p->p_comm));
+ if (args->fname)
+ bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
+ min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
+ else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
+ bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
+ bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
+#ifdef KTR
+ sched_clear_tdname(td);
+#endif
+
+ /*
+ * mark as execed, wakeup the process that vforked (if any) and tell
+ * it that it now has its own resources back
+ */
+ p->p_flag |= P_EXEC;
+ if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+ p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
+ cv_broadcast(&p->p_pwait);
+ }
+
+ /*
+ * Implement image setuid/setgid.
+ *
+ * Don't honor setuid/setgid if the filesystem prohibits it or if
+ * the process is being traced.
+ *
+ * We disable setuid/setgid/etc in compatibility mode on the basis
+ * that most setugid applications are not written with that
+ * environment in mind, and will therefore almost certainly operate
+ * incorrectly. In principle there's no reason that setugid
+ * applications might not be useful in capability mode, so we may want
+ * to reconsider this conservative design choice in the future.
+ *
+ * XXXMAC: For the time being, use NOSUID to also prohibit
+ * transitions on the file system.
+ */
+ credential_changing = 0;
+ credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
+ attr.va_uid;
+ credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
+ attr.va_gid;
+#ifdef MAC
+ will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
+ interpvplabel, imgp);
+ credential_changing |= will_transition;
+#endif
+
+ if (credential_changing &&
+#ifdef CAPABILITY_MODE
+ ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
+#endif
+ (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
+ (p->p_flag & P_TRACED) == 0) {
+ /*
+ * Turn off syscall tracing for set-id programs, except for
+ * root. Record any set-id flags first to make sure that
+ * we do not regain any tracing during a possible block.
+ */
+ setsugid(p);
+
+#ifdef KTRACE
+ if (p->p_tracecred != NULL &&
+ priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0))
+ ktrprocexec(p, &tracecred, &tracevp);
+#endif
+ /*
+ * Close any file descriptors 0..2 that reference procfs,
+ * then make sure file descriptors 0..2 are in use.
+ *
+ * setugidsafety() may call closef() and then pfind()
+ * which may grab the process lock.
+ * fdcheckstd() may call falloc() which may block to
+ * allocate memory, so temporarily drop the process lock.
+ */
+ PROC_UNLOCK(p);
+ VOP_UNLOCK(imgp->vp, 0);
+ setugidsafety(td);
+ error = fdcheckstd(td);
+ vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+ if (error != 0)
+ goto done1;
+ PROC_LOCK(p);
+ /*
+ * Set the new credentials.
+ */
+ if (attr.va_mode & S_ISUID)
+ change_euid(newcred, euip);
+ if (attr.va_mode & S_ISGID)
+ change_egid(newcred, attr.va_gid);
+#ifdef MAC
+ if (will_transition) {
+ mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
+ interpvplabel, imgp);
+ }
+#endif
+ /*
+ * Implement correct POSIX saved-id behavior.
+ *
+ * XXXMAC: Note that the current logic will save the
+ * uid and gid if a MAC domain transition occurs, even
+ * though maybe it shouldn't.
+ */
+ change_svuid(newcred, newcred->cr_uid);
+ change_svgid(newcred, newcred->cr_gid);
+ p->p_ucred = newcred;
+ newcred = NULL;
+ } else {
+ if (oldcred->cr_uid == oldcred->cr_ruid &&
+ oldcred->cr_gid == oldcred->cr_rgid)
+ p->p_flag &= ~P_SUGID;
+ /*
+ * Implement correct POSIX saved-id behavior.
+ *
+ * XXX: It's not clear that the existing behavior is
+ * POSIX-compliant. A number of sources indicate that the
+ * saved uid/gid should only be updated if the new ruid is
+ * not equal to the old ruid, or the new euid is not equal
+ * to the old euid and the new euid is not equal to the old
+ * ruid. The FreeBSD code always updates the saved uid/gid.
+ * Also, this code uses the new (replaced) euid and egid as
+ * the source, which may or may not be the right ones to use.
+ */
+ if (oldcred->cr_svuid != oldcred->cr_uid ||
+ oldcred->cr_svgid != oldcred->cr_gid) {
+ change_svuid(newcred, newcred->cr_uid);
+ change_svgid(newcred, newcred->cr_gid);
+ p->p_ucred = newcred;
+ newcred = NULL;
+ }
+ }
+
+ /*
+ * Store the vp for use in procfs. This vnode was referenced prior
+ * to locking the proc lock.
+ */
+ textvp = p->p_textvp;
+ p->p_textvp = binvp;
+
+#ifdef KDTRACE_HOOKS
+ /*
+ * Tell the DTrace fasttrap provider about the exec if it
+ * has declared an interest.
+ */
+ if (dtrace_fasttrap_exec)
+ dtrace_fasttrap_exec(p);
+#endif
+
+ /*
+ * Notify others that we exec'd, and clear the P_INEXEC flag
+ * as we're now a bona fide freshly-execed process.
+ */
+ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
+ p->p_flag &= ~P_INEXEC;
+
+ /* clear "fork but no exec" flag, as we _are_ execing */
+ p->p_acflag &= ~AFORK;
+
+ /*
+ * Free any previous argument cache and replace it with
+ * the new argument cache, if any.
+ */
+ oldargs = p->p_args;
+ p->p_args = newargs;
+ newargs = NULL;
+
+#ifdef HWPMC_HOOKS
+ /*
+ * Check if system-wide sampling is in effect or if the
+ * current process is using PMCs. If so, do exec() time
+ * processing. This processing needs to happen AFTER the
+ * P_INEXEC flag is cleared.
+ *
+ * The proc lock needs to be released before taking the PMC
+ * SX.
+ */
+ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
+ PROC_UNLOCK(p);
+ VOP_UNLOCK(imgp->vp, 0);
+ pe.pm_credentialschanged = credential_changing;
+ pe.pm_entryaddr = imgp->entry_addr;
+
+ PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
+ vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+ } else
+ PROC_UNLOCK(p);
+#else /* !HWPMC_HOOKS */
+ PROC_UNLOCK(p);
+#endif
+
+ /* Set values passed into the program in registers. */
+ if (p->p_sysent->sv_setregs)
+ (*p->p_sysent->sv_setregs)(td, imgp,
+ (u_long)(uintptr_t)stack_base);
+ else
+ exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
+
+ vfs_mark_atime(imgp->vp, td->td_ucred);
+
+ SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);
+
+done1:
+ /*
+ * Free any resources malloc'd earlier that we didn't use.
+ */
+ uifree(euip);
+ if (newcred == NULL)
+ crfree(oldcred);
+ else
+ crfree(newcred);
+ VOP_UNLOCK(imgp->vp, 0);
+
+ /*
+ * Handle deferred decrement of ref counts.
+ */
+ if (textvp != NULL)
+ vrele(textvp);
+ if (binvp && error != 0)
+ vrele(binvp);
+#ifdef KTRACE
+ if (tracevp != NULL)
+ vrele(tracevp);
+ if (tracecred != NULL)
+ crfree(tracecred);
+#endif
+ vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+ pargs_drop(oldargs);
+ pargs_drop(newargs);
+ if (oldsigacts != NULL)
+ sigacts_free(oldsigacts);
+
+exec_fail_dealloc:
+
+ /*
+ * free various allocated resources
+ */
+ if (imgp->firstpage != NULL)
+ exec_unmap_first_page(imgp);
+
+ if (imgp->vp != NULL) {
+ if (args->fname)
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (imgp->opened)
+ VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
+ vput(imgp->vp);
+ }
+
+ if (imgp->object != NULL)
+ vm_object_deallocate(imgp->object);
+
+ free(imgp->freepath, M_TEMP);
+
+ if (error == 0) {
+ PROC_LOCK(p);
+ td->td_dbgflags |= TDB_EXEC;
+ PROC_UNLOCK(p);
+
+ /*
+ * Stop the process here if its stop event mask has
+ * the S_EXEC bit set.
+ */
+ STOPEVENT(p, S_EXEC, 0);
+ goto done2;
+ }
+
+exec_fail:
+ /* we're done here, clear P_INEXEC */
+ PROC_LOCK(p);
+ p->p_flag &= ~P_INEXEC;
+ PROC_UNLOCK(p);
+
+ SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);
+
+done2:
+#ifdef MAC
+ mac_execve_exit(imgp);
+ mac_execve_interpreter_exit(interpvplabel);
+#endif
+ exec_free_args(args);
+
+ if (error && imgp->vmspace_destroyed) {
+ /* sorry, no more process anymore. exit gracefully */
+ exit1(td, W_EXITCODE(0, SIGABRT));
+ /* NOT REACHED */
+ }
+
+#ifdef KTRACE
+ if (error == 0)
+ ktrprocctor(p);
+#endif
+
+ return (error);
+}
+
+int
+exec_map_first_page(imgp)
+ struct image_params *imgp;
+{
+ int rv, i;
+ int initial_pagein;
+ vm_page_t ma[VM_INITIAL_PAGEIN];
+ vm_object_t object;
+
+ if (imgp->firstpage != NULL)
+ exec_unmap_first_page(imgp);
+
+ object = imgp->vp->v_object;
+ if (object == NULL)
+ return (EACCES);
+ VM_OBJECT_WLOCK(object);
+#if VM_NRESERVLEVEL > 0
+ if ((object->flags & OBJ_COLORED) == 0) {
+ object->flags |= OBJ_COLORED;
+ object->pg_color = 0;
+ }
+#endif
+ ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL);
+ if (ma[0]->valid != VM_PAGE_BITS_ALL) {
+ initial_pagein = VM_INITIAL_PAGEIN;
+ if (initial_pagein > object->size)
+ initial_pagein = object->size;
+ for (i = 1; i < initial_pagein; i++) {
+ if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
+ if (ma[i]->valid)
+ break;
+ if (vm_page_tryxbusy(ma[i]))
+ break;
+ } else {
+ ma[i] = vm_page_alloc(object, i,
+ VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
+ if (ma[i] == NULL)
+ break;
+ }
+ }
+ initial_pagein = i;
+ rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
+ ma[0] = vm_page_lookup(object, 0);
+ if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) {
+ if (ma[0] != NULL) {
+ vm_page_lock(ma[0]);
+ vm_page_free(ma[0]);
+ vm_page_unlock(ma[0]);
+ }
+ VM_OBJECT_WUNLOCK(object);
+ return (EIO);
+ }
+ }
+ vm_page_xunbusy(ma[0]);
+ vm_page_lock(ma[0]);
+ vm_page_hold(ma[0]);
+ vm_page_unlock(ma[0]);
+ VM_OBJECT_WUNLOCK(object);
+
+ imgp->firstpage = sf_buf_alloc(ma[0], 0);
+ imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
+
+ return (0);
+}
+
+void
+exec_unmap_first_page(imgp)
+ struct image_params *imgp;
+{
+ vm_page_t m;
+
+ if (imgp->firstpage != NULL) {
+ m = sf_buf_page(imgp->firstpage);
+ sf_buf_free(imgp->firstpage);
+ imgp->firstpage = NULL;
+ vm_page_lock(m);
+ vm_page_unhold(m);
+ vm_page_unlock(m);
+ }
+}
+
+/*
+ * Destroy old address space, and allocate a new stack
+ * The new stack is only SGROWSIZ large because it is grown
+ * automatically in trap.c.
+ */
+int
+exec_new_vmspace(imgp, sv)
+ struct image_params *imgp;
+ struct sysentvec *sv;
+{
+ int error;
+ struct proc *p = imgp->proc;
+ struct vmspace *vmspace = p->p_vmspace;
+ vm_object_t obj;
+ vm_offset_t sv_minuser, stack_addr;
+ vm_map_t map;
+ u_long ssiz;
+
+ imgp->vmspace_destroyed = 1;
+ imgp->sysent = sv;
+
+ /* May be called with Giant held */
+ EVENTHANDLER_INVOKE(process_exec, p, imgp);
+
+ /*
+ * Blow away entire process VM, if address space not shared,
+ * otherwise, create a new VM space so that other threads are
+ * not disrupted
+ */
+ map = &vmspace->vm_map;
+ if (map_at_zero)
+ sv_minuser = sv->sv_minuser;
+ else
+ sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
+ if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
+ vm_map_max(map) == sv->sv_maxuser) {
+ shmexit(vmspace);
+ pmap_remove_pages(vmspace_pmap(vmspace));
+ vm_map_remove(map, vm_map_min(map), vm_map_max(map));
+ } else {
+ error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
+ if (error)
+ return (error);
+ vmspace = p->p_vmspace;
+ map = &vmspace->vm_map;
+ }
+
+ /* Map a shared page */
+ obj = sv->sv_shared_page_obj;
+ if (obj != NULL) {
+ vm_object_reference(obj);
+ error = vm_map_fixed(map, obj, 0,
+ sv->sv_shared_page_base, sv->sv_shared_page_len,
+ VM_PROT_READ | VM_PROT_EXECUTE,
+ VM_PROT_READ | VM_PROT_EXECUTE,
+ MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
+ if (error) {
+ vm_object_deallocate(obj);
+ return (error);
+ }
+ }
+
+ /* Allocate a new stack */
+ if (sv->sv_maxssiz != NULL)
+ ssiz = *sv->sv_maxssiz;
+ else
+ ssiz = maxssiz;
+ stack_addr = sv->sv_usrstack - ssiz;
+ error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
+ obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
+ sv->sv_stackprot,
+ VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
+ if (error)
+ return (error);
+
+#ifdef __ia64__
+ /* Allocate a new register stack */
+ stack_addr = IA64_BACKINGSTORE;
+ error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
+ sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
+ if (error)
+ return (error);
+#endif
+
+ /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
+ * VM_STACK case, but they are still used to monitor the size of the
+ * process stack so we can check the stack rlimit.
+ */
+ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
+ vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
+
+ return (0);
+}
+
+/*
+ * Copy out argument and environment strings from the old process address
+ * space into the temporary string buffer.
+ */
+int
+exec_copyin_args(struct image_args *args, char *fname,
+ enum uio_seg segflg, char **argv, char **envv)
+{
+ char *argp, *envp;
+ int error;
+ size_t length;
+
+ bzero(args, sizeof(*args));
+ if (argv == NULL)
+ return (EFAULT);
+
+ /*
+ * Allocate demand-paged memory for the file name, argument, and
+ * environment strings.
+ */
+ error = exec_alloc_args(args);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Copy the file name.
+ */
+ if (fname != NULL) {
+ args->fname = args->buf;
+ error = (segflg == UIO_SYSSPACE) ?
+ copystr(fname, args->fname, PATH_MAX, &length) :
+ copyinstr(fname, args->fname, PATH_MAX, &length);
+ if (error != 0)
+ goto err_exit;
+ } else
+ length = 0;
+
+ args->begin_argv = args->buf + length;
+ args->endp = args->begin_argv;
+ args->stringspace = ARG_MAX;
+
+ /*
+ * extract arguments first
+ */
+ while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
+ if (argp == (caddr_t) -1) {
+ error = EFAULT;
+ goto err_exit;
+ }
+ if ((error = copyinstr(argp, args->endp,
+ args->stringspace, &length))) {
+ if (error == ENAMETOOLONG)
+ error = E2BIG;
+ goto err_exit;
+ }
+ args->stringspace -= length;
+ args->endp += length;
+ args->argc++;
+ }
+
+ args->begin_envv = args->endp;
+
+ /*
+ * extract environment strings
+ */
+ if (envv) {
+ while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
+ if (envp == (caddr_t)-1) {
+ error = EFAULT;
+ goto err_exit;
+ }
+ if ((error = copyinstr(envp, args->endp,
+ args->stringspace, &length))) {
+ if (error == ENAMETOOLONG)
+ error = E2BIG;
+ goto err_exit;
+ }
+ args->stringspace -= length;
+ args->endp += length;
+ args->envc++;
+ }
+ }
+
+ return (0);
+
+err_exit:
+ exec_free_args(args);
+ return (error);
+}
+
+/*
+ * Allocate temporary demand-paged, zero-filled memory for the file name,
+ * argument, and environment strings. Returns zero if the allocation succeeds
+ * and ENOMEM otherwise.
+ */
+int
+exec_alloc_args(struct image_args *args)
+{
+
+ args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
+ return (args->buf != NULL ? 0 : ENOMEM);
+}
+
+void
+exec_free_args(struct image_args *args)
+{
+
+ if (args->buf != NULL) {
+ kmap_free_wakeup(exec_map, (vm_offset_t)args->buf,
+ PATH_MAX + ARG_MAX);
+ args->buf = NULL;
+ }
+ if (args->fname_buf != NULL) {
+ free(args->fname_buf, M_TEMP);
+ args->fname_buf = NULL;
+ }
+}
+
+/*
+ * Copy strings out to the new process address space, constructing new arg
+ * and env vector tables. Return a pointer to the base so that it can be used
+ * as the initial stack pointer.
+ */
+register_t *
+exec_copyout_strings(imgp)
+ struct image_params *imgp;
+{
+ int argc, envc;
+ char **vectp;
+ char *stringp, *destp;
+ register_t *stack_base;
+ struct ps_strings *arginfo;
+ struct proc *p;
+ size_t execpath_len;
+ int szsigcode, szps;
+ char canary[sizeof(long) * 8];
+
+ szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
+ /*
+ * Calculate string base and vector table pointers.
+ * Also deal with signal trampoline code for this exec type.
+ */
+ if (imgp->execpath != NULL && imgp->auxargs != NULL)
+ execpath_len = strlen(imgp->execpath) + 1;
+ else
+ execpath_len = 0;
+ p = imgp->proc;
+ szsigcode = 0;
+ arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
+ if (p->p_sysent->sv_sigcode_base == 0) {
+ if (p->p_sysent->sv_szsigcode != NULL)
+ szsigcode = *(p->p_sysent->sv_szsigcode);
+ }
+ destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
+ roundup(execpath_len, sizeof(char *)) -
+ roundup(sizeof(canary), sizeof(char *)) -
+ roundup(szps, sizeof(char *)) -
+ roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
+
+ /*
+ * install sigcode
+ */
+ if (szsigcode != 0)
+ copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
+ szsigcode), szsigcode);
+
+ /*
+ * Copy the image path for the rtld.
+ */
+ if (execpath_len != 0) {
+ imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
+ copyout(imgp->execpath, (void *)imgp->execpathp,
+ execpath_len);
+ }
+
+ /*
+ * Prepare the canary for SSP.
+ */
+ arc4rand(canary, sizeof(canary), 0);
+ imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
+ sizeof(canary);
+ copyout(canary, (void *)imgp->canary, sizeof(canary));
+ imgp->canarylen = sizeof(canary);
+
+ /*
+ * Prepare the pagesizes array.
+ */
+ imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
+ roundup(sizeof(canary), sizeof(char *)) - szps;
+ copyout(pagesizes, (void *)imgp->pagesizes, szps);
+ imgp->pagesizeslen = szps;
+
+ /*
+ * If we have a valid auxargs ptr, prepare some room
+ * on the stack.
+ */
+ if (imgp->auxargs) {
+ /*
+ * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
+ * lower compatibility.
+ */
+ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
+ (AT_COUNT * 2);
+ /*
+ * The '+ 2' is for the null pointers at the end of each of
+ * the arg and env vector sets,and imgp->auxarg_size is room
+ * for argument of Runtime loader.
+ */
+ vectp = (char **)(destp - (imgp->args->argc +
+ imgp->args->envc + 2 + imgp->auxarg_size)
+ * sizeof(char *));
+ } else {
+ /*
+ * The '+ 2' is for the null pointers at the end of each of
+ * the arg and env vector sets
+ */
+ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
+ sizeof(char *));
+ }
+
+ /*
+ * vectp also becomes our initial stack base
+ */
+ stack_base = (register_t *)vectp;
+
+ stringp = imgp->args->begin_argv;
+ argc = imgp->args->argc;
+ envc = imgp->args->envc;
+
+ /*
+ * Copy out strings - arguments and environment.
+ */
+ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
+
+ /*
+ * Fill in "ps_strings" struct for ps, w, etc.
+ */
+ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
+ suword32(&arginfo->ps_nargvstr, argc);
+
+ /*
+ * Fill in argument portion of vector table.
+ */
+ for (; argc > 0; --argc) {
+ suword(vectp++, (long)(intptr_t)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* a null vector table pointer separates the argp's from the envp's */
+ suword(vectp++, 0);
+
+ suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
+ suword32(&arginfo->ps_nenvstr, envc);
+
+ /*
+ * Fill in environment portion of vector table.
+ */
+ for (; envc > 0; --envc) {
+ suword(vectp++, (long)(intptr_t)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* end of vector table is a null pointer */
+ suword(vectp, 0);
+
+ return (stack_base);
+}
+
+/*
+ * Check permissions of file to execute.
+ * Called with imgp->vp locked.
+ * Return 0 for success or error code on failure.
+ */
+int
+exec_check_permissions(imgp)
+ struct image_params *imgp;
+{
+ struct vnode *vp = imgp->vp;
+ struct vattr *attr = imgp->attr;
+ struct thread *td;
+ int error, writecount;
+
+ td = curthread;
+
+ /* Get file attributes */
+ error = VOP_GETATTR(vp, attr, td->td_ucred);
+ if (error)
+ return (error);
+
+#ifdef MAC
+ error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
+ if (error)
+ return (error);
+#endif
+
+ /*
+ * 1) Check if file execution is disabled for the filesystem that
+ * this file resides on.
+ * 2) Ensure that at least one execute bit is on. Otherwise, a
+ * privileged user will always succeed, and we don't want this
+ * to happen unless the file really is executable.
+ * 3) Ensure that the file is a regular file.
+ */
+ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+ (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
+ (attr->va_type != VREG))
+ return (EACCES);
+
+ /*
+ * Zero length files can't be exec'd
+ */
+ if (attr->va_size == 0)
+ return (ENOEXEC);
+
+ /*
+ * Check for execute permission to file based on current credentials.
+ */
+ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+ if (error)
+ return (error);
+
+ /*
+ * Check number of open-for-writes on the file and deny execution
+ * if there are any.
+ */
+ error = VOP_GET_WRITECOUNT(vp, &writecount);
+ if (error != 0)
+ return (error);
+ if (writecount != 0)
+ return (ETXTBSY);
+
+ /*
+ * Call filesystem specific open routine (which does nothing in the
+ * general case).
+ */
+ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
+ if (error == 0)
+ imgp->opened = 1;
+ return (error);
+}
+
+/*
+ * Exec handler registration
+ */
+int
+exec_register(execsw_arg)
+ const struct execsw *execsw_arg;
+{
+ const struct execsw **es, **xs, **newexecsw;
+ int count = 2; /* New slot and trailing NULL */
+
+ if (execsw)
+ for (es = execsw; *es; es++)
+ count++;
+ newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+ if (newexecsw == NULL)
+ return (ENOMEM);
+ xs = newexecsw;
+ if (execsw)
+ for (es = execsw; *es; es++)
+ *xs++ = *es;
+ *xs++ = execsw_arg;
+ *xs = NULL;
+ if (execsw)
+ free(execsw, M_TEMP);
+ execsw = newexecsw;
+ return (0);
+}
+
+int
+exec_unregister(execsw_arg)
+ const struct execsw *execsw_arg;
+{
+ const struct execsw **es, **xs, **newexecsw;
+ int count = 1;
+
+ if (execsw == NULL)
+ panic("unregister with no handlers left?\n");
+
+ for (es = execsw; *es; es++) {
+ if (*es == execsw_arg)
+ break;
+ }
+ if (*es == NULL)
+ return (ENOENT);
+ for (es = execsw; *es; es++)
+ if (*es != execsw_arg)
+ count++;
+ newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
+ if (newexecsw == NULL)
+ return (ENOMEM);
+ xs = newexecsw;
+ for (es = execsw; *es; es++)
+ if (*es != execsw_arg)
+ *xs++ = *es;
+ *xs = NULL;
+ if (execsw)
+ free(execsw, M_TEMP);
+ execsw = newexecsw;
+ return (0);
+}
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
new file mode 100644
index 0000000..f0be10e
--- /dev/null
+++ b/sys/kern/kern_exit.c
@@ -0,0 +1,1261 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/capability.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/pioctl.h>
+#include <sys/jail.h>
+#include <sys/tty.h>
+#include <sys/wait.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/signalvar.h>
+#include <sys/sched.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/syslog.h>
+#include <sys/ptrace.h>
+#include <sys/acct.h> /* for acct_process() function prototype */
+#include <sys/filedesc.h>
+#include <sys/sdt.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+dtrace_execexit_func_t dtrace_fasttrap_exit;
+#endif
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE1(proc, kernel, , exit, exit, "int");
+
+/* Hook for NFS teardown procedure. */
+void (*nlminfo_release_p)(struct proc *p);
+
+static void
+clear_orphan(struct proc *p)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if (p->p_flag & P_ORPHAN) {
+ LIST_REMOVE(p, p_orphan);
+ p->p_flag &= ~P_ORPHAN;
+ }
+}
+
+/*
+ * exit -- death of process.
+ */
+void
+sys_sys_exit(struct thread *td, struct sys_exit_args *uap)
+{
+
+ exit1(td, W_EXITCODE(uap->rval, 0));
+ /* NOTREACHED */
+}
+
+/*
+ * Exit: deallocate address space and other resources, change proc state to
+ * zombie, and unlink proc from allproc and parent's lists. Save exit status
+ * and rusage for wait(). Check for child processes and orphan them.
+ */
+void
+exit1(struct thread *td, int rv)
+{
+ struct proc *p, *nq, *q;
+ struct vnode *vtmp;
+ struct vnode *ttyvp = NULL;
+ struct plimit *plim;
+
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ p = td->td_proc;
+ /*
+ * XXX in case we're rebooting we just let init die in order to
+ * work around an unsolved stack overflow seen very late during
+ * shutdown on sparc64 when the gmirror worker process exists.
+ */
+ if (p == initproc && rebooting == 0) {
+ printf("init died (signal %d, exit %d)\n",
+ WTERMSIG(rv), WEXITSTATUS(rv));
+ panic("Going nowhere without my init!");
+ }
+
+ /*
+ * MUST abort all other threads before proceeding past here.
+ */
+ PROC_LOCK(p);
+ while (p->p_flag & P_HADTHREADS) {
+ /*
+ * First check if some other thread got here before us.
+ * If so, act appropriately: exit or suspend.
+ */
+ thread_suspend_check(0);
+
+ /*
+ * Kill off the other threads. This requires
+ * some co-operation from other parts of the kernel
+ * so it may not be instantaneous. With this state set
+ * any thread entering the kernel from userspace will
+ * thread_exit() in trap(). Any thread attempting to
+ * sleep will return immediately with EINTR or EWOULDBLOCK
+ * which will hopefully force them to back out to userland
+ * freeing resources as they go. Any thread attempting
+ * to return to userland will thread_exit() from userret().
+ * thread_exit() will unsuspend us when the last of the
+ * other threads exits.
+ * If there is already a thread singler after resumption,
+ * calling thread_single will fail; in that case, we just
+ * re-check all suspension request, the thread should
+ * either be suspended there or exit.
+ */
+ if (!thread_single(SINGLE_EXIT))
+ break;
+
+ /*
+ * All other activity in this process is now stopped.
+ * Threading support has been turned off.
+ */
+ }
+ KASSERT(p->p_numthreads == 1,
+ ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
+ racct_sub(p, RACCT_NTHR, 1);
+ /*
+ * Wakeup anyone in procfs' PIOCWAIT. They should have a hold
+ * on our vmspace, so we should block below until they have
+ * released their reference to us. Note that if they have
+ * requested S_EXIT stops we will block here until they ack
+ * via PIOCCONT.
+ */
+ _STOPEVENT(p, S_EXIT, rv);
+
+ /*
+ * Ignore any pending request to stop due to a stop signal.
+ * Once P_WEXIT is set, future requests will be ignored as
+ * well.
+ */
+ p->p_flag &= ~P_STOPPED_SIG;
+ KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped"));
+
+ /*
+ * Note that we are exiting and do another wakeup of anyone in
+ * PIOCWAIT in case they aren't listening for S_EXIT stops or
+ * decided to wait again after we told them we are exiting.
+ */
+ p->p_flag |= P_WEXIT;
+ wakeup(&p->p_stype);
+
+ /*
+ * Wait for any processes that have a hold on our vmspace to
+ * release their reference.
+ */
+ while (p->p_lock > 0)
+ msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
+
+ p->p_xstat = rv; /* Let event handler change exit status */
+ PROC_UNLOCK(p);
+ /* Drain the limit callout while we don't have the proc locked */
+ callout_drain(&p->p_limco);
+
+#ifdef AUDIT
+ /*
+ * The Sun BSM exit token contains two components: an exit status as
+ * passed to exit(), and a return value to indicate what sort of exit
+ * it was. The exit status is WEXITSTATUS(rv), but it's not clear
+ * what the return value is.
+ */
+ AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0);
+ AUDIT_SYSCALL_EXIT(0, td);
+#endif
+
+ /* Are we a task leader? */
+ if (p == p->p_leader) {
+ mtx_lock(&ppeers_lock);
+ q = p->p_peers;
+ while (q != NULL) {
+ PROC_LOCK(q);
+ kern_psignal(q, SIGKILL);
+ PROC_UNLOCK(q);
+ q = q->p_peers;
+ }
+ while (p->p_peers != NULL)
+ msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
+ mtx_unlock(&ppeers_lock);
+ }
+
+ /*
+ * Check if any loadable modules need anything done at process exit.
+ * E.g. SYSV IPC stuff
+ * XXX what if one of these generates an error?
+ */
+ EVENTHANDLER_INVOKE(process_exit, p);
+
+ /*
+ * If parent is waiting for us to exit or exec,
+ * P_PPWAIT is set; we will wakeup the parent below.
+ */
+ PROC_LOCK(p);
+ rv = p->p_xstat; /* Event handler could change exit status */
+ stopprofclock(p);
+ p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE);
+
+ /*
+ * Stop the real interval timer. If the handler is currently
+ * executing, prevent it from rearming itself and let it finish.
+ */
+ if (timevalisset(&p->p_realtimer.it_value) &&
+ callout_stop(&p->p_itcallout) == 0) {
+ timevalclear(&p->p_realtimer.it_interval);
+ msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
+ KASSERT(!timevalisset(&p->p_realtimer.it_value),
+ ("realtime timer is still armed"));
+ }
+ PROC_UNLOCK(p);
+
+ /*
+ * Reset any sigio structures pointing to us as a result of
+ * F_SETOWN with our pid.
+ */
+ funsetownlst(&p->p_sigiolst);
+
+ /*
+ * If this process has an nlminfo data area (for lockd), release it
+ */
+ if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
+ (*nlminfo_release_p)(p);
+
+ /*
+ * Close open files and release open-file table.
+ * This may block!
+ */
+ fdescfree(td);
+
+ /*
+ * If this thread tickled GEOM, we need to wait for the giggling to
+ * stop before we return to userland
+ */
+ if (td->td_pflags & TDP_GEOM)
+ g_waitidle();
+
+ /*
+ * Remove ourself from our leader's peer list and wake our leader.
+ */
+ mtx_lock(&ppeers_lock);
+ if (p->p_leader->p_peers) {
+ q = p->p_leader;
+ while (q->p_peers != p)
+ q = q->p_peers;
+ q->p_peers = p->p_peers;
+ wakeup(p->p_leader);
+ }
+ mtx_unlock(&ppeers_lock);
+
+ vmspace_exit(td);
+
+ sx_xlock(&proctree_lock);
+ if (SESS_LEADER(p)) {
+ struct session *sp = p->p_session;
+ struct tty *tp;
+
+ /*
+ * s_ttyp is not zero'd; we use this to indicate that
+ * the session once had a controlling terminal. (for
+ * logging and informational purposes)
+ */
+ SESS_LOCK(sp);
+ ttyvp = sp->s_ttyvp;
+ tp = sp->s_ttyp;
+ sp->s_ttyvp = NULL;
+ sp->s_ttydp = NULL;
+ sp->s_leader = NULL;
+ SESS_UNLOCK(sp);
+
+ /*
+ * Signal foreground pgrp and revoke access to
+ * controlling terminal if it has not been revoked
+ * already.
+ *
+ * Because the TTY may have been revoked in the mean
+ * time and could already have a new session associated
+ * with it, make sure we don't send a SIGHUP to a
+ * foreground process group that does not belong to this
+ * session.
+ */
+
+ if (tp != NULL) {
+ tty_lock(tp);
+ if (tp->t_session == sp)
+ tty_signal_pgrp(tp, SIGHUP);
+ tty_unlock(tp);
+ }
+
+ if (ttyvp != NULL) {
+ sx_xunlock(&proctree_lock);
+ if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) {
+ VOP_REVOKE(ttyvp, REVOKEALL);
+ VOP_UNLOCK(ttyvp, 0);
+ }
+ sx_xlock(&proctree_lock);
+ }
+ }
+ fixjobc(p, p->p_pgrp, 0);
+ sx_xunlock(&proctree_lock);
+ (void)acct_process(td);
+
+ /* Release the TTY now we've unlocked everything. */
+ if (ttyvp != NULL)
+ vrele(ttyvp);
+#ifdef KTRACE
+ ktrprocexit(td);
+#endif
+ /*
+ * Release reference to text vnode
+ */
+ if ((vtmp = p->p_textvp) != NULL) {
+ p->p_textvp = NULL;
+ vrele(vtmp);
+ }
+
+ /*
+ * Release our limits structure.
+ */
+ PROC_LOCK(p);
+ plim = p->p_limit;
+ p->p_limit = NULL;
+ PROC_UNLOCK(p);
+ lim_free(plim);
+
+ tidhash_remove(td);
+
+ /*
+ * Remove proc from allproc queue and pidhash chain.
+ * Place onto zombproc. Unlink from parent's child list.
+ */
+ sx_xlock(&allproc_lock);
+ LIST_REMOVE(p, p_list);
+ LIST_INSERT_HEAD(&zombproc, p, p_list);
+ LIST_REMOVE(p, p_hash);
+ sx_xunlock(&allproc_lock);
+
+ /*
+ * Call machine-dependent code to release any
+ * machine-dependent resources other than the address space.
+ * The address space is released by "vmspace_exitfree(p)" in
+ * vm_waitproc().
+ */
+ cpu_exit(td);
+
+ WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
+
+ /*
+ * Reparent all of our children to init.
+ */
+ sx_xlock(&proctree_lock);
+ q = LIST_FIRST(&p->p_children);
+ if (q != NULL) /* only need this if any child is S_ZOMB */
+ wakeup(initproc);
+ for (; q != NULL; q = nq) {
+ nq = LIST_NEXT(q, p_sibling);
+ PROC_LOCK(q);
+ proc_reparent(q, initproc);
+ q->p_sigparent = SIGCHLD;
+ /*
+ * Traced processes are killed
+ * since their existence means someone is screwing up.
+ */
+ if (q->p_flag & P_TRACED) {
+ struct thread *temp;
+
+ /*
+ * Since q was found on our children list, the
+ * proc_reparent() call moved q to the orphan
+ * list due to present P_TRACED flag. Clear
+ * orphan link for q now while q is locked.
+ */
+ clear_orphan(q);
+ q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
+ FOREACH_THREAD_IN_PROC(q, temp)
+ temp->td_dbgflags &= ~TDB_SUSPEND;
+ kern_psignal(q, SIGKILL);
+ }
+ PROC_UNLOCK(q);
+ }
+
+ /*
+ * Also get rid of our orphans.
+ */
+ while ((q = LIST_FIRST(&p->p_orphans)) != NULL) {
+ PROC_LOCK(q);
+ clear_orphan(q);
+ PROC_UNLOCK(q);
+ }
+
+ /* Save exit status. */
+ PROC_LOCK(p);
+ p->p_xthread = td;
+
+ /* Tell the prison that we are gone. */
+ prison_proc_free(p->p_ucred->cr_prison);
+
+#ifdef KDTRACE_HOOKS
+ /*
+ * Tell the DTrace fasttrap provider about the exit if it
+ * has declared an interest.
+ */
+ if (dtrace_fasttrap_exit)
+ dtrace_fasttrap_exit(p);
+#endif
+
+ /*
+ * Notify interested parties of our demise.
+ */
+ KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
+
+#ifdef KDTRACE_HOOKS
+ int reason = CLD_EXITED;
+ if (WCOREDUMP(rv))
+ reason = CLD_DUMPED;
+ else if (WIFSIGNALED(rv))
+ reason = CLD_KILLED;
+ SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0);
+#endif
+
+ /*
+ * Just delete all entries in the p_klist. At this point we won't
+ * report any more events, and there are nasty race conditions that
+ * can beat us if we don't.
+ */
+ knlist_clear(&p->p_klist, 1);
+
+ /*
+ * If this is a process with a descriptor, we may not need to deliver
+ * a signal to the parent. proctree_lock is held over
+ * procdesc_exit() to serialize concurrent calls to close() and
+ * exit().
+ */
+#ifdef PROCDESC
+ if (p->p_procdesc == NULL || procdesc_exit(p)) {
+#endif
+ /*
+ * Notify parent that we're gone. If parent has the
+ * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
+ * notify process 1 instead (and hope it will handle this
+ * situation).
+ */
+ PROC_LOCK(p->p_pptr);
+ mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
+ if (p->p_pptr->p_sigacts->ps_flag &
+ (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
+ struct proc *pp;
+
+ mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
+ pp = p->p_pptr;
+ PROC_UNLOCK(pp);
+ proc_reparent(p, initproc);
+ p->p_sigparent = SIGCHLD;
+ PROC_LOCK(p->p_pptr);
+
+ /*
+ * Notify parent, so in case he was wait(2)ing or
+ * executing waitpid(2) with our pid, he will
+ * continue.
+ */
+ wakeup(pp);
+ } else
+ mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
+
+ if (p->p_pptr == initproc)
+ kern_psignal(p->p_pptr, SIGCHLD);
+ else if (p->p_sigparent != 0) {
+ if (p->p_sigparent == SIGCHLD)
+ childproc_exited(p);
+ else /* LINUX thread */
+ kern_psignal(p->p_pptr, p->p_sigparent);
+ }
+#ifdef PROCDESC
+ } else
+ PROC_LOCK(p->p_pptr);
+#endif
+ sx_xunlock(&proctree_lock);
+
+ /*
+ * The state PRS_ZOMBIE prevents other proesses from sending
+ * signal to the process, to avoid memory leak, we free memory
+ * for signal queue at the time when the state is set.
+ */
+ sigqueue_flush(&p->p_sigqueue);
+ sigqueue_flush(&td->td_sigqueue);
+
+ /*
+ * We have to wait until after acquiring all locks before
+ * changing p_state. We need to avoid all possible context
+ * switches (including ones from blocking on a mutex) while
+ * marked as a zombie. We also have to set the zombie state
+ * before we release the parent process' proc lock to avoid
+ * a lost wakeup. So, we first call wakeup, then we grab the
+ * sched lock, update the state, and release the parent process'
+ * proc lock.
+ */
+ wakeup(p->p_pptr);
+ cv_broadcast(&p->p_pwait);
+ sched_exit(p->p_pptr, td);
+ PROC_SLOCK(p);
+ p->p_state = PRS_ZOMBIE;
+ PROC_UNLOCK(p->p_pptr);
+
+ /*
+ * Hopefully no one will try to deliver a signal to the process this
+ * late in the game.
+ */
+ knlist_destroy(&p->p_klist);
+
+ /*
+ * Save our children's rusage information in our exit rusage.
+ */
+ ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
+
+ /*
+ * Make sure the scheduler takes this thread out of its tables etc.
+ * This will also release this thread's reference to the ucred.
+ * Other thread parts to release include pcb bits and such.
+ */
+ thread_exit();
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct abort2_args {
+ char *why;
+ int nargs;
+ void **args;
+};
+#endif
+
+int
+sys_abort2(struct thread *td, struct abort2_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct sbuf *sb;
+ void *uargs[16];
+ int error, i, sig;
+
+ /*
+ * Do it right now so we can log either proper call of abort2(), or
+ * note, that invalid argument was passed. 512 is big enough to
+ * handle 16 arguments' descriptions with additional comments.
+ */
+ sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
+ sbuf_clear(sb);
+ sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
+ p->p_comm, p->p_pid, td->td_ucred->cr_uid);
+ /*
+ * Since we can't return from abort2(), send SIGKILL in cases, where
+ * abort2() was called improperly
+ */
+ sig = SIGKILL;
+ /* Prevent from DoSes from user-space. */
+ if (uap->nargs < 0 || uap->nargs > 16)
+ goto out;
+ if (uap->nargs > 0) {
+ if (uap->args == NULL)
+ goto out;
+ error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
+ if (error != 0)
+ goto out;
+ }
+ /*
+ * Limit size of 'reason' string to 128. Will fit even when
+ * maximal number of arguments was chosen to be logged.
+ */
+ if (uap->why != NULL) {
+ error = sbuf_copyin(sb, uap->why, 128);
+ if (error < 0)
+ goto out;
+ } else {
+ sbuf_printf(sb, "(null)");
+ }
+ if (uap->nargs > 0) {
+ sbuf_printf(sb, "(");
+ for (i = 0;i < uap->nargs; i++)
+ sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
+ sbuf_printf(sb, ")");
+ }
+ /*
+ * Final stage: arguments were proper, string has been
+ * successfully copied from userspace, and copying pointers
+ * from user-space succeed.
+ */
+ sig = SIGABRT;
+out:
+ if (sig == SIGKILL) {
+ sbuf_trim(sb);
+ sbuf_printf(sb, " (Reason text inaccessible)");
+ }
+ sbuf_cat(sb, "\n");
+ sbuf_finish(sb);
+ log(LOG_INFO, "%s", sbuf_data(sb));
+ sbuf_delete(sb);
+ exit1(td, W_EXITCODE(0, sig));
+ return (0);
+}
+
+
+#ifdef COMPAT_43
+/*
+ * The dirty work is handled by kern_wait().
+ */
+int
+owait(struct thread *td, struct owait_args *uap __unused)
+{
+ int error, status;
+
+ error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
+ if (error == 0)
+ td->td_retval[1] = status;
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * The dirty work is handled by kern_wait().
+ */
+int
+sys_wait4(struct thread *td, struct wait4_args *uap)
+{
+ struct rusage ru, *rup;
+ int error, status;
+
+ if (uap->rusage != NULL)
+ rup = &ru;
+ else
+ rup = NULL;
+ error = kern_wait(td, uap->pid, &status, uap->options, rup);
+ if (uap->status != NULL && error == 0)
+ error = copyout(&status, uap->status, sizeof(status));
+ if (uap->rusage != NULL && error == 0)
+ error = copyout(&ru, uap->rusage, sizeof(struct rusage));
+ return (error);
+}
+
+int
+sys_wait6(struct thread *td, struct wait6_args *uap)
+{
+ struct __wrusage wru, *wrup;
+ siginfo_t si, *sip;
+ idtype_t idtype;
+ id_t id;
+ int error, status;
+
+ idtype = uap->idtype;
+ id = uap->id;
+
+ if (uap->wrusage != NULL)
+ wrup = &wru;
+ else
+ wrup = NULL;
+
+ if (uap->info != NULL) {
+ sip = &si;
+ bzero(sip, sizeof(*sip));
+ } else
+ sip = NULL;
+
+ /*
+ * We expect all callers of wait6() to know about WEXITED and
+ * WTRAPPED.
+ */
+ error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
+
+ if (uap->status != NULL && error == 0)
+ error = copyout(&status, uap->status, sizeof(status));
+ if (uap->wrusage != NULL && error == 0)
+ error = copyout(&wru, uap->wrusage, sizeof(wru));
+ if (uap->info != NULL && error == 0)
+ error = copyout(&si, uap->info, sizeof(si));
+ return (error);
+}
+
+/*
+ * Reap the remains of a zombie process and optionally return status and
+ * rusage. Asserts and will release both the proctree_lock and the process
+ * lock as part of its work.
+ */
+void
+proc_reap(struct thread *td, struct proc *p, int *status, int options)
+{
+ struct proc *q, *t;
+
+ sx_assert(&proctree_lock, SA_XLOCKED);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
+
+ q = td->td_proc;
+
+ PROC_SUNLOCK(p);
+ td->td_retval[0] = p->p_pid;
+ if (status)
+ *status = p->p_xstat; /* convert to int */
+ if (options & WNOWAIT) {
+ /*
+ * Only poll, returning the status. Caller does not wish to
+ * release the proc struct just yet.
+ */
+ PROC_UNLOCK(p);
+ sx_xunlock(&proctree_lock);
+ return;
+ }
+
+ PROC_LOCK(q);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(q);
+ PROC_UNLOCK(p);
+
+ /*
+ * If we got the child via a ptrace 'attach', we need to give it back
+ * to the old parent.
+ */
+ if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
+ PROC_LOCK(p);
+ proc_reparent(p, t);
+ p->p_oppid = 0;
+ PROC_UNLOCK(p);
+ pksignal(t, SIGCHLD, p->p_ksi);
+ wakeup(t);
+ cv_broadcast(&p->p_pwait);
+ PROC_UNLOCK(t);
+ sx_xunlock(&proctree_lock);
+ return;
+ }
+
+ /*
+ * Remove other references to this process to ensure we have an
+ * exclusive reference.
+ */
+ sx_xlock(&allproc_lock);
+ LIST_REMOVE(p, p_list); /* off zombproc */
+ sx_xunlock(&allproc_lock);
+ LIST_REMOVE(p, p_sibling);
+ PROC_LOCK(p);
+ clear_orphan(p);
+ PROC_UNLOCK(p);
+ leavepgrp(p);
+#ifdef PROCDESC
+ if (p->p_procdesc != NULL)
+ procdesc_reap(p);
+#endif
+ sx_xunlock(&proctree_lock);
+
+ /*
+ * As a side effect of this lock, we know that all other writes to
+ * this proc are visible now, so no more locking is needed for p.
+ */
+ PROC_LOCK(p);
+ p->p_xstat = 0; /* XXX: why? */
+ PROC_UNLOCK(p);
+ PROC_LOCK(q);
+ ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux);
+ PROC_UNLOCK(q);
+
+ /*
+ * Decrement the count of procs running with this uid.
+ */
+ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
+
+ /*
+ * Destroy resource accounting information associated with the process.
+ */
+#ifdef RACCT
+ PROC_LOCK(p);
+ racct_sub(p, RACCT_NPROC, 1);
+ PROC_UNLOCK(p);
+#endif
+ racct_proc_exit(p);
+
+ /*
+ * Free credentials, arguments, and sigacts.
+ */
+ crfree(p->p_ucred);
+ p->p_ucred = NULL;
+ pargs_drop(p->p_args);
+ p->p_args = NULL;
+ sigacts_free(p->p_sigacts);
+ p->p_sigacts = NULL;
+
+ /*
+ * Do any thread-system specific cleanups.
+ */
+ thread_wait(p);
+
+ /*
+ * Give vm and machine-dependent layer a chance to free anything that
+ * cpu_exit couldn't release while still running in process context.
+ */
+ vm_waitproc(p);
+#ifdef MAC
+ mac_proc_destroy(p);
+#endif
+ KASSERT(FIRST_THREAD_IN_PROC(p),
+ ("proc_reap: no residual thread!"));
+ uma_zfree(proc_zone, p);
+ sx_xlock(&allproc_lock);
+ nprocs--;
+ sx_xunlock(&allproc_lock);
+}
+
+static int
+proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
+ int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo)
+{
+ struct proc *q;
+ struct rusage *rup;
+
+ sx_assert(&proctree_lock, SA_XLOCKED);
+
+ q = td->td_proc;
+ PROC_LOCK(p);
+
+ switch (idtype) {
+ case P_ALL:
+ break;
+ case P_PID:
+ if (p->p_pid != (pid_t)id) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ break;
+ case P_PGID:
+ if (p->p_pgid != (pid_t)id) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ break;
+ case P_SID:
+ if (p->p_session->s_sid != (pid_t)id) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ break;
+ case P_UID:
+ if (p->p_ucred->cr_uid != (uid_t)id) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ break;
+ case P_GID:
+ if (p->p_ucred->cr_gid != (gid_t)id) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ break;
+ case P_JAILID:
+ if (p->p_ucred->cr_prison->pr_id != (int)id) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ break;
+ /*
+ * It seems that the thread structures get zeroed out
+ * at process exit. This makes it impossible to
+ * support P_SETID, P_CID or P_CPUID.
+ */
+ default:
+ PROC_UNLOCK(p);
+ return (0);
+ }
+
+ if (p_canwait(td, p)) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+
+ if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+
+ /*
+ * This special case handles a kthread spawned by linux_clone
+ * (see linux_misc.c). The linux_wait4 and linux_waitpid
+ * functions need to be able to distinguish between waiting
+ * on a process and waiting on a thread. It is a thread if
+ * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
+ * signifies we want to wait for threads and not processes.
+ */
+ if ((p->p_sigparent != SIGCHLD) ^
+ ((options & WLINUXCLONE) != 0)) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+
+ PROC_SLOCK(p);
+
+ if (siginfo != NULL) {
+ bzero(siginfo, sizeof(*siginfo));
+ siginfo->si_errno = 0;
+
+ /*
+ * SUSv4 requires that the si_signo value is always
+ * SIGCHLD. Obey it despite the rfork(2) interface
+ * allows to request other signal for child exit
+ * notification.
+ */
+ siginfo->si_signo = SIGCHLD;
+
+ /*
+ * This is still a rough estimate. We will fix the
+ * cases TRAPPED, STOPPED, and CONTINUED later.
+ */
+ if (WCOREDUMP(p->p_xstat))
+ siginfo->si_code = CLD_DUMPED;
+ else if (WIFSIGNALED(p->p_xstat))
+ siginfo->si_code = CLD_KILLED;
+ else
+ siginfo->si_code = CLD_EXITED;
+
+ siginfo->si_pid = p->p_pid;
+ siginfo->si_uid = p->p_ucred->cr_uid;
+ siginfo->si_status = p->p_xstat;
+
+ /*
+ * The si_addr field would be useful additional
+ * detail, but apparently the PC value may be lost
+ * when we reach this point. bzero() above sets
+ * siginfo->si_addr to NULL.
+ */
+ }
+
+ /*
+ * There should be no reason to limit resources usage info to
+ * exited processes only. A snapshot about any resources used
+ * by a stopped process may be exactly what is needed.
+ */
+ if (wrusage != NULL) {
+ rup = &wrusage->wru_self;
+ *rup = p->p_ru;
+ calcru(p, &rup->ru_utime, &rup->ru_stime);
+
+ rup = &wrusage->wru_children;
+ *rup = p->p_stats->p_cru;
+ calccru(p, &rup->ru_utime, &rup->ru_stime);
+ }
+
+ if (p->p_state == PRS_ZOMBIE) {
+ proc_reap(td, p, status, options);
+ return (-1);
+ }
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
+ return (1);
+}
+
+int
+kern_wait(struct thread *td, pid_t pid, int *status, int options,
+ struct rusage *rusage)
+{
+ struct __wrusage wru, *wrup;
+ idtype_t idtype;
+ id_t id;
+ int ret;
+
+ /*
+ * Translate the special pid values into the (idtype, pid)
+ * pair for kern_wait6. The WAIT_MYPGRP case is handled by
+ * kern_wait6() on its own.
+ */
+ if (pid == WAIT_ANY) {
+ idtype = P_ALL;
+ id = 0;
+ } else if (pid < 0) {
+ idtype = P_PGID;
+ id = (id_t)-pid;
+ } else {
+ idtype = P_PID;
+ id = (id_t)pid;
+ }
+
+ if (rusage != NULL)
+ wrup = &wru;
+ else
+ wrup = NULL;
+
+ /*
+ * For backward compatibility we implicitly add flags WEXITED
+ * and WTRAPPED here.
+ */
+ options |= WEXITED | WTRAPPED;
+ ret = kern_wait6(td, idtype, id, status, options, wrup, NULL);
+ if (rusage != NULL)
+ *rusage = wru.wru_self;
+ return (ret);
+}
+
+int
+kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status,
+ int options, struct __wrusage *wrusage, siginfo_t *siginfo)
+{
+ struct proc *p, *q;
+ int error, nfound, ret;
+
+ AUDIT_ARG_VALUE((int)idtype); /* XXX - This is likely wrong! */
+ AUDIT_ARG_PID((pid_t)id); /* XXX - This may be wrong! */
+ AUDIT_ARG_VALUE(options);
+
+ q = td->td_proc;
+
+ if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
+ PROC_LOCK(q);
+ id = (id_t)q->p_pgid;
+ PROC_UNLOCK(q);
+ idtype = P_PGID;
+ }
+
+ /* If we don't know the option, just return. */
+ if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT |
+ WEXITED | WTRAPPED | WLINUXCLONE)) != 0)
+ return (EINVAL);
+ if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) {
+ /*
+ * We will be unable to find any matching processes,
+ * because there are no known events to look for.
+ * Prefer to return error instead of blocking
+ * indefinitely.
+ */
+ return (EINVAL);
+ }
+
+loop:
+ if (q->p_flag & P_STATCHILD) {
+ PROC_LOCK(q);
+ q->p_flag &= ~P_STATCHILD;
+ PROC_UNLOCK(q);
+ }
+ nfound = 0;
+ sx_xlock(&proctree_lock);
+ LIST_FOREACH(p, &q->p_children, p_sibling) {
+ ret = proc_to_reap(td, p, idtype, id, status, options,
+ wrusage, siginfo);
+ if (ret == 0)
+ continue;
+ else if (ret == 1)
+ nfound++;
+ else
+ return (0);
+
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
+
+ if ((options & WTRAPPED) != 0 &&
+ (p->p_flag & P_TRACED) != 0 &&
+ (p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0 &&
+ (p->p_suspcount == p->p_numthreads) &&
+ ((p->p_flag & P_WAITED) == 0)) {
+ PROC_SUNLOCK(p);
+ if ((options & WNOWAIT) == 0)
+ p->p_flag |= P_WAITED;
+ sx_xunlock(&proctree_lock);
+ td->td_retval[0] = p->p_pid;
+
+ if (status != NULL)
+ *status = W_STOPCODE(p->p_xstat);
+ if (siginfo != NULL) {
+ siginfo->si_status = p->p_xstat;
+ siginfo->si_code = CLD_TRAPPED;
+ }
+ if ((options & WNOWAIT) == 0) {
+ PROC_LOCK(q);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(q);
+ }
+
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ if ((options & WUNTRACED) != 0 &&
+ (p->p_flag & P_STOPPED_SIG) != 0 &&
+ (p->p_suspcount == p->p_numthreads) &&
+ ((p->p_flag & P_WAITED) == 0)) {
+ PROC_SUNLOCK(p);
+ if ((options & WNOWAIT) == 0)
+ p->p_flag |= P_WAITED;
+ sx_xunlock(&proctree_lock);
+ td->td_retval[0] = p->p_pid;
+
+ if (status != NULL)
+ *status = W_STOPCODE(p->p_xstat);
+ if (siginfo != NULL) {
+ siginfo->si_status = p->p_xstat;
+ siginfo->si_code = CLD_STOPPED;
+ }
+ if ((options & WNOWAIT) == 0) {
+ PROC_LOCK(q);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(q);
+ }
+
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ PROC_SUNLOCK(p);
+ if ((options & WCONTINUED) != 0 &&
+ (p->p_flag & P_CONTINUED) != 0) {
+ sx_xunlock(&proctree_lock);
+ td->td_retval[0] = p->p_pid;
+ if ((options & WNOWAIT) == 0) {
+ p->p_flag &= ~P_CONTINUED;
+ PROC_LOCK(q);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(q);
+ }
+ PROC_UNLOCK(p);
+
+ if (status != NULL)
+ *status = SIGCONT;
+ if (siginfo != NULL) {
+ siginfo->si_status = SIGCONT;
+ siginfo->si_code = CLD_CONTINUED;
+ }
+ return (0);
+ }
+ PROC_UNLOCK(p);
+ }
+
+ /*
+ * Look in the orphans list too, to allow the parent to
+ * collect it's child exit status even if child is being
+ * debugged.
+ *
+ * Debugger detaches from the parent upon successful
+ * switch-over from parent to child. At this point due to
+ * re-parenting the parent loses the child to debugger and a
+ * wait4(2) call would report that it has no children to wait
+ * for. By maintaining a list of orphans we allow the parent
+ * to successfully wait until the child becomes a zombie.
+ */
+ LIST_FOREACH(p, &q->p_orphans, p_orphan) {
+ ret = proc_to_reap(td, p, idtype, id, status, options,
+ wrusage, siginfo);
+ if (ret == 0)
+ continue;
+ else if (ret == 1)
+ nfound++;
+ else
+ return (0);
+ }
+ if (nfound == 0) {
+ sx_xunlock(&proctree_lock);
+ return (ECHILD);
+ }
+ if (options & WNOHANG) {
+ sx_xunlock(&proctree_lock);
+ td->td_retval[0] = 0;
+ return (0);
+ }
+ PROC_LOCK(q);
+ sx_xunlock(&proctree_lock);
+ if (q->p_flag & P_STATCHILD) {
+ q->p_flag &= ~P_STATCHILD;
+ error = 0;
+ } else
+ error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
+ PROC_UNLOCK(q);
+ if (error)
+ return (error);
+ goto loop;
+}
+
+/*
+ * Make process 'parent' the new parent of process 'child'.
+ * Must be called with an exclusive hold of proctree lock.
+ */
+void
+proc_reparent(struct proc *child, struct proc *parent)
+{
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ PROC_LOCK_ASSERT(child, MA_OWNED);
+ if (child->p_pptr == parent)
+ return;
+
+ PROC_LOCK(child->p_pptr);
+ sigqueue_take(child->p_ksi);
+ PROC_UNLOCK(child->p_pptr);
+ LIST_REMOVE(child, p_sibling);
+ LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
+
+ clear_orphan(child);
+ if (child->p_flag & P_TRACED) {
+ LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child, p_orphan);
+ child->p_flag |= P_ORPHAN;
+ }
+
+ child->p_pptr = parent;
+}
diff --git a/sys/kern/kern_fail.c b/sys/kern/kern_fail.c
new file mode 100644
index 0000000..85d81e8
--- /dev/null
+++ b/sys/kern/kern_fail.c
@@ -0,0 +1,611 @@
+/*-
+ * Copyright (c) 2009 Isilon Inc http://www.isilon.com/
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/**
+ * @file
+ *
+ * fail(9) Facility.
+ *
+ * @ingroup failpoint_private
+ */
+/**
+ * @defgroup failpoint fail(9) Facility
+ *
+ * Failpoints allow for injecting fake errors into running code on the fly,
+ * without modifying code or recompiling with flags. Failpoints are always
+ * present, and are very efficient when disabled. Failpoints are described
+ * in man fail(9).
+ */
+/**
+ * @defgroup failpoint_private Private fail(9) Implementation functions
+ *
+ * Private implementations for the actual failpoint code.
+ *
+ * @ingroup failpoint
+ */
+/**
+ * @addtogroup failpoint_private
+ * @{
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/ctype.h>
+#include <sys/errno.h>
+#include <sys/fail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+
+#include <machine/stdarg.h>
+
+#ifdef ILOG_DEFINE_FOR_FILE
+ILOG_DEFINE_FOR_FILE(L_ISI_FAIL_POINT, L_ILOG, fail_point);
+#endif
+
+static MALLOC_DEFINE(M_FAIL_POINT, "Fail Points", "fail points system");
+#define fp_free(ptr) free(ptr, M_FAIL_POINT)
+#define fp_malloc(size, flags) malloc((size), M_FAIL_POINT, (flags))
+
+static struct mtx g_fp_mtx;
+MTX_SYSINIT(g_fp_mtx, &g_fp_mtx, "fail point mtx", MTX_DEF);
+#define FP_LOCK() mtx_lock(&g_fp_mtx)
+#define FP_UNLOCK() mtx_unlock(&g_fp_mtx)
+
+/**
+ * Failpoint types.
+ * Don't change these without changing fail_type_strings in fail.c.
+ * @ingroup failpoint_private
+ */
+enum fail_point_t {
+ FAIL_POINT_OFF, /**< don't fail */
+ FAIL_POINT_PANIC, /**< panic */
+ FAIL_POINT_RETURN, /**< return an errorcode */
+ FAIL_POINT_BREAK, /**< break into the debugger */
+ FAIL_POINT_PRINT, /**< print a message */
+ FAIL_POINT_SLEEP, /**< sleep for some msecs */
+ FAIL_POINT_NUMTYPES
+};
+
+static struct {
+ const char *name;
+ int nmlen;
+} fail_type_strings[] = {
+#define FP_TYPE_NM_LEN(s) { s, sizeof(s) - 1 }
+ [FAIL_POINT_OFF] = FP_TYPE_NM_LEN("off"),
+ [FAIL_POINT_PANIC] = FP_TYPE_NM_LEN("panic"),
+ [FAIL_POINT_RETURN] = FP_TYPE_NM_LEN("return"),
+ [FAIL_POINT_BREAK] = FP_TYPE_NM_LEN("break"),
+ [FAIL_POINT_PRINT] = FP_TYPE_NM_LEN("print"),
+ [FAIL_POINT_SLEEP] = FP_TYPE_NM_LEN("sleep"),
+};
+
+/**
+ * Internal structure tracking a single term of a complete failpoint.
+ * @ingroup failpoint_private
+ */
+struct fail_point_entry {
+ enum fail_point_t fe_type; /**< type of entry */
+ int fe_arg; /**< argument to type (e.g. return value) */
+ int fe_prob; /**< likelihood of firing in millionths */
+ int fe_count; /**< number of times to fire, 0 means always */
+ pid_t fe_pid; /**< only fail for this process */
+ TAILQ_ENTRY(fail_point_entry) fe_entries; /**< next entry in fail point */
+};
+
+static inline void
+fail_point_sleep(struct fail_point *fp, struct fail_point_entry *ent,
+ int msecs, enum fail_point_return_code *pret)
+{
+ /* convert from millisecs to ticks, rounding up */
+ int timo = ((msecs * hz) + 999) / 1000;
+
+ if (timo > 0) {
+ if (fp->fp_sleep_fn == NULL) {
+ msleep(fp, &g_fp_mtx, PWAIT, "failpt", timo);
+ } else {
+ timeout(fp->fp_sleep_fn, fp->fp_sleep_arg, timo);
+ *pret = FAIL_POINT_RC_QUEUED;
+ }
+ }
+}
+
+
+/**
+ * Defines stating the equivalent of probablilty one (100%)
+ */
+enum {
+ PROB_MAX = 1000000, /* probability between zero and this number */
+ PROB_DIGITS = 6, /* number of zero's in above number */
+};
+
+static char *parse_fail_point(struct fail_point_entries *, char *);
+static char *parse_term(struct fail_point_entries *, char *);
+static char *parse_number(int *out_units, int *out_decimal, char *);
+static char *parse_type(struct fail_point_entry *, char *);
+static void free_entry(struct fail_point_entries *, struct fail_point_entry *);
+static void clear_entries(struct fail_point_entries *);
+
+/**
+ * Initialize a fail_point. The name is formed in a printf-like fashion
+ * from "fmt" and subsequent arguments. This function is generally used
+ * for custom failpoints located at odd places in the sysctl tree, and is
+ * not explicitly needed for standard in-line-declared failpoints.
+ *
+ * @ingroup failpoint
+ */
+void
+fail_point_init(struct fail_point *fp, const char *fmt, ...)
+{
+ va_list ap;
+ char *name;
+ int n;
+
+ TAILQ_INIT(&fp->fp_entries);
+ fp->fp_flags = 0;
+
+ /* Figure out the size of the name. */
+ va_start(ap, fmt);
+ n = vsnprintf(NULL, 0, fmt, ap);
+ va_end(ap);
+
+ /* Allocate the name and fill it in. */
+ name = fp_malloc(n + 1, M_WAITOK);
+ if (name != NULL) {
+ va_start(ap, fmt);
+ vsnprintf(name, n + 1, fmt, ap);
+ va_end(ap);
+ }
+ fp->fp_name = name;
+ fp->fp_location = "";
+ fp->fp_flags |= FAIL_POINT_DYNAMIC_NAME;
+ fp->fp_sleep_fn = NULL;
+ fp->fp_sleep_arg = NULL;
+}
+
+/**
+ * Free the resources held by a fail_point.
+ *
+ * @ingroup failpoint
+ */
+void
+fail_point_destroy(struct fail_point *fp)
+{
+
+ if ((fp->fp_flags & FAIL_POINT_DYNAMIC_NAME) != 0) {
+ fp_free(__DECONST(void *, fp->fp_name));
+ fp->fp_name = NULL;
+ }
+ fp->fp_flags = 0;
+ clear_entries(&fp->fp_entries);
+}
+
+/**
+ * This does the real work of evaluating a fail point. If the fail point tells
+ * us to return a value, this function returns 1 and fills in 'return_value'
+ * (return_value is allowed to be null). If the fail point tells us to panic,
+ * we never return. Otherwise we just return 0 after doing some work, which
+ * means "keep going".
+ */
+enum fail_point_return_code
+fail_point_eval_nontrivial(struct fail_point *fp, int *return_value)
+{
+ enum fail_point_return_code ret = FAIL_POINT_RC_CONTINUE;
+ struct fail_point_entry *ent, *next;
+ int msecs;
+
+ FP_LOCK();
+
+ TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, next) {
+ int cont = 0; /* don't continue by default */
+
+ if (ent->fe_prob < PROB_MAX &&
+ ent->fe_prob < random() % PROB_MAX)
+ continue;
+ if (ent->fe_pid != NO_PID && ent->fe_pid != curproc->p_pid)
+ continue;
+
+ switch (ent->fe_type) {
+ case FAIL_POINT_PANIC:
+ panic("fail point %s panicking", fp->fp_name);
+ /* NOTREACHED */
+
+ case FAIL_POINT_RETURN:
+ if (return_value != NULL)
+ *return_value = ent->fe_arg;
+ ret = FAIL_POINT_RC_RETURN;
+ break;
+
+ case FAIL_POINT_BREAK:
+ printf("fail point %s breaking to debugger\n",
+ fp->fp_name);
+ breakpoint();
+ break;
+
+ case FAIL_POINT_PRINT:
+ printf("fail point %s executing\n", fp->fp_name);
+ cont = ent->fe_arg;
+ break;
+
+ case FAIL_POINT_SLEEP:
+ /*
+ * Free the entry now if necessary, since
+ * we're about to drop the mutex and sleep.
+ */
+ msecs = ent->fe_arg;
+ if (ent->fe_count > 0 && --ent->fe_count == 0) {
+ free_entry(&fp->fp_entries, ent);
+ ent = NULL;
+ }
+
+ if (msecs)
+ fail_point_sleep(fp, ent, msecs, &ret);
+ break;
+
+ default:
+ break;
+ }
+
+ if (ent != NULL && ent->fe_count > 0 && --ent->fe_count == 0)
+ free_entry(&fp->fp_entries, ent);
+ if (cont == 0)
+ break;
+ }
+
+ /* Get rid of "off"s at the end. */
+ while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) &&
+ ent->fe_type == FAIL_POINT_OFF)
+ free_entry(&fp->fp_entries, ent);
+
+ FP_UNLOCK();
+
+ return (ret);
+}
+
+/**
+ * Translate internal fail_point structure into human-readable text.
+ */
+static void
+fail_point_get(struct fail_point *fp, struct sbuf *sb)
+{
+ struct fail_point_entry *ent;
+
+ FP_LOCK();
+
+ TAILQ_FOREACH(ent, &fp->fp_entries, fe_entries) {
+ if (ent->fe_prob < PROB_MAX) {
+ int decimal = ent->fe_prob % (PROB_MAX / 100);
+ int units = ent->fe_prob / (PROB_MAX / 100);
+ sbuf_printf(sb, "%d", units);
+ if (decimal) {
+ int digits = PROB_DIGITS - 2;
+ while (!(decimal % 10)) {
+ digits--;
+ decimal /= 10;
+ }
+ sbuf_printf(sb, ".%0*d", digits, decimal);
+ }
+ sbuf_printf(sb, "%%");
+ }
+ if (ent->fe_count > 0)
+ sbuf_printf(sb, "%d*", ent->fe_count);
+ sbuf_printf(sb, "%s", fail_type_strings[ent->fe_type].name);
+ if (ent->fe_arg)
+ sbuf_printf(sb, "(%d)", ent->fe_arg);
+ if (ent->fe_pid != NO_PID)
+ sbuf_printf(sb, "[pid %d]", ent->fe_pid);
+ if (TAILQ_NEXT(ent, fe_entries))
+ sbuf_printf(sb, "->");
+ }
+ if (TAILQ_EMPTY(&fp->fp_entries))
+ sbuf_printf(sb, "off");
+
+ FP_UNLOCK();
+}
+
+/**
+ * Set an internal fail_point structure from a human-readable failpoint string
+ * in a lock-safe manner.
+ */
+static int
+fail_point_set(struct fail_point *fp, char *buf)
+{
+ int error = 0;
+ struct fail_point_entry *ent, *ent_next;
+ struct fail_point_entries new_entries;
+
+ /* Parse new entries. */
+ TAILQ_INIT(&new_entries);
+ if (!parse_fail_point(&new_entries, buf)) {
+ clear_entries(&new_entries);
+ error = EINVAL;
+ goto end;
+ }
+
+ FP_LOCK();
+
+ /* Move new entries in. */
+ TAILQ_SWAP(&fp->fp_entries, &new_entries, fail_point_entry, fe_entries);
+ clear_entries(&new_entries);
+
+ /* Get rid of useless zero probability entries. */
+ TAILQ_FOREACH_SAFE(ent, &fp->fp_entries, fe_entries, ent_next) {
+ if (ent->fe_prob == 0)
+ free_entry(&fp->fp_entries, ent);
+ }
+
+ /* Get rid of "off"s at the end. */
+ while ((ent = TAILQ_LAST(&fp->fp_entries, fail_point_entries)) &&
+ ent->fe_type == FAIL_POINT_OFF)
+ free_entry(&fp->fp_entries, ent);
+
+ FP_UNLOCK();
+
+ end:
+#ifdef IWARNING
+ if (error)
+ IWARNING("Failed to set %s %s to %s",
+ fp->fp_name, fp->fp_location, buf);
+ else
+ INOTICE("Set %s %s to %s",
+ fp->fp_name, fp->fp_location, buf);
+#endif /* IWARNING */
+
+ return (error);
+}
+
+#define MAX_FAIL_POINT_BUF 1023
+
+/**
+ * Handle kernel failpoint set/get.
+ */
+int
+fail_point_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct fail_point *fp = arg1;
+ char *buf = NULL;
+ struct sbuf sb;
+ int error;
+
+ /* Retrieving */
+ sbuf_new(&sb, NULL, 128, SBUF_AUTOEXTEND);
+ fail_point_get(fp, &sb);
+ sbuf_trim(&sb);
+ sbuf_finish(&sb);
+ error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
+ sbuf_delete(&sb);
+
+ /* Setting */
+ if (!error && req->newptr) {
+ if (req->newlen > MAX_FAIL_POINT_BUF) {
+ error = EINVAL;
+ goto out;
+ }
+
+ buf = fp_malloc(req->newlen + 1, M_WAITOK);
+
+ error = SYSCTL_IN(req, buf, req->newlen);
+ if (error)
+ goto out;
+ buf[req->newlen] = '\0';
+
+ error = fail_point_set(fp, buf);
+ }
+
+out:
+ fp_free(buf);
+ return (error);
+}
+
+/**
+ * Internal helper function to translate a human-readable failpoint string
+ * into a internally-parsable fail_point structure.
+ */
+static char *
+parse_fail_point(struct fail_point_entries *ents, char *p)
+{
+ /* <fail_point> ::
+ * <term> ( "->" <term> )*
+ */
+ p = parse_term(ents, p);
+ if (p == NULL)
+ return (NULL);
+ while (*p != '\0') {
+ if (p[0] != '-' || p[1] != '>')
+ return (NULL);
+ p = parse_term(ents, p + 2);
+ if (p == NULL)
+ return (NULL);
+ }
+ return (p);
+}
+
+/**
+ * Internal helper function to parse an individual term from a failpoint.
+ */
+static char *
+parse_term(struct fail_point_entries *ents, char *p)
+{
+ struct fail_point_entry *ent;
+
+ ent = fp_malloc(sizeof *ent, M_WAITOK | M_ZERO);
+ ent->fe_prob = PROB_MAX;
+ ent->fe_pid = NO_PID;
+ TAILQ_INSERT_TAIL(ents, ent, fe_entries);
+
+ /*
+ * <term> ::
+ * ( (<float> "%") | (<integer> "*" ) )*
+ * <type>
+ * [ "(" <integer> ")" ]
+ * [ "[pid " <integer> "]" ]
+ */
+
+ /* ( (<float> "%") | (<integer> "*" ) )* */
+ while (isdigit(*p) || *p == '.') {
+ int units, decimal;
+
+ p = parse_number(&units, &decimal, p);
+ if (p == NULL)
+ return (NULL);
+
+ if (*p == '%') {
+ if (units > 100) /* prevent overflow early */
+ units = 100;
+ ent->fe_prob = units * (PROB_MAX / 100) + decimal;
+ if (ent->fe_prob > PROB_MAX)
+ ent->fe_prob = PROB_MAX;
+ } else if (*p == '*') {
+ if (!units || decimal)
+ return (NULL);
+ ent->fe_count = units;
+ } else
+ return (NULL);
+ p++;
+ }
+
+ /* <type> */
+ p = parse_type(ent, p);
+ if (p == NULL)
+ return (NULL);
+ if (*p == '\0')
+ return (p);
+
+ /* [ "(" <integer> ")" ] */
+ if (*p != '(')
+ return p;
+ p++;
+ if (!isdigit(*p) && *p != '-')
+ return (NULL);
+ ent->fe_arg = strtol(p, &p, 0);
+ if (*p++ != ')')
+ return (NULL);
+
+ /* [ "[pid " <integer> "]" ] */
+#define PID_STRING "[pid "
+ if (strncmp(p, PID_STRING, sizeof(PID_STRING) - 1) != 0)
+ return (p);
+ p += sizeof(PID_STRING) - 1;
+ if (!isdigit(*p))
+ return (NULL);
+ ent->fe_pid = strtol(p, &p, 0);
+ if (*p++ != ']')
+ return (NULL);
+
+ return (p);
+}
+
+/**
+ * Internal helper function to parse a numeric for a failpoint term.
+ */
+static char *
+parse_number(int *out_units, int *out_decimal, char *p)
+{
+ char *old_p;
+
+ /*
+ * <number> ::
+ * <integer> [ "." <integer> ] |
+ * "." <integer>
+ */
+
+ /* whole part */
+ old_p = p;
+ *out_units = strtol(p, &p, 10);
+ if (p == old_p && *p != '.')
+ return (NULL);
+
+ /* fractional part */
+ *out_decimal = 0;
+ if (*p == '.') {
+ int digits = 0;
+ p++;
+ while (isdigit(*p)) {
+ int digit = *p - '0';
+ if (digits < PROB_DIGITS - 2)
+ *out_decimal = *out_decimal * 10 + digit;
+ else if (digits == PROB_DIGITS - 2 && digit >= 5)
+ (*out_decimal)++;
+ digits++;
+ p++;
+ }
+ if (!digits) /* need at least one digit after '.' */
+ return (NULL);
+ while (digits++ < PROB_DIGITS - 2) /* add implicit zeros */
+ *out_decimal *= 10;
+ }
+
+ return (p); /* success */
+}
+
+/**
+ * Internal helper function to parse an individual type for a failpoint term.
+ */
+static char *
+parse_type(struct fail_point_entry *ent, char *beg)
+{
+ enum fail_point_t type;
+ int len;
+
+ for (type = FAIL_POINT_OFF; type < FAIL_POINT_NUMTYPES; type++) {
+ len = fail_type_strings[type].nmlen;
+ if (strncmp(fail_type_strings[type].name, beg, len) == 0) {
+ ent->fe_type = type;
+ return (beg + len);
+ }
+ }
+ return (NULL);
+}
+
+/**
+ * Internal helper function to free an individual failpoint term.
+ */
+static void
+free_entry(struct fail_point_entries *ents, struct fail_point_entry *ent)
+{
+ TAILQ_REMOVE(ents, ent, fe_entries);
+ fp_free(ent);
+}
+
+/**
+ * Internal helper function to clear out all failpoint terms for a single
+ * failpoint.
+ */
+static void
+clear_entries(struct fail_point_entries *ents)
+{
+ struct fail_point_entry *ent, *ent_next;
+
+ TAILQ_FOREACH_SAFE(ent, ents, fe_entries, ent_next)
+ fp_free(ent);
+ TAILQ_INIT(ents);
+}
+
+/* The fail point sysctl tree. */
+SYSCTL_NODE(_debug, OID_AUTO, fail_point, CTLFLAG_RW, 0, "fail points");
diff --git a/sys/kern/kern_ffclock.c b/sys/kern/kern_ffclock.c
new file mode 100644
index 0000000..07441cd
--- /dev/null
+++ b/sys/kern/kern_ffclock.c
@@ -0,0 +1,479 @@
+/*-
+ * Copyright (c) 2011 The University of Melbourne
+ * All rights reserved.
+ *
+ * This software was developed by Julien Ridoux at the University of Melbourne
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ffclock.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/timeffc.h>
+
+#ifdef FFCLOCK
+
+FEATURE(ffclock, "Feed-forward clock support");
+
+extern struct ffclock_estimate ffclock_estimate;
+extern struct bintime ffclock_boottime;
+extern int8_t ffclock_updated;
+extern struct mtx ffclock_mtx;
+
+/*
+ * Feed-forward clock absolute time. This should be the preferred way to read
+ * the feed-forward clock for "wall-clock" type time. The flags allow to compose
+ * various flavours of absolute time (e.g. with or without leap seconds taken
+ * into account). If valid pointers are provided, the ffcounter value and an
+ * upper bound on clock error associated with the bintime are provided.
+ * NOTE: use ffclock_convert_abs() to differ the conversion of a ffcounter value
+ * read earlier.
+ */
+void
+ffclock_abstime(ffcounter *ffcount, struct bintime *bt,
+ struct bintime *error_bound, uint32_t flags)
+{
+ struct ffclock_estimate cest;
+ ffcounter ffc;
+ ffcounter update_ffcount;
+ ffcounter ffdelta_error;
+
+ /* Get counter and corresponding time. */
+ if ((flags & FFCLOCK_FAST) == FFCLOCK_FAST)
+ ffclock_last_tick(&ffc, bt, flags);
+ else {
+ ffclock_read_counter(&ffc);
+ ffclock_convert_abs(ffc, bt, flags);
+ }
+
+ /* Current ffclock estimate, use update_ffcount as generation number. */
+ do {
+ update_ffcount = ffclock_estimate.update_ffcount;
+ bcopy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate));
+ } while (update_ffcount != ffclock_estimate.update_ffcount);
+
+ /*
+ * Leap second adjustment. Total as seen by synchronisation algorithm
+ * since it started. cest.leapsec_next is the ffcounter prediction of
+ * when the next leapsecond occurs.
+ */
+ if ((flags & FFCLOCK_LEAPSEC) == FFCLOCK_LEAPSEC) {
+ bt->sec -= cest.leapsec_total;
+ if (ffc > cest.leapsec_next)
+ bt->sec -= cest.leapsec;
+ }
+
+ /* Boot time adjustment, for uptime/monotonic clocks. */
+ if ((flags & FFCLOCK_UPTIME) == FFCLOCK_UPTIME) {
+ bintime_sub(bt, &ffclock_boottime);
+ }
+
+ /* Compute error bound if a valid pointer has been passed. */
+ if (error_bound) {
+ ffdelta_error = ffc - cest.update_ffcount;
+ ffclock_convert_diff(ffdelta_error, error_bound);
+ /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */
+ bintime_mul(error_bound, cest.errb_rate *
+ (uint64_t)18446744073709LL);
+ /* 18446744073 = int(2^64 / 1e9), since err_abs in [ns] */
+ bintime_addx(error_bound, cest.errb_abs *
+ (uint64_t)18446744073LL);
+ }
+
+ if (ffcount)
+ *ffcount = ffc;
+}
+
+/*
+ * Feed-forward difference clock. This should be the preferred way to convert a
+ * time interval in ffcounter values into a time interval in seconds. If a valid
+ * pointer is passed, an upper bound on the error in computing the time interval
+ * in seconds is provided.
+ */
+void
+ffclock_difftime(ffcounter ffdelta, struct bintime *bt,
+ struct bintime *error_bound)
+{
+ ffcounter update_ffcount;
+ uint32_t err_rate;
+
+ ffclock_convert_diff(ffdelta, bt);
+
+ if (error_bound) {
+ do {
+ update_ffcount = ffclock_estimate.update_ffcount;
+ err_rate = ffclock_estimate.errb_rate;
+ } while (update_ffcount != ffclock_estimate.update_ffcount);
+
+ ffclock_convert_diff(ffdelta, error_bound);
+ /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s] */
+ bintime_mul(error_bound, err_rate * (uint64_t)18446744073709LL);
+ }
+}
+
+/*
+ * Create a new kern.sysclock sysctl node, which will be home to some generic
+ * sysclock configuration variables. Feed-forward clock specific variables will
+ * live under the ffclock subnode.
+ */
+
+SYSCTL_NODE(_kern, OID_AUTO, sysclock, CTLFLAG_RW, 0,
+ "System clock related configuration");
+SYSCTL_NODE(_kern_sysclock, OID_AUTO, ffclock, CTLFLAG_RW, 0,
+ "Feed-forward clock configuration");
+
+static char *sysclocks[] = {"feedback", "feed-forward"};
+#define MAX_SYSCLOCK_NAME_LEN 16
+#define NUM_SYSCLOCKS (sizeof(sysclocks) / sizeof(*sysclocks))
+
+static int ffclock_version = 2;
+SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, version, CTLFLAG_RD,
+ &ffclock_version, 0, "Feed-forward clock kernel version");
+
+/* List available sysclocks. */
+static int
+sysctl_kern_sysclock_available(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf *s;
+ int clk, error;
+
+ s = sbuf_new_for_sysctl(NULL, NULL,
+ MAX_SYSCLOCK_NAME_LEN * NUM_SYSCLOCKS, req);
+ if (s == NULL)
+ return (ENOMEM);
+
+ for (clk = 0; clk < NUM_SYSCLOCKS; clk++) {
+ sbuf_cat(s, sysclocks[clk]);
+ if (clk + 1 < NUM_SYSCLOCKS)
+ sbuf_cat(s, " ");
+ }
+ error = sbuf_finish(s);
+ sbuf_delete(s);
+
+ return (error);
+}
+
+SYSCTL_PROC(_kern_sysclock, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD,
+ 0, 0, sysctl_kern_sysclock_available, "A",
+ "List of available system clocks");
+
+/*
+ * Return the name of the active system clock if read, or attempt to change
+ * the active system clock to the user specified one if written to. The active
+ * system clock is read when calling any of the [get]{bin,nano,micro}[up]time()
+ * functions.
+ */
+static int
+sysctl_kern_sysclock_active(SYSCTL_HANDLER_ARGS)
+{
+ char newclock[MAX_SYSCLOCK_NAME_LEN];
+ int clk, error;
+
+ if (req->newptr == NULL) {
+ /* Return the name of the current active sysclock. */
+ strlcpy(newclock, sysclocks[sysclock_active], sizeof(newclock));
+ error = sysctl_handle_string(oidp, newclock,
+ sizeof(newclock), req);
+ } else {
+ /* Change the active sysclock to the user specified one. */
+ error = EINVAL;
+ for (clk = 0; clk < NUM_SYSCLOCKS; clk++) {
+ if (strncmp((char *)req->newptr, sysclocks[clk],
+ strlen(sysclocks[clk])) == 0) {
+ sysclock_active = clk;
+ error = 0;
+ break;
+ }
+ }
+ }
+
+ return (error);
+}
+
+SYSCTL_PROC(_kern_sysclock, OID_AUTO, active, CTLTYPE_STRING | CTLFLAG_RW,
+ 0, 0, sysctl_kern_sysclock_active, "A",
+ "Name of the active system clock which is currently serving time");
+
+static int sysctl_kern_ffclock_ffcounter_bypass = 0;
+SYSCTL_INT(_kern_sysclock_ffclock, OID_AUTO, ffcounter_bypass, CTLFLAG_RW,
+ &sysctl_kern_ffclock_ffcounter_bypass, 0,
+ "Use reliable hardware timecounter as the feed-forward counter");
+
+/*
+ * High level functions to access the Feed-Forward Clock.
+ */
+void
+ffclock_bintime(struct bintime *bt)
+{
+
+ ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
+}
+
+void
+ffclock_nanotime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_microtime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_LEAPSEC);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+ffclock_getbintime(struct bintime *bt)
+{
+
+ ffclock_abstime(NULL, bt, NULL,
+ FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
+}
+
+void
+ffclock_getnanotime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ ffclock_abstime(NULL, &bt, NULL,
+ FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_getmicrotime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ ffclock_abstime(NULL, &bt, NULL,
+ FFCLOCK_LERP | FFCLOCK_LEAPSEC | FFCLOCK_FAST);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+ffclock_binuptime(struct bintime *bt)
+{
+
+ ffclock_abstime(NULL, bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
+}
+
+void
+ffclock_nanouptime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_microuptime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ ffclock_abstime(NULL, &bt, NULL, FFCLOCK_LERP | FFCLOCK_UPTIME);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+ffclock_getbinuptime(struct bintime *bt)
+{
+
+ ffclock_abstime(NULL, bt, NULL,
+ FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
+}
+
+void
+ffclock_getnanouptime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ ffclock_abstime(NULL, &bt, NULL,
+ FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_getmicrouptime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ ffclock_abstime(NULL, &bt, NULL,
+ FFCLOCK_LERP | FFCLOCK_UPTIME | FFCLOCK_FAST);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+ffclock_bindifftime(ffcounter ffdelta, struct bintime *bt)
+{
+
+ ffclock_difftime(ffdelta, bt, NULL);
+}
+
+void
+ffclock_nanodifftime(ffcounter ffdelta, struct timespec *tsp)
+{
+ struct bintime bt;
+
+ ffclock_difftime(ffdelta, &bt, NULL);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+ffclock_microdifftime(ffcounter ffdelta, struct timeval *tvp)
+{
+ struct bintime bt;
+
+ ffclock_difftime(ffdelta, &bt, NULL);
+ bintime2timeval(&bt, tvp);
+}
+
+/*
+ * System call allowing userland applications to retrieve the current value of
+ * the Feed-Forward Clock counter.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ffclock_getcounter_args {
+ ffcounter *ffcount;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap)
+{
+ ffcounter ffcount;
+ int error;
+
+ ffcount = 0;
+ ffclock_read_counter(&ffcount);
+ if (ffcount == 0)
+ return (EAGAIN);
+ error = copyout(&ffcount, uap->ffcount, sizeof(ffcounter));
+
+ return (error);
+}
+
+/*
+ * System call allowing the synchronisation daemon to push new feed-foward clock
+ * estimates to the kernel. Acquire ffclock_mtx to prevent concurrent updates
+ * and ensure data consistency.
+ * NOTE: ffclock_updated signals the fftimehands that new estimates are
+ * available. The updated estimates are picked up by the fftimehands on next
+ * tick, which could take as long as 1/hz seconds (if ticks are not missed).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ffclock_setestimate_args {
+ struct ffclock_estimate *cest;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap)
+{
+ struct ffclock_estimate cest;
+ int error;
+
+ /* Reuse of PRIV_CLOCK_SETTIME. */
+ if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
+ return (error);
+
+ if ((error = copyin(uap->cest, &cest, sizeof(struct ffclock_estimate)))
+ != 0)
+ return (error);
+
+ mtx_lock(&ffclock_mtx);
+ memcpy(&ffclock_estimate, &cest, sizeof(struct ffclock_estimate));
+ ffclock_updated++;
+ mtx_unlock(&ffclock_mtx);
+ return (error);
+}
+
+/*
+ * System call allowing userland applications to retrieve the clock estimates
+ * stored within the kernel. It is useful to kickstart the synchronisation
+ * daemon with the kernel's knowledge of hardware timecounter.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ffclock_getestimate_args {
+ struct ffclock_estimate *cest;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap)
+{
+ struct ffclock_estimate cest;
+ int error;
+
+ mtx_lock(&ffclock_mtx);
+ memcpy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
+ mtx_unlock(&ffclock_mtx);
+ error = copyout(&cest, uap->cest, sizeof(struct ffclock_estimate));
+ return (error);
+}
+
+#else /* !FFCLOCK */
+
+int
+sys_ffclock_getcounter(struct thread *td, struct ffclock_getcounter_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_ffclock_setestimate(struct thread *td, struct ffclock_setestimate_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_ffclock_getestimate(struct thread *td, struct ffclock_getestimate_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+#endif /* FFCLOCK */
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
new file mode 100644
index 0000000..9cd1da9
--- /dev/null
+++ b/sys/kern/kern_fork.c
@@ -0,0 +1,1052 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_kstack_pages.h"
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/filedesc.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/pioctl.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/syscall.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/acct.h>
+#include <sys/ktr.h>
+#include <sys/ktrace.h>
+#include <sys/unistd.h>
+#include <sys/sdt.h>
+#include <sys/sx.h>
+#include <sys/sysent.h>
+#include <sys/signalvar.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+dtrace_fork_func_t dtrace_fasttrap_fork;
+#endif
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE3(proc, kernel, , create, create, "struct proc *",
+ "struct proc *", "int");
+
+#ifndef _SYS_SYSPROTO_H_
+struct fork_args {
+ int dummy;
+};
+#endif
+
+/* ARGSUSED */
+int
+sys_fork(struct thread *td, struct fork_args *uap)
+{
+ int error;
+ struct proc *p2;
+
+ error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0);
+ if (error == 0) {
+ td->td_retval[0] = p2->p_pid;
+ td->td_retval[1] = 0;
+ }
+ return (error);
+}
+
+/* ARGUSED */
+int
+sys_pdfork(td, uap)
+ struct thread *td;
+ struct pdfork_args *uap;
+{
+#ifdef PROCDESC
+ int error, fd;
+ struct proc *p2;
+
+ /*
+ * It is necessary to return fd by reference because 0 is a valid file
+ * descriptor number, and the child needs to be able to distinguish
+ * itself from the parent using the return value.
+ */
+ error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2,
+ &fd, uap->flags);
+ if (error == 0) {
+ td->td_retval[0] = p2->p_pid;
+ td->td_retval[1] = 0;
+ error = copyout(&fd, uap->fdp, sizeof(fd));
+ }
+ return (error);
+#else
+ return (ENOSYS);
+#endif
+}
+
+/* ARGSUSED */
+int
+sys_vfork(struct thread *td, struct vfork_args *uap)
+{
+ int error, flags;
+ struct proc *p2;
+
+ flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
+ error = fork1(td, flags, 0, &p2, NULL, 0);
+ if (error == 0) {
+ td->td_retval[0] = p2->p_pid;
+ td->td_retval[1] = 0;
+ }
+ return (error);
+}
+
+int
+sys_rfork(struct thread *td, struct rfork_args *uap)
+{
+ struct proc *p2;
+ int error;
+
+ /* Don't allow kernel-only flags. */
+ if ((uap->flags & RFKERNELONLY) != 0)
+ return (EINVAL);
+
+ AUDIT_ARG_FFLAGS(uap->flags);
+ error = fork1(td, uap->flags, 0, &p2, NULL, 0);
+ if (error == 0) {
+ td->td_retval[0] = p2 ? p2->p_pid : 0;
+ td->td_retval[1] = 0;
+ }
+ return (error);
+}
+
+int nprocs = 1; /* process 0 */
+int lastpid = 0;
+SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
+ "Last used PID");
+
+/*
+ * Random component to lastpid generation. We mix in a random factor to make
+ * it a little harder to predict. We sanity check the modulus value to avoid
+ * doing it in critical paths. Don't let it be too small or we pointlessly
+ * waste randomness entropy, and don't let it be impossibly large. Using a
+ * modulus that is too big causes a LOT more process table scans and slows
+ * down fork processing as the pidchecked caching is defeated.
+ */
+static int randompid = 0;
+
+static int
+sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
+{
+ int error, pid;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error != 0)
+ return(error);
+ sx_xlock(&allproc_lock);
+ pid = randompid;
+ error = sysctl_handle_int(oidp, &pid, 0, req);
+ if (error == 0 && req->newptr != NULL) {
+ if (pid < 0 || pid > pid_max - 100) /* out of range */
+ pid = pid_max - 100;
+ else if (pid < 2) /* NOP */
+ pid = 0;
+ else if (pid < 100) /* Make it reasonable */
+ pid = 100;
+ randompid = pid;
+ }
+ sx_xunlock(&allproc_lock);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
+ 0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
+
+static int
+fork_findpid(int flags)
+{
+ struct proc *p;
+ int trypid;
+ static int pidchecked = 0;
+
+ /*
+ * Requires allproc_lock in order to iterate over the list
+ * of processes, and proctree_lock to access p_pgrp.
+ */
+ sx_assert(&allproc_lock, SX_LOCKED);
+ sx_assert(&proctree_lock, SX_LOCKED);
+
+ /*
+ * Find an unused process ID. We remember a range of unused IDs
+ * ready to use (from lastpid+1 through pidchecked-1).
+ *
+ * If RFHIGHPID is set (used during system boot), do not allocate
+ * low-numbered pids.
+ */
+ trypid = lastpid + 1;
+ if (flags & RFHIGHPID) {
+ if (trypid < 10)
+ trypid = 10;
+ } else {
+ if (randompid)
+ trypid += arc4random() % randompid;
+ }
+retry:
+ /*
+ * If the process ID prototype has wrapped around,
+ * restart somewhat above 0, as the low-numbered procs
+ * tend to include daemons that don't exit.
+ */
+ if (trypid >= pid_max) {
+ trypid = trypid % pid_max;
+ if (trypid < 100)
+ trypid += 100;
+ pidchecked = 0;
+ }
+ if (trypid >= pidchecked) {
+ int doingzomb = 0;
+
+ pidchecked = PID_MAX;
+ /*
+ * Scan the active and zombie procs to check whether this pid
+ * is in use. Remember the lowest pid that's greater
+ * than trypid, so we can avoid checking for a while.
+ */
+ p = LIST_FIRST(&allproc);
+again:
+ for (; p != NULL; p = LIST_NEXT(p, p_list)) {
+ while (p->p_pid == trypid ||
+ (p->p_pgrp != NULL &&
+ (p->p_pgrp->pg_id == trypid ||
+ (p->p_session != NULL &&
+ p->p_session->s_sid == trypid)))) {
+ trypid++;
+ if (trypid >= pidchecked)
+ goto retry;
+ }
+ if (p->p_pid > trypid && pidchecked > p->p_pid)
+ pidchecked = p->p_pid;
+ if (p->p_pgrp != NULL) {
+ if (p->p_pgrp->pg_id > trypid &&
+ pidchecked > p->p_pgrp->pg_id)
+ pidchecked = p->p_pgrp->pg_id;
+ if (p->p_session != NULL &&
+ p->p_session->s_sid > trypid &&
+ pidchecked > p->p_session->s_sid)
+ pidchecked = p->p_session->s_sid;
+ }
+ }
+ if (!doingzomb) {
+ doingzomb = 1;
+ p = LIST_FIRST(&zombproc);
+ goto again;
+ }
+ }
+
+ /*
+ * RFHIGHPID does not mess with the lastpid counter during boot.
+ */
+ if (flags & RFHIGHPID)
+ pidchecked = 0;
+ else
+ lastpid = trypid;
+
+ return (trypid);
+}
+
+static int
+fork_norfproc(struct thread *td, int flags)
+{
+ int error;
+ struct proc *p1;
+
+ KASSERT((flags & RFPROC) == 0,
+ ("fork_norfproc called with RFPROC set"));
+ p1 = td->td_proc;
+
+ if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
+ (flags & (RFCFDG | RFFDG))) {
+ PROC_LOCK(p1);
+ if (thread_single(SINGLE_BOUNDARY)) {
+ PROC_UNLOCK(p1);
+ return (ERESTART);
+ }
+ PROC_UNLOCK(p1);
+ }
+
+ error = vm_forkproc(td, NULL, NULL, NULL, flags);
+ if (error)
+ goto fail;
+
+ /*
+ * Close all file descriptors.
+ */
+ if (flags & RFCFDG) {
+ struct filedesc *fdtmp;
+ fdtmp = fdinit(td->td_proc->p_fd);
+ fdescfree(td);
+ p1->p_fd = fdtmp;
+ }
+
+ /*
+ * Unshare file descriptors (from parent).
+ */
+ if (flags & RFFDG)
+ fdunshare(p1, td);
+
+fail:
+ if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
+ (flags & (RFCFDG | RFFDG))) {
+ PROC_LOCK(p1);
+ thread_single_end();
+ PROC_UNLOCK(p1);
+ }
+ return (error);
+}
+
+static void
+do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
+ struct vmspace *vm2, int pdflags)
+{
+ struct proc *p1, *pptr;
+ int p2_held, trypid;
+ struct filedesc *fd;
+ struct filedesc_to_leader *fdtol;
+ struct sigacts *newsigacts;
+
+ sx_assert(&proctree_lock, SX_SLOCKED);
+ sx_assert(&allproc_lock, SX_XLOCKED);
+
+ p2_held = 0;
+ p1 = td->td_proc;
+
+ /*
+ * Increment the nprocs resource before blocking can occur. There
+ * are hard-limits as to the number of processes that can run.
+ */
+ nprocs++;
+
+ trypid = fork_findpid(flags);
+
+ sx_sunlock(&proctree_lock);
+
+ p2->p_state = PRS_NEW; /* protect against others */
+ p2->p_pid = trypid;
+ AUDIT_ARG_PID(p2->p_pid);
+ LIST_INSERT_HEAD(&allproc, p2, p_list);
+ LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+ tidhash_add(td2);
+ PROC_LOCK(p2);
+ PROC_LOCK(p1);
+
+ sx_xunlock(&allproc_lock);
+
+ bcopy(&p1->p_startcopy, &p2->p_startcopy,
+ __rangeof(struct proc, p_startcopy, p_endcopy));
+ pargs_hold(p2->p_args);
+ PROC_UNLOCK(p1);
+
+ bzero(&p2->p_startzero,
+ __rangeof(struct proc, p_startzero, p_endzero));
+
+ p2->p_ucred = crhold(td->td_ucred);
+
+ /* Tell the prison that we exist. */
+ prison_proc_hold(p2->p_ucred->cr_prison);
+
+ PROC_UNLOCK(p2);
+
+ /*
+ * Malloc things while we don't hold any locks.
+ */
+ if (flags & RFSIGSHARE)
+ newsigacts = NULL;
+ else
+ newsigacts = sigacts_alloc();
+
+ /*
+ * Copy filedesc.
+ */
+ if (flags & RFCFDG) {
+ fd = fdinit(p1->p_fd);
+ fdtol = NULL;
+ } else if (flags & RFFDG) {
+ fd = fdcopy(p1->p_fd);
+ fdtol = NULL;
+ } else {
+ fd = fdshare(p1->p_fd);
+ if (p1->p_fdtol == NULL)
+ p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
+ p1->p_leader);
+ if ((flags & RFTHREAD) != 0) {
+ /*
+ * Shared file descriptor table, and shared
+ * process leaders.
+ */
+ fdtol = p1->p_fdtol;
+ FILEDESC_XLOCK(p1->p_fd);
+ fdtol->fdl_refcount++;
+ FILEDESC_XUNLOCK(p1->p_fd);
+ } else {
+ /*
+ * Shared file descriptor table, and different
+ * process leaders.
+ */
+ fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
+ p1->p_fd, p2);
+ }
+ }
+ /*
+ * Make a proc table entry for the new process.
+ * Start by zeroing the section of proc that is zero-initialized,
+ * then copy the section that is copied directly from the parent.
+ */
+
+ PROC_LOCK(p2);
+ PROC_LOCK(p1);
+
+ bzero(&td2->td_startzero,
+ __rangeof(struct thread, td_startzero, td_endzero));
+
+ bcopy(&td->td_startcopy, &td2->td_startcopy,
+ __rangeof(struct thread, td_startcopy, td_endcopy));
+
+ bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
+ td2->td_sigstk = td->td_sigstk;
+ td2->td_flags = TDF_INMEM;
+ td2->td_lend_user_pri = PRI_MAX;
+
+#ifdef VIMAGE
+ td2->td_vnet = NULL;
+ td2->td_vnet_lpush = NULL;
+#endif
+
+ /*
+ * Allow the scheduler to initialize the child.
+ */
+ thread_lock(td);
+ sched_fork(td, td2);
+ thread_unlock(td);
+
+ /*
+ * Duplicate sub-structures as needed.
+ * Increase reference counts on shared objects.
+ */
+ p2->p_flag = P_INMEM;
+ p2->p_swtick = ticks;
+ if (p1->p_flag & P_PROFIL)
+ startprofclock(p2);
+ td2->td_ucred = crhold(p2->p_ucred);
+
+ if (flags & RFSIGSHARE) {
+ p2->p_sigacts = sigacts_hold(p1->p_sigacts);
+ } else {
+ sigacts_copy(newsigacts, p1->p_sigacts);
+ p2->p_sigacts = newsigacts;
+ }
+
+ if (flags & RFTSIGZMB)
+ p2->p_sigparent = RFTSIGNUM(flags);
+ else if (flags & RFLINUXTHPN)
+ p2->p_sigparent = SIGUSR1;
+ else
+ p2->p_sigparent = SIGCHLD;
+
+ p2->p_textvp = p1->p_textvp;
+ p2->p_fd = fd;
+ p2->p_fdtol = fdtol;
+
+ /*
+ * p_limit is copy-on-write. Bump its refcount.
+ */
+ lim_fork(p1, p2);
+
+ pstats_fork(p1->p_stats, p2->p_stats);
+
+ PROC_UNLOCK(p1);
+ PROC_UNLOCK(p2);
+
+ /* Bump references to the text vnode (for procfs). */
+ if (p2->p_textvp)
+ vref(p2->p_textvp);
+
+ /*
+ * Set up linkage for kernel based threading.
+ */
+ if ((flags & RFTHREAD) != 0) {
+ mtx_lock(&ppeers_lock);
+ p2->p_peers = p1->p_peers;
+ p1->p_peers = p2;
+ p2->p_leader = p1->p_leader;
+ mtx_unlock(&ppeers_lock);
+ PROC_LOCK(p1->p_leader);
+ if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
+ PROC_UNLOCK(p1->p_leader);
+ /*
+ * The task leader is exiting, so process p1 is
+ * going to be killed shortly. Since p1 obviously
+ * isn't dead yet, we know that the leader is either
+ * sending SIGKILL's to all the processes in this
+ * task or is sleeping waiting for all the peers to
+ * exit. We let p1 complete the fork, but we need
+ * to go ahead and kill the new process p2 since
+ * the task leader may not get a chance to send
+ * SIGKILL to it. We leave it on the list so that
+ * the task leader will wait for this new process
+ * to commit suicide.
+ */
+ PROC_LOCK(p2);
+ kern_psignal(p2, SIGKILL);
+ PROC_UNLOCK(p2);
+ } else
+ PROC_UNLOCK(p1->p_leader);
+ } else {
+ p2->p_peers = NULL;
+ p2->p_leader = p2;
+ }
+
+ sx_xlock(&proctree_lock);
+ PGRP_LOCK(p1->p_pgrp);
+ PROC_LOCK(p2);
+ PROC_LOCK(p1);
+
+ /*
+ * Preserve some more flags in subprocess. P_PROFIL has already
+ * been preserved.
+ */
+ p2->p_flag |= p1->p_flag & P_SUGID;
+ td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
+ SESS_LOCK(p1->p_session);
+ if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
+ p2->p_flag |= P_CONTROLT;
+ SESS_UNLOCK(p1->p_session);
+ if (flags & RFPPWAIT)
+ p2->p_flag |= P_PPWAIT;
+
+ p2->p_pgrp = p1->p_pgrp;
+ LIST_INSERT_AFTER(p1, p2, p_pglist);
+ PGRP_UNLOCK(p1->p_pgrp);
+ LIST_INIT(&p2->p_children);
+ LIST_INIT(&p2->p_orphans);
+
+ callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0);
+
+ /*
+ * If PF_FORK is set, the child process inherits the
+ * procfs ioctl flags from its parent.
+ */
+ if (p1->p_pfsflags & PF_FORK) {
+ p2->p_stops = p1->p_stops;
+ p2->p_pfsflags = p1->p_pfsflags;
+ }
+
+ /*
+ * This begins the section where we must prevent the parent
+ * from being swapped.
+ */
+ _PHOLD(p1);
+ PROC_UNLOCK(p1);
+
+ /*
+ * Attach the new process to its parent.
+ *
+ * If RFNOWAIT is set, the newly created process becomes a child
+ * of init. This effectively disassociates the child from the
+ * parent.
+ */
+ if (flags & RFNOWAIT)
+ pptr = initproc;
+ else
+ pptr = p1;
+ p2->p_pptr = pptr;
+ LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
+ sx_xunlock(&proctree_lock);
+
+ /* Inform accounting that we have forked. */
+ p2->p_acflag = AFORK;
+ PROC_UNLOCK(p2);
+
+#ifdef KTRACE
+ ktrprocfork(p1, p2);
+#endif
+
+ /*
+ * Finish creating the child process. It will return via a different
+ * execution path later. (ie: directly into user mode)
+ */
+ vm_forkproc(td, p2, td2, vm2, flags);
+
+ if (flags == (RFFDG | RFPROC)) {
+ PCPU_INC(cnt.v_forks);
+ PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
+ p2->p_vmspace->vm_ssize);
+ } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
+ PCPU_INC(cnt.v_vforks);
+ PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
+ p2->p_vmspace->vm_ssize);
+ } else if (p1 == &proc0) {
+ PCPU_INC(cnt.v_kthreads);
+ PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
+ p2->p_vmspace->vm_ssize);
+ } else {
+ PCPU_INC(cnt.v_rforks);
+ PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
+ p2->p_vmspace->vm_ssize);
+ }
+
+#ifdef PROCDESC
+ /*
+ * Associate the process descriptor with the process before anything
+ * can happen that might cause that process to need the descriptor.
+ * However, don't do this until after fork(2) can no longer fail.
+ */
+ if (flags & RFPROCDESC)
+ procdesc_new(p2, pdflags);
+#endif
+
+ /*
+ * Both processes are set up, now check if any loadable modules want
+ * to adjust anything.
+ */
+ EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
+
+ /*
+ * Set the child start time and mark the process as being complete.
+ */
+ PROC_LOCK(p2);
+ PROC_LOCK(p1);
+ microuptime(&p2->p_stats->p_start);
+ PROC_SLOCK(p2);
+ p2->p_state = PRS_NORMAL;
+ PROC_SUNLOCK(p2);
+
+#ifdef KDTRACE_HOOKS
+ /*
+ * Tell the DTrace fasttrap provider about the new process
+ * if it has registered an interest. We have to do this only after
+ * p_state is PRS_NORMAL since the fasttrap module will use pfind()
+ * later on.
+ */
+ if (dtrace_fasttrap_fork)
+ dtrace_fasttrap_fork(p1, p2);
+#endif
+ if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
+ P_FOLLOWFORK)) {
+ /*
+ * Arrange for debugger to receive the fork event.
+ *
+ * We can report PL_FLAG_FORKED regardless of
+ * P_FOLLOWFORK settings, but it does not make a sense
+ * for runaway child.
+ */
+ td->td_dbgflags |= TDB_FORK;
+ td->td_dbg_forked = p2->p_pid;
+ td2->td_dbgflags |= TDB_STOPATFORK;
+ _PHOLD(p2);
+ p2_held = 1;
+ }
+ if (flags & RFPPWAIT) {
+ td->td_pflags |= TDP_RFPPWAIT;
+ td->td_rfppwait_p = p2;
+ }
+ PROC_UNLOCK(p2);
+ if ((flags & RFSTOPPED) == 0) {
+ /*
+ * If RFSTOPPED not requested, make child runnable and
+ * add to run queue.
+ */
+ thread_lock(td2);
+ TD_SET_CAN_RUN(td2);
+ sched_add(td2, SRQ_BORING);
+ thread_unlock(td2);
+ }
+
+ /*
+ * Now can be swapped.
+ */
+ _PRELE(p1);
+ PROC_UNLOCK(p1);
+
+ /*
+ * Tell any interested parties about the new process.
+ */
+ knote_fork(&p1->p_klist, p2->p_pid);
+ SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
+
+ /*
+ * Wait until debugger is attached to child.
+ */
+ PROC_LOCK(p2);
+ while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
+ cv_wait(&p2->p_dbgwait, &p2->p_mtx);
+ if (p2_held)
+ _PRELE(p2);
+ PROC_UNLOCK(p2);
+}
+
+int
+fork1(struct thread *td, int flags, int pages, struct proc **procp,
+ int *procdescp, int pdflags)
+{
+ struct proc *p1;
+ struct proc *newproc;
+ int ok;
+ struct thread *td2;
+ struct vmspace *vm2;
+ vm_ooffset_t mem_charged;
+ int error;
+ static int curfail;
+ static struct timeval lastfail;
+#ifdef PROCDESC
+ struct file *fp_procdesc = NULL;
+#endif
+
+ /* Check for the undefined or unimplemented flags. */
+ if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
+ return (EINVAL);
+
+ /* Signal value requires RFTSIGZMB. */
+ if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
+ return (EINVAL);
+
+ /* Can't copy and clear. */
+ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+ return (EINVAL);
+
+ /* Check the validity of the signal number. */
+ if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
+ return (EINVAL);
+
+#ifdef PROCDESC
+ if ((flags & RFPROCDESC) != 0) {
+ /* Can't not create a process yet get a process descriptor. */
+ if ((flags & RFPROC) == 0)
+ return (EINVAL);
+
+ /* Must provide a place to put a procdesc if creating one. */
+ if (procdescp == NULL)
+ return (EINVAL);
+ }
+#endif
+
+ p1 = td->td_proc;
+
+ /*
+ * Here we don't create a new process, but we divorce
+ * certain parts of a process from itself.
+ */
+ if ((flags & RFPROC) == 0) {
+ *procp = NULL;
+ return (fork_norfproc(td, flags));
+ }
+
+#ifdef PROCDESC
+ /*
+ * If required, create a process descriptor in the parent first; we
+ * will abandon it if something goes wrong. We don't finit() until
+ * later.
+ */
+ if (flags & RFPROCDESC) {
+ error = falloc(td, &fp_procdesc, procdescp, 0);
+ if (error != 0)
+ return (error);
+ }
+#endif
+
+ mem_charged = 0;
+ vm2 = NULL;
+ if (pages == 0)
+ pages = KSTACK_PAGES;
+ /* Allocate new proc. */
+ newproc = uma_zalloc(proc_zone, M_WAITOK);
+ td2 = FIRST_THREAD_IN_PROC(newproc);
+ if (td2 == NULL) {
+ td2 = thread_alloc(pages);
+ if (td2 == NULL) {
+ error = ENOMEM;
+ goto fail1;
+ }
+ proc_linkup(newproc, td2);
+ } else {
+ if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
+ if (td2->td_kstack != 0)
+ vm_thread_dispose(td2);
+ if (!thread_alloc_stack(td2, pages)) {
+ error = ENOMEM;
+ goto fail1;
+ }
+ }
+ }
+
+ if ((flags & RFMEM) == 0) {
+ vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
+ if (vm2 == NULL) {
+ error = ENOMEM;
+ goto fail1;
+ }
+ if (!swap_reserve(mem_charged)) {
+ /*
+ * The swap reservation failed. The accounting
+ * from the entries of the copied vm2 will be
+ * substracted in vmspace_free(), so force the
+ * reservation there.
+ */
+ swap_reserve_force(mem_charged);
+ error = ENOMEM;
+ goto fail1;
+ }
+ } else
+ vm2 = NULL;
+
+ /*
+ * XXX: This is ugly; when we copy resource usage, we need to bump
+ * per-cred resource counters.
+ */
+ newproc->p_ucred = p1->p_ucred;
+
+ /*
+ * Initialize resource accounting for the child process.
+ */
+ error = racct_proc_fork(p1, newproc);
+ if (error != 0) {
+ error = EAGAIN;
+ goto fail1;
+ }
+
+#ifdef MAC
+ mac_proc_init(newproc);
+#endif
+ knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
+ STAILQ_INIT(&newproc->p_ktr);
+
+ /* We have to lock the process tree while we look for a pid. */
+ sx_slock(&proctree_lock);
+
+ /*
+ * Although process entries are dynamically created, we still keep
+ * a global limit on the maximum number we will create. Don't allow
+ * a nonprivileged user to use the last ten processes; don't let root
+ * exceed the limit. The variable nprocs is the current number of
+ * processes, maxproc is the limit.
+ */
+ sx_xlock(&allproc_lock);
+ if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
+ PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
+ error = EAGAIN;
+ goto fail;
+ }
+
+ /*
+ * Increment the count of procs running with this uid. Don't allow
+ * a nonprivileged user to exceed their current limit.
+ *
+ * XXXRW: Can we avoid privilege here if it's not needed?
+ */
+ error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
+ if (error == 0)
+ ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
+ else {
+ PROC_LOCK(p1);
+ ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
+ lim_cur(p1, RLIMIT_NPROC));
+ PROC_UNLOCK(p1);
+ }
+ if (ok) {
+ do_fork(td, flags, newproc, td2, vm2, pdflags);
+
+ /*
+ * Return child proc pointer to parent.
+ */
+ *procp = newproc;
+#ifdef PROCDESC
+ if (flags & RFPROCDESC) {
+ procdesc_finit(newproc->p_procdesc, fp_procdesc);
+ fdrop(fp_procdesc, td);
+ }
+#endif
+ racct_proc_fork_done(newproc);
+ return (0);
+ }
+
+ error = EAGAIN;
+fail:
+ sx_sunlock(&proctree_lock);
+ if (ppsratecheck(&lastfail, &curfail, 1))
+ printf("maxproc limit exceeded by uid %u (pid %d); see tuning(7) and login.conf(5)\n",
+ td->td_ucred->cr_ruid, p1->p_pid);
+ sx_xunlock(&allproc_lock);
+#ifdef MAC
+ mac_proc_destroy(newproc);
+#endif
+ racct_proc_exit(newproc);
+fail1:
+ if (vm2 != NULL)
+ vmspace_free(vm2);
+ uma_zfree(proc_zone, newproc);
+#ifdef PROCDESC
+ if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
+ fdclose(td->td_proc->p_fd, fp_procdesc, *procdescp, td);
+ fdrop(fp_procdesc, td);
+ }
+#endif
+ pause("fork", hz / 2);
+ return (error);
+}
+
+/*
+ * Handle the return of a child process from fork1(). This function
+ * is called from the MD fork_trampoline() entry point.
+ */
+void
+fork_exit(void (*callout)(void *, struct trapframe *), void *arg,
+ struct trapframe *frame)
+{
+ struct proc *p;
+ struct thread *td;
+ struct thread *dtd;
+
+ td = curthread;
+ p = td->td_proc;
+ KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
+
+ CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
+ td, td->td_sched, p->p_pid, td->td_name);
+
+ sched_fork_exit(td);
+ /*
+ * Processes normally resume in mi_switch() after being
+ * cpu_switch()'ed to, but when children start up they arrive here
+ * instead, so we must do much the same things as mi_switch() would.
+ */
+ if ((dtd = PCPU_GET(deadthread))) {
+ PCPU_SET(deadthread, NULL);
+ thread_stash(dtd);
+ }
+ thread_unlock(td);
+
+ /*
+ * cpu_set_fork_handler intercepts this function call to
+ * have this call a non-return function to stay in kernel mode.
+ * initproc has its own fork handler, but it does return.
+ */
+ KASSERT(callout != NULL, ("NULL callout in fork_exit"));
+ callout(arg, frame);
+
+ /*
+ * Check if a kernel thread misbehaved and returned from its main
+ * function.
+ */
+ if (p->p_flag & P_KTHREAD) {
+ printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
+ td->td_name, p->p_pid);
+ kproc_exit(0);
+ }
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ if (p->p_sysent->sv_schedtail != NULL)
+ (p->p_sysent->sv_schedtail)(td);
+}
+
+/*
+ * Simplified back end of syscall(), used when returning from fork()
+ * directly into user mode. Giant is not held on entry, and must not
+ * be held on return. This function is passed in to fork_exit() as the
+ * first parameter and is called when returning to a new userland process.
+ */
+void
+fork_return(struct thread *td, struct trapframe *frame)
+{
+ struct proc *p, *dbg;
+
+ if (td->td_dbgflags & TDB_STOPATFORK) {
+ p = td->td_proc;
+ sx_xlock(&proctree_lock);
+ PROC_LOCK(p);
+ if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
+ (P_TRACED | P_FOLLOWFORK)) {
+ /*
+ * If debugger still wants auto-attach for the
+ * parent's children, do it now.
+ */
+ dbg = p->p_pptr->p_pptr;
+ p->p_flag |= P_TRACED;
+ p->p_oppid = p->p_pptr->p_pid;
+ proc_reparent(p, dbg);
+ sx_xunlock(&proctree_lock);
+ td->td_dbgflags |= TDB_CHILD;
+ ptracestop(td, SIGSTOP);
+ td->td_dbgflags &= ~TDB_CHILD;
+ } else {
+ /*
+ * ... otherwise clear the request.
+ */
+ sx_xunlock(&proctree_lock);
+ td->td_dbgflags &= ~TDB_STOPATFORK;
+ cv_broadcast(&p->p_dbgwait);
+ }
+ PROC_UNLOCK(p);
+ }
+
+ userret(td, frame);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_SYSRET))
+ ktrsysret(SYS_fork, 0, 0);
+#endif
+}
diff --git a/sys/kern/kern_gzio.c b/sys/kern/kern_gzio.c
new file mode 100644
index 0000000..15dc301
--- /dev/null
+++ b/sys/kern/kern_gzio.c
@@ -0,0 +1,400 @@
+/*
+ * $Id: kern_gzio.c,v 1.6 2008-10-18 22:54:45 lbazinet Exp $
+ *
+ * core_gzip.c -- gzip routines used in compressing user process cores
+ *
+ * This file is derived from src/lib/libz/gzio.c in FreeBSD.
+ */
+
+/* gzio.c -- IO on .gz files
+ * Copyright (C) 1995-1998 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ */
+
+/* @(#) $FreeBSD$ */
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/vnode.h>
+#include <sys/syslog.h>
+#include <sys/endian.h>
+#include <net/zutil.h>
+#include <sys/libkern.h>
+
+#include <sys/vnode.h>
+#include <sys/mount.h>
+
+#define GZ_HEADER_LEN 10
+
+#ifndef Z_BUFSIZE
+# ifdef MAXSEG_64K
+# define Z_BUFSIZE 4096 /* minimize memory usage for 16-bit DOS */
+# else
+# define Z_BUFSIZE 16384
+# endif
+#endif
+#ifndef Z_PRINTF_BUFSIZE
+# define Z_PRINTF_BUFSIZE 4096
+#endif
+
+#define ALLOC(size) malloc(size, M_TEMP, M_WAITOK | M_ZERO)
+#define TRYFREE(p) {if (p) free(p, M_TEMP);}
+
+static int gz_magic[2] = {0x1f, 0x8b}; /* gzip magic header */
+
+/* gzip flag byte */
+#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */
+#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */
+#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
+#define COMMENT 0x10 /* bit 4 set: file comment present */
+#define RESERVED 0xE0 /* bits 5..7: reserved */
+
+typedef struct gz_stream {
+ z_stream stream;
+ int z_err; /* error code for last stream operation */
+ int z_eof; /* set if end of input file */
+ struct vnode *file; /* vnode pointer of .gz file */
+ Byte *inbuf; /* input buffer */
+ Byte *outbuf; /* output buffer */
+ uLong crc; /* crc32 of uncompressed data */
+ char *msg; /* error message */
+ char *path; /* path name for debugging only */
+ int transparent; /* 1 if input file is not a .gz file */
+ char mode; /* 'w' or 'r' */
+ long startpos; /* start of compressed data in file (header skipped) */
+ off_t outoff; /* current offset in output file */
+ int flags;
+} gz_stream;
+
+
+local int do_flush OF((gzFile file, int flush));
+local int destroy OF((gz_stream *s));
+local void putU32 OF((gz_stream *file, uint32_t x));
+local void *gz_alloc OF((void *notused, u_int items, u_int size));
+local void gz_free OF((void *notused, void *ptr));
+
+/* ===========================================================================
+ Opens a gzip (.gz) file for reading or writing. The mode parameter
+ is as in fopen ("rb" or "wb"). The file is given either by file descriptor
+ or path name (if fd == -1).
+ gz_open return NULL if the file could not be opened or if there was
+ insufficient memory to allocate the (de)compression state; errno
+ can be checked to distinguish the two cases (if errno is zero, the
+ zlib error is Z_MEM_ERROR).
+*/
+gzFile gz_open (path, mode, vp)
+ const char *path;
+ const char *mode;
+ struct vnode *vp;
+{
+ int err;
+ int level = Z_DEFAULT_COMPRESSION; /* compression level */
+ int strategy = Z_DEFAULT_STRATEGY; /* compression strategy */
+ const char *p = mode;
+ gz_stream *s;
+ char fmode[80]; /* copy of mode, without the compression level */
+ char *m = fmode;
+ ssize_t resid;
+ int error;
+ char buf[GZ_HEADER_LEN + 1];
+
+ if (!path || !mode) return Z_NULL;
+
+ s = (gz_stream *)ALLOC(sizeof(gz_stream));
+ if (!s) return Z_NULL;
+
+ s->stream.zalloc = (alloc_func)gz_alloc;
+ s->stream.zfree = (free_func)gz_free;
+ s->stream.opaque = (voidpf)0;
+ s->stream.next_in = s->inbuf = Z_NULL;
+ s->stream.next_out = s->outbuf = Z_NULL;
+ s->stream.avail_in = s->stream.avail_out = 0;
+ s->file = NULL;
+ s->z_err = Z_OK;
+ s->z_eof = 0;
+ s->crc = 0;
+ s->msg = NULL;
+ s->transparent = 0;
+ s->outoff = 0;
+ s->flags = 0;
+
+ s->path = (char*)ALLOC(strlen(path)+1);
+ if (s->path == NULL) {
+ return destroy(s), (gzFile)Z_NULL;
+ }
+ strcpy(s->path, path); /* do this early for debugging */
+
+ s->mode = '\0';
+ do {
+ if (*p == 'r') s->mode = 'r';
+ if (*p == 'w' || *p == 'a') s->mode = 'w';
+ if (*p >= '0' && *p <= '9') {
+ level = *p - '0';
+ } else if (*p == 'f') {
+ strategy = Z_FILTERED;
+ } else if (*p == 'h') {
+ strategy = Z_HUFFMAN_ONLY;
+ } else {
+ *m++ = *p; /* copy the mode */
+ }
+ } while (*p++ && m != fmode + sizeof(fmode));
+
+ if (s->mode != 'w') {
+ log(LOG_ERR, "gz_open: mode is not w (%c)\n", s->mode);
+ return destroy(s), (gzFile)Z_NULL;
+ }
+
+ err = deflateInit2(&(s->stream), level,
+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, strategy);
+ /* windowBits is passed < 0 to suppress zlib header */
+
+ s->stream.next_out = s->outbuf = (Byte*)ALLOC(Z_BUFSIZE);
+ if (err != Z_OK || s->outbuf == Z_NULL) {
+ return destroy(s), (gzFile)Z_NULL;
+ }
+
+ s->stream.avail_out = Z_BUFSIZE;
+ s->file = vp;
+
+ /* Write a very simple .gz header:
+ */
+ snprintf(buf, sizeof(buf), "%c%c%c%c%c%c%c%c%c%c", gz_magic[0],
+ gz_magic[1], Z_DEFLATED, 0 /*flags*/, 0,0,0,0 /*time*/,
+ 0 /*xflags*/, OS_CODE);
+
+ if ((error = vn_rdwr(UIO_WRITE, s->file, buf, GZ_HEADER_LEN, s->outoff,
+ UIO_SYSSPACE, IO_UNIT, curproc->p_ucred,
+ NOCRED, &resid, curthread))) {
+ s->outoff += GZ_HEADER_LEN - resid;
+ return destroy(s), (gzFile)Z_NULL;
+ }
+ s->outoff += GZ_HEADER_LEN;
+ s->startpos = 10L;
+
+ return (gzFile)s;
+}
+
+
+ /* ===========================================================================
+ * Cleanup then free the given gz_stream. Return a zlib error code.
+ Try freeing in the reverse order of allocations.
+ */
+local int destroy (s)
+ gz_stream *s;
+{
+ int err = Z_OK;
+
+ if (!s) return Z_STREAM_ERROR;
+
+ TRYFREE(s->msg);
+
+ if (s->stream.state != NULL) {
+ if (s->mode == 'w') {
+ err = deflateEnd(&(s->stream));
+ }
+ }
+ if (s->z_err < 0) err = s->z_err;
+
+ TRYFREE(s->inbuf);
+ TRYFREE(s->outbuf);
+ TRYFREE(s->path);
+ TRYFREE(s);
+ return err;
+}
+
+
+/* ===========================================================================
+ Writes the given number of uncompressed bytes into the compressed file.
+ gzwrite returns the number of bytes actually written (0 in case of error).
+*/
+int ZEXPORT gzwrite (file, buf, len)
+ gzFile file;
+ const voidp buf;
+ unsigned len;
+{
+ gz_stream *s = (gz_stream*)file;
+ off_t curoff;
+ size_t resid;
+ int error;
+
+ if (s == NULL || s->mode != 'w') return Z_STREAM_ERROR;
+
+ s->stream.next_in = (Bytef*)buf;
+ s->stream.avail_in = len;
+
+ curoff = s->outoff;
+ while (s->stream.avail_in != 0) {
+
+ if (s->stream.avail_out == 0) {
+
+ s->stream.next_out = s->outbuf;
+ error = vn_rdwr_inchunks(UIO_WRITE, s->file, s->outbuf, Z_BUFSIZE,
+ curoff, UIO_SYSSPACE, IO_UNIT,
+ curproc->p_ucred, NOCRED, &resid, curthread);
+ if (error) {
+ log(LOG_ERR, "gzwrite: vn_rdwr return %d\n", error);
+ curoff += Z_BUFSIZE - resid;
+ s->z_err = Z_ERRNO;
+ break;
+ }
+ curoff += Z_BUFSIZE;
+ s->stream.avail_out = Z_BUFSIZE;
+ }
+ s->z_err = deflate(&(s->stream), Z_NO_FLUSH);
+ if (s->z_err != Z_OK) {
+ log(LOG_ERR,
+ "gzwrite: deflate returned error %d\n", s->z_err);
+ break;
+ }
+ }
+
+ s->crc = ~crc32_raw(buf, len, ~s->crc);
+ s->outoff = curoff;
+
+ return (int)(len - s->stream.avail_in);
+}
+
+
+/* ===========================================================================
+ Flushes all pending output into the compressed file. The parameter
+ flush is as in the deflate() function.
+*/
+local int do_flush (file, flush)
+ gzFile file;
+ int flush;
+{
+ uInt len;
+ int done = 0;
+ gz_stream *s = (gz_stream*)file;
+ off_t curoff = s->outoff;
+ size_t resid;
+ int error;
+
+ if (s == NULL || s->mode != 'w') return Z_STREAM_ERROR;
+
+ if (s->stream.avail_in) {
+ log(LOG_WARNING, "do_flush: avail_in non-zero on entry\n");
+ }
+
+ s->stream.avail_in = 0; /* should be zero already anyway */
+
+ for (;;) {
+ len = Z_BUFSIZE - s->stream.avail_out;
+
+ if (len != 0) {
+ error = vn_rdwr_inchunks(UIO_WRITE, s->file, s->outbuf, len, curoff,
+ UIO_SYSSPACE, IO_UNIT, curproc->p_ucred,
+ NOCRED, &resid, curthread);
+ if (error) {
+ s->z_err = Z_ERRNO;
+ s->outoff = curoff + len - resid;
+ return Z_ERRNO;
+ }
+ s->stream.next_out = s->outbuf;
+ s->stream.avail_out = Z_BUFSIZE;
+ curoff += len;
+ }
+ if (done) break;
+ s->z_err = deflate(&(s->stream), flush);
+
+ /* Ignore the second of two consecutive flushes: */
+ if (len == 0 && s->z_err == Z_BUF_ERROR) s->z_err = Z_OK;
+
+ /* deflate has finished flushing only when it hasn't used up
+ * all the available space in the output buffer:
+ */
+ done = (s->stream.avail_out != 0 || s->z_err == Z_STREAM_END);
+
+ if (s->z_err != Z_OK && s->z_err != Z_STREAM_END) break;
+ }
+ s->outoff = curoff;
+
+ return s->z_err == Z_STREAM_END ? Z_OK : s->z_err;
+}
+
+int ZEXPORT gzflush (file, flush)
+ gzFile file;
+ int flush;
+{
+ gz_stream *s = (gz_stream*)file;
+ int err = do_flush (file, flush);
+
+ if (err) return err;
+ return s->z_err == Z_STREAM_END ? Z_OK : s->z_err;
+}
+
+
+/* ===========================================================================
+ Outputs a long in LSB order to the given file
+*/
+local void putU32 (s, x)
+ gz_stream *s;
+ uint32_t x;
+{
+ uint32_t xx;
+ off_t curoff = s->outoff;
+ ssize_t resid;
+
+#if BYTE_ORDER == BIG_ENDIAN
+ xx = bswap32(x);
+#else
+ xx = x;
+#endif
+ vn_rdwr(UIO_WRITE, s->file, (caddr_t)&xx, sizeof(xx), curoff,
+ UIO_SYSSPACE, IO_UNIT, curproc->p_ucred,
+ NOCRED, &resid, curthread);
+ s->outoff += sizeof(xx) - resid;
+}
+
+
+/* ===========================================================================
+ Flushes all pending output if necessary, closes the compressed file
+ and deallocates all the (de)compression state.
+*/
+int ZEXPORT gzclose (file)
+ gzFile file;
+{
+ int err;
+ gz_stream *s = (gz_stream*)file;
+
+ if (s == NULL) return Z_STREAM_ERROR;
+
+ if (s->mode == 'w') {
+ err = do_flush (file, Z_FINISH);
+ if (err != Z_OK) {
+ log(LOG_ERR, "gzclose: do_flush failed (err %d)\n", err);
+ return destroy((gz_stream*)file);
+ }
+#if 0
+ printf("gzclose: putting crc: %lld total: %lld\n",
+ (long long)s->crc, (long long)s->stream.total_in);
+ printf("sizeof uLong = %d\n", (int)sizeof(uLong));
+#endif
+ putU32 (s, s->crc);
+ putU32 (s, (uint32_t) s->stream.total_in);
+ }
+ return destroy((gz_stream*)file);
+}
+
+/*
+ * Space allocation and freeing routines for use by zlib routines when called
+ * from gzip modules.
+ */
+static void *
+gz_alloc(void *notused __unused, u_int items, u_int size)
+{
+ void *ptr;
+
+ MALLOC(ptr, void *, items * size, M_TEMP, M_NOWAIT | M_ZERO);
+ return ptr;
+}
+
+static void
+gz_free(void *opaque __unused, void *ptr)
+{
+ FREE(ptr, M_TEMP);
+}
+
diff --git a/sys/kern/kern_hhook.c b/sys/kern/kern_hhook.c
new file mode 100644
index 0000000..321e1a9
--- /dev/null
+++ b/sys/kern/kern_hhook.c
@@ -0,0 +1,521 @@
+/*-
+ * Copyright (c) 2010,2013 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University of Technology,
+ * made possible in part by grants from the FreeBSD Foundation and Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/khelp.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/module_khelp.h>
+#include <sys/osd.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/systm.h>
+
+#include <net/vnet.h>
+
+struct hhook {
+ hhook_func_t hhk_func;
+ struct helper *hhk_helper;
+ void *hhk_udata;
+ STAILQ_ENTRY(hhook) hhk_next;
+};
+
+static MALLOC_DEFINE(M_HHOOK, "hhook", "Helper hooks are linked off hhook_head lists");
+
+LIST_HEAD(hhookheadhead, hhook_head);
+struct hhookheadhead hhook_head_list;
+VNET_DEFINE(struct hhookheadhead, hhook_vhead_list);
+#define V_hhook_vhead_list VNET(hhook_vhead_list)
+
+static struct mtx hhook_head_list_lock;
+MTX_SYSINIT(hhookheadlistlock, &hhook_head_list_lock, "hhook_head list lock",
+ MTX_DEF);
+
+/* Protected by hhook_head_list_lock. */
+static uint32_t n_hhookheads;
+
+/* Private function prototypes. */
+static void hhook_head_destroy(struct hhook_head *hhh);
+void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
+
+#define HHHLIST_LOCK() mtx_lock(&hhook_head_list_lock)
+#define HHHLIST_UNLOCK() mtx_unlock(&hhook_head_list_lock)
+#define HHHLIST_LOCK_ASSERT() mtx_assert(&hhook_head_list_lock, MA_OWNED)
+
+#define HHH_LOCK_INIT(hhh) rm_init(&(hhh)->hhh_lock, "hhook_head rm lock")
+#define HHH_LOCK_DESTROY(hhh) rm_destroy(&(hhh)->hhh_lock)
+#define HHH_WLOCK(hhh) rm_wlock(&(hhh)->hhh_lock)
+#define HHH_WUNLOCK(hhh) rm_wunlock(&(hhh)->hhh_lock)
+#define HHH_RLOCK(hhh, rmpt) rm_rlock(&(hhh)->hhh_lock, (rmpt))
+#define HHH_RUNLOCK(hhh, rmpt) rm_runlock(&(hhh)->hhh_lock, (rmpt))
+
+/*
+ * Run all helper hook functions for a given hook point.
+ */
+void
+hhook_run_hooks(struct hhook_head *hhh, void *ctx_data, struct osd *hosd)
+{
+ struct hhook *hhk;
+ void *hdata;
+ struct rm_priotracker rmpt;
+
+ KASSERT(hhh->hhh_refcount > 0, ("hhook_head %p refcount is 0", hhh));
+
+ HHH_RLOCK(hhh, &rmpt);
+ STAILQ_FOREACH(hhk, &hhh->hhh_hooks, hhk_next) {
+ if (hhk->hhk_helper->h_flags & HELPER_NEEDS_OSD) {
+ hdata = osd_get(OSD_KHELP, hosd, hhk->hhk_helper->h_id);
+ if (hdata == NULL)
+ continue;
+ } else
+ hdata = NULL;
+
+ /*
+ * XXXLAS: We currently ignore the int returned by the hook,
+ * but will likely want to handle it in future to allow hhook to
+ * be used like pfil and effect changes at the hhook calling
+ * site e.g. we could define a new hook type of HHOOK_TYPE_PFIL
+ * and standardise what particular return values mean and set
+ * the context data to pass exactly the same information as pfil
+ * hooks currently receive, thus replicating pfil with hhook.
+ */
+ hhk->hhk_func(hhh->hhh_type, hhh->hhh_id, hhk->hhk_udata,
+ ctx_data, hdata, hosd);
+ }
+ HHH_RUNLOCK(hhh, &rmpt);
+}
+
+/*
+ * Register a new helper hook function with a helper hook point.
+ */
+int
+hhook_add_hook(struct hhook_head *hhh, struct hookinfo *hki, uint32_t flags)
+{
+ struct hhook *hhk, *tmp;
+ int error;
+
+ error = 0;
+
+ if (hhh == NULL)
+ return (ENOENT);
+
+ hhk = malloc(sizeof(struct hhook), M_HHOOK,
+ M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
+
+ if (hhk == NULL)
+ return (ENOMEM);
+
+ hhk->hhk_helper = hki->hook_helper;
+ hhk->hhk_func = hki->hook_func;
+ hhk->hhk_udata = hki->hook_udata;
+
+ HHH_WLOCK(hhh);
+ STAILQ_FOREACH(tmp, &hhh->hhh_hooks, hhk_next) {
+ if (tmp->hhk_func == hki->hook_func &&
+ tmp->hhk_udata == hki->hook_udata) {
+ /* The helper hook function is already registered. */
+ error = EEXIST;
+ break;
+ }
+ }
+
+ if (!error) {
+ STAILQ_INSERT_TAIL(&hhh->hhh_hooks, hhk, hhk_next);
+ hhh->hhh_nhooks++;
+ } else
+ free(hhk, M_HHOOK);
+
+ HHH_WUNLOCK(hhh);
+
+ return (error);
+}
+
+/*
+ * Register a helper hook function with a helper hook point (including all
+ * virtual instances of the hook point if it is virtualised).
+ *
+ * The logic is unfortunately far more complex than for
+ * hhook_remove_hook_lookup() because hhook_add_hook() can call malloc() with
+ * M_WAITOK and thus we cannot call hhook_add_hook() with the
+ * hhook_head_list_lock held.
+ *
+ * The logic assembles an array of hhook_head structs that correspond to the
+ * helper hook point being hooked and bumps the refcount on each (all done with
+ * the hhook_head_list_lock held). The hhook_head_list_lock is then dropped, and
+ * hhook_add_hook() is called and the refcount dropped for each hhook_head
+ * struct in the array.
+ */
+int
+hhook_add_hook_lookup(struct hookinfo *hki, uint32_t flags)
+{
+ struct hhook_head **heads_to_hook, *hhh;
+ int error, i, n_heads_to_hook;
+
+tryagain:
+ error = i = 0;
+ /*
+ * Accessing n_hhookheads without hhook_head_list_lock held opens up a
+ * race with hhook_head_register() which we are unlikely to lose, but
+ * nonetheless have to cope with - hence the complex goto logic.
+ */
+ n_heads_to_hook = n_hhookheads;
+ heads_to_hook = malloc(n_heads_to_hook * sizeof(struct hhook_head *),
+ M_HHOOK, flags & HHOOK_WAITOK ? M_WAITOK : M_NOWAIT);
+ if (heads_to_hook == NULL)
+ return (ENOMEM);
+
+ HHHLIST_LOCK();
+ LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+ if (hhh->hhh_type == hki->hook_type &&
+ hhh->hhh_id == hki->hook_id) {
+ if (i < n_heads_to_hook) {
+ heads_to_hook[i] = hhh;
+ refcount_acquire(&heads_to_hook[i]->hhh_refcount);
+ i++;
+ } else {
+ /*
+ * We raced with hhook_head_register() which
+ * inserted a hhook_head that we need to hook
+ * but did not malloc space for. Abort this run
+ * and try again.
+ */
+ for (i--; i >= 0; i--)
+ refcount_release(&heads_to_hook[i]->hhh_refcount);
+ free(heads_to_hook, M_HHOOK);
+ HHHLIST_UNLOCK();
+ goto tryagain;
+ }
+ }
+ }
+ HHHLIST_UNLOCK();
+
+ for (i--; i >= 0; i--) {
+ if (!error)
+ error = hhook_add_hook(heads_to_hook[i], hki, flags);
+ refcount_release(&heads_to_hook[i]->hhh_refcount);
+ }
+
+ free(heads_to_hook, M_HHOOK);
+
+ return (error);
+}
+
+/*
+ * Remove a helper hook function from a helper hook point.
+ */
+int
+hhook_remove_hook(struct hhook_head *hhh, struct hookinfo *hki)
+{
+ struct hhook *tmp;
+
+ if (hhh == NULL)
+ return (ENOENT);
+
+ HHH_WLOCK(hhh);
+ STAILQ_FOREACH(tmp, &hhh->hhh_hooks, hhk_next) {
+ if (tmp->hhk_func == hki->hook_func &&
+ tmp->hhk_udata == hki->hook_udata) {
+ STAILQ_REMOVE(&hhh->hhh_hooks, tmp, hhook, hhk_next);
+ free(tmp, M_HHOOK);
+ hhh->hhh_nhooks--;
+ break;
+ }
+ }
+ HHH_WUNLOCK(hhh);
+
+ return (0);
+}
+
+/*
+ * Remove a helper hook function from a helper hook point (including all
+ * virtual instances of the hook point if it is virtualised).
+ */
+int
+hhook_remove_hook_lookup(struct hookinfo *hki)
+{
+ struct hhook_head *hhh;
+
+ HHHLIST_LOCK();
+ LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+ if (hhh->hhh_type == hki->hook_type &&
+ hhh->hhh_id == hki->hook_id)
+ hhook_remove_hook(hhh, hki);
+ }
+ HHHLIST_UNLOCK();
+
+ return (0);
+}
+
+/*
+ * Register a new helper hook point.
+ */
+int
+hhook_head_register(int32_t hhook_type, int32_t hhook_id, struct hhook_head **hhh,
+ uint32_t flags)
+{
+ struct hhook_head *tmphhh;
+
+ tmphhh = hhook_head_get(hhook_type, hhook_id);
+
+ if (tmphhh != NULL) {
+ /* Hook point previously registered. */
+ hhook_head_release(tmphhh);
+ return (EEXIST);
+ }
+
+ tmphhh = malloc(sizeof(struct hhook_head), M_HHOOK,
+ M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
+
+ if (tmphhh == NULL)
+ return (ENOMEM);
+
+ tmphhh->hhh_type = hhook_type;
+ tmphhh->hhh_id = hhook_id;
+ tmphhh->hhh_nhooks = 0;
+ STAILQ_INIT(&tmphhh->hhh_hooks);
+ HHH_LOCK_INIT(tmphhh);
+ refcount_init(&tmphhh->hhh_refcount, 1);
+
+ HHHLIST_LOCK();
+ if (flags & HHOOK_HEADISINVNET) {
+ tmphhh->hhh_flags |= HHH_ISINVNET;
+#ifdef VIMAGE
+ KASSERT(curvnet != NULL, ("curvnet is NULL"));
+ tmphhh->hhh_vid = (uintptr_t)curvnet;
+ LIST_INSERT_HEAD(&V_hhook_vhead_list, tmphhh, hhh_vnext);
+#endif
+ }
+ LIST_INSERT_HEAD(&hhook_head_list, tmphhh, hhh_next);
+ n_hhookheads++;
+ HHHLIST_UNLOCK();
+
+ khelp_new_hhook_registered(tmphhh, flags);
+
+ if (hhh != NULL)
+ *hhh = tmphhh;
+ else
+ refcount_release(&tmphhh->hhh_refcount);
+
+ return (0);
+}
+
+static void
+hhook_head_destroy(struct hhook_head *hhh)
+{
+ struct hhook *tmp, *tmp2;
+
+ HHHLIST_LOCK_ASSERT();
+ KASSERT(n_hhookheads > 0, ("n_hhookheads should be > 0"));
+
+ LIST_REMOVE(hhh, hhh_next);
+#ifdef VIMAGE
+ if (hhook_head_is_virtualised(hhh) == HHOOK_HEADISINVNET)
+ LIST_REMOVE(hhh, hhh_vnext);
+#endif
+ HHH_WLOCK(hhh);
+ STAILQ_FOREACH_SAFE(tmp, &hhh->hhh_hooks, hhk_next, tmp2)
+ free(tmp, M_HHOOK);
+ HHH_WUNLOCK(hhh);
+ HHH_LOCK_DESTROY(hhh);
+ free(hhh, M_HHOOK);
+ n_hhookheads--;
+}
+
+/*
+ * Remove a helper hook point.
+ */
+int
+hhook_head_deregister(struct hhook_head *hhh)
+{
+ int error;
+
+ error = 0;
+
+ HHHLIST_LOCK();
+ if (hhh == NULL)
+ error = ENOENT;
+ else if (hhh->hhh_refcount > 1)
+ error = EBUSY;
+ else
+ hhook_head_destroy(hhh);
+ HHHLIST_UNLOCK();
+
+ return (error);
+}
+
+/*
+ * Remove a helper hook point via a hhook_head lookup.
+ */
+int
+hhook_head_deregister_lookup(int32_t hhook_type, int32_t hhook_id)
+{
+ struct hhook_head *hhh;
+ int error;
+
+ hhh = hhook_head_get(hhook_type, hhook_id);
+ error = hhook_head_deregister(hhh);
+
+ if (error == EBUSY)
+ hhook_head_release(hhh);
+
+ return (error);
+}
+
+/*
+ * Lookup and return the hhook_head struct associated with the specified type
+ * and id, or NULL if not found. If found, the hhook_head's refcount is bumped.
+ */
+struct hhook_head *
+hhook_head_get(int32_t hhook_type, int32_t hhook_id)
+{
+ struct hhook_head *hhh;
+
+ HHHLIST_LOCK();
+ LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+ if (hhh->hhh_type == hhook_type && hhh->hhh_id == hhook_id) {
+#ifdef VIMAGE
+ if (hhook_head_is_virtualised(hhh) ==
+ HHOOK_HEADISINVNET) {
+ KASSERT(curvnet != NULL, ("curvnet is NULL"));
+ if (hhh->hhh_vid != (uintptr_t)curvnet)
+ continue;
+ }
+#endif
+ refcount_acquire(&hhh->hhh_refcount);
+ break;
+ }
+ }
+ HHHLIST_UNLOCK();
+
+ return (hhh);
+}
+
+void
+hhook_head_release(struct hhook_head *hhh)
+{
+
+ refcount_release(&hhh->hhh_refcount);
+}
+
+/*
+ * Check the hhook_head private flags and return the appropriate public
+ * representation of the flag to the caller. The function is implemented in a
+ * way that allows us to cope with other subsystems becoming virtualised in the
+ * future.
+ */
+uint32_t
+hhook_head_is_virtualised(struct hhook_head *hhh)
+{
+ uint32_t ret;
+
+ ret = 0;
+
+ if (hhh != NULL) {
+ if (hhh->hhh_flags & HHH_ISINVNET)
+ ret = HHOOK_HEADISINVNET;
+ }
+
+ return (ret);
+}
+
+uint32_t
+hhook_head_is_virtualised_lookup(int32_t hook_type, int32_t hook_id)
+{
+ struct hhook_head *hhh;
+ uint32_t ret;
+
+ hhh = hhook_head_get(hook_type, hook_id);
+
+ if (hhh == NULL)
+ return (0);
+
+ ret = hhook_head_is_virtualised(hhh);
+ hhook_head_release(hhh);
+
+ return (ret);
+}
+
+/*
+ * Vnet created and being initialised.
+ */
+static void
+hhook_vnet_init(const void *unused __unused)
+{
+
+ LIST_INIT(&V_hhook_vhead_list);
+}
+
+/*
+ * Vnet being torn down and destroyed.
+ */
+static void
+hhook_vnet_uninit(const void *unused __unused)
+{
+ struct hhook_head *hhh, *tmphhh;
+
+ /*
+ * If subsystems which export helper hook points use the hhook KPI
+ * correctly, the loop below should have no work to do because the
+ * subsystem should have already called hhook_head_deregister().
+ */
+ HHHLIST_LOCK();
+ LIST_FOREACH_SAFE(hhh, &V_hhook_vhead_list, hhh_vnext, tmphhh) {
+ printf("%s: hhook_head type=%d, id=%d cleanup required\n",
+ __func__, hhh->hhh_type, hhh->hhh_id);
+ hhook_head_destroy(hhh);
+ }
+ HHHLIST_UNLOCK();
+}
+
+
+/*
+ * When a vnet is created and being initialised, init the V_hhook_vhead_list.
+ */
+VNET_SYSINIT(hhook_vnet_init, SI_SUB_MBUF, SI_ORDER_FIRST,
+ hhook_vnet_init, NULL);
+
+/*
+ * The hhook KPI provides a mechanism for subsystems which export helper hook
+ * points to clean up on vnet tear down, but in case the KPI is misused,
+ * provide a function to clean up and free memory for a vnet being destroyed.
+ */
+VNET_SYSUNINIT(hhook_vnet_uninit, SI_SUB_MBUF, SI_ORDER_ANY,
+ hhook_vnet_uninit, NULL);
diff --git a/sys/kern/kern_idle.c b/sys/kern/kern_idle.c
new file mode 100644
index 0000000..f412d17
--- /dev/null
+++ b/sys/kern/kern_idle.c
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (C) 2000-2004 The FreeBSD Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/unistd.h>
+#ifdef SMP
+#include <sys/smp.h>
+#endif
+
+static void idle_setup(void *dummy);
+SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL);
+
+/*
+ * Set up per-cpu idle process contexts. The AP's shouldn't be running or
+ * accessing their idle processes at this point, so don't bother with
+ * locking.
+ */
+static void
+idle_setup(void *dummy)
+{
+#ifdef SMP
+ struct pcpu *pc;
+#endif
+ struct proc *p;
+ struct thread *td;
+ int error;
+
+ p = NULL; /* start with no idle process */
+#ifdef SMP
+ STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+#endif
+#ifdef SMP
+ error = kproc_kthread_add(sched_idletd, NULL, &p, &td,
+ RFSTOPPED | RFHIGHPID, 0, "idle", "idle: cpu%d", pc->pc_cpuid);
+ pc->pc_idlethread = td;
+#else
+ error = kproc_kthread_add(sched_idletd, NULL, &p, &td,
+ RFSTOPPED | RFHIGHPID, 0, "idle", "idle");
+ PCPU_SET(idlethread, td);
+#endif
+ if (error)
+ panic("idle_setup: kproc_create error %d\n", error);
+
+ thread_lock(td);
+ TD_SET_CAN_RUN(td);
+ td->td_flags |= TDF_IDLETD | TDF_NOLOAD;
+ sched_class(td, PRI_IDLE);
+ sched_prio(td, PRI_MAX_IDLE);
+ thread_unlock(td);
+#ifdef SMP
+ }
+#endif
+}
diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c
new file mode 100644
index 0000000..f4b04c3
--- /dev/null
+++ b/sys/kern/kern_intr.c
@@ -0,0 +1,1943 @@
+/*-
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/cpuset.h>
+#include <sys/rtprio.h>
+#include <sys/systm.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/random.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/unistd.h>
+#include <sys/vmmeter.h>
+#include <machine/atomic.h>
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/stdarg.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+#endif
+
+/*
+ * Describe an interrupt thread. There is one of these per interrupt event.
+ */
+struct intr_thread {
+ struct intr_event *it_event;
+ struct thread *it_thread; /* Kernel thread. */
+ int it_flags; /* (j) IT_* flags. */
+ int it_need; /* Needs service. */
+};
+
+/* Interrupt thread flags kept in it_flags */
+#define IT_DEAD 0x000001 /* Thread is waiting to exit. */
+#define IT_WAIT 0x000002 /* Thread is waiting for completion. */
+
+struct intr_entropy {
+ struct thread *td;
+ uintptr_t event;
+};
+
+struct intr_event *clk_intr_event;
+struct intr_event *tty_intr_event;
+void *vm_ih;
+struct proc *intrproc;
+
+static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
+
+static int intr_storm_threshold = 1000;
+TUNABLE_INT("hw.intr_storm_threshold", &intr_storm_threshold);
+SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RW,
+ &intr_storm_threshold, 0,
+ "Number of consecutive interrupts before storm protection is enabled");
+static TAILQ_HEAD(, intr_event) event_list =
+ TAILQ_HEAD_INITIALIZER(event_list);
+static struct mtx event_lock;
+MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF);
+
+static void intr_event_update(struct intr_event *ie);
+#ifdef INTR_FILTER
+static int intr_event_schedule_thread(struct intr_event *ie,
+ struct intr_thread *ithd);
+static int intr_filter_loop(struct intr_event *ie,
+ struct trapframe *frame, struct intr_thread **ithd);
+static struct intr_thread *ithread_create(const char *name,
+ struct intr_handler *ih);
+#else
+static int intr_event_schedule_thread(struct intr_event *ie);
+static struct intr_thread *ithread_create(const char *name);
+#endif
+static void ithread_destroy(struct intr_thread *ithread);
+static void ithread_execute_handlers(struct proc *p,
+ struct intr_event *ie);
+#ifdef INTR_FILTER
+static void priv_ithread_execute_handler(struct proc *p,
+ struct intr_handler *ih);
+#endif
+static void ithread_loop(void *);
+static void ithread_update(struct intr_thread *ithd);
+static void start_softintr(void *);
+
+/* Map an interrupt type to an ithread priority. */
+u_char
+intr_priority(enum intr_type flags)
+{
+ u_char pri;
+
+ flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
+ INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV);
+ switch (flags) {
+ case INTR_TYPE_TTY:
+ pri = PI_TTY;
+ break;
+ case INTR_TYPE_BIO:
+ pri = PI_DISK;
+ break;
+ case INTR_TYPE_NET:
+ pri = PI_NET;
+ break;
+ case INTR_TYPE_CAM:
+ pri = PI_DISK;
+ break;
+ case INTR_TYPE_AV:
+ pri = PI_AV;
+ break;
+ case INTR_TYPE_CLK:
+ pri = PI_REALTIME;
+ break;
+ case INTR_TYPE_MISC:
+ pri = PI_DULL; /* don't care */
+ break;
+ default:
+ /* We didn't specify an interrupt level. */
+ panic("intr_priority: no interrupt type in flags");
+ }
+
+ return pri;
+}
+
+/*
+ * Update an ithread based on the associated intr_event.
+ */
+static void
+ithread_update(struct intr_thread *ithd)
+{
+ struct intr_event *ie;
+ struct thread *td;
+ u_char pri;
+
+ ie = ithd->it_event;
+ td = ithd->it_thread;
+
+ /* Determine the overall priority of this event. */
+ if (TAILQ_EMPTY(&ie->ie_handlers))
+ pri = PRI_MAX_ITHD;
+ else
+ pri = TAILQ_FIRST(&ie->ie_handlers)->ih_pri;
+
+ /* Update name and priority. */
+ strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name));
+#ifdef KTR
+ sched_clear_tdname(td);
+#endif
+ thread_lock(td);
+ sched_prio(td, pri);
+ thread_unlock(td);
+}
+
+/*
+ * Regenerate the full name of an interrupt event and update its priority.
+ */
+static void
+intr_event_update(struct intr_event *ie)
+{
+ struct intr_handler *ih;
+ char *last;
+ int missed, space;
+
+ /* Start off with no entropy and just the name of the event. */
+ mtx_assert(&ie->ie_lock, MA_OWNED);
+ strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
+ ie->ie_flags &= ~IE_ENTROPY;
+ missed = 0;
+ space = 1;
+
+ /* Run through all the handlers updating values. */
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+ if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 <
+ sizeof(ie->ie_fullname)) {
+ strcat(ie->ie_fullname, " ");
+ strcat(ie->ie_fullname, ih->ih_name);
+ space = 0;
+ } else
+ missed++;
+ if (ih->ih_flags & IH_ENTROPY)
+ ie->ie_flags |= IE_ENTROPY;
+ }
+
+ /*
+ * If the handler names were too long, add +'s to indicate missing
+ * names. If we run out of room and still have +'s to add, change
+ * the last character from a + to a *.
+ */
+ last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2];
+ while (missed-- > 0) {
+ if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) {
+ if (*last == '+') {
+ *last = '*';
+ break;
+ } else
+ *last = '+';
+ } else if (space) {
+ strcat(ie->ie_fullname, " +");
+ space = 0;
+ } else
+ strcat(ie->ie_fullname, "+");
+ }
+
+ /*
+ * If this event has an ithread, update it's priority and
+ * name.
+ */
+ if (ie->ie_thread != NULL)
+ ithread_update(ie->ie_thread);
+ CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
+}
+
+int
+intr_event_create(struct intr_event **event, void *source, int flags, int irq,
+ void (*pre_ithread)(void *), void (*post_ithread)(void *),
+ void (*post_filter)(void *), int (*assign_cpu)(void *, u_char),
+ const char *fmt, ...)
+{
+ struct intr_event *ie;
+ va_list ap;
+
+ /* The only valid flag during creation is IE_SOFT. */
+ if ((flags & ~IE_SOFT) != 0)
+ return (EINVAL);
+ ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
+ ie->ie_source = source;
+ ie->ie_pre_ithread = pre_ithread;
+ ie->ie_post_ithread = post_ithread;
+ ie->ie_post_filter = post_filter;
+ ie->ie_assign_cpu = assign_cpu;
+ ie->ie_flags = flags;
+ ie->ie_irq = irq;
+ ie->ie_cpu = NOCPU;
+ TAILQ_INIT(&ie->ie_handlers);
+ mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
+
+ va_start(ap, fmt);
+ vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
+ va_end(ap);
+ strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
+ mtx_lock(&event_lock);
+ TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
+ mtx_unlock(&event_lock);
+ if (event != NULL)
+ *event = ie;
+ CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
+ return (0);
+}
+
+/*
+ * Bind an interrupt event to the specified CPU. Note that not all
+ * platforms support binding an interrupt to a CPU. For those
+ * platforms this request will fail. For supported platforms, any
+ * associated ithreads as well as the primary interrupt context will
+ * be bound to the specificed CPU. Using a cpu id of NOCPU unbinds
+ * the interrupt event.
+ */
+int
+intr_event_bind(struct intr_event *ie, u_char cpu)
+{
+ cpuset_t mask;
+ lwpid_t id;
+ int error;
+
+ /* Need a CPU to bind to. */
+ if (cpu != NOCPU && CPU_ABSENT(cpu))
+ return (EINVAL);
+
+ if (ie->ie_assign_cpu == NULL)
+ return (EOPNOTSUPP);
+
+ error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR);
+ if (error)
+ return (error);
+
+ /*
+ * If we have any ithreads try to set their mask first to verify
+ * permissions, etc.
+ */
+ mtx_lock(&ie->ie_lock);
+ if (ie->ie_thread != NULL) {
+ CPU_ZERO(&mask);
+ if (cpu == NOCPU)
+ CPU_COPY(cpuset_root, &mask);
+ else
+ CPU_SET(cpu, &mask);
+ id = ie->ie_thread->it_thread->td_tid;
+ mtx_unlock(&ie->ie_lock);
+ error = cpuset_setthread(id, &mask);
+ if (error)
+ return (error);
+ } else
+ mtx_unlock(&ie->ie_lock);
+ error = ie->ie_assign_cpu(ie->ie_source, cpu);
+ if (error) {
+ mtx_lock(&ie->ie_lock);
+ if (ie->ie_thread != NULL) {
+ CPU_ZERO(&mask);
+ if (ie->ie_cpu == NOCPU)
+ CPU_COPY(cpuset_root, &mask);
+ else
+ CPU_SET(ie->ie_cpu, &mask);
+ id = ie->ie_thread->it_thread->td_tid;
+ mtx_unlock(&ie->ie_lock);
+ (void)cpuset_setthread(id, &mask);
+ } else
+ mtx_unlock(&ie->ie_lock);
+ return (error);
+ }
+
+ mtx_lock(&ie->ie_lock);
+ ie->ie_cpu = cpu;
+ mtx_unlock(&ie->ie_lock);
+
+ return (error);
+}
+
+static struct intr_event *
+intr_lookup(int irq)
+{
+ struct intr_event *ie;
+
+ mtx_lock(&event_lock);
+ TAILQ_FOREACH(ie, &event_list, ie_list)
+ if (ie->ie_irq == irq &&
+ (ie->ie_flags & IE_SOFT) == 0 &&
+ TAILQ_FIRST(&ie->ie_handlers) != NULL)
+ break;
+ mtx_unlock(&event_lock);
+ return (ie);
+}
+
+int
+intr_setaffinity(int irq, void *m)
+{
+ struct intr_event *ie;
+ cpuset_t *mask;
+ u_char cpu;
+ int n;
+
+ mask = m;
+ cpu = NOCPU;
+ /*
+ * If we're setting all cpus we can unbind. Otherwise make sure
+ * only one cpu is in the set.
+ */
+ if (CPU_CMP(cpuset_root, mask)) {
+ for (n = 0; n < CPU_SETSIZE; n++) {
+ if (!CPU_ISSET(n, mask))
+ continue;
+ if (cpu != NOCPU)
+ return (EINVAL);
+ cpu = (u_char)n;
+ }
+ }
+ ie = intr_lookup(irq);
+ if (ie == NULL)
+ return (ESRCH);
+ return (intr_event_bind(ie, cpu));
+}
+
+int
+intr_getaffinity(int irq, void *m)
+{
+ struct intr_event *ie;
+ cpuset_t *mask;
+
+ mask = m;
+ ie = intr_lookup(irq);
+ if (ie == NULL)
+ return (ESRCH);
+ CPU_ZERO(mask);
+ mtx_lock(&ie->ie_lock);
+ if (ie->ie_cpu == NOCPU)
+ CPU_COPY(cpuset_root, mask);
+ else
+ CPU_SET(ie->ie_cpu, mask);
+ mtx_unlock(&ie->ie_lock);
+ return (0);
+}
+
+int
+intr_event_destroy(struct intr_event *ie)
+{
+
+ mtx_lock(&event_lock);
+ mtx_lock(&ie->ie_lock);
+ if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+ mtx_unlock(&ie->ie_lock);
+ mtx_unlock(&event_lock);
+ return (EBUSY);
+ }
+ TAILQ_REMOVE(&event_list, ie, ie_list);
+#ifndef notyet
+ if (ie->ie_thread != NULL) {
+ ithread_destroy(ie->ie_thread);
+ ie->ie_thread = NULL;
+ }
+#endif
+ mtx_unlock(&ie->ie_lock);
+ mtx_unlock(&event_lock);
+ mtx_destroy(&ie->ie_lock);
+ free(ie, M_ITHREAD);
+ return (0);
+}
+
+#ifndef INTR_FILTER
+static struct intr_thread *
+ithread_create(const char *name)
+{
+ struct intr_thread *ithd;
+ struct thread *td;
+ int error;
+
+ ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
+
+ error = kproc_kthread_add(ithread_loop, ithd, &intrproc,
+ &td, RFSTOPPED | RFHIGHPID,
+ 0, "intr", "%s", name);
+ if (error)
+ panic("kproc_create() failed with %d", error);
+ thread_lock(td);
+ sched_class(td, PRI_ITHD);
+ TD_SET_IWAIT(td);
+ thread_unlock(td);
+ td->td_pflags |= TDP_ITHREAD;
+ ithd->it_thread = td;
+ CTR2(KTR_INTR, "%s: created %s", __func__, name);
+ return (ithd);
+}
+#else
+static struct intr_thread *
+ithread_create(const char *name, struct intr_handler *ih)
+{
+ struct intr_thread *ithd;
+ struct thread *td;
+ int error;
+
+ ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
+
+ error = kproc_kthread_add(ithread_loop, ih, &intrproc,
+ &td, RFSTOPPED | RFHIGHPID,
+ 0, "intr", "%s", name);
+ if (error)
+ panic("kproc_create() failed with %d", error);
+ thread_lock(td);
+ sched_class(td, PRI_ITHD);
+ TD_SET_IWAIT(td);
+ thread_unlock(td);
+ td->td_pflags |= TDP_ITHREAD;
+ ithd->it_thread = td;
+ CTR2(KTR_INTR, "%s: created %s", __func__, name);
+ return (ithd);
+}
+#endif
+
+static void
+ithread_destroy(struct intr_thread *ithread)
+{
+ struct thread *td;
+
+ CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
+ td = ithread->it_thread;
+ thread_lock(td);
+ ithread->it_flags |= IT_DEAD;
+ if (TD_AWAITING_INTR(td)) {
+ TD_CLR_IWAIT(td);
+ sched_add(td, SRQ_INTR);
+ }
+ thread_unlock(td);
+}
+
+#ifndef INTR_FILTER
+int
+intr_event_add_handler(struct intr_event *ie, const char *name,
+ driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
+ enum intr_type flags, void **cookiep)
+{
+ struct intr_handler *ih, *temp_ih;
+ struct intr_thread *it;
+
+ if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
+ return (EINVAL);
+
+ /* Allocate and populate an interrupt handler structure. */
+ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+ ih->ih_filter = filter;
+ ih->ih_handler = handler;
+ ih->ih_argument = arg;
+ strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
+ ih->ih_event = ie;
+ ih->ih_pri = pri;
+ if (flags & INTR_EXCL)
+ ih->ih_flags = IH_EXCLUSIVE;
+ if (flags & INTR_MPSAFE)
+ ih->ih_flags |= IH_MPSAFE;
+ if (flags & INTR_ENTROPY)
+ ih->ih_flags |= IH_ENTROPY;
+
+ /* We can only have one exclusive handler in a event. */
+ mtx_lock(&ie->ie_lock);
+ if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+ if ((flags & INTR_EXCL) ||
+ (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
+ mtx_unlock(&ie->ie_lock);
+ free(ih, M_ITHREAD);
+ return (EINVAL);
+ }
+ }
+
+ /* Create a thread if we need one. */
+ while (ie->ie_thread == NULL && handler != NULL) {
+ if (ie->ie_flags & IE_ADDING_THREAD)
+ msleep(ie, &ie->ie_lock, 0, "ithread", 0);
+ else {
+ ie->ie_flags |= IE_ADDING_THREAD;
+ mtx_unlock(&ie->ie_lock);
+ it = ithread_create("intr: newborn");
+ mtx_lock(&ie->ie_lock);
+ ie->ie_flags &= ~IE_ADDING_THREAD;
+ ie->ie_thread = it;
+ it->it_event = ie;
+ ithread_update(it);
+ wakeup(ie);
+ }
+ }
+
+ /* Add the new handler to the event in priority order. */
+ TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
+ if (temp_ih->ih_pri > ih->ih_pri)
+ break;
+ }
+ if (temp_ih == NULL)
+ TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
+ else
+ TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+ intr_event_update(ie);
+
+ CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
+ ie->ie_name);
+ mtx_unlock(&ie->ie_lock);
+
+ if (cookiep != NULL)
+ *cookiep = ih;
+ return (0);
+}
+#else
+int
+intr_event_add_handler(struct intr_event *ie, const char *name,
+ driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
+ enum intr_type flags, void **cookiep)
+{
+ struct intr_handler *ih, *temp_ih;
+ struct intr_thread *it;
+
+ if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
+ return (EINVAL);
+
+ /* Allocate and populate an interrupt handler structure. */
+ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+ ih->ih_filter = filter;
+ ih->ih_handler = handler;
+ ih->ih_argument = arg;
+ strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
+ ih->ih_event = ie;
+ ih->ih_pri = pri;
+ if (flags & INTR_EXCL)
+ ih->ih_flags = IH_EXCLUSIVE;
+ if (flags & INTR_MPSAFE)
+ ih->ih_flags |= IH_MPSAFE;
+ if (flags & INTR_ENTROPY)
+ ih->ih_flags |= IH_ENTROPY;
+
+ /* We can only have one exclusive handler in a event. */
+ mtx_lock(&ie->ie_lock);
+ if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+ if ((flags & INTR_EXCL) ||
+ (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
+ mtx_unlock(&ie->ie_lock);
+ free(ih, M_ITHREAD);
+ return (EINVAL);
+ }
+ }
+
+ /* For filtered handlers, create a private ithread to run on. */
+ if (filter != NULL && handler != NULL) {
+ mtx_unlock(&ie->ie_lock);
+ it = ithread_create("intr: newborn", ih);
+ mtx_lock(&ie->ie_lock);
+ it->it_event = ie;
+ ih->ih_thread = it;
+ ithread_update(it); /* XXX - do we really need this?!?!? */
+ } else { /* Create the global per-event thread if we need one. */
+ while (ie->ie_thread == NULL && handler != NULL) {
+ if (ie->ie_flags & IE_ADDING_THREAD)
+ msleep(ie, &ie->ie_lock, 0, "ithread", 0);
+ else {
+ ie->ie_flags |= IE_ADDING_THREAD;
+ mtx_unlock(&ie->ie_lock);
+ it = ithread_create("intr: newborn", ih);
+ mtx_lock(&ie->ie_lock);
+ ie->ie_flags &= ~IE_ADDING_THREAD;
+ ie->ie_thread = it;
+ it->it_event = ie;
+ ithread_update(it);
+ wakeup(ie);
+ }
+ }
+ }
+
+ /* Add the new handler to the event in priority order. */
+ TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
+ if (temp_ih->ih_pri > ih->ih_pri)
+ break;
+ }
+ if (temp_ih == NULL)
+ TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
+ else
+ TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+ intr_event_update(ie);
+
+ CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
+ ie->ie_name);
+ mtx_unlock(&ie->ie_lock);
+
+ if (cookiep != NULL)
+ *cookiep = ih;
+ return (0);
+}
+#endif
+
+/*
+ * Append a description preceded by a ':' to the name of the specified
+ * interrupt handler.
+ */
+int
+intr_event_describe_handler(struct intr_event *ie, void *cookie,
+ const char *descr)
+{
+ struct intr_handler *ih;
+ size_t space;
+ char *start;
+
+ mtx_lock(&ie->ie_lock);
+#ifdef INVARIANTS
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+ if (ih == cookie)
+ break;
+ }
+ if (ih == NULL) {
+ mtx_unlock(&ie->ie_lock);
+ panic("handler %p not found in interrupt event %p", cookie, ie);
+ }
+#endif
+ ih = cookie;
+
+ /*
+ * Look for an existing description by checking for an
+ * existing ":". This assumes device names do not include
+ * colons. If one is found, prepare to insert the new
+ * description at that point. If one is not found, find the
+ * end of the name to use as the insertion point.
+ */
+ start = strchr(ih->ih_name, ':');
+ if (start == NULL)
+ start = strchr(ih->ih_name, 0);
+
+ /*
+ * See if there is enough remaining room in the string for the
+ * description + ":". The "- 1" leaves room for the trailing
+ * '\0'. The "+ 1" accounts for the colon.
+ */
+ space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1;
+ if (strlen(descr) + 1 > space) {
+ mtx_unlock(&ie->ie_lock);
+ return (ENOSPC);
+ }
+
+ /* Append a colon followed by the description. */
+ *start = ':';
+ strcpy(start + 1, descr);
+ intr_event_update(ie);
+ mtx_unlock(&ie->ie_lock);
+ return (0);
+}
+
+/*
+ * Return the ie_source field from the intr_event an intr_handler is
+ * associated with.
+ */
+void *
+intr_handler_source(void *cookie)
+{
+ struct intr_handler *ih;
+ struct intr_event *ie;
+
+ ih = (struct intr_handler *)cookie;
+ if (ih == NULL)
+ return (NULL);
+ ie = ih->ih_event;
+ KASSERT(ie != NULL,
+ ("interrupt handler \"%s\" has a NULL interrupt event",
+ ih->ih_name));
+ return (ie->ie_source);
+}
+
+/*
+ * Sleep until an ithread finishes executing an interrupt handler.
+ *
+ * XXX Doesn't currently handle interrupt filters or fast interrupt
+ * handlers. This is intended for compatibility with linux drivers
+ * only. Do not use in BSD code.
+ */
+void
+_intr_drain(int irq)
+{
+ struct intr_event *ie;
+ struct intr_thread *ithd;
+ struct thread *td;
+
+ ie = intr_lookup(irq);
+ if (ie == NULL)
+ return;
+ if (ie->ie_thread == NULL)
+ return;
+ ithd = ie->ie_thread;
+ td = ithd->it_thread;
+ /*
+ * We set the flag and wait for it to be cleared to avoid
+ * long delays with potentially busy interrupt handlers
+ * were we to only sample TD_AWAITING_INTR() every tick.
+ */
+ thread_lock(td);
+ if (!TD_AWAITING_INTR(td)) {
+ ithd->it_flags |= IT_WAIT;
+ while (ithd->it_flags & IT_WAIT) {
+ thread_unlock(td);
+ pause("idrain", 1);
+ thread_lock(td);
+ }
+ }
+ thread_unlock(td);
+ return;
+}
+
+
+#ifndef INTR_FILTER
+int
+intr_event_remove_handler(void *cookie)
+{
+ struct intr_handler *handler = (struct intr_handler *)cookie;
+ struct intr_event *ie;
+#ifdef INVARIANTS
+ struct intr_handler *ih;
+#endif
+#ifdef notyet
+ int dead;
+#endif
+
+ if (handler == NULL)
+ return (EINVAL);
+ ie = handler->ih_event;
+ KASSERT(ie != NULL,
+ ("interrupt handler \"%s\" has a NULL interrupt event",
+ handler->ih_name));
+ mtx_lock(&ie->ie_lock);
+ CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
+ ie->ie_name);
+#ifdef INVARIANTS
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
+ if (ih == handler)
+ goto ok;
+ mtx_unlock(&ie->ie_lock);
+ panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
+ ih->ih_name, ie->ie_name);
+ok:
+#endif
+ /*
+ * If there is no ithread, then just remove the handler and return.
+ * XXX: Note that an INTR_FAST handler might be running on another
+ * CPU!
+ */
+ if (ie->ie_thread == NULL) {
+ TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+ mtx_unlock(&ie->ie_lock);
+ free(handler, M_ITHREAD);
+ return (0);
+ }
+
+ /*
+ * If the interrupt thread is already running, then just mark this
+ * handler as being dead and let the ithread do the actual removal.
+ *
+ * During a cold boot while cold is set, msleep() does not sleep,
+ * so we have to remove the handler here rather than letting the
+ * thread do it.
+ */
+ thread_lock(ie->ie_thread->it_thread);
+ if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
+ handler->ih_flags |= IH_DEAD;
+
+ /*
+ * Ensure that the thread will process the handler list
+ * again and remove this handler if it has already passed
+ * it on the list.
+ */
+ atomic_store_rel_int(&ie->ie_thread->it_need, 1);
+ } else
+ TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+ thread_unlock(ie->ie_thread->it_thread);
+ while (handler->ih_flags & IH_DEAD)
+ msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
+ intr_event_update(ie);
+#ifdef notyet
+ /*
+ * XXX: This could be bad in the case of ppbus(8). Also, I think
+ * this could lead to races of stale data when servicing an
+ * interrupt.
+ */
+ dead = 1;
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+ if (!(ih->ih_flags & IH_FAST)) {
+ dead = 0;
+ break;
+ }
+ }
+ if (dead) {
+ ithread_destroy(ie->ie_thread);
+ ie->ie_thread = NULL;
+ }
+#endif
+ mtx_unlock(&ie->ie_lock);
+ free(handler, M_ITHREAD);
+ return (0);
+}
+
+static int
+intr_event_schedule_thread(struct intr_event *ie)
+{
+ struct intr_entropy entropy;
+ struct intr_thread *it;
+ struct thread *td;
+ struct thread *ctd;
+ struct proc *p;
+
+ /*
+ * If no ithread or no handlers, then we have a stray interrupt.
+ */
+ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) ||
+ ie->ie_thread == NULL)
+ return (EINVAL);
+
+ ctd = curthread;
+ it = ie->ie_thread;
+ td = it->it_thread;
+ p = td->td_proc;
+
+ /*
+ * If any of the handlers for this ithread claim to be good
+ * sources of entropy, then gather some.
+ */
+ if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
+ CTR3(KTR_INTR, "%s: pid %d (%s) gathering entropy", __func__,
+ p->p_pid, td->td_name);
+ entropy.event = (uintptr_t)ie;
+ entropy.td = ctd;
+ random_harvest(&entropy, sizeof(entropy), 2, 0,
+ RANDOM_INTERRUPT);
+ }
+
+ KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
+
+ /*
+ * Set it_need to tell the thread to keep running if it is already
+ * running. Then, lock the thread and see if we actually need to
+ * put it on the runqueue.
+ */
+ atomic_store_rel_int(&it->it_need, 1);
+ thread_lock(td);
+ if (TD_AWAITING_INTR(td)) {
+ CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
+ td->td_name);
+ TD_CLR_IWAIT(td);
+ sched_add(td, SRQ_INTR);
+ } else {
+ CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
+ __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
+ }
+ thread_unlock(td);
+
+ return (0);
+}
+#else
+int
+intr_event_remove_handler(void *cookie)
+{
+ struct intr_handler *handler = (struct intr_handler *)cookie;
+ struct intr_event *ie;
+ struct intr_thread *it;
+#ifdef INVARIANTS
+ struct intr_handler *ih;
+#endif
+#ifdef notyet
+ int dead;
+#endif
+
+ if (handler == NULL)
+ return (EINVAL);
+ ie = handler->ih_event;
+ KASSERT(ie != NULL,
+ ("interrupt handler \"%s\" has a NULL interrupt event",
+ handler->ih_name));
+ mtx_lock(&ie->ie_lock);
+ CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
+ ie->ie_name);
+#ifdef INVARIANTS
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
+ if (ih == handler)
+ goto ok;
+ mtx_unlock(&ie->ie_lock);
+ panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
+ ih->ih_name, ie->ie_name);
+ok:
+#endif
+ /*
+ * If there are no ithreads (per event and per handler), then
+ * just remove the handler and return.
+ * XXX: Note that an INTR_FAST handler might be running on another CPU!
+ */
+ if (ie->ie_thread == NULL && handler->ih_thread == NULL) {
+ TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+ mtx_unlock(&ie->ie_lock);
+ free(handler, M_ITHREAD);
+ return (0);
+ }
+
+ /* Private or global ithread? */
+ it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread;
+ /*
+ * If the interrupt thread is already running, then just mark this
+ * handler as being dead and let the ithread do the actual removal.
+ *
+ * During a cold boot while cold is set, msleep() does not sleep,
+ * so we have to remove the handler here rather than letting the
+ * thread do it.
+ */
+ thread_lock(it->it_thread);
+ if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
+ handler->ih_flags |= IH_DEAD;
+
+ /*
+ * Ensure that the thread will process the handler list
+ * again and remove this handler if it has already passed
+ * it on the list.
+ */
+ atomic_store_rel_int(&it->it_need, 1);
+ } else
+ TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+ thread_unlock(it->it_thread);
+ while (handler->ih_flags & IH_DEAD)
+ msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
+ /*
+ * At this point, the handler has been disconnected from the event,
+ * so we can kill the private ithread if any.
+ */
+ if (handler->ih_thread) {
+ ithread_destroy(handler->ih_thread);
+ handler->ih_thread = NULL;
+ }
+ intr_event_update(ie);
+#ifdef notyet
+ /*
+ * XXX: This could be bad in the case of ppbus(8). Also, I think
+ * this could lead to races of stale data when servicing an
+ * interrupt.
+ */
+ dead = 1;
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+ if (handler != NULL) {
+ dead = 0;
+ break;
+ }
+ }
+ if (dead) {
+ ithread_destroy(ie->ie_thread);
+ ie->ie_thread = NULL;
+ }
+#endif
+ mtx_unlock(&ie->ie_lock);
+ free(handler, M_ITHREAD);
+ return (0);
+}
+
+static int
+intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
+{
+ struct intr_entropy entropy;
+ struct thread *td;
+ struct thread *ctd;
+ struct proc *p;
+
+ /*
+ * If no ithread or no handlers, then we have a stray interrupt.
+ */
+ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL)
+ return (EINVAL);
+
+ ctd = curthread;
+ td = it->it_thread;
+ p = td->td_proc;
+
+ /*
+ * If any of the handlers for this ithread claim to be good
+ * sources of entropy, then gather some.
+ */
+ if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
+ CTR3(KTR_INTR, "%s: pid %d (%s) gathering entropy", __func__,
+ p->p_pid, td->td_name);
+ entropy.event = (uintptr_t)ie;
+ entropy.td = ctd;
+ random_harvest(&entropy, sizeof(entropy), 2, 0,
+ RANDOM_INTERRUPT);
+ }
+
+ KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
+
+ /*
+ * Set it_need to tell the thread to keep running if it is already
+ * running. Then, lock the thread and see if we actually need to
+ * put it on the runqueue.
+ */
+ atomic_store_rel_int(&it->it_need, 1);
+ thread_lock(td);
+ if (TD_AWAITING_INTR(td)) {
+ CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
+ td->td_name);
+ TD_CLR_IWAIT(td);
+ sched_add(td, SRQ_INTR);
+ } else {
+ CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
+ __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
+ }
+ thread_unlock(td);
+
+ return (0);
+}
+#endif
+
+/*
+ * Allow interrupt event binding for software interrupt handlers -- a no-op,
+ * since interrupts are generated in software rather than being directed by
+ * a PIC.
+ */
+static int
+swi_assign_cpu(void *arg, u_char cpu)
+{
+
+ return (0);
+}
+
+/*
+ * Add a software interrupt handler to a specified event. If a given event
+ * is not specified, then a new event is created.
+ */
+int
+swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler,
+ void *arg, int pri, enum intr_type flags, void **cookiep)
+{
+ struct intr_event *ie;
+ int error;
+
+ if (flags & INTR_ENTROPY)
+ return (EINVAL);
+
+ ie = (eventp != NULL) ? *eventp : NULL;
+
+ if (ie != NULL) {
+ if (!(ie->ie_flags & IE_SOFT))
+ return (EINVAL);
+ } else {
+ error = intr_event_create(&ie, NULL, IE_SOFT, 0,
+ NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri);
+ if (error)
+ return (error);
+ if (eventp != NULL)
+ *eventp = ie;
+ }
+ error = intr_event_add_handler(ie, name, NULL, handler, arg,
+ PI_SWI(pri), flags, cookiep);
+ return (error);
+}
+
+/*
+ * Schedule a software interrupt thread.
+ */
+void
+swi_sched(void *cookie, int flags)
+{
+ struct intr_handler *ih = (struct intr_handler *)cookie;
+ struct intr_event *ie = ih->ih_event;
+ struct intr_entropy entropy;
+ int error;
+
+ CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
+ ih->ih_need);
+
+ if (harvest.swi) {
+ CTR2(KTR_INTR, "swi_sched: pid %d (%s) gathering entropy",
+ curproc->p_pid, curthread->td_name);
+ entropy.event = (uintptr_t)ih;
+ entropy.td = curthread;
+ random_harvest(&entropy, sizeof(entropy), 1, 0,
+ RANDOM_SWI);
+ }
+
+ /*
+ * Set ih_need for this handler so that if the ithread is already
+ * running it will execute this handler on the next pass. Otherwise,
+ * it will execute it the next time it runs.
+ */
+ atomic_store_rel_int(&ih->ih_need, 1);
+
+ if (!(flags & SWI_DELAY)) {
+ PCPU_INC(cnt.v_soft);
+#ifdef INTR_FILTER
+ error = intr_event_schedule_thread(ie, ie->ie_thread);
+#else
+ error = intr_event_schedule_thread(ie);
+#endif
+ KASSERT(error == 0, ("stray software interrupt"));
+ }
+}
+
+/*
+ * Remove a software interrupt handler. Currently this code does not
+ * remove the associated interrupt event if it becomes empty. Calling code
+ * may do so manually via intr_event_destroy(), but that's not really
+ * an optimal interface.
+ */
+int
+swi_remove(void *cookie)
+{
+
+ return (intr_event_remove_handler(cookie));
+}
+
+#ifdef INTR_FILTER
+static void
+priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih)
+{
+ struct intr_event *ie;
+
+ ie = ih->ih_event;
+ /*
+ * If this handler is marked for death, remove it from
+ * the list of handlers and wake up the sleeper.
+ */
+ if (ih->ih_flags & IH_DEAD) {
+ mtx_lock(&ie->ie_lock);
+ TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
+ ih->ih_flags &= ~IH_DEAD;
+ wakeup(ih);
+ mtx_unlock(&ie->ie_lock);
+ return;
+ }
+
+ /* Execute this handler. */
+ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
+ __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
+ ih->ih_name, ih->ih_flags);
+
+ if (!(ih->ih_flags & IH_MPSAFE))
+ mtx_lock(&Giant);
+ ih->ih_handler(ih->ih_argument);
+ if (!(ih->ih_flags & IH_MPSAFE))
+ mtx_unlock(&Giant);
+}
+#endif
+
+/*
+ * This is a public function for use by drivers that mux interrupt
+ * handlers for child devices from their interrupt handler.
+ */
+void
+intr_event_execute_handlers(struct proc *p, struct intr_event *ie)
+{
+ struct intr_handler *ih, *ihn;
+
+ TAILQ_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) {
+ /*
+ * If this handler is marked for death, remove it from
+ * the list of handlers and wake up the sleeper.
+ */
+ if (ih->ih_flags & IH_DEAD) {
+ mtx_lock(&ie->ie_lock);
+ TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
+ ih->ih_flags &= ~IH_DEAD;
+ wakeup(ih);
+ mtx_unlock(&ie->ie_lock);
+ continue;
+ }
+
+ /* Skip filter only handlers */
+ if (ih->ih_handler == NULL)
+ continue;
+
+ /*
+ * For software interrupt threads, we only execute
+ * handlers that have their need flag set. Hardware
+ * interrupt threads always invoke all of their handlers.
+ */
+ if (ie->ie_flags & IE_SOFT) {
+ if (atomic_load_acq_int(&ih->ih_need) == 0)
+ continue;
+ else
+ atomic_store_rel_int(&ih->ih_need, 0);
+ }
+
+ /* Execute this handler. */
+ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
+ __func__, p->p_pid, (void *)ih->ih_handler,
+ ih->ih_argument, ih->ih_name, ih->ih_flags);
+
+ if (!(ih->ih_flags & IH_MPSAFE))
+ mtx_lock(&Giant);
+ ih->ih_handler(ih->ih_argument);
+ if (!(ih->ih_flags & IH_MPSAFE))
+ mtx_unlock(&Giant);
+ }
+}
+
+static void
+ithread_execute_handlers(struct proc *p, struct intr_event *ie)
+{
+
+ /* Interrupt handlers should not sleep. */
+ if (!(ie->ie_flags & IE_SOFT))
+ THREAD_NO_SLEEPING();
+ intr_event_execute_handlers(p, ie);
+ if (!(ie->ie_flags & IE_SOFT))
+ THREAD_SLEEPING_OK();
+
+ /*
+ * Interrupt storm handling:
+ *
+ * If this interrupt source is currently storming, then throttle
+ * it to only fire the handler once per clock tick.
+ *
+ * If this interrupt source is not currently storming, but the
+ * number of back to back interrupts exceeds the storm threshold,
+ * then enter storming mode.
+ */
+ if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
+ !(ie->ie_flags & IE_SOFT)) {
+ /* Report the message only once every second. */
+ if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
+ printf(
+ "interrupt storm detected on \"%s\"; throttling interrupt source\n",
+ ie->ie_name);
+ }
+ pause("istorm", 1);
+ } else
+ ie->ie_count++;
+
+ /*
+ * Now that all the handlers have had a chance to run, reenable
+ * the interrupt source.
+ */
+ if (ie->ie_post_ithread != NULL)
+ ie->ie_post_ithread(ie->ie_source);
+}
+
+#ifndef INTR_FILTER
+/*
+ * This is the main code for interrupt threads.
+ */
+static void
+ithread_loop(void *arg)
+{
+ struct intr_thread *ithd;
+ struct intr_event *ie;
+ struct thread *td;
+ struct proc *p;
+ int wake;
+
+ td = curthread;
+ p = td->td_proc;
+ ithd = (struct intr_thread *)arg;
+ KASSERT(ithd->it_thread == td,
+ ("%s: ithread and proc linkage out of sync", __func__));
+ ie = ithd->it_event;
+ ie->ie_count = 0;
+ wake = 0;
+
+ /*
+ * As long as we have interrupts outstanding, go through the
+ * list of handlers, giving each one a go at it.
+ */
+ for (;;) {
+ /*
+ * If we are an orphaned thread, then just die.
+ */
+ if (ithd->it_flags & IT_DEAD) {
+ CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
+ p->p_pid, td->td_name);
+ free(ithd, M_ITHREAD);
+ kthread_exit();
+ }
+
+ /*
+ * Service interrupts. If another interrupt arrives while
+ * we are running, it will set it_need to note that we
+ * should make another pass.
+ */
+ while (atomic_load_acq_int(&ithd->it_need) != 0) {
+ /*
+ * This might need a full read and write barrier
+ * to make sure that this write posts before any
+ * of the memory or device accesses in the
+ * handlers.
+ */
+ atomic_store_rel_int(&ithd->it_need, 0);
+ ithread_execute_handlers(p, ie);
+ }
+ WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ /*
+ * Processed all our interrupts. Now get the sched
+ * lock. This may take a while and it_need may get
+ * set again, so we have to check it again.
+ */
+ thread_lock(td);
+ if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
+ !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
+ TD_SET_IWAIT(td);
+ ie->ie_count = 0;
+ mi_switch(SW_VOL | SWT_IWAIT, NULL);
+ }
+ if (ithd->it_flags & IT_WAIT) {
+ wake = 1;
+ ithd->it_flags &= ~IT_WAIT;
+ }
+ thread_unlock(td);
+ if (wake) {
+ wakeup(ithd);
+ wake = 0;
+ }
+ }
+}
+
+/*
+ * Main interrupt handling body.
+ *
+ * Input:
+ * o ie: the event connected to this interrupt.
+ * o frame: some archs (i.e. i386) pass a frame to some.
+ * handlers as their main argument.
+ * Return value:
+ * o 0: everything ok.
+ * o EINVAL: stray interrupt.
+ */
+int
+intr_event_handle(struct intr_event *ie, struct trapframe *frame)
+{
+ struct intr_handler *ih;
+ struct trapframe *oldframe;
+ struct thread *td;
+ int error, ret, thread;
+
+ td = curthread;
+
+ /* An interrupt with no event or handlers is a stray interrupt. */
+ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
+ return (EINVAL);
+
+ /*
+ * Execute fast interrupt handlers directly.
+ * To support clock handlers, if a handler registers
+ * with a NULL argument, then we pass it a pointer to
+ * a trapframe as its argument.
+ */
+ td->td_intr_nesting_level++;
+ thread = 0;
+ ret = 0;
+ critical_enter();
+ oldframe = td->td_intr_frame;
+ td->td_intr_frame = frame;
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+ if (ih->ih_filter == NULL) {
+ thread = 1;
+ continue;
+ }
+ CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__,
+ ih->ih_filter, ih->ih_argument == NULL ? frame :
+ ih->ih_argument, ih->ih_name);
+ if (ih->ih_argument == NULL)
+ ret = ih->ih_filter(frame);
+ else
+ ret = ih->ih_filter(ih->ih_argument);
+ KASSERT(ret == FILTER_STRAY ||
+ ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
+ (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
+ ("%s: incorrect return value %#x from %s", __func__, ret,
+ ih->ih_name));
+
+ /*
+ * Wrapper handler special handling:
+ *
+ * in some particular cases (like pccard and pccbb),
+ * the _real_ device handler is wrapped in a couple of
+ * functions - a filter wrapper and an ithread wrapper.
+ * In this case (and just in this case), the filter wrapper
+ * could ask the system to schedule the ithread and mask
+ * the interrupt source if the wrapped handler is composed
+ * of just an ithread handler.
+ *
+ * TODO: write a generic wrapper to avoid people rolling
+ * their own
+ */
+ if (!thread) {
+ if (ret == FILTER_SCHEDULE_THREAD)
+ thread = 1;
+ }
+ }
+ td->td_intr_frame = oldframe;
+
+ if (thread) {
+ if (ie->ie_pre_ithread != NULL)
+ ie->ie_pre_ithread(ie->ie_source);
+ } else {
+ if (ie->ie_post_filter != NULL)
+ ie->ie_post_filter(ie->ie_source);
+ }
+
+ /* Schedule the ithread if needed. */
+ if (thread) {
+ error = intr_event_schedule_thread(ie);
+#ifndef XEN
+ KASSERT(error == 0, ("bad stray interrupt"));
+#else
+ if (error != 0)
+ log(LOG_WARNING, "bad stray interrupt");
+#endif
+ }
+ critical_exit();
+ td->td_intr_nesting_level--;
+ return (0);
+}
+#else
+/*
+ * This is the main code for interrupt threads.
+ */
+static void
+ithread_loop(void *arg)
+{
+ struct intr_thread *ithd;
+ struct intr_handler *ih;
+ struct intr_event *ie;
+ struct thread *td;
+ struct proc *p;
+ int priv;
+ int wake;
+
+ td = curthread;
+ p = td->td_proc;
+ ih = (struct intr_handler *)arg;
+ priv = (ih->ih_thread != NULL) ? 1 : 0;
+ ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread;
+ KASSERT(ithd->it_thread == td,
+ ("%s: ithread and proc linkage out of sync", __func__));
+ ie = ithd->it_event;
+ ie->ie_count = 0;
+ wake = 0;
+
+ /*
+ * As long as we have interrupts outstanding, go through the
+ * list of handlers, giving each one a go at it.
+ */
+ for (;;) {
+ /*
+ * If we are an orphaned thread, then just die.
+ */
+ if (ithd->it_flags & IT_DEAD) {
+ CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
+ p->p_pid, td->td_name);
+ free(ithd, M_ITHREAD);
+ kthread_exit();
+ }
+
+ /*
+ * Service interrupts. If another interrupt arrives while
+ * we are running, it will set it_need to note that we
+ * should make another pass.
+ */
+ while (atomic_load_acq_int(&ithd->it_need) != 0) {
+ /*
+ * This might need a full read and write barrier
+ * to make sure that this write posts before any
+ * of the memory or device accesses in the
+ * handlers.
+ */
+ atomic_store_rel_int(&ithd->it_need, 0);
+ if (priv)
+ priv_ithread_execute_handler(p, ih);
+ else
+ ithread_execute_handlers(p, ie);
+ }
+ WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ /*
+ * Processed all our interrupts. Now get the sched
+ * lock. This may take a while and it_need may get
+ * set again, so we have to check it again.
+ */
+ thread_lock(td);
+ if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
+ !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
+ TD_SET_IWAIT(td);
+ ie->ie_count = 0;
+ mi_switch(SW_VOL | SWT_IWAIT, NULL);
+ }
+ if (ithd->it_flags & IT_WAIT) {
+ wake = 1;
+ ithd->it_flags &= ~IT_WAIT;
+ }
+ thread_unlock(td);
+ if (wake) {
+ wakeup(ithd);
+ wake = 0;
+ }
+ }
+}
+
+/*
+ * Main loop for interrupt filter.
+ *
+ * Some architectures (i386, amd64 and arm) require the optional frame
+ * parameter, and use it as the main argument for fast handler execution
+ * when ih_argument == NULL.
+ *
+ * Return value:
+ * o FILTER_STRAY: No filter recognized the event, and no
+ * filter-less handler is registered on this
+ * line.
+ * o FILTER_HANDLED: A filter claimed the event and served it.
+ * o FILTER_SCHEDULE_THREAD: No filter claimed the event, but there's at
+ * least one filter-less handler on this line.
+ * o FILTER_HANDLED |
+ * FILTER_SCHEDULE_THREAD: A filter claimed the event, and asked for
+ * scheduling the per-handler ithread.
+ *
+ * In case an ithread has to be scheduled, in *ithd there will be a
+ * pointer to a struct intr_thread containing the thread to be
+ * scheduled.
+ */
+
+static int
+intr_filter_loop(struct intr_event *ie, struct trapframe *frame,
+ struct intr_thread **ithd)
+{
+ struct intr_handler *ih;
+ void *arg;
+ int ret, thread_only;
+
+ ret = 0;
+ thread_only = 0;
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+ /*
+ * Execute fast interrupt handlers directly.
+ * To support clock handlers, if a handler registers
+ * with a NULL argument, then we pass it a pointer to
+ * a trapframe as its argument.
+ */
+ arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument);
+
+ CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__,
+ ih->ih_filter, ih->ih_handler, arg, ih->ih_name);
+
+ if (ih->ih_filter != NULL)
+ ret = ih->ih_filter(arg);
+ else {
+ thread_only = 1;
+ continue;
+ }
+ KASSERT(ret == FILTER_STRAY ||
+ ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
+ (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
+ ("%s: incorrect return value %#x from %s", __func__, ret,
+ ih->ih_name));
+ if (ret & FILTER_STRAY)
+ continue;
+ else {
+ *ithd = ih->ih_thread;
+ return (ret);
+ }
+ }
+
+ /*
+ * No filters handled the interrupt and we have at least
+ * one handler without a filter. In this case, we schedule
+ * all of the filter-less handlers to run in the ithread.
+ */
+ if (thread_only) {
+ *ithd = ie->ie_thread;
+ return (FILTER_SCHEDULE_THREAD);
+ }
+ return (FILTER_STRAY);
+}
+
+/*
+ * Main interrupt handling body.
+ *
+ * Input:
+ * o ie: the event connected to this interrupt.
+ * o frame: some archs (i.e. i386) pass a frame to some.
+ * handlers as their main argument.
+ * Return value:
+ * o 0: everything ok.
+ * o EINVAL: stray interrupt.
+ */
+int
+intr_event_handle(struct intr_event *ie, struct trapframe *frame)
+{
+ struct intr_thread *ithd;
+ struct trapframe *oldframe;
+ struct thread *td;
+ int thread;
+
+ ithd = NULL;
+ td = curthread;
+
+ if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
+ return (EINVAL);
+
+ td->td_intr_nesting_level++;
+ thread = 0;
+ critical_enter();
+ oldframe = td->td_intr_frame;
+ td->td_intr_frame = frame;
+ thread = intr_filter_loop(ie, frame, &ithd);
+ if (thread & FILTER_HANDLED) {
+ if (ie->ie_post_filter != NULL)
+ ie->ie_post_filter(ie->ie_source);
+ } else {
+ if (ie->ie_pre_ithread != NULL)
+ ie->ie_pre_ithread(ie->ie_source);
+ }
+ td->td_intr_frame = oldframe;
+ critical_exit();
+
+ /* Interrupt storm logic */
+ if (thread & FILTER_STRAY) {
+ ie->ie_count++;
+ if (ie->ie_count < intr_storm_threshold)
+ printf("Interrupt stray detection not present\n");
+ }
+
+ /* Schedule an ithread if needed. */
+ if (thread & FILTER_SCHEDULE_THREAD) {
+ if (intr_event_schedule_thread(ie, ithd) != 0)
+ panic("%s: impossible stray interrupt", __func__);
+ }
+ td->td_intr_nesting_level--;
+ return (0);
+}
+#endif
+
+#ifdef DDB
+/*
+ * Dump details about an interrupt handler
+ */
+static void
+db_dump_intrhand(struct intr_handler *ih)
+{
+ int comma;
+
+ db_printf("\t%-10s ", ih->ih_name);
+ switch (ih->ih_pri) {
+ case PI_REALTIME:
+ db_printf("CLK ");
+ break;
+ case PI_AV:
+ db_printf("AV ");
+ break;
+ case PI_TTY:
+ db_printf("TTY ");
+ break;
+ case PI_NET:
+ db_printf("NET ");
+ break;
+ case PI_DISK:
+ db_printf("DISK");
+ break;
+ case PI_DULL:
+ db_printf("DULL");
+ break;
+ default:
+ if (ih->ih_pri >= PI_SOFT)
+ db_printf("SWI ");
+ else
+ db_printf("%4u", ih->ih_pri);
+ break;
+ }
+ db_printf(" ");
+ if (ih->ih_filter != NULL) {
+ db_printf("[F]");
+ db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC);
+ }
+ if (ih->ih_handler != NULL) {
+ if (ih->ih_filter != NULL)
+ db_printf(",");
+ db_printf("[H]");
+ db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
+ }
+ db_printf("(%p)", ih->ih_argument);
+ if (ih->ih_need ||
+ (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
+ IH_MPSAFE)) != 0) {
+ db_printf(" {");
+ comma = 0;
+ if (ih->ih_flags & IH_EXCLUSIVE) {
+ if (comma)
+ db_printf(", ");
+ db_printf("EXCL");
+ comma = 1;
+ }
+ if (ih->ih_flags & IH_ENTROPY) {
+ if (comma)
+ db_printf(", ");
+ db_printf("ENTROPY");
+ comma = 1;
+ }
+ if (ih->ih_flags & IH_DEAD) {
+ if (comma)
+ db_printf(", ");
+ db_printf("DEAD");
+ comma = 1;
+ }
+ if (ih->ih_flags & IH_MPSAFE) {
+ if (comma)
+ db_printf(", ");
+ db_printf("MPSAFE");
+ comma = 1;
+ }
+ if (ih->ih_need) {
+ if (comma)
+ db_printf(", ");
+ db_printf("NEED");
+ }
+ db_printf("}");
+ }
+ db_printf("\n");
+}
+
+/*
+ * Dump details about a event.
+ */
+void
+db_dump_intr_event(struct intr_event *ie, int handlers)
+{
+ struct intr_handler *ih;
+ struct intr_thread *it;
+ int comma;
+
+ db_printf("%s ", ie->ie_fullname);
+ it = ie->ie_thread;
+ if (it != NULL)
+ db_printf("(pid %d)", it->it_thread->td_proc->p_pid);
+ else
+ db_printf("(no thread)");
+ if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 ||
+ (it != NULL && it->it_need)) {
+ db_printf(" {");
+ comma = 0;
+ if (ie->ie_flags & IE_SOFT) {
+ db_printf("SOFT");
+ comma = 1;
+ }
+ if (ie->ie_flags & IE_ENTROPY) {
+ if (comma)
+ db_printf(", ");
+ db_printf("ENTROPY");
+ comma = 1;
+ }
+ if (ie->ie_flags & IE_ADDING_THREAD) {
+ if (comma)
+ db_printf(", ");
+ db_printf("ADDING_THREAD");
+ comma = 1;
+ }
+ if (it != NULL && it->it_need) {
+ if (comma)
+ db_printf(", ");
+ db_printf("NEED");
+ }
+ db_printf("}");
+ }
+ db_printf("\n");
+
+ if (handlers)
+ TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
+ db_dump_intrhand(ih);
+}
+
+/*
+ * Dump data about interrupt handlers
+ */
+DB_SHOW_COMMAND(intr, db_show_intr)
+{
+ struct intr_event *ie;
+ int all, verbose;
+
+ verbose = strchr(modif, 'v') != NULL;
+ all = strchr(modif, 'a') != NULL;
+ TAILQ_FOREACH(ie, &event_list, ie_list) {
+ if (!all && TAILQ_EMPTY(&ie->ie_handlers))
+ continue;
+ db_dump_intr_event(ie, verbose);
+ if (db_pager_quit)
+ break;
+ }
+}
+#endif /* DDB */
+
+/*
+ * Start standard software interrupt threads
+ */
+static void
+start_softintr(void *dummy)
+{
+
+ if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih))
+ panic("died while creating vm swi ithread");
+}
+SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr,
+ NULL);
+
+/*
+ * Sysctls used by systat and others: hw.intrnames and hw.intrcnt.
+ * The data for this machine dependent, and the declarations are in machine
+ * dependent code. The layout of intrnames and intrcnt however is machine
+ * independent.
+ *
+ * We do not know the length of intrcnt and intrnames at compile time, so
+ * calculate things at run time.
+ */
+static int
+sysctl_intrnames(SYSCTL_HANDLER_ARGS)
+{
+ return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_intrnames, "", "Interrupt Names");
+
+static int
+sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
+{
+#ifdef SCTL_MASK32
+ uint32_t *intrcnt32;
+ unsigned i;
+ int error;
+
+ if (req->flags & SCTL_MASK32) {
+ if (!req->oldptr)
+ return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req));
+ intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT);
+ if (intrcnt32 == NULL)
+ return (ENOMEM);
+ for (i = 0; i < sintrcnt / sizeof (u_long); i++)
+ intrcnt32[i] = intrcnt[i];
+ error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req);
+ free(intrcnt32, M_TEMP);
+ return (error);
+ }
+#endif
+ return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req));
+}
+
+SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_intrcnt, "", "Interrupt Counts");
+
+#ifdef DDB
+/*
+ * DDB command to dump the interrupt statistics.
+ */
+DB_SHOW_COMMAND(intrcnt, db_show_intrcnt)
+{
+ u_long *i;
+ char *cp;
+ u_int j;
+
+ cp = intrnames;
+ j = 0;
+ for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit;
+ i++, j++) {
+ if (*cp == '\0')
+ break;
+ if (*i != 0)
+ db_printf("%s\t%lu\n", cp, *i);
+ cp += strlen(cp) + 1;
+ }
+}
+#endif
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
new file mode 100644
index 0000000..331b0e1
--- /dev/null
+++ b/sys/kern/kern_jail.c
@@ -0,0 +1,4677 @@
+/*-
+ * Copyright (c) 1999 Poul-Henning Kamp.
+ * Copyright (c) 2008 Bjoern A. Zeeb.
+ * Copyright (c) 2009 James Gritton.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/osd.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/taskqueue.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/racct.h>
+#include <sys/refcount.h>
+#include <sys/sx.h>
+#include <sys/sysent.h>
+#include <sys/namei.h>
+#include <sys/mount.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <net/if.h>
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#ifdef INET6
+#include <netinet6/in6_var.h>
+#endif /* INET6 */
+#endif /* DDB */
+
+#include <security/mac/mac_framework.h>
+
+#define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000"
+
+MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
+static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
+
+/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
+#ifdef INET
+#ifdef INET6
+#define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
+#else
+#define _PR_IP_SADDRSEL PR_IP4_SADDRSEL
+#endif
+#else /* !INET */
+#ifdef INET6
+#define _PR_IP_SADDRSEL PR_IP6_SADDRSEL
+#else
+#define _PR_IP_SADDRSEL 0
+#endif
+#endif
+
+/* prison0 describes what is "real" about the system. */
+struct prison prison0 = {
+ .pr_id = 0,
+ .pr_name = "0",
+ .pr_ref = 1,
+ .pr_uref = 1,
+ .pr_path = "/",
+ .pr_securelevel = -1,
+ .pr_devfs_rsnum = 0,
+ .pr_childmax = JAIL_MAX,
+ .pr_hostuuid = DEFAULT_HOSTUUID,
+ .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
+#ifdef VIMAGE
+ .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
+#else
+ .pr_flags = PR_HOST|_PR_IP_SADDRSEL,
+#endif
+ .pr_allow = PR_ALLOW_ALL,
+};
+MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
+
+/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
+struct sx allprison_lock;
+SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
+struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
+LIST_HEAD(, prison_racct) allprison_racct;
+int lastprid = 0;
+
+static int do_jail_attach(struct thread *td, struct prison *pr);
+static void prison_complete(void *context, int pending);
+static void prison_deref(struct prison *pr, int flags);
+static char *prison_path(struct prison *pr1, struct prison *pr2);
+static void prison_remove_one(struct prison *pr);
+#ifdef RACCT
+static void prison_racct_attach(struct prison *pr);
+static void prison_racct_modify(struct prison *pr);
+static void prison_racct_detach(struct prison *pr);
+#endif
+#ifdef INET
+static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
+static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
+#endif
+#ifdef INET6
+static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
+static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
+#endif
+
+/* Flags for prison_deref */
+#define PD_DEREF 0x01
+#define PD_DEUREF 0x02
+#define PD_LOCKED 0x04
+#define PD_LIST_SLOCKED 0x08
+#define PD_LIST_XLOCKED 0x10
+
+/*
+ * Parameter names corresponding to PR_* flag values. Size values are for kvm
+ * as we cannot figure out the size of a sparse array, or an array without a
+ * terminating entry.
+ */
+static char *pr_flag_names[] = {
+ [0] = "persist",
+#ifdef INET
+ [7] = "ip4.saddrsel",
+#endif
+#ifdef INET6
+ [8] = "ip6.saddrsel",
+#endif
+};
+const size_t pr_flag_names_size = sizeof(pr_flag_names);
+
+static char *pr_flag_nonames[] = {
+ [0] = "nopersist",
+#ifdef INET
+ [7] = "ip4.nosaddrsel",
+#endif
+#ifdef INET6
+ [8] = "ip6.nosaddrsel",
+#endif
+};
+const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
+
+struct jailsys_flags {
+ const char *name;
+ unsigned disable;
+ unsigned new;
+} pr_flag_jailsys[] = {
+ { "host", 0, PR_HOST },
+#ifdef VIMAGE
+ { "vnet", 0, PR_VNET },
+#endif
+#ifdef INET
+ { "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
+#endif
+#ifdef INET6
+ { "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
+#endif
+};
+const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
+
+static char *pr_allow_names[] = {
+ "allow.set_hostname",
+ "allow.sysvipc",
+ "allow.raw_sockets",
+ "allow.chflags",
+ "allow.mount",
+ "allow.quotas",
+ "allow.socket_af",
+ "allow.mount.devfs",
+ "allow.mount.nullfs",
+ "allow.mount.zfs",
+ "allow.mount.procfs",
+ "allow.mount.tmpfs",
+};
+const size_t pr_allow_names_size = sizeof(pr_allow_names);
+
+static char *pr_allow_nonames[] = {
+ "allow.noset_hostname",
+ "allow.nosysvipc",
+ "allow.noraw_sockets",
+ "allow.nochflags",
+ "allow.nomount",
+ "allow.noquotas",
+ "allow.nosocket_af",
+ "allow.mount.nodevfs",
+ "allow.mount.nonullfs",
+ "allow.mount.nozfs",
+ "allow.mount.noprocfs",
+ "allow.mount.notmpfs",
+};
+const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
+
+#define JAIL_DEFAULT_ALLOW PR_ALLOW_SET_HOSTNAME
+#define JAIL_DEFAULT_ENFORCE_STATFS 2
+#define JAIL_DEFAULT_DEVFS_RSNUM 0
+static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
+static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
+static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
+#if defined(INET) || defined(INET6)
+static unsigned jail_max_af_ips = 255;
+#endif
+
+#ifdef INET
+static int
+qcmp_v4(const void *ip1, const void *ip2)
+{
+ in_addr_t iaa, iab;
+
+ /*
+ * We need to compare in HBO here to get the list sorted as expected
+ * by the result of the code. Sorting NBO addresses gives you
+ * interesting results. If you do not understand, do not try.
+ */
+ iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
+ iab = ntohl(((const struct in_addr *)ip2)->s_addr);
+
+ /*
+ * Do not simply return the difference of the two numbers, the int is
+ * not wide enough.
+ */
+ if (iaa > iab)
+ return (1);
+ else if (iaa < iab)
+ return (-1);
+ else
+ return (0);
+}
+#endif
+
+#ifdef INET6
+static int
+qcmp_v6(const void *ip1, const void *ip2)
+{
+ const struct in6_addr *ia6a, *ia6b;
+ int i, rc;
+
+ ia6a = (const struct in6_addr *)ip1;
+ ia6b = (const struct in6_addr *)ip2;
+
+ rc = 0;
+ for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
+ if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
+ rc = 1;
+ else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
+ rc = -1;
+ }
+ return (rc);
+}
+#endif
+
+/*
+ * struct jail_args {
+ * struct jail *jail;
+ * };
+ */
+int
+sys_jail(struct thread *td, struct jail_args *uap)
+{
+ uint32_t version;
+ int error;
+ struct jail j;
+
+ error = copyin(uap->jail, &version, sizeof(uint32_t));
+ if (error)
+ return (error);
+
+ switch (version) {
+ case 0:
+ {
+ struct jail_v0 j0;
+
+ /* FreeBSD single IPv4 jails. */
+ bzero(&j, sizeof(struct jail));
+ error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
+ if (error)
+ return (error);
+ j.version = j0.version;
+ j.path = j0.path;
+ j.hostname = j0.hostname;
+ j.ip4s = j0.ip_number;
+ break;
+ }
+
+ case 1:
+ /*
+ * Version 1 was used by multi-IPv4 jail implementations
+ * that never made it into the official kernel.
+ */
+ return (EINVAL);
+
+ case 2: /* JAIL_API_VERSION */
+ /* FreeBSD multi-IPv4/IPv6,noIP jails. */
+ error = copyin(uap->jail, &j, sizeof(struct jail));
+ if (error)
+ return (error);
+ break;
+
+ default:
+ /* Sci-Fi jails are not supported, sorry. */
+ return (EINVAL);
+ }
+ return (kern_jail(td, &j));
+}
+
+int
+kern_jail(struct thread *td, struct jail *j)
+{
+ struct iovec optiov[2 * (4
+ + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
+#ifdef INET
+ + 1
+#endif
+#ifdef INET6
+ + 1
+#endif
+ )];
+ struct uio opt;
+ char *u_path, *u_hostname, *u_name;
+#ifdef INET
+ uint32_t ip4s;
+ struct in_addr *u_ip4;
+#endif
+#ifdef INET6
+ struct in6_addr *u_ip6;
+#endif
+ size_t tmplen;
+ int error, enforce_statfs, fi;
+
+ bzero(&optiov, sizeof(optiov));
+ opt.uio_iov = optiov;
+ opt.uio_iovcnt = 0;
+ opt.uio_offset = -1;
+ opt.uio_resid = -1;
+ opt.uio_segflg = UIO_SYSSPACE;
+ opt.uio_rw = UIO_READ;
+ opt.uio_td = td;
+
+ /* Set permissions for top-level jails from sysctls. */
+ if (!jailed(td->td_ucred)) {
+ for (fi = 0; fi < sizeof(pr_allow_names) /
+ sizeof(pr_allow_names[0]); fi++) {
+ optiov[opt.uio_iovcnt].iov_base =
+ (jail_default_allow & (1 << fi))
+ ? pr_allow_names[fi] : pr_allow_nonames[fi];
+ optiov[opt.uio_iovcnt].iov_len =
+ strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
+ opt.uio_iovcnt += 2;
+ }
+ optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
+ optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
+ opt.uio_iovcnt++;
+ enforce_statfs = jail_default_enforce_statfs;
+ optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
+ optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
+ opt.uio_iovcnt++;
+ }
+
+ tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
+#ifdef INET
+ ip4s = (j->version == 0) ? 1 : j->ip4s;
+ if (ip4s > jail_max_af_ips)
+ return (EINVAL);
+ tmplen += ip4s * sizeof(struct in_addr);
+#else
+ if (j->ip4s > 0)
+ return (EINVAL);
+#endif
+#ifdef INET6
+ if (j->ip6s > jail_max_af_ips)
+ return (EINVAL);
+ tmplen += j->ip6s * sizeof(struct in6_addr);
+#else
+ if (j->ip6s > 0)
+ return (EINVAL);
+#endif
+ u_path = malloc(tmplen, M_TEMP, M_WAITOK);
+ u_hostname = u_path + MAXPATHLEN;
+ u_name = u_hostname + MAXHOSTNAMELEN;
+#ifdef INET
+ u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
+#endif
+#ifdef INET6
+#ifdef INET
+ u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
+#else
+ u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
+#endif
+#endif
+ optiov[opt.uio_iovcnt].iov_base = "path";
+ optiov[opt.uio_iovcnt].iov_len = sizeof("path");
+ opt.uio_iovcnt++;
+ optiov[opt.uio_iovcnt].iov_base = u_path;
+ error = copyinstr(j->path, u_path, MAXPATHLEN,
+ &optiov[opt.uio_iovcnt].iov_len);
+ if (error) {
+ free(u_path, M_TEMP);
+ return (error);
+ }
+ opt.uio_iovcnt++;
+ optiov[opt.uio_iovcnt].iov_base = "host.hostname";
+ optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
+ opt.uio_iovcnt++;
+ optiov[opt.uio_iovcnt].iov_base = u_hostname;
+ error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
+ &optiov[opt.uio_iovcnt].iov_len);
+ if (error) {
+ free(u_path, M_TEMP);
+ return (error);
+ }
+ opt.uio_iovcnt++;
+ if (j->jailname != NULL) {
+ optiov[opt.uio_iovcnt].iov_base = "name";
+ optiov[opt.uio_iovcnt].iov_len = sizeof("name");
+ opt.uio_iovcnt++;
+ optiov[opt.uio_iovcnt].iov_base = u_name;
+ error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
+ &optiov[opt.uio_iovcnt].iov_len);
+ if (error) {
+ free(u_path, M_TEMP);
+ return (error);
+ }
+ opt.uio_iovcnt++;
+ }
+#ifdef INET
+ optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
+ optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
+ opt.uio_iovcnt++;
+ optiov[opt.uio_iovcnt].iov_base = u_ip4;
+ optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
+ if (j->version == 0)
+ u_ip4->s_addr = j->ip4s;
+ else {
+ error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
+ if (error) {
+ free(u_path, M_TEMP);
+ return (error);
+ }
+ }
+ opt.uio_iovcnt++;
+#endif
+#ifdef INET6
+ optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
+ optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
+ opt.uio_iovcnt++;
+ optiov[opt.uio_iovcnt].iov_base = u_ip6;
+ optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
+ error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
+ if (error) {
+ free(u_path, M_TEMP);
+ return (error);
+ }
+ opt.uio_iovcnt++;
+#endif
+ KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
+ ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
+ error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
+ free(u_path, M_TEMP);
+ return (error);
+}
+
+
+/*
+ * struct jail_set_args {
+ * struct iovec *iovp;
+ * unsigned int iovcnt;
+ * int flags;
+ * };
+ */
+int
+sys_jail_set(struct thread *td, struct jail_set_args *uap)
+{
+ struct uio *auio;
+ int error;
+
+ /* Check that we have an even number of iovecs. */
+ if (uap->iovcnt & 1)
+ return (EINVAL);
+
+ error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+ if (error)
+ return (error);
+ error = kern_jail_set(td, auio, uap->flags);
+ free(auio, M_IOV);
+ return (error);
+}
+
+int
+kern_jail_set(struct thread *td, struct uio *optuio, int flags)
+{
+ struct nameidata nd;
+#ifdef INET
+ struct in_addr *ip4;
+#endif
+#ifdef INET6
+ struct in6_addr *ip6;
+#endif
+ struct vfsopt *opt;
+ struct vfsoptlist *opts;
+ struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
+ struct vnode *root;
+ char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
+ char *g_path;
+#if defined(INET) || defined(INET6)
+ struct prison *tppr;
+ void *op;
+#endif
+ unsigned long hid;
+ size_t namelen, onamelen;
+ int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
+ int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
+ int fi, jid, jsys, len, level;
+ int childmax, rsnum, slevel;
+ int fullpath_disabled;
+#if defined(INET) || defined(INET6)
+ int ii, ij;
+#endif
+#ifdef INET
+ int ip4s, redo_ip4;
+#endif
+#ifdef INET6
+ int ip6s, redo_ip6;
+#endif
+ uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
+ unsigned tallow;
+ char numbuf[12];
+
+ error = priv_check(td, PRIV_JAIL_SET);
+ if (!error && (flags & JAIL_ATTACH))
+ error = priv_check(td, PRIV_JAIL_ATTACH);
+ if (error)
+ return (error);
+ mypr = ppr = td->td_ucred->cr_prison;
+ if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
+ return (EPERM);
+ if (flags & ~JAIL_SET_MASK)
+ return (EINVAL);
+
+ /*
+ * Check all the parameters before committing to anything. Not all
+ * errors can be caught early, but we may as well try. Also, this
+ * takes care of some expensive stuff (path lookup) before getting
+ * the allprison lock.
+ *
+ * XXX Jails are not filesystems, and jail parameters are not mount
+ * options. But it makes more sense to re-use the vfsopt code
+ * than duplicate it under a different name.
+ */
+ error = vfs_buildopts(optuio, &opts);
+ if (error)
+ return (error);
+#ifdef INET
+ ip4 = NULL;
+#endif
+#ifdef INET6
+ ip6 = NULL;
+#endif
+ g_path = NULL;
+
+ error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
+ if (error == ENOENT)
+ jid = 0;
+ else if (error != 0)
+ goto done_free;
+
+ error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
+ if (error == ENOENT)
+ gotslevel = 0;
+ else if (error != 0)
+ goto done_free;
+ else
+ gotslevel = 1;
+
+ error =
+ vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
+ if (error == ENOENT)
+ gotchildmax = 0;
+ else if (error != 0)
+ goto done_free;
+ else
+ gotchildmax = 1;
+
+ error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
+ if (error == ENOENT)
+ gotenforce = 0;
+ else if (error != 0)
+ goto done_free;
+ else if (enforce < 0 || enforce > 2) {
+ error = EINVAL;
+ goto done_free;
+ } else
+ gotenforce = 1;
+
+ error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
+ if (error == ENOENT)
+ gotrsnum = 0;
+ else if (error != 0)
+ goto done_free;
+ else
+ gotrsnum = 1;
+
+ pr_flags = ch_flags = 0;
+ for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
+ fi++) {
+ if (pr_flag_names[fi] == NULL)
+ continue;
+ vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
+ vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
+ }
+ ch_flags |= pr_flags;
+ for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
+ fi++) {
+ error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
+ sizeof(jsys));
+ if (error == ENOENT)
+ continue;
+ if (error != 0)
+ goto done_free;
+ switch (jsys) {
+ case JAIL_SYS_DISABLE:
+ if (!pr_flag_jailsys[fi].disable) {
+ error = EINVAL;
+ goto done_free;
+ }
+ pr_flags |= pr_flag_jailsys[fi].disable;
+ break;
+ case JAIL_SYS_NEW:
+ pr_flags |= pr_flag_jailsys[fi].new;
+ break;
+ case JAIL_SYS_INHERIT:
+ break;
+ default:
+ error = EINVAL;
+ goto done_free;
+ }
+ ch_flags |=
+ pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
+ }
+ if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
+ && !(pr_flags & PR_PERSIST)) {
+ error = EINVAL;
+ vfs_opterror(opts, "new jail must persist or attach");
+ goto done_errmsg;
+ }
+#ifdef VIMAGE
+ if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
+ error = EINVAL;
+ vfs_opterror(opts, "vnet cannot be changed after creation");
+ goto done_errmsg;
+ }
+#endif
+#ifdef INET
+ if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
+ error = EINVAL;
+ vfs_opterror(opts, "ip4 cannot be changed after creation");
+ goto done_errmsg;
+ }
+#endif
+#ifdef INET6
+ if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
+ error = EINVAL;
+ vfs_opterror(opts, "ip6 cannot be changed after creation");
+ goto done_errmsg;
+ }
+#endif
+
+ pr_allow = ch_allow = 0;
+ for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
+ fi++) {
+ vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
+ vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
+ }
+ ch_allow |= pr_allow;
+
+ error = vfs_getopt(opts, "name", (void **)&name, &len);
+ if (error == ENOENT)
+ name = NULL;
+ else if (error != 0)
+ goto done_free;
+ else {
+ if (len == 0 || name[len - 1] != '\0') {
+ error = EINVAL;
+ goto done_free;
+ }
+ if (len > MAXHOSTNAMELEN) {
+ error = ENAMETOOLONG;
+ goto done_free;
+ }
+ }
+
+ error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
+ if (error == ENOENT)
+ host = NULL;
+ else if (error != 0)
+ goto done_free;
+ else {
+ ch_flags |= PR_HOST;
+ pr_flags |= PR_HOST;
+ if (len == 0 || host[len - 1] != '\0') {
+ error = EINVAL;
+ goto done_free;
+ }
+ if (len > MAXHOSTNAMELEN) {
+ error = ENAMETOOLONG;
+ goto done_free;
+ }
+ }
+
+ error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
+ if (error == ENOENT)
+ domain = NULL;
+ else if (error != 0)
+ goto done_free;
+ else {
+ ch_flags |= PR_HOST;
+ pr_flags |= PR_HOST;
+ if (len == 0 || domain[len - 1] != '\0') {
+ error = EINVAL;
+ goto done_free;
+ }
+ if (len > MAXHOSTNAMELEN) {
+ error = ENAMETOOLONG;
+ goto done_free;
+ }
+ }
+
+ error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
+ if (error == ENOENT)
+ uuid = NULL;
+ else if (error != 0)
+ goto done_free;
+ else {
+ ch_flags |= PR_HOST;
+ pr_flags |= PR_HOST;
+ if (len == 0 || uuid[len - 1] != '\0') {
+ error = EINVAL;
+ goto done_free;
+ }
+ if (len > HOSTUUIDLEN) {
+ error = ENAMETOOLONG;
+ goto done_free;
+ }
+ }
+
+#ifdef COMPAT_FREEBSD32
+ if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+ uint32_t hid32;
+
+ error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
+ hid = hid32;
+ } else
+#endif
+ error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
+ if (error == ENOENT)
+ gothid = 0;
+ else if (error != 0)
+ goto done_free;
+ else {
+ gothid = 1;
+ ch_flags |= PR_HOST;
+ pr_flags |= PR_HOST;
+ }
+
+#ifdef INET
+ error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
+ if (error == ENOENT)
+ ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
+ else if (error != 0)
+ goto done_free;
+ else if (ip4s & (sizeof(*ip4) - 1)) {
+ error = EINVAL;
+ goto done_free;
+ } else {
+ ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
+ if (ip4s == 0)
+ pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
+ else {
+ pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
+ ip4s /= sizeof(*ip4);
+ if (ip4s > jail_max_af_ips) {
+ error = EINVAL;
+ vfs_opterror(opts, "too many IPv4 addresses");
+ goto done_errmsg;
+ }
+ ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
+ bcopy(op, ip4, ip4s * sizeof(*ip4));
+ /*
+ * IP addresses are all sorted but ip[0] to preserve
+ * the primary IP address as given from userland.
+ * This special IP is used for unbound outgoing
+ * connections as well for "loopback" traffic in case
+ * source address selection cannot find any more fitting
+ * address to connect from.
+ */
+ if (ip4s > 1)
+ qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
+ /*
+ * Check for duplicate addresses and do some simple
+ * zero and broadcast checks. If users give other bogus
+ * addresses it is their problem.
+ *
+ * We do not have to care about byte order for these
+ * checks so we will do them in NBO.
+ */
+ for (ii = 0; ii < ip4s; ii++) {
+ if (ip4[ii].s_addr == INADDR_ANY ||
+ ip4[ii].s_addr == INADDR_BROADCAST) {
+ error = EINVAL;
+ goto done_free;
+ }
+ if ((ii+1) < ip4s &&
+ (ip4[0].s_addr == ip4[ii+1].s_addr ||
+ ip4[ii].s_addr == ip4[ii+1].s_addr)) {
+ error = EINVAL;
+ goto done_free;
+ }
+ }
+ }
+ }
+#endif
+
+#ifdef INET6
+ error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
+ if (error == ENOENT)
+ ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
+ else if (error != 0)
+ goto done_free;
+ else if (ip6s & (sizeof(*ip6) - 1)) {
+ error = EINVAL;
+ goto done_free;
+ } else {
+ ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
+ if (ip6s == 0)
+ pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
+ else {
+ pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
+ ip6s /= sizeof(*ip6);
+ if (ip6s > jail_max_af_ips) {
+ error = EINVAL;
+ vfs_opterror(opts, "too many IPv6 addresses");
+ goto done_errmsg;
+ }
+ ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
+ bcopy(op, ip6, ip6s * sizeof(*ip6));
+ if (ip6s > 1)
+ qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
+ for (ii = 0; ii < ip6s; ii++) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
+ error = EINVAL;
+ goto done_free;
+ }
+ if ((ii+1) < ip6s &&
+ (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
+ IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
+ {
+ error = EINVAL;
+ goto done_free;
+ }
+ }
+ }
+ }
+#endif
+
+#if defined(VIMAGE) && (defined(INET) || defined(INET6))
+ if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "vnet jails cannot have IP address restrictions");
+ goto done_errmsg;
+ }
+#endif
+
+ fullpath_disabled = 0;
+ root = NULL;
+ error = vfs_getopt(opts, "path", (void **)&path, &len);
+ if (error == ENOENT)
+ path = NULL;
+ else if (error != 0)
+ goto done_free;
+ else {
+ if (flags & JAIL_UPDATE) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "path cannot be changed after creation");
+ goto done_errmsg;
+ }
+ if (len == 0 || path[len - 1] != '\0') {
+ error = EINVAL;
+ goto done_free;
+ }
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
+ path, td);
+ error = namei(&nd);
+ if (error)
+ goto done_free;
+ root = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ strlcpy(g_path, path, MAXPATHLEN);
+ error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
+ if (error == 0)
+ path = g_path;
+ else if (error == ENODEV) {
+ /* proceed if sysctl debug.disablefullpath == 1 */
+ fullpath_disabled = 1;
+ if (len < 2 || (len == 2 && path[0] == '/'))
+ path = NULL;
+ } else {
+ /* exit on other errors */
+ goto done_free;
+ }
+ if (root->v_type != VDIR) {
+ error = ENOTDIR;
+ vput(root);
+ goto done_free;
+ }
+ VOP_UNLOCK(root, 0);
+ if (fullpath_disabled) {
+ /* Leave room for a real-root full pathname. */
+ if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
+ ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
+ error = ENAMETOOLONG;
+ goto done_free;
+ }
+ }
+ }
+
+ /*
+ * Grab the allprison lock before letting modules check their
+ * parameters. Once we have it, do not let go so we'll have a
+ * consistent view of the OSD list.
+ */
+ sx_xlock(&allprison_lock);
+ error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
+ if (error)
+ goto done_unlock_list;
+
+ /* By now, all parameters should have been noted. */
+ TAILQ_FOREACH(opt, opts, link) {
+ if (!opt->seen && strcmp(opt->name, "errmsg")) {
+ error = EINVAL;
+ vfs_opterror(opts, "unknown parameter: %s", opt->name);
+ goto done_unlock_list;
+ }
+ }
+
+ /*
+ * See if we are creating a new record or updating an existing one.
+ * This abuses the file error codes ENOENT and EEXIST.
+ */
+ cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
+ if (!cuflags) {
+ error = EINVAL;
+ vfs_opterror(opts, "no valid operation (create or update)");
+ goto done_unlock_list;
+ }
+ pr = NULL;
+ namelc = NULL;
+ if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
+ namelc = strrchr(name, '.');
+ jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
+ if (*p != '\0')
+ jid = 0;
+ }
+ if (jid != 0) {
+ /*
+ * See if a requested jid already exists. There is an
+ * information leak here if the jid exists but is not within
+ * the caller's jail hierarchy. Jail creators will get EEXIST
+ * even though they cannot see the jail, and CREATE | UPDATE
+ * will return ENOENT which is not normally a valid error.
+ */
+ if (jid < 0) {
+ error = EINVAL;
+ vfs_opterror(opts, "negative jid");
+ goto done_unlock_list;
+ }
+ pr = prison_find(jid);
+ if (pr != NULL) {
+ ppr = pr->pr_parent;
+ /* Create: jid must not exist. */
+ if (cuflags == JAIL_CREATE) {
+ mtx_unlock(&pr->pr_mtx);
+ error = EEXIST;
+ vfs_opterror(opts, "jail %d already exists",
+ jid);
+ goto done_unlock_list;
+ }
+ if (!prison_ischild(mypr, pr)) {
+ mtx_unlock(&pr->pr_mtx);
+ pr = NULL;
+ } else if (pr->pr_uref == 0) {
+ if (!(flags & JAIL_DYING)) {
+ mtx_unlock(&pr->pr_mtx);
+ error = ENOENT;
+ vfs_opterror(opts, "jail %d is dying",
+ jid);
+ goto done_unlock_list;
+ } else if ((flags & JAIL_ATTACH) ||
+ (pr_flags & PR_PERSIST)) {
+ /*
+ * A dying jail might be resurrected
+ * (via attach or persist), but first
+ * it must determine if another jail
+ * has claimed its name. Accomplish
+ * this by implicitly re-setting the
+ * name.
+ */
+ if (name == NULL)
+ name = prison_name(mypr, pr);
+ }
+ }
+ }
+ if (pr == NULL) {
+ /* Update: jid must exist. */
+ if (cuflags == JAIL_UPDATE) {
+ error = ENOENT;
+ vfs_opterror(opts, "jail %d not found", jid);
+ goto done_unlock_list;
+ }
+ }
+ }
+ /*
+ * If the caller provided a name, look for a jail by that name.
+ * This has different semantics for creates and updates keyed by jid
+ * (where the name must not already exist in a different jail),
+ * and updates keyed by the name itself (where the name must exist
+ * because that is the jail being updated).
+ */
+ if (name != NULL) {
+ namelc = strrchr(name, '.');
+ if (namelc == NULL)
+ namelc = name;
+ else {
+ /*
+ * This is a hierarchical name. Split it into the
+ * parent and child names, and make sure the parent
+ * exists or matches an already found jail.
+ */
+ *namelc = '\0';
+ if (pr != NULL) {
+ if (strncmp(name, ppr->pr_name, namelc - name)
+ || ppr->pr_name[namelc - name] != '\0') {
+ mtx_unlock(&pr->pr_mtx);
+ error = EINVAL;
+ vfs_opterror(opts,
+ "cannot change jail's parent");
+ goto done_unlock_list;
+ }
+ } else {
+ ppr = prison_find_name(mypr, name);
+ if (ppr == NULL) {
+ error = ENOENT;
+ vfs_opterror(opts,
+ "jail \"%s\" not found", name);
+ goto done_unlock_list;
+ }
+ mtx_unlock(&ppr->pr_mtx);
+ }
+ name = ++namelc;
+ }
+ if (name[0] != '\0') {
+ namelen =
+ (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
+ name_again:
+ deadpr = NULL;
+ FOREACH_PRISON_CHILD(ppr, tpr) {
+ if (tpr != pr && tpr->pr_ref > 0 &&
+ !strcmp(tpr->pr_name + namelen, name)) {
+ if (pr == NULL &&
+ cuflags != JAIL_CREATE) {
+ mtx_lock(&tpr->pr_mtx);
+ if (tpr->pr_ref > 0) {
+ /*
+ * Use this jail
+ * for updates.
+ */
+ if (tpr->pr_uref > 0) {
+ pr = tpr;
+ break;
+ }
+ deadpr = tpr;
+ }
+ mtx_unlock(&tpr->pr_mtx);
+ } else if (tpr->pr_uref > 0) {
+ /*
+ * Create, or update(jid):
+ * name must not exist in an
+ * active sibling jail.
+ */
+ error = EEXIST;
+ if (pr != NULL)
+ mtx_unlock(&pr->pr_mtx);
+ vfs_opterror(opts,
+ "jail \"%s\" already exists",
+ name);
+ goto done_unlock_list;
+ }
+ }
+ }
+ /* If no active jail is found, use a dying one. */
+ if (deadpr != NULL && pr == NULL) {
+ if (flags & JAIL_DYING) {
+ mtx_lock(&deadpr->pr_mtx);
+ if (deadpr->pr_ref == 0) {
+ mtx_unlock(&deadpr->pr_mtx);
+ goto name_again;
+ }
+ pr = deadpr;
+ } else if (cuflags == JAIL_UPDATE) {
+ error = ENOENT;
+ vfs_opterror(opts,
+ "jail \"%s\" is dying", name);
+ goto done_unlock_list;
+ }
+ }
+ /* Update: name must exist if no jid. */
+ else if (cuflags == JAIL_UPDATE && pr == NULL) {
+ error = ENOENT;
+ vfs_opterror(opts, "jail \"%s\" not found",
+ name);
+ goto done_unlock_list;
+ }
+ }
+ }
+ /* Update: must provide a jid or name. */
+ else if (cuflags == JAIL_UPDATE && pr == NULL) {
+ error = ENOENT;
+ vfs_opterror(opts, "update specified no jail");
+ goto done_unlock_list;
+ }
+
+ /* If there's no prison to update, create a new one and link it in. */
+ if (pr == NULL) {
+ for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
+ if (tpr->pr_childcount >= tpr->pr_childmax) {
+ error = EPERM;
+ vfs_opterror(opts, "prison limit exceeded");
+ goto done_unlock_list;
+ }
+ created = 1;
+ mtx_lock(&ppr->pr_mtx);
+ if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
+ mtx_unlock(&ppr->pr_mtx);
+ error = ENOENT;
+ vfs_opterror(opts, "parent jail went away!");
+ goto done_unlock_list;
+ }
+ ppr->pr_ref++;
+ ppr->pr_uref++;
+ mtx_unlock(&ppr->pr_mtx);
+ pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
+ if (jid == 0) {
+ /* Find the next free jid. */
+ jid = lastprid + 1;
+ findnext:
+ if (jid == JAIL_MAX)
+ jid = 1;
+ TAILQ_FOREACH(tpr, &allprison, pr_list) {
+ if (tpr->pr_id < jid)
+ continue;
+ if (tpr->pr_id > jid || tpr->pr_ref == 0) {
+ TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
+ break;
+ }
+ if (jid == lastprid) {
+ error = EAGAIN;
+ vfs_opterror(opts,
+ "no available jail IDs");
+ free(pr, M_PRISON);
+ prison_deref(ppr, PD_DEREF |
+ PD_DEUREF | PD_LIST_XLOCKED);
+ goto done_releroot;
+ }
+ jid++;
+ goto findnext;
+ }
+ lastprid = jid;
+ } else {
+ /*
+ * The jail already has a jid (that did not yet exist),
+ * so just find where to insert it.
+ */
+ TAILQ_FOREACH(tpr, &allprison, pr_list)
+ if (tpr->pr_id >= jid) {
+ TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
+ break;
+ }
+ }
+ if (tpr == NULL)
+ TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
+ LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
+ for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
+ tpr->pr_childcount++;
+
+ pr->pr_parent = ppr;
+ pr->pr_id = jid;
+
+ /* Set some default values, and inherit some from the parent. */
+ if (name == NULL)
+ name = "";
+ if (path == NULL) {
+ path = "/";
+ root = mypr->pr_root;
+ vref(root);
+ }
+ strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
+ pr->pr_flags |= PR_HOST;
+#if defined(INET) || defined(INET6)
+#ifdef VIMAGE
+ if (!(pr_flags & PR_VNET))
+#endif
+ {
+#ifdef INET
+ if (!(ch_flags & PR_IP4_USER))
+ pr->pr_flags |=
+ PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
+ else if (!(pr_flags & PR_IP4_USER)) {
+ pr->pr_flags |= ppr->pr_flags & PR_IP4;
+ if (ppr->pr_ip4 != NULL) {
+ pr->pr_ip4s = ppr->pr_ip4s;
+ pr->pr_ip4 = malloc(pr->pr_ip4s *
+ sizeof(struct in_addr), M_PRISON,
+ M_WAITOK);
+ bcopy(ppr->pr_ip4, pr->pr_ip4,
+ pr->pr_ip4s * sizeof(*pr->pr_ip4));
+ }
+ }
+#endif
+#ifdef INET6
+ if (!(ch_flags & PR_IP6_USER))
+ pr->pr_flags |=
+ PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
+ else if (!(pr_flags & PR_IP6_USER)) {
+ pr->pr_flags |= ppr->pr_flags & PR_IP6;
+ if (ppr->pr_ip6 != NULL) {
+ pr->pr_ip6s = ppr->pr_ip6s;
+ pr->pr_ip6 = malloc(pr->pr_ip6s *
+ sizeof(struct in6_addr), M_PRISON,
+ M_WAITOK);
+ bcopy(ppr->pr_ip6, pr->pr_ip6,
+ pr->pr_ip6s * sizeof(*pr->pr_ip6));
+ }
+ }
+#endif
+ }
+#endif
+ /* Source address selection is always on by default. */
+ pr->pr_flags |= _PR_IP_SADDRSEL;
+
+ pr->pr_securelevel = ppr->pr_securelevel;
+ pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
+ pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
+ pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
+
+ LIST_INIT(&pr->pr_children);
+ mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
+
+#ifdef VIMAGE
+ /* Allocate a new vnet if specified. */
+ pr->pr_vnet = (pr_flags & PR_VNET)
+ ? vnet_alloc() : ppr->pr_vnet;
+#endif
+ /*
+ * Allocate a dedicated cpuset for each jail.
+ * Unlike other initial settings, this may return an erorr.
+ */
+ error = cpuset_create_root(ppr, &pr->pr_cpuset);
+ if (error) {
+ prison_deref(pr, PD_LIST_XLOCKED);
+ goto done_releroot;
+ }
+
+ mtx_lock(&pr->pr_mtx);
+ /*
+ * New prisons do not yet have a reference, because we do not
+ * want other to see the incomplete prison once the
+ * allprison_lock is downgraded.
+ */
+ } else {
+ created = 0;
+ /*
+ * Grab a reference for existing prisons, to ensure they
+ * continue to exist for the duration of the call.
+ */
+ pr->pr_ref++;
+#if defined(VIMAGE) && (defined(INET) || defined(INET6))
+ if ((pr->pr_flags & PR_VNET) &&
+ (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "vnet jails cannot have IP address restrictions");
+ goto done_deref_locked;
+ }
+#endif
+#ifdef INET
+ if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "ip4 cannot be changed after creation");
+ goto done_deref_locked;
+ }
+#endif
+#ifdef INET6
+ if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "ip6 cannot be changed after creation");
+ goto done_deref_locked;
+ }
+#endif
+ }
+
+ /* Do final error checking before setting anything. */
+ if (gotslevel) {
+ if (slevel < ppr->pr_securelevel) {
+ error = EPERM;
+ goto done_deref_locked;
+ }
+ }
+ if (gotchildmax) {
+ if (childmax >= ppr->pr_childmax) {
+ error = EPERM;
+ goto done_deref_locked;
+ }
+ }
+ if (gotenforce) {
+ if (enforce < ppr->pr_enforce_statfs) {
+ error = EPERM;
+ goto done_deref_locked;
+ }
+ }
+ if (gotrsnum) {
+ /*
+ * devfs_rsnum is a uint16_t
+ */
+ if (rsnum < 0 || rsnum > 65535) {
+ error = EINVAL;
+ goto done_deref_locked;
+ }
+ /*
+ * Nested jails always inherit parent's devfs ruleset
+ */
+ if (jailed(td->td_ucred)) {
+ if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
+ error = EPERM;
+ goto done_deref_locked;
+ } else
+ rsnum = ppr->pr_devfs_rsnum;
+ }
+ }
+#ifdef INET
+ if (ip4s > 0) {
+ if (ppr->pr_flags & PR_IP4) {
+ /*
+ * Make sure the new set of IP addresses is a
+ * subset of the parent's list. Don't worry
+ * about the parent being unlocked, as any
+ * setting is done with allprison_lock held.
+ */
+ for (ij = 0; ij < ppr->pr_ip4s; ij++)
+ if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
+ break;
+ if (ij == ppr->pr_ip4s) {
+ error = EPERM;
+ goto done_deref_locked;
+ }
+ if (ip4s > 1) {
+ for (ii = ij = 1; ii < ip4s; ii++) {
+ if (ip4[ii].s_addr ==
+ ppr->pr_ip4[0].s_addr)
+ continue;
+ for (; ij < ppr->pr_ip4s; ij++)
+ if (ip4[ii].s_addr ==
+ ppr->pr_ip4[ij].s_addr)
+ break;
+ if (ij == ppr->pr_ip4s)
+ break;
+ }
+ if (ij == ppr->pr_ip4s) {
+ error = EPERM;
+ goto done_deref_locked;
+ }
+ }
+ }
+ /*
+ * Check for conflicting IP addresses. We permit them
+ * if there is no more than one IP on each jail. If
+ * there is a duplicate on a jail with more than one
+ * IP stop checking and return error.
+ */
+ tppr = ppr;
+#ifdef VIMAGE
+ for (; tppr != &prison0; tppr = tppr->pr_parent)
+ if (tppr->pr_flags & PR_VNET)
+ break;
+#endif
+ FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
+ if (tpr == pr ||
+#ifdef VIMAGE
+ (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
+#endif
+ tpr->pr_uref == 0) {
+ descend = 0;
+ continue;
+ }
+ if (!(tpr->pr_flags & PR_IP4_USER))
+ continue;
+ descend = 0;
+ if (tpr->pr_ip4 == NULL ||
+ (ip4s == 1 && tpr->pr_ip4s == 1))
+ continue;
+ for (ii = 0; ii < ip4s; ii++) {
+ if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
+ error = EADDRINUSE;
+ vfs_opterror(opts,
+ "IPv4 addresses clash");
+ goto done_deref_locked;
+ }
+ }
+ }
+ }
+#endif
+#ifdef INET6
+ if (ip6s > 0) {
+ if (ppr->pr_flags & PR_IP6) {
+ /*
+ * Make sure the new set of IP addresses is a
+ * subset of the parent's list.
+ */
+ for (ij = 0; ij < ppr->pr_ip6s; ij++)
+ if (IN6_ARE_ADDR_EQUAL(&ip6[0],
+ &ppr->pr_ip6[ij]))
+ break;
+ if (ij == ppr->pr_ip6s) {
+ error = EPERM;
+ goto done_deref_locked;
+ }
+ if (ip6s > 1) {
+ for (ii = ij = 1; ii < ip6s; ii++) {
+ if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
+ &ppr->pr_ip6[0]))
+ continue;
+ for (; ij < ppr->pr_ip6s; ij++)
+ if (IN6_ARE_ADDR_EQUAL(
+ &ip6[ii], &ppr->pr_ip6[ij]))
+ break;
+ if (ij == ppr->pr_ip6s)
+ break;
+ }
+ if (ij == ppr->pr_ip6s) {
+ error = EPERM;
+ goto done_deref_locked;
+ }
+ }
+ }
+ /* Check for conflicting IP addresses. */
+ tppr = ppr;
+#ifdef VIMAGE
+ for (; tppr != &prison0; tppr = tppr->pr_parent)
+ if (tppr->pr_flags & PR_VNET)
+ break;
+#endif
+ FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
+ if (tpr == pr ||
+#ifdef VIMAGE
+ (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
+#endif
+ tpr->pr_uref == 0) {
+ descend = 0;
+ continue;
+ }
+ if (!(tpr->pr_flags & PR_IP6_USER))
+ continue;
+ descend = 0;
+ if (tpr->pr_ip6 == NULL ||
+ (ip6s == 1 && tpr->pr_ip6s == 1))
+ continue;
+ for (ii = 0; ii < ip6s; ii++) {
+ if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
+ error = EADDRINUSE;
+ vfs_opterror(opts,
+ "IPv6 addresses clash");
+ goto done_deref_locked;
+ }
+ }
+ }
+ }
+#endif
+ onamelen = namelen = 0;
+ if (name != NULL) {
+ /* Give a default name of the jid. */
+ if (name[0] == '\0')
+ snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
+ else if (*namelc == '0' || (strtoul(namelc, &p, 10) != jid &&
+ *p == '\0')) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "name cannot be numeric (unless it is the jid)");
+ goto done_deref_locked;
+ }
+ /*
+ * Make sure the name isn't too long for the prison or its
+ * children.
+ */
+ onamelen = strlen(pr->pr_name);
+ namelen = strlen(name);
+ if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
+ error = ENAMETOOLONG;
+ goto done_deref_locked;
+ }
+ FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
+ if (strlen(tpr->pr_name) + (namelen - onamelen) >=
+ sizeof(pr->pr_name)) {
+ error = ENAMETOOLONG;
+ goto done_deref_locked;
+ }
+ }
+ }
+ if (pr_allow & ~ppr->pr_allow) {
+ error = EPERM;
+ goto done_deref_locked;
+ }
+
+ /* Set the parameters of the prison. */
+#ifdef INET
+ redo_ip4 = 0;
+ if (pr_flags & PR_IP4_USER) {
+ pr->pr_flags |= PR_IP4;
+ free(pr->pr_ip4, M_PRISON);
+ pr->pr_ip4s = ip4s;
+ pr->pr_ip4 = ip4;
+ ip4 = NULL;
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+#ifdef VIMAGE
+ if (tpr->pr_flags & PR_VNET) {
+ descend = 0;
+ continue;
+ }
+#endif
+ if (prison_restrict_ip4(tpr, NULL)) {
+ redo_ip4 = 1;
+ descend = 0;
+ }
+ }
+ }
+#endif
+#ifdef INET6
+ redo_ip6 = 0;
+ if (pr_flags & PR_IP6_USER) {
+ pr->pr_flags |= PR_IP6;
+ free(pr->pr_ip6, M_PRISON);
+ pr->pr_ip6s = ip6s;
+ pr->pr_ip6 = ip6;
+ ip6 = NULL;
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+#ifdef VIMAGE
+ if (tpr->pr_flags & PR_VNET) {
+ descend = 0;
+ continue;
+ }
+#endif
+ if (prison_restrict_ip6(tpr, NULL)) {
+ redo_ip6 = 1;
+ descend = 0;
+ }
+ }
+ }
+#endif
+ if (gotslevel) {
+ pr->pr_securelevel = slevel;
+ /* Set all child jails to be at least this level. */
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
+ if (tpr->pr_securelevel < slevel)
+ tpr->pr_securelevel = slevel;
+ }
+ if (gotchildmax) {
+ pr->pr_childmax = childmax;
+ /* Set all child jails to under this limit. */
+ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
+ if (tpr->pr_childmax > childmax - level)
+ tpr->pr_childmax = childmax > level
+ ? childmax - level : 0;
+ }
+ if (gotenforce) {
+ pr->pr_enforce_statfs = enforce;
+ /* Pass this restriction on to the children. */
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
+ if (tpr->pr_enforce_statfs < enforce)
+ tpr->pr_enforce_statfs = enforce;
+ }
+ if (gotrsnum) {
+ pr->pr_devfs_rsnum = rsnum;
+ /* Pass this restriction on to the children. */
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
+ tpr->pr_devfs_rsnum = rsnum;
+ }
+ if (name != NULL) {
+ if (ppr == &prison0)
+ strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
+ else
+ snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
+ ppr->pr_name, name);
+ /* Change this component of child names. */
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+ bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
+ strlen(tpr->pr_name + onamelen) + 1);
+ bcopy(pr->pr_name, tpr->pr_name, namelen);
+ }
+ }
+ if (path != NULL) {
+ /* Try to keep a real-rooted full pathname. */
+ if (fullpath_disabled && path[0] == '/' &&
+ strcmp(mypr->pr_path, "/"))
+ snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
+ mypr->pr_path, path);
+ else
+ strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
+ pr->pr_root = root;
+ }
+ if (PR_HOST & ch_flags & ~pr_flags) {
+ if (pr->pr_flags & PR_HOST) {
+ /*
+ * Copy the parent's host info. As with pr_ip4 above,
+ * the lack of a lock on the parent is not a problem;
+ * it is always set with allprison_lock at least
+ * shared, and is held exclusively here.
+ */
+ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
+ sizeof(pr->pr_hostname));
+ strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
+ sizeof(pr->pr_domainname));
+ strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
+ sizeof(pr->pr_hostuuid));
+ pr->pr_hostid = pr->pr_parent->pr_hostid;
+ }
+ } else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
+ /* Set this prison, and any descendants without PR_HOST. */
+ if (host != NULL)
+ strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
+ if (domain != NULL)
+ strlcpy(pr->pr_domainname, domain,
+ sizeof(pr->pr_domainname));
+ if (uuid != NULL)
+ strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
+ if (gothid)
+ pr->pr_hostid = hid;
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+ if (tpr->pr_flags & PR_HOST)
+ descend = 0;
+ else {
+ if (host != NULL)
+ strlcpy(tpr->pr_hostname,
+ pr->pr_hostname,
+ sizeof(tpr->pr_hostname));
+ if (domain != NULL)
+ strlcpy(tpr->pr_domainname,
+ pr->pr_domainname,
+ sizeof(tpr->pr_domainname));
+ if (uuid != NULL)
+ strlcpy(tpr->pr_hostuuid,
+ pr->pr_hostuuid,
+ sizeof(tpr->pr_hostuuid));
+ if (gothid)
+ tpr->pr_hostid = hid;
+ }
+ }
+ }
+ if ((tallow = ch_allow & ~pr_allow)) {
+ /* Clear allow bits in all children. */
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
+ tpr->pr_allow &= ~tallow;
+ }
+ pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
+ /*
+ * Persistent prisons get an extra reference, and prisons losing their
+ * persist flag lose that reference. Only do this for existing prisons
+ * for now, so new ones will remain unseen until after the module
+ * handlers have completed.
+ */
+ if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
+ if (pr_flags & PR_PERSIST) {
+ pr->pr_ref++;
+ pr->pr_uref++;
+ } else {
+ pr->pr_ref--;
+ pr->pr_uref--;
+ }
+ }
+ pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
+ mtx_unlock(&pr->pr_mtx);
+
+#ifdef RACCT
+ if (created)
+ prison_racct_attach(pr);
+#endif
+
+ /* Locks may have prevented a complete restriction of child IP
+ * addresses. If so, allocate some more memory and try again.
+ */
+#ifdef INET
+ while (redo_ip4) {
+ ip4s = pr->pr_ip4s;
+ ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
+ mtx_lock(&pr->pr_mtx);
+ redo_ip4 = 0;
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+#ifdef VIMAGE
+ if (tpr->pr_flags & PR_VNET) {
+ descend = 0;
+ continue;
+ }
+#endif
+ if (prison_restrict_ip4(tpr, ip4)) {
+ if (ip4 != NULL)
+ ip4 = NULL;
+ else
+ redo_ip4 = 1;
+ }
+ }
+ mtx_unlock(&pr->pr_mtx);
+ }
+#endif
+#ifdef INET6
+ while (redo_ip6) {
+ ip6s = pr->pr_ip6s;
+ ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
+ mtx_lock(&pr->pr_mtx);
+ redo_ip6 = 0;
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
+#ifdef VIMAGE
+ if (tpr->pr_flags & PR_VNET) {
+ descend = 0;
+ continue;
+ }
+#endif
+ if (prison_restrict_ip6(tpr, ip6)) {
+ if (ip6 != NULL)
+ ip6 = NULL;
+ else
+ redo_ip6 = 1;
+ }
+ }
+ mtx_unlock(&pr->pr_mtx);
+ }
+#endif
+
+ /* Let the modules do their work. */
+ sx_downgrade(&allprison_lock);
+ if (created) {
+ error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
+ if (error) {
+ prison_deref(pr, PD_LIST_SLOCKED);
+ goto done_errmsg;
+ }
+ }
+ error = osd_jail_call(pr, PR_METHOD_SET, opts);
+ if (error) {
+ prison_deref(pr, created
+ ? PD_LIST_SLOCKED
+ : PD_DEREF | PD_LIST_SLOCKED);
+ goto done_errmsg;
+ }
+
+ /* Attach this process to the prison if requested. */
+ if (flags & JAIL_ATTACH) {
+ mtx_lock(&pr->pr_mtx);
+ error = do_jail_attach(td, pr);
+ if (error) {
+ vfs_opterror(opts, "attach failed");
+ if (!created)
+ prison_deref(pr, PD_DEREF);
+ goto done_errmsg;
+ }
+ }
+
+#ifdef RACCT
+ if (!created) {
+ sx_sunlock(&allprison_lock);
+ prison_racct_modify(pr);
+ sx_slock(&allprison_lock);
+ }
+#endif
+
+ td->td_retval[0] = pr->pr_id;
+
+ /*
+ * Now that it is all there, drop the temporary reference from existing
+ * prisons. Or add a reference to newly created persistent prisons
+ * (which was not done earlier so that the prison would not be publicly
+ * visible).
+ */
+ if (!created) {
+ prison_deref(pr, (flags & JAIL_ATTACH)
+ ? PD_DEREF
+ : PD_DEREF | PD_LIST_SLOCKED);
+ } else {
+ if (pr_flags & PR_PERSIST) {
+ mtx_lock(&pr->pr_mtx);
+ pr->pr_ref++;
+ pr->pr_uref++;
+ mtx_unlock(&pr->pr_mtx);
+ }
+ if (!(flags & JAIL_ATTACH))
+ sx_sunlock(&allprison_lock);
+ }
+
+ goto done_errmsg;
+
+ done_deref_locked:
+ prison_deref(pr, created
+ ? PD_LOCKED | PD_LIST_XLOCKED
+ : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
+ goto done_releroot;
+ done_unlock_list:
+ sx_xunlock(&allprison_lock);
+ done_releroot:
+ if (root != NULL)
+ vrele(root);
+ done_errmsg:
+ if (error) {
+ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
+ if (errmsg_len > 0) {
+ errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
+ if (errmsg_pos > 0) {
+ if (optuio->uio_segflg == UIO_SYSSPACE)
+ bcopy(errmsg,
+ optuio->uio_iov[errmsg_pos].iov_base,
+ errmsg_len);
+ else
+ copyout(errmsg,
+ optuio->uio_iov[errmsg_pos].iov_base,
+ errmsg_len);
+ }
+ }
+ }
+ done_free:
+#ifdef INET
+ free(ip4, M_PRISON);
+#endif
+#ifdef INET6
+ free(ip6, M_PRISON);
+#endif
+ if (g_path != NULL)
+ free(g_path, M_TEMP);
+ vfs_freeopts(opts);
+ return (error);
+}
+
+
+/*
+ * struct jail_get_args {
+ * struct iovec *iovp;
+ * unsigned int iovcnt;
+ * int flags;
+ * };
+ */
+int
+sys_jail_get(struct thread *td, struct jail_get_args *uap)
+{
+ struct uio *auio;
+ int error;
+
+ /* Check that we have an even number of iovecs. */
+ if (uap->iovcnt & 1)
+ return (EINVAL);
+
+ error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+ if (error)
+ return (error);
+ error = kern_jail_get(td, auio, uap->flags);
+ if (error == 0)
+ error = copyout(auio->uio_iov, uap->iovp,
+ uap->iovcnt * sizeof (struct iovec));
+ free(auio, M_IOV);
+ return (error);
+}
+
+int
+kern_jail_get(struct thread *td, struct uio *optuio, int flags)
+{
+ struct prison *pr, *mypr;
+ struct vfsopt *opt;
+ struct vfsoptlist *opts;
+ char *errmsg, *name;
+ int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
+
+ if (flags & ~JAIL_GET_MASK)
+ return (EINVAL);
+
+ /* Get the parameter list. */
+ error = vfs_buildopts(optuio, &opts);
+ if (error)
+ return (error);
+ errmsg_pos = vfs_getopt_pos(opts, "errmsg");
+ mypr = td->td_ucred->cr_prison;
+
+ /*
+ * Find the prison specified by one of: lastjid, jid, name.
+ */
+ sx_slock(&allprison_lock);
+ error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
+ if (error == 0) {
+ TAILQ_FOREACH(pr, &allprison, pr_list) {
+ if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
+ mtx_lock(&pr->pr_mtx);
+ if (pr->pr_ref > 0 &&
+ (pr->pr_uref > 0 || (flags & JAIL_DYING)))
+ break;
+ mtx_unlock(&pr->pr_mtx);
+ }
+ }
+ if (pr != NULL)
+ goto found_prison;
+ error = ENOENT;
+ vfs_opterror(opts, "no jail after %d", jid);
+ goto done_unlock_list;
+ } else if (error != ENOENT)
+ goto done_unlock_list;
+
+ error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
+ if (error == 0) {
+ if (jid != 0) {
+ pr = prison_find_child(mypr, jid);
+ if (pr != NULL) {
+ if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
+ mtx_unlock(&pr->pr_mtx);
+ error = ENOENT;
+ vfs_opterror(opts, "jail %d is dying",
+ jid);
+ goto done_unlock_list;
+ }
+ goto found_prison;
+ }
+ error = ENOENT;
+ vfs_opterror(opts, "jail %d not found", jid);
+ goto done_unlock_list;
+ }
+ } else if (error != ENOENT)
+ goto done_unlock_list;
+
+ error = vfs_getopt(opts, "name", (void **)&name, &len);
+ if (error == 0) {
+ if (len == 0 || name[len - 1] != '\0') {
+ error = EINVAL;
+ goto done_unlock_list;
+ }
+ pr = prison_find_name(mypr, name);
+ if (pr != NULL) {
+ if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
+ mtx_unlock(&pr->pr_mtx);
+ error = ENOENT;
+ vfs_opterror(opts, "jail \"%s\" is dying",
+ name);
+ goto done_unlock_list;
+ }
+ goto found_prison;
+ }
+ error = ENOENT;
+ vfs_opterror(opts, "jail \"%s\" not found", name);
+ goto done_unlock_list;
+ } else if (error != ENOENT)
+ goto done_unlock_list;
+
+ vfs_opterror(opts, "no jail specified");
+ error = ENOENT;
+ goto done_unlock_list;
+
+ found_prison:
+ /* Get the parameters of the prison. */
+ pr->pr_ref++;
+ locked = PD_LOCKED;
+ td->td_retval[0] = pr->pr_id;
+ error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
+ error = vfs_setopt(opts, "parent", &i, sizeof(i));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopts(opts, "name", prison_name(mypr, pr));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
+ sizeof(pr->pr_cpuset->cs_id));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopts(opts, "path", prison_path(mypr, pr));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+#ifdef INET
+ error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
+ pr->pr_ip4s * sizeof(*pr->pr_ip4));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+#endif
+#ifdef INET6
+ error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
+ pr->pr_ip6s * sizeof(*pr->pr_ip6));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+#endif
+ error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
+ sizeof(pr->pr_securelevel));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
+ sizeof(pr->pr_childcount));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
+ sizeof(pr->pr_childmax));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+#ifdef COMPAT_FREEBSD32
+ if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+ uint32_t hid32 = pr->pr_hostid;
+
+ error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
+ } else
+#endif
+ error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
+ sizeof(pr->pr_hostid));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
+ sizeof(pr->pr_enforce_statfs));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
+ sizeof(pr->pr_devfs_rsnum));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
+ fi++) {
+ if (pr_flag_names[fi] == NULL)
+ continue;
+ i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
+ error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ i = !i;
+ error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ }
+ for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
+ fi++) {
+ i = pr->pr_flags &
+ (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
+ i = pr_flag_jailsys[fi].disable &&
+ (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
+ : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
+ : JAIL_SYS_INHERIT;
+ error =
+ vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ }
+ for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
+ fi++) {
+ if (pr_allow_names[fi] == NULL)
+ continue;
+ i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
+ error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ i = !i;
+ error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ }
+ i = (pr->pr_uref == 0);
+ error = vfs_setopt(opts, "dying", &i, sizeof(i));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ i = !i;
+ error = vfs_setopt(opts, "nodying", &i, sizeof(i));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+
+ /* Get the module parameters. */
+ mtx_unlock(&pr->pr_mtx);
+ locked = 0;
+ error = osd_jail_call(pr, PR_METHOD_GET, opts);
+ if (error)
+ goto done_deref;
+ prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
+
+ /* By now, all parameters should have been noted. */
+ TAILQ_FOREACH(opt, opts, link) {
+ if (!opt->seen && strcmp(opt->name, "errmsg")) {
+ error = EINVAL;
+ vfs_opterror(opts, "unknown parameter: %s", opt->name);
+ goto done_errmsg;
+ }
+ }
+
+ /* Write the fetched parameters back to userspace. */
+ error = 0;
+ TAILQ_FOREACH(opt, opts, link) {
+ if (opt->pos >= 0 && opt->pos != errmsg_pos) {
+ pos = 2 * opt->pos + 1;
+ optuio->uio_iov[pos].iov_len = opt->len;
+ if (opt->value != NULL) {
+ if (optuio->uio_segflg == UIO_SYSSPACE) {
+ bcopy(opt->value,
+ optuio->uio_iov[pos].iov_base,
+ opt->len);
+ } else {
+ error = copyout(opt->value,
+ optuio->uio_iov[pos].iov_base,
+ opt->len);
+ if (error)
+ break;
+ }
+ }
+ }
+ }
+ goto done_errmsg;
+
+ done_deref:
+ prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
+ goto done_errmsg;
+
+ done_unlock_list:
+ sx_sunlock(&allprison_lock);
+ done_errmsg:
+ if (error && errmsg_pos >= 0) {
+ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
+ errmsg_pos = 2 * errmsg_pos + 1;
+ if (errmsg_len > 0) {
+ if (optuio->uio_segflg == UIO_SYSSPACE)
+ bcopy(errmsg,
+ optuio->uio_iov[errmsg_pos].iov_base,
+ errmsg_len);
+ else
+ copyout(errmsg,
+ optuio->uio_iov[errmsg_pos].iov_base,
+ errmsg_len);
+ }
+ }
+ vfs_freeopts(opts);
+ return (error);
+}
+
+
+/*
+ * struct jail_remove_args {
+ * int jid;
+ * };
+ */
+int
+sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
+{
+ struct prison *pr, *cpr, *lpr, *tpr;
+ int descend, error;
+
+ error = priv_check(td, PRIV_JAIL_REMOVE);
+ if (error)
+ return (error);
+
+ sx_xlock(&allprison_lock);
+ pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
+ if (pr == NULL) {
+ sx_xunlock(&allprison_lock);
+ return (EINVAL);
+ }
+
+ /* Remove all descendants of this prison, then remove this prison. */
+ pr->pr_ref++;
+ pr->pr_flags |= PR_REMOVE;
+ if (!LIST_EMPTY(&pr->pr_children)) {
+ mtx_unlock(&pr->pr_mtx);
+ lpr = NULL;
+ FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
+ mtx_lock(&cpr->pr_mtx);
+ if (cpr->pr_ref > 0) {
+ tpr = cpr;
+ cpr->pr_ref++;
+ cpr->pr_flags |= PR_REMOVE;
+ } else {
+ /* Already removed - do not do it again. */
+ tpr = NULL;
+ }
+ mtx_unlock(&cpr->pr_mtx);
+ if (lpr != NULL) {
+ mtx_lock(&lpr->pr_mtx);
+ prison_remove_one(lpr);
+ sx_xlock(&allprison_lock);
+ }
+ lpr = tpr;
+ }
+ if (lpr != NULL) {
+ mtx_lock(&lpr->pr_mtx);
+ prison_remove_one(lpr);
+ sx_xlock(&allprison_lock);
+ }
+ mtx_lock(&pr->pr_mtx);
+ }
+ prison_remove_one(pr);
+ return (0);
+}
+
+static void
+prison_remove_one(struct prison *pr)
+{
+ struct proc *p;
+ int deuref;
+
+ /* If the prison was persistent, it is not anymore. */
+ deuref = 0;
+ if (pr->pr_flags & PR_PERSIST) {
+ pr->pr_ref--;
+ deuref = PD_DEUREF;
+ pr->pr_flags &= ~PR_PERSIST;
+ }
+
+ /*
+ * jail_remove added a reference. If that's the only one, remove
+ * the prison now.
+ */
+ KASSERT(pr->pr_ref > 0,
+ ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
+ if (pr->pr_ref == 1) {
+ prison_deref(pr,
+ deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
+ return;
+ }
+
+ mtx_unlock(&pr->pr_mtx);
+ sx_xunlock(&allprison_lock);
+ /*
+ * Kill all processes unfortunate enough to be attached to this prison.
+ */
+ sx_slock(&allproc_lock);
+ LIST_FOREACH(p, &allproc, p_list) {
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NEW && p->p_ucred &&
+ p->p_ucred->cr_prison == pr)
+ kern_psignal(p, SIGKILL);
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ /* Remove the temporary reference added by jail_remove. */
+ prison_deref(pr, deuref | PD_DEREF);
+}
+
+
+/*
+ * struct jail_attach_args {
+ * int jid;
+ * };
+ */
+int
+sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
+{
+ struct prison *pr;
+ int error;
+
+ error = priv_check(td, PRIV_JAIL_ATTACH);
+ if (error)
+ return (error);
+
+ sx_slock(&allprison_lock);
+ pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
+ if (pr == NULL) {
+ sx_sunlock(&allprison_lock);
+ return (EINVAL);
+ }
+
+ /*
+ * Do not allow a process to attach to a prison that is not
+ * considered to be "alive".
+ */
+ if (pr->pr_uref == 0) {
+ mtx_unlock(&pr->pr_mtx);
+ sx_sunlock(&allprison_lock);
+ return (EINVAL);
+ }
+
+ return (do_jail_attach(td, pr));
+}
+
+static int
+do_jail_attach(struct thread *td, struct prison *pr)
+{
+ struct prison *ppr;
+ struct proc *p;
+ struct ucred *newcred, *oldcred;
+ int error;
+
+ /*
+ * XXX: Note that there is a slight race here if two threads
+ * in the same privileged process attempt to attach to two
+ * different jails at the same time. It is important for
+ * user processes not to do this, or they might end up with
+ * a process root from one prison, but attached to the jail
+ * of another.
+ */
+ pr->pr_ref++;
+ pr->pr_uref++;
+ mtx_unlock(&pr->pr_mtx);
+
+ /* Let modules do whatever they need to prepare for attaching. */
+ error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
+ if (error) {
+ prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
+ return (error);
+ }
+ sx_sunlock(&allprison_lock);
+
+ /*
+ * Reparent the newly attached process to this jail.
+ */
+ ppr = td->td_ucred->cr_prison;
+ p = td->td_proc;
+ error = cpuset_setproc_update_set(p, pr->pr_cpuset);
+ if (error)
+ goto e_revert_osd;
+
+ vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
+ if ((error = change_dir(pr->pr_root, td)) != 0)
+ goto e_unlock;
+#ifdef MAC
+ if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
+ goto e_unlock;
+#endif
+ VOP_UNLOCK(pr->pr_root, 0);
+ if ((error = change_root(pr->pr_root, td)))
+ goto e_revert_osd;
+
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ setsugid(p);
+ crcopy(newcred, oldcred);
+ newcred->cr_prison = pr;
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+#ifdef RACCT
+ racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+ crfree(oldcred);
+ prison_deref(ppr, PD_DEREF | PD_DEUREF);
+ return (0);
+ e_unlock:
+ VOP_UNLOCK(pr->pr_root, 0);
+ e_revert_osd:
+ /* Tell modules this thread is still in its old jail after all. */
+ (void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
+ prison_deref(pr, PD_DEREF | PD_DEUREF);
+ return (error);
+}
+
+
+/*
+ * Returns a locked prison instance, or NULL on failure.
+ */
+struct prison *
+prison_find(int prid)
+{
+ struct prison *pr;
+
+ sx_assert(&allprison_lock, SX_LOCKED);
+ TAILQ_FOREACH(pr, &allprison, pr_list) {
+ if (pr->pr_id == prid) {
+ mtx_lock(&pr->pr_mtx);
+ if (pr->pr_ref > 0)
+ return (pr);
+ mtx_unlock(&pr->pr_mtx);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Find a prison that is a descendant of mypr. Returns a locked prison or NULL.
+ */
+struct prison *
+prison_find_child(struct prison *mypr, int prid)
+{
+ struct prison *pr;
+ int descend;
+
+ sx_assert(&allprison_lock, SX_LOCKED);
+ FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
+ if (pr->pr_id == prid) {
+ mtx_lock(&pr->pr_mtx);
+ if (pr->pr_ref > 0)
+ return (pr);
+ mtx_unlock(&pr->pr_mtx);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Look for the name relative to mypr. Returns a locked prison or NULL.
+ */
+struct prison *
+prison_find_name(struct prison *mypr, const char *name)
+{
+ struct prison *pr, *deadpr;
+ size_t mylen;
+ int descend;
+
+ sx_assert(&allprison_lock, SX_LOCKED);
+ mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
+ again:
+ deadpr = NULL;
+ FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
+ if (!strcmp(pr->pr_name + mylen, name)) {
+ mtx_lock(&pr->pr_mtx);
+ if (pr->pr_ref > 0) {
+ if (pr->pr_uref > 0)
+ return (pr);
+ deadpr = pr;
+ }
+ mtx_unlock(&pr->pr_mtx);
+ }
+ }
+ /* There was no valid prison - perhaps there was a dying one. */
+ if (deadpr != NULL) {
+ mtx_lock(&deadpr->pr_mtx);
+ if (deadpr->pr_ref == 0) {
+ mtx_unlock(&deadpr->pr_mtx);
+ goto again;
+ }
+ }
+ return (deadpr);
+}
+
+/*
+ * See if a prison has the specific flag set.
+ */
+int
+prison_flag(struct ucred *cred, unsigned flag)
+{
+
+ /* This is an atomic read, so no locking is necessary. */
+ return (cred->cr_prison->pr_flags & flag);
+}
+
+int
+prison_allow(struct ucred *cred, unsigned flag)
+{
+
+ /* This is an atomic read, so no locking is necessary. */
+ return (cred->cr_prison->pr_allow & flag);
+}
+
+/*
+ * Remove a prison reference. If that was the last reference, remove the
+ * prison itself - but not in this context in case there are locks held.
+ */
+void
+prison_free_locked(struct prison *pr)
+{
+
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ pr->pr_ref--;
+ if (pr->pr_ref == 0) {
+ mtx_unlock(&pr->pr_mtx);
+ TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
+ taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
+ return;
+ }
+ mtx_unlock(&pr->pr_mtx);
+}
+
+void
+prison_free(struct prison *pr)
+{
+
+ mtx_lock(&pr->pr_mtx);
+ prison_free_locked(pr);
+}
+
+static void
+prison_complete(void *context, int pending)
+{
+
+ prison_deref((struct prison *)context, 0);
+}
+
+/*
+ * Remove a prison reference (usually). This internal version assumes no
+ * mutexes are held, except perhaps the prison itself. If there are no more
+ * references, release and delist the prison. On completion, the prison lock
+ * and the allprison lock are both unlocked.
+ */
+static void
+prison_deref(struct prison *pr, int flags)
+{
+ struct prison *ppr, *tpr;
+
+ if (!(flags & PD_LOCKED))
+ mtx_lock(&pr->pr_mtx);
+ for (;;) {
+ if (flags & PD_DEUREF) {
+ pr->pr_uref--;
+ KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
+ }
+ if (flags & PD_DEREF)
+ pr->pr_ref--;
+ /* If the prison still has references, nothing else to do. */
+ if (pr->pr_ref > 0) {
+ mtx_unlock(&pr->pr_mtx);
+ if (flags & PD_LIST_SLOCKED)
+ sx_sunlock(&allprison_lock);
+ else if (flags & PD_LIST_XLOCKED)
+ sx_xunlock(&allprison_lock);
+ return;
+ }
+
+ mtx_unlock(&pr->pr_mtx);
+ if (flags & PD_LIST_SLOCKED) {
+ if (!sx_try_upgrade(&allprison_lock)) {
+ sx_sunlock(&allprison_lock);
+ sx_xlock(&allprison_lock);
+ }
+ } else if (!(flags & PD_LIST_XLOCKED))
+ sx_xlock(&allprison_lock);
+
+ TAILQ_REMOVE(&allprison, pr, pr_list);
+ LIST_REMOVE(pr, pr_sibling);
+ ppr = pr->pr_parent;
+ for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
+ tpr->pr_childcount--;
+ sx_xunlock(&allprison_lock);
+
+#ifdef VIMAGE
+ if (pr->pr_vnet != ppr->pr_vnet)
+ vnet_destroy(pr->pr_vnet);
+#endif
+ if (pr->pr_root != NULL)
+ vrele(pr->pr_root);
+ mtx_destroy(&pr->pr_mtx);
+#ifdef INET
+ free(pr->pr_ip4, M_PRISON);
+#endif
+#ifdef INET6
+ free(pr->pr_ip6, M_PRISON);
+#endif
+ if (pr->pr_cpuset != NULL)
+ cpuset_rel(pr->pr_cpuset);
+ osd_jail_exit(pr);
+#ifdef RACCT
+ prison_racct_detach(pr);
+#endif
+ free(pr, M_PRISON);
+
+ /* Removing a prison frees a reference on its parent. */
+ pr = ppr;
+ mtx_lock(&pr->pr_mtx);
+ flags = PD_DEREF | PD_DEUREF;
+ }
+}
+
+void
+prison_hold_locked(struct prison *pr)
+{
+
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ KASSERT(pr->pr_ref > 0,
+ ("Trying to hold dead prison (jid=%d).", pr->pr_id));
+ pr->pr_ref++;
+}
+
+void
+prison_hold(struct prison *pr)
+{
+
+ mtx_lock(&pr->pr_mtx);
+ prison_hold_locked(pr);
+ mtx_unlock(&pr->pr_mtx);
+}
+
+void
+prison_proc_hold(struct prison *pr)
+{
+
+ mtx_lock(&pr->pr_mtx);
+ KASSERT(pr->pr_uref > 0,
+ ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
+ pr->pr_uref++;
+ mtx_unlock(&pr->pr_mtx);
+}
+
+void
+prison_proc_free(struct prison *pr)
+{
+
+ mtx_lock(&pr->pr_mtx);
+ KASSERT(pr->pr_uref > 0,
+ ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
+ prison_deref(pr, PD_DEUREF | PD_LOCKED);
+}
+
+
+#ifdef INET
+/*
+ * Restrict a prison's IP address list with its parent's, possibly replacing
+ * it. Return true if the replacement buffer was used (or would have been).
+ */
+static int
+prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
+{
+ int ii, ij, used;
+ struct prison *ppr;
+
+ ppr = pr->pr_parent;
+ if (!(pr->pr_flags & PR_IP4_USER)) {
+ /* This has no user settings, so just copy the parent's list. */
+ if (pr->pr_ip4s < ppr->pr_ip4s) {
+ /*
+ * There's no room for the parent's list. Use the
+ * new list buffer, which is assumed to be big enough
+ * (if it was passed). If there's no buffer, try to
+ * allocate one.
+ */
+ used = 1;
+ if (newip4 == NULL) {
+ newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
+ M_PRISON, M_NOWAIT);
+ if (newip4 != NULL)
+ used = 0;
+ }
+ if (newip4 != NULL) {
+ bcopy(ppr->pr_ip4, newip4,
+ ppr->pr_ip4s * sizeof(*newip4));
+ free(pr->pr_ip4, M_PRISON);
+ pr->pr_ip4 = newip4;
+ pr->pr_ip4s = ppr->pr_ip4s;
+ }
+ return (used);
+ }
+ pr->pr_ip4s = ppr->pr_ip4s;
+ if (pr->pr_ip4s > 0)
+ bcopy(ppr->pr_ip4, pr->pr_ip4,
+ pr->pr_ip4s * sizeof(*newip4));
+ else if (pr->pr_ip4 != NULL) {
+ free(pr->pr_ip4, M_PRISON);
+ pr->pr_ip4 = NULL;
+ }
+ } else if (pr->pr_ip4s > 0) {
+ /* Remove addresses that aren't in the parent. */
+ for (ij = 0; ij < ppr->pr_ip4s; ij++)
+ if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
+ break;
+ if (ij < ppr->pr_ip4s)
+ ii = 1;
+ else {
+ bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
+ --pr->pr_ip4s * sizeof(*pr->pr_ip4));
+ ii = 0;
+ }
+ for (ij = 1; ii < pr->pr_ip4s; ) {
+ if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
+ ii++;
+ continue;
+ }
+ switch (ij >= ppr->pr_ip4s ? -1 :
+ qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
+ case -1:
+ bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
+ (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
+ break;
+ case 0:
+ ii++;
+ ij++;
+ break;
+ case 1:
+ ij++;
+ break;
+ }
+ }
+ if (pr->pr_ip4s == 0) {
+ pr->pr_flags |= PR_IP4_DISABLE;
+ free(pr->pr_ip4, M_PRISON);
+ pr->pr_ip4 = NULL;
+ }
+ }
+ return (0);
+}
+
+/*
+ * Pass back primary IPv4 address of this jail.
+ *
+ * If not restricted return success but do not alter the address. Caller has
+ * to make sure to initialize it correctly (e.g. INADDR_ANY).
+ *
+ * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
+ * Address returned in NBO.
+ */
+int
+prison_get_ip4(struct ucred *cred, struct in_addr *ia)
+{
+ struct prison *pr;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+ pr = cred->cr_prison;
+ if (!(pr->pr_flags & PR_IP4))
+ return (0);
+ mtx_lock(&pr->pr_mtx);
+ if (!(pr->pr_flags & PR_IP4)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ if (pr->pr_ip4 == NULL) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EAFNOSUPPORT);
+ }
+
+ ia->s_addr = pr->pr_ip4[0].s_addr;
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+}
+
+/*
+ * Return 1 if we should do proper source address selection or are not jailed.
+ * We will return 0 if we should bypass source address selection in favour
+ * of the primary jail IPv4 address. Only in this case *ia will be updated and
+ * returned in NBO.
+ * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
+ */
+int
+prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
+{
+ struct prison *pr;
+ struct in_addr lia;
+ int error;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+ if (!jailed(cred))
+ return (1);
+
+ pr = cred->cr_prison;
+ if (pr->pr_flags & PR_IP4_SADDRSEL)
+ return (1);
+
+ lia.s_addr = INADDR_ANY;
+ error = prison_get_ip4(cred, &lia);
+ if (error)
+ return (error);
+ if (lia.s_addr == INADDR_ANY)
+ return (1);
+
+ ia->s_addr = lia.s_addr;
+ return (0);
+}
+
+/*
+ * Return true if pr1 and pr2 have the same IPv4 address restrictions.
+ */
+int
+prison_equal_ip4(struct prison *pr1, struct prison *pr2)
+{
+
+ if (pr1 == pr2)
+ return (1);
+
+ /*
+ * No need to lock since the PR_IP4_USER flag can't be altered for
+ * existing prisons.
+ */
+ while (pr1 != &prison0 &&
+#ifdef VIMAGE
+ !(pr1->pr_flags & PR_VNET) &&
+#endif
+ !(pr1->pr_flags & PR_IP4_USER))
+ pr1 = pr1->pr_parent;
+ while (pr2 != &prison0 &&
+#ifdef VIMAGE
+ !(pr2->pr_flags & PR_VNET) &&
+#endif
+ !(pr2->pr_flags & PR_IP4_USER))
+ pr2 = pr2->pr_parent;
+ return (pr1 == pr2);
+}
+
+/*
+ * Make sure our (source) address is set to something meaningful to this
+ * jail.
+ *
+ * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
+ * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
+ * doesn't allow IPv4. Address passed in in NBO and returned in NBO.
+ */
+int
+prison_local_ip4(struct ucred *cred, struct in_addr *ia)
+{
+ struct prison *pr;
+ struct in_addr ia0;
+ int error;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+ pr = cred->cr_prison;
+ if (!(pr->pr_flags & PR_IP4))
+ return (0);
+ mtx_lock(&pr->pr_mtx);
+ if (!(pr->pr_flags & PR_IP4)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ if (pr->pr_ip4 == NULL) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EAFNOSUPPORT);
+ }
+
+ ia0.s_addr = ntohl(ia->s_addr);
+ if (ia0.s_addr == INADDR_LOOPBACK) {
+ ia->s_addr = pr->pr_ip4[0].s_addr;
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+
+ if (ia0.s_addr == INADDR_ANY) {
+ /*
+ * In case there is only 1 IPv4 address, bind directly.
+ */
+ if (pr->pr_ip4s == 1)
+ ia->s_addr = pr->pr_ip4[0].s_addr;
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+
+ error = _prison_check_ip4(pr, ia);
+ mtx_unlock(&pr->pr_mtx);
+ return (error);
+}
+
+/*
+ * Rewrite destination address in case we will connect to loopback address.
+ *
+ * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
+ * Address passed in in NBO and returned in NBO.
+ */
+int
+prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
+{
+ struct prison *pr;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+ pr = cred->cr_prison;
+ if (!(pr->pr_flags & PR_IP4))
+ return (0);
+ mtx_lock(&pr->pr_mtx);
+ if (!(pr->pr_flags & PR_IP4)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ if (pr->pr_ip4 == NULL) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EAFNOSUPPORT);
+ }
+
+ if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
+ ia->s_addr = pr->pr_ip4[0].s_addr;
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+
+ /*
+ * Return success because nothing had to be changed.
+ */
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+}
+
+/*
+ * Check if given address belongs to the jail referenced by cred/prison.
+ *
+ * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
+ * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
+ * doesn't allow IPv4. Address passed in in NBO.
+ */
+static int
+_prison_check_ip4(struct prison *pr, struct in_addr *ia)
+{
+ int i, a, z, d;
+
+ /*
+ * Check the primary IP.
+ */
+ if (pr->pr_ip4[0].s_addr == ia->s_addr)
+ return (0);
+
+ /*
+ * All the other IPs are sorted so we can do a binary search.
+ */
+ a = 0;
+ z = pr->pr_ip4s - 2;
+ while (a <= z) {
+ i = (a + z) / 2;
+ d = qcmp_v4(&pr->pr_ip4[i+1], ia);
+ if (d > 0)
+ z = i - 1;
+ else if (d < 0)
+ a = i + 1;
+ else
+ return (0);
+ }
+
+ return (EADDRNOTAVAIL);
+}
+
+int
+prison_check_ip4(struct ucred *cred, struct in_addr *ia)
+{
+ struct prison *pr;
+ int error;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
+
+ pr = cred->cr_prison;
+ if (!(pr->pr_flags & PR_IP4))
+ return (0);
+ mtx_lock(&pr->pr_mtx);
+ if (!(pr->pr_flags & PR_IP4)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ if (pr->pr_ip4 == NULL) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EAFNOSUPPORT);
+ }
+
+ error = _prison_check_ip4(pr, ia);
+ mtx_unlock(&pr->pr_mtx);
+ return (error);
+}
+#endif
+
+#ifdef INET6
+static int
+prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
+{
+ int ii, ij, used;
+ struct prison *ppr;
+
+ ppr = pr->pr_parent;
+ if (!(pr->pr_flags & PR_IP6_USER)) {
+ /* This has no user settings, so just copy the parent's list. */
+ if (pr->pr_ip6s < ppr->pr_ip6s) {
+ /*
+ * There's no room for the parent's list. Use the
+ * new list buffer, which is assumed to be big enough
+ * (if it was passed). If there's no buffer, try to
+ * allocate one.
+ */
+ used = 1;
+ if (newip6 == NULL) {
+ newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
+ M_PRISON, M_NOWAIT);
+ if (newip6 != NULL)
+ used = 0;
+ }
+ if (newip6 != NULL) {
+ bcopy(ppr->pr_ip6, newip6,
+ ppr->pr_ip6s * sizeof(*newip6));
+ free(pr->pr_ip6, M_PRISON);
+ pr->pr_ip6 = newip6;
+ pr->pr_ip6s = ppr->pr_ip6s;
+ }
+ return (used);
+ }
+ pr->pr_ip6s = ppr->pr_ip6s;
+ if (pr->pr_ip6s > 0)
+ bcopy(ppr->pr_ip6, pr->pr_ip6,
+ pr->pr_ip6s * sizeof(*newip6));
+ else if (pr->pr_ip6 != NULL) {
+ free(pr->pr_ip6, M_PRISON);
+ pr->pr_ip6 = NULL;
+ }
+ } else if (pr->pr_ip6s > 0) {
+ /* Remove addresses that aren't in the parent. */
+ for (ij = 0; ij < ppr->pr_ip6s; ij++)
+ if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
+ &ppr->pr_ip6[ij]))
+ break;
+ if (ij < ppr->pr_ip6s)
+ ii = 1;
+ else {
+ bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
+ --pr->pr_ip6s * sizeof(*pr->pr_ip6));
+ ii = 0;
+ }
+ for (ij = 1; ii < pr->pr_ip6s; ) {
+ if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
+ &ppr->pr_ip6[0])) {
+ ii++;
+ continue;
+ }
+ switch (ij >= ppr->pr_ip4s ? -1 :
+ qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
+ case -1:
+ bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
+ (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
+ break;
+ case 0:
+ ii++;
+ ij++;
+ break;
+ case 1:
+ ij++;
+ break;
+ }
+ }
+ if (pr->pr_ip6s == 0) {
+ pr->pr_flags |= PR_IP6_DISABLE;
+ free(pr->pr_ip6, M_PRISON);
+ pr->pr_ip6 = NULL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Pass back primary IPv6 address for this jail.
+ *
+ * If not restricted return success but do not alter the address. Caller has
+ * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
+ *
+ * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
+ */
+int
+prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
+{
+ struct prison *pr;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+ pr = cred->cr_prison;
+ if (!(pr->pr_flags & PR_IP6))
+ return (0);
+ mtx_lock(&pr->pr_mtx);
+ if (!(pr->pr_flags & PR_IP6)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ if (pr->pr_ip6 == NULL) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EAFNOSUPPORT);
+ }
+
+ bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+}
+
+/*
+ * Return 1 if we should do proper source address selection or are not jailed.
+ * We will return 0 if we should bypass source address selection in favour
+ * of the primary jail IPv6 address. Only in this case *ia will be updated and
+ * returned in NBO.
+ * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
+ */
+int
+prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
+{
+ struct prison *pr;
+ struct in6_addr lia6;
+ int error;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+ if (!jailed(cred))
+ return (1);
+
+ pr = cred->cr_prison;
+ if (pr->pr_flags & PR_IP6_SADDRSEL)
+ return (1);
+
+ lia6 = in6addr_any;
+ error = prison_get_ip6(cred, &lia6);
+ if (error)
+ return (error);
+ if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
+ return (1);
+
+ bcopy(&lia6, ia6, sizeof(struct in6_addr));
+ return (0);
+}
+
+/*
+ * Return true if pr1 and pr2 have the same IPv6 address restrictions.
+ */
+int
+prison_equal_ip6(struct prison *pr1, struct prison *pr2)
+{
+
+ if (pr1 == pr2)
+ return (1);
+
+ while (pr1 != &prison0 &&
+#ifdef VIMAGE
+ !(pr1->pr_flags & PR_VNET) &&
+#endif
+ !(pr1->pr_flags & PR_IP6_USER))
+ pr1 = pr1->pr_parent;
+ while (pr2 != &prison0 &&
+#ifdef VIMAGE
+ !(pr2->pr_flags & PR_VNET) &&
+#endif
+ !(pr2->pr_flags & PR_IP6_USER))
+ pr2 = pr2->pr_parent;
+ return (pr1 == pr2);
+}
+
+/*
+ * Make sure our (source) address is set to something meaningful to this jail.
+ *
+ * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
+ * when needed while binding.
+ *
+ * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
+ * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
+ * doesn't allow IPv6.
+ */
+int
+prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
+{
+ struct prison *pr;
+ int error;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+ pr = cred->cr_prison;
+ if (!(pr->pr_flags & PR_IP6))
+ return (0);
+ mtx_lock(&pr->pr_mtx);
+ if (!(pr->pr_flags & PR_IP6)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ if (pr->pr_ip6 == NULL) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EAFNOSUPPORT);
+ }
+
+ if (IN6_IS_ADDR_LOOPBACK(ia6)) {
+ bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+
+ if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
+ /*
+ * In case there is only 1 IPv6 address, and v6only is true,
+ * then bind directly.
+ */
+ if (v6only != 0 && pr->pr_ip6s == 1)
+ bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+
+ error = _prison_check_ip6(pr, ia6);
+ mtx_unlock(&pr->pr_mtx);
+ return (error);
+}
+
+/*
+ * Rewrite destination address in case we will connect to loopback address.
+ *
+ * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
+ */
+int
+prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
+{
+ struct prison *pr;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+ pr = cred->cr_prison;
+ if (!(pr->pr_flags & PR_IP6))
+ return (0);
+ mtx_lock(&pr->pr_mtx);
+ if (!(pr->pr_flags & PR_IP6)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ if (pr->pr_ip6 == NULL) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EAFNOSUPPORT);
+ }
+
+ if (IN6_IS_ADDR_LOOPBACK(ia6)) {
+ bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+
+ /*
+ * Return success because nothing had to be changed.
+ */
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+}
+
+/*
+ * Check if given address belongs to the jail referenced by cred/prison.
+ *
+ * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
+ * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
+ * doesn't allow IPv6.
+ */
+static int
+_prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
+{
+ int i, a, z, d;
+
+ /*
+ * Check the primary IP.
+ */
+ if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
+ return (0);
+
+ /*
+ * All the other IPs are sorted so we can do a binary search.
+ */
+ a = 0;
+ z = pr->pr_ip6s - 2;
+ while (a <= z) {
+ i = (a + z) / 2;
+ d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
+ if (d > 0)
+ z = i - 1;
+ else if (d < 0)
+ a = i + 1;
+ else
+ return (0);
+ }
+
+ return (EADDRNOTAVAIL);
+}
+
+int
+prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
+{
+ struct prison *pr;
+ int error;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
+
+ pr = cred->cr_prison;
+ if (!(pr->pr_flags & PR_IP6))
+ return (0);
+ mtx_lock(&pr->pr_mtx);
+ if (!(pr->pr_flags & PR_IP6)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+ }
+ if (pr->pr_ip6 == NULL) {
+ mtx_unlock(&pr->pr_mtx);
+ return (EAFNOSUPPORT);
+ }
+
+ error = _prison_check_ip6(pr, ia6);
+ mtx_unlock(&pr->pr_mtx);
+ return (error);
+}
+#endif
+
+/*
+ * Check if a jail supports the given address family.
+ *
+ * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
+ * if not.
+ */
+int
+prison_check_af(struct ucred *cred, int af)
+{
+ struct prison *pr;
+ int error;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+
+ pr = cred->cr_prison;
+#ifdef VIMAGE
+ /* Prisons with their own network stack are not limited. */
+ if (prison_owns_vnet(cred))
+ return (0);
+#endif
+
+ error = 0;
+ switch (af)
+ {
+#ifdef INET
+ case AF_INET:
+ if (pr->pr_flags & PR_IP4)
+ {
+ mtx_lock(&pr->pr_mtx);
+ if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
+ error = EAFNOSUPPORT;
+ mtx_unlock(&pr->pr_mtx);
+ }
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ if (pr->pr_flags & PR_IP6)
+ {
+ mtx_lock(&pr->pr_mtx);
+ if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
+ error = EAFNOSUPPORT;
+ mtx_unlock(&pr->pr_mtx);
+ }
+ break;
+#endif
+ case AF_LOCAL:
+ case AF_ROUTE:
+ break;
+ default:
+ if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
+ error = EAFNOSUPPORT;
+ }
+ return (error);
+}
+
+/*
+ * Check if given address belongs to the jail referenced by cred (wrapper to
+ * prison_check_ip[46]).
+ *
+ * Returns 0 if jail doesn't restrict the address family or if address belongs
+ * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
+ * the jail doesn't allow the address family. IPv4 Address passed in in NBO.
+ */
+int
+prison_if(struct ucred *cred, struct sockaddr *sa)
+{
+#ifdef INET
+ struct sockaddr_in *sai;
+#endif
+#ifdef INET6
+ struct sockaddr_in6 *sai6;
+#endif
+ int error;
+
+ KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
+ KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
+
+#ifdef VIMAGE
+ if (prison_owns_vnet(cred))
+ return (0);
+#endif
+
+ error = 0;
+ switch (sa->sa_family)
+ {
+#ifdef INET
+ case AF_INET:
+ sai = (struct sockaddr_in *)sa;
+ error = prison_check_ip4(cred, &sai->sin_addr);
+ break;
+#endif
+#ifdef INET6
+ case AF_INET6:
+ sai6 = (struct sockaddr_in6 *)sa;
+ error = prison_check_ip6(cred, &sai6->sin6_addr);
+ break;
+#endif
+ default:
+ if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
+ error = EAFNOSUPPORT;
+ }
+ return (error);
+}
+
+/*
+ * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
+ */
+int
+prison_check(struct ucred *cred1, struct ucred *cred2)
+{
+
+ return ((cred1->cr_prison == cred2->cr_prison ||
+ prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
+}
+
+/*
+ * Return 1 if p2 is a child of p1, otherwise 0.
+ */
+int
+prison_ischild(struct prison *pr1, struct prison *pr2)
+{
+
+ for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
+ if (pr1 == pr2)
+ return (1);
+ return (0);
+}
+
+/*
+ * Return 1 if the passed credential is in a jail, otherwise 0.
+ */
+int
+jailed(struct ucred *cred)
+{
+
+ return (cred->cr_prison != &prison0);
+}
+
+/*
+ * Return 1 if the passed credential is in a jail and that jail does not
+ * have its own virtual network stack, otherwise 0.
+ */
+int
+jailed_without_vnet(struct ucred *cred)
+{
+
+ if (!jailed(cred))
+ return (0);
+#ifdef VIMAGE
+ if (prison_owns_vnet(cred))
+ return (0);
+#endif
+
+ return (1);
+}
+
+/*
+ * Return the correct hostname (domainname, et al) for the passed credential.
+ */
+void
+getcredhostname(struct ucred *cred, char *buf, size_t size)
+{
+ struct prison *pr;
+
+ /*
+ * A NULL credential can be used to shortcut to the physical
+ * system's hostname.
+ */
+ pr = (cred != NULL) ? cred->cr_prison : &prison0;
+ mtx_lock(&pr->pr_mtx);
+ strlcpy(buf, pr->pr_hostname, size);
+ mtx_unlock(&pr->pr_mtx);
+}
+
+void
+getcreddomainname(struct ucred *cred, char *buf, size_t size)
+{
+
+ mtx_lock(&cred->cr_prison->pr_mtx);
+ strlcpy(buf, cred->cr_prison->pr_domainname, size);
+ mtx_unlock(&cred->cr_prison->pr_mtx);
+}
+
+void
+getcredhostuuid(struct ucred *cred, char *buf, size_t size)
+{
+
+ mtx_lock(&cred->cr_prison->pr_mtx);
+ strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
+ mtx_unlock(&cred->cr_prison->pr_mtx);
+}
+
+void
+getcredhostid(struct ucred *cred, unsigned long *hostid)
+{
+
+ mtx_lock(&cred->cr_prison->pr_mtx);
+ *hostid = cred->cr_prison->pr_hostid;
+ mtx_unlock(&cred->cr_prison->pr_mtx);
+}
+
+#ifdef VIMAGE
+/*
+ * Determine whether the prison represented by cred owns
+ * its vnet rather than having it inherited.
+ *
+ * Returns 1 in case the prison owns the vnet, 0 otherwise.
+ */
+int
+prison_owns_vnet(struct ucred *cred)
+{
+
+ /*
+ * vnets cannot be added/removed after jail creation,
+ * so no need to lock here.
+ */
+ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
+}
+#endif
+
+/*
+ * Determine whether the subject represented by cred can "see"
+ * status of a mount point.
+ * Returns: 0 for permitted, ENOENT otherwise.
+ * XXX: This function should be called cr_canseemount() and should be
+ * placed in kern_prot.c.
+ */
+int
+prison_canseemount(struct ucred *cred, struct mount *mp)
+{
+ struct prison *pr;
+ struct statfs *sp;
+ size_t len;
+
+ pr = cred->cr_prison;
+ if (pr->pr_enforce_statfs == 0)
+ return (0);
+ if (pr->pr_root->v_mount == mp)
+ return (0);
+ if (pr->pr_enforce_statfs == 2)
+ return (ENOENT);
+ /*
+ * If jail's chroot directory is set to "/" we should be able to see
+ * all mount-points from inside a jail.
+ * This is ugly check, but this is the only situation when jail's
+ * directory ends with '/'.
+ */
+ if (strcmp(pr->pr_path, "/") == 0)
+ return (0);
+ len = strlen(pr->pr_path);
+ sp = &mp->mnt_stat;
+ if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
+ return (ENOENT);
+ /*
+ * Be sure that we don't have situation where jail's root directory
+ * is "/some/path" and mount point is "/some/pathpath".
+ */
+ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
+ return (ENOENT);
+ return (0);
+}
+
+void
+prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
+{
+ char jpath[MAXPATHLEN];
+ struct prison *pr;
+ size_t len;
+
+ pr = cred->cr_prison;
+ if (pr->pr_enforce_statfs == 0)
+ return;
+ if (prison_canseemount(cred, mp) != 0) {
+ bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
+ strlcpy(sp->f_mntonname, "[restricted]",
+ sizeof(sp->f_mntonname));
+ return;
+ }
+ if (pr->pr_root->v_mount == mp) {
+ /*
+ * Clear current buffer data, so we are sure nothing from
+ * the valid path left there.
+ */
+ bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
+ *sp->f_mntonname = '/';
+ return;
+ }
+ /*
+ * If jail's chroot directory is set to "/" we should be able to see
+ * all mount-points from inside a jail.
+ */
+ if (strcmp(pr->pr_path, "/") == 0)
+ return;
+ len = strlen(pr->pr_path);
+ strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
+ /*
+ * Clear current buffer data, so we are sure nothing from
+ * the valid path left there.
+ */
+ bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
+ if (*jpath == '\0') {
+ /* Should never happen. */
+ *sp->f_mntonname = '/';
+ } else {
+ strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
+ }
+}
+
+/*
+ * Check with permission for a specific privilege is granted within jail. We
+ * have a specific list of accepted privileges; the rest are denied.
+ */
+int
+prison_priv_check(struct ucred *cred, int priv)
+{
+
+ if (!jailed(cred))
+ return (0);
+
+#ifdef VIMAGE
+ /*
+ * Privileges specific to prisons with a virtual network stack.
+ * There might be a duplicate entry here in case the privilege
+ * is only granted conditionally in the legacy jail case.
+ */
+ switch (priv) {
+#ifdef notyet
+ /*
+ * NFS-specific privileges.
+ */
+ case PRIV_NFS_DAEMON:
+ case PRIV_NFS_LOCKD:
+#endif
+ /*
+ * Network stack privileges.
+ */
+ case PRIV_NET_BRIDGE:
+ case PRIV_NET_GRE:
+ case PRIV_NET_BPF:
+ case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */
+ case PRIV_NET_ROUTE:
+ case PRIV_NET_TAP:
+ case PRIV_NET_SETIFMTU:
+ case PRIV_NET_SETIFFLAGS:
+ case PRIV_NET_SETIFCAP:
+ case PRIV_NET_SETIFDESCR:
+ case PRIV_NET_SETIFNAME :
+ case PRIV_NET_SETIFMETRIC:
+ case PRIV_NET_SETIFPHYS:
+ case PRIV_NET_SETIFMAC:
+ case PRIV_NET_ADDMULTI:
+ case PRIV_NET_DELMULTI:
+ case PRIV_NET_HWIOCTL:
+ case PRIV_NET_SETLLADDR:
+ case PRIV_NET_ADDIFGROUP:
+ case PRIV_NET_DELIFGROUP:
+ case PRIV_NET_IFCREATE:
+ case PRIV_NET_IFDESTROY:
+ case PRIV_NET_ADDIFADDR:
+ case PRIV_NET_DELIFADDR:
+ case PRIV_NET_LAGG:
+ case PRIV_NET_GIF:
+ case PRIV_NET_SETIFVNET:
+ case PRIV_NET_SETIFFIB:
+
+ /*
+ * 802.11-related privileges.
+ */
+ case PRIV_NET80211_GETKEY:
+#ifdef notyet
+ case PRIV_NET80211_MANAGE: /* XXX-BZ discuss with sam@ */
+#endif
+
+#ifdef notyet
+ /*
+ * AppleTalk privileges.
+ */
+ case PRIV_NETATALK_RESERVEDPORT:
+
+ /*
+ * ATM privileges.
+ */
+ case PRIV_NETATM_CFG:
+ case PRIV_NETATM_ADD:
+ case PRIV_NETATM_DEL:
+ case PRIV_NETATM_SET:
+
+ /*
+ * Bluetooth privileges.
+ */
+ case PRIV_NETBLUETOOTH_RAW:
+#endif
+
+ /*
+ * Netgraph and netgraph module privileges.
+ */
+ case PRIV_NETGRAPH_CONTROL:
+#ifdef notyet
+ case PRIV_NETGRAPH_TTY:
+#endif
+
+ /*
+ * IPv4 and IPv6 privileges.
+ */
+ case PRIV_NETINET_IPFW:
+ case PRIV_NETINET_DIVERT:
+ case PRIV_NETINET_PF:
+ case PRIV_NETINET_DUMMYNET:
+ case PRIV_NETINET_CARP:
+ case PRIV_NETINET_MROUTE:
+ case PRIV_NETINET_RAW:
+ case PRIV_NETINET_ADDRCTRL6:
+ case PRIV_NETINET_ND6:
+ case PRIV_NETINET_SCOPE6:
+ case PRIV_NETINET_ALIFETIME6:
+ case PRIV_NETINET_IPSEC:
+ case PRIV_NETINET_BINDANY:
+
+#ifdef notyet
+ /*
+ * IPX/SPX privileges.
+ */
+ case PRIV_NETIPX_RESERVEDPORT:
+ case PRIV_NETIPX_RAW:
+
+ /*
+ * NCP privileges.
+ */
+ case PRIV_NETNCP:
+
+ /*
+ * SMB privileges.
+ */
+ case PRIV_NETSMB:
+#endif
+
+ /*
+ * No default: or deny here.
+ * In case of no permit fall through to next switch().
+ */
+ if (cred->cr_prison->pr_flags & PR_VNET)
+ return (0);
+ }
+#endif /* VIMAGE */
+
+ switch (priv) {
+
+ /*
+ * Allow ktrace privileges for root in jail.
+ */
+ case PRIV_KTRACE:
+
+#if 0
+ /*
+ * Allow jailed processes to configure audit identity and
+ * submit audit records (login, etc). In the future we may
+ * want to further refine the relationship between audit and
+ * jail.
+ */
+ case PRIV_AUDIT_GETAUDIT:
+ case PRIV_AUDIT_SETAUDIT:
+ case PRIV_AUDIT_SUBMIT:
+#endif
+
+ /*
+ * Allow jailed processes to manipulate process UNIX
+ * credentials in any way they see fit.
+ */
+ case PRIV_CRED_SETUID:
+ case PRIV_CRED_SETEUID:
+ case PRIV_CRED_SETGID:
+ case PRIV_CRED_SETEGID:
+ case PRIV_CRED_SETGROUPS:
+ case PRIV_CRED_SETREUID:
+ case PRIV_CRED_SETREGID:
+ case PRIV_CRED_SETRESUID:
+ case PRIV_CRED_SETRESGID:
+
+ /*
+ * Jail implements visibility constraints already, so allow
+ * jailed root to override uid/gid-based constraints.
+ */
+ case PRIV_SEEOTHERGIDS:
+ case PRIV_SEEOTHERUIDS:
+
+ /*
+ * Jail implements inter-process debugging limits already, so
+ * allow jailed root various debugging privileges.
+ */
+ case PRIV_DEBUG_DIFFCRED:
+ case PRIV_DEBUG_SUGID:
+ case PRIV_DEBUG_UNPRIV:
+
+ /*
+ * Allow jail to set various resource limits and login
+ * properties, and for now, exceed process resource limits.
+ */
+ case PRIV_PROC_LIMIT:
+ case PRIV_PROC_SETLOGIN:
+ case PRIV_PROC_SETRLIMIT:
+
+ /*
+ * System V and POSIX IPC privileges are granted in jail.
+ */
+ case PRIV_IPC_READ:
+ case PRIV_IPC_WRITE:
+ case PRIV_IPC_ADMIN:
+ case PRIV_IPC_MSGSIZE:
+ case PRIV_MQ_ADMIN:
+
+ /*
+ * Jail operations within a jail work on child jails.
+ */
+ case PRIV_JAIL_ATTACH:
+ case PRIV_JAIL_SET:
+ case PRIV_JAIL_REMOVE:
+
+ /*
+ * Jail implements its own inter-process limits, so allow
+ * root processes in jail to change scheduling on other
+ * processes in the same jail. Likewise for signalling.
+ */
+ case PRIV_SCHED_DIFFCRED:
+ case PRIV_SCHED_CPUSET:
+ case PRIV_SIGNAL_DIFFCRED:
+ case PRIV_SIGNAL_SUGID:
+
+ /*
+ * Allow jailed processes to write to sysctls marked as jail
+ * writable.
+ */
+ case PRIV_SYSCTL_WRITEJAIL:
+
+ /*
+ * Allow root in jail to manage a variety of quota
+ * properties. These should likely be conditional on a
+ * configuration option.
+ */
+ case PRIV_VFS_GETQUOTA:
+ case PRIV_VFS_SETQUOTA:
+
+ /*
+ * Since Jail relies on chroot() to implement file system
+ * protections, grant many VFS privileges to root in jail.
+ * Be careful to exclude mount-related and NFS-related
+ * privileges.
+ */
+ case PRIV_VFS_READ:
+ case PRIV_VFS_WRITE:
+ case PRIV_VFS_ADMIN:
+ case PRIV_VFS_EXEC:
+ case PRIV_VFS_LOOKUP:
+ case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
+ case PRIV_VFS_CHFLAGS_DEV:
+ case PRIV_VFS_CHOWN:
+ case PRIV_VFS_CHROOT:
+ case PRIV_VFS_RETAINSUGID:
+ case PRIV_VFS_FCHROOT:
+ case PRIV_VFS_LINK:
+ case PRIV_VFS_SETGID:
+ case PRIV_VFS_STAT:
+ case PRIV_VFS_STICKYFILE:
+
+ /*
+ * As in the non-jail case, non-root users are expected to be
+ * able to read kernel/phyiscal memory (provided /dev/[k]mem
+ * exists in the jail and they have permission to access it).
+ */
+ case PRIV_KMEM_READ:
+ return (0);
+
+ /*
+ * Depending on the global setting, allow privilege of
+ * setting system flags.
+ */
+ case PRIV_VFS_SYSFLAGS:
+ if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
+ return (0);
+ else
+ return (EPERM);
+
+ /*
+ * Depending on the global setting, allow privilege of
+ * mounting/unmounting file systems.
+ */
+ case PRIV_VFS_MOUNT:
+ case PRIV_VFS_UNMOUNT:
+ case PRIV_VFS_MOUNT_NONUSER:
+ case PRIV_VFS_MOUNT_OWNER:
+ if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
+ cred->cr_prison->pr_enforce_statfs < 2)
+ return (0);
+ else
+ return (EPERM);
+
+ /*
+ * Allow jailed root to bind reserved ports and reuse in-use
+ * ports.
+ */
+ case PRIV_NETINET_RESERVEDPORT:
+ case PRIV_NETINET_REUSEPORT:
+ return (0);
+
+ /*
+ * Allow jailed root to set certian IPv4/6 (option) headers.
+ */
+ case PRIV_NETINET_SETHDROPTS:
+ return (0);
+
+ /*
+ * Conditionally allow creating raw sockets in jail.
+ */
+ case PRIV_NETINET_RAW:
+ if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
+ return (0);
+ else
+ return (EPERM);
+
+ /*
+ * Since jail implements its own visibility limits on netstat
+ * sysctls, allow getcred. This allows identd to work in
+ * jail.
+ */
+ case PRIV_NETINET_GETCRED:
+ return (0);
+
+ /*
+ * Allow jailed root to set loginclass.
+ */
+ case PRIV_PROC_SETLOGINCLASS:
+ return (0);
+
+ default:
+ /*
+ * In all remaining cases, deny the privilege request. This
+ * includes almost all network privileges, many system
+ * configuration privileges.
+ */
+ return (EPERM);
+ }
+}
+
+/*
+ * Return the part of pr2's name that is relative to pr1, or the whole name
+ * if it does not directly follow.
+ */
+
+char *
+prison_name(struct prison *pr1, struct prison *pr2)
+{
+ char *name;
+
+ /* Jails see themselves as "0" (if they see themselves at all). */
+ if (pr1 == pr2)
+ return "0";
+ name = pr2->pr_name;
+ if (prison_ischild(pr1, pr2)) {
+ /*
+ * pr1 isn't locked (and allprison_lock may not be either)
+ * so its length can't be counted on. But the number of dots
+ * can be counted on - and counted.
+ */
+ for (; pr1 != &prison0; pr1 = pr1->pr_parent)
+ name = strchr(name, '.') + 1;
+ }
+ return (name);
+}
+
+/*
+ * Return the part of pr2's path that is relative to pr1, or the whole path
+ * if it does not directly follow.
+ */
+static char *
+prison_path(struct prison *pr1, struct prison *pr2)
+{
+ char *path1, *path2;
+ int len1;
+
+ path1 = pr1->pr_path;
+ path2 = pr2->pr_path;
+ if (!strcmp(path1, "/"))
+ return (path2);
+ len1 = strlen(path1);
+ if (strncmp(path1, path2, len1))
+ return (path2);
+ if (path2[len1] == '\0')
+ return "/";
+ if (path2[len1] == '/')
+ return (path2 + len1);
+ return (path2);
+}
+
+
+/*
+ * Jail-related sysctls.
+ */
+static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
+ "Jails");
+
+static int
+sysctl_jail_list(SYSCTL_HANDLER_ARGS)
+{
+ struct xprison *xp;
+ struct prison *pr, *cpr;
+#ifdef INET
+ struct in_addr *ip4 = NULL;
+ int ip4s = 0;
+#endif
+#ifdef INET6
+ struct in6_addr *ip6 = NULL;
+ int ip6s = 0;
+#endif
+ int descend, error;
+
+ xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
+ pr = req->td->td_ucred->cr_prison;
+ error = 0;
+ sx_slock(&allprison_lock);
+ FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
+#if defined(INET) || defined(INET6)
+ again:
+#endif
+ mtx_lock(&cpr->pr_mtx);
+#ifdef INET
+ if (cpr->pr_ip4s > 0) {
+ if (ip4s < cpr->pr_ip4s) {
+ ip4s = cpr->pr_ip4s;
+ mtx_unlock(&cpr->pr_mtx);
+ ip4 = realloc(ip4, ip4s *
+ sizeof(struct in_addr), M_TEMP, M_WAITOK);
+ goto again;
+ }
+ bcopy(cpr->pr_ip4, ip4,
+ cpr->pr_ip4s * sizeof(struct in_addr));
+ }
+#endif
+#ifdef INET6
+ if (cpr->pr_ip6s > 0) {
+ if (ip6s < cpr->pr_ip6s) {
+ ip6s = cpr->pr_ip6s;
+ mtx_unlock(&cpr->pr_mtx);
+ ip6 = realloc(ip6, ip6s *
+ sizeof(struct in6_addr), M_TEMP, M_WAITOK);
+ goto again;
+ }
+ bcopy(cpr->pr_ip6, ip6,
+ cpr->pr_ip6s * sizeof(struct in6_addr));
+ }
+#endif
+ if (cpr->pr_ref == 0) {
+ mtx_unlock(&cpr->pr_mtx);
+ continue;
+ }
+ bzero(xp, sizeof(*xp));
+ xp->pr_version = XPRISON_VERSION;
+ xp->pr_id = cpr->pr_id;
+ xp->pr_state = cpr->pr_uref > 0
+ ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
+ strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
+ strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
+ strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
+#ifdef INET
+ xp->pr_ip4s = cpr->pr_ip4s;
+#endif
+#ifdef INET6
+ xp->pr_ip6s = cpr->pr_ip6s;
+#endif
+ mtx_unlock(&cpr->pr_mtx);
+ error = SYSCTL_OUT(req, xp, sizeof(*xp));
+ if (error)
+ break;
+#ifdef INET
+ if (xp->pr_ip4s > 0) {
+ error = SYSCTL_OUT(req, ip4,
+ xp->pr_ip4s * sizeof(struct in_addr));
+ if (error)
+ break;
+ }
+#endif
+#ifdef INET6
+ if (xp->pr_ip6s > 0) {
+ error = SYSCTL_OUT(req, ip6,
+ xp->pr_ip6s * sizeof(struct in6_addr));
+ if (error)
+ break;
+ }
+#endif
+ }
+ sx_sunlock(&allprison_lock);
+ free(xp, M_TEMP);
+#ifdef INET
+ free(ip4, M_TEMP);
+#endif
+#ifdef INET6
+ free(ip6, M_TEMP);
+#endif
+ return (error);
+}
+
+SYSCTL_OID(_security_jail, OID_AUTO, list,
+ CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_jail_list, "S", "List of active jails");
+
+static int
+sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
+{
+ int error, injail;
+
+ injail = jailed(req->td->td_ucred);
+ error = SYSCTL_OUT(req, &injail, sizeof(injail));
+
+ return (error);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_jail_jailed, "I", "Process in jail?");
+
+static int
+sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
+{
+ int error, havevnet;
+#ifdef VIMAGE
+ struct ucred *cred = req->td->td_ucred;
+
+ havevnet = jailed(cred) && prison_owns_vnet(cred);
+#else
+ havevnet = 0;
+#endif
+ error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
+
+ return (error);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_jail_vnet, "I", "Jail owns VNET?");
+
+#if defined(INET) || defined(INET6)
+SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
+ &jail_max_af_ips, 0,
+ "Number of IP addresses a jail may have at most per address family");
+#endif
+
+/*
+ * Default parameters for jail(2) compatability. For historical reasons,
+ * the sysctl names have varying similarity to the parameter names. Prisons
+ * just see their own parameters, and can't change them.
+ */
+static int
+sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
+{
+ struct prison *pr;
+ int allow, error, i;
+
+ pr = req->td->td_ucred->cr_prison;
+ allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
+
+ /* Get the current flag value, and convert it to a boolean. */
+ i = (allow & arg2) ? 1 : 0;
+ if (arg1 != NULL)
+ i = !i;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ i = i ? arg2 : 0;
+ if (arg1 != NULL)
+ i ^= arg2;
+ /*
+ * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
+ * for writing.
+ */
+ mtx_lock(&prison0.pr_mtx);
+ jail_default_allow = (jail_default_allow & ~arg2) | i;
+ mtx_unlock(&prison0.pr_mtx);
+ return (0);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
+ "Processes in jail can set their hostnames");
+SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
+ "Processes in jail are limited to creating UNIX/IP/route sockets only");
+SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
+ "Processes in jail can use System V IPC primitives");
+SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
+ "Prison root can create raw sockets");
+SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
+ "Processes in jail can alter system file flags");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount/unmount jail-friendly file systems");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the devfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the nullfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the procfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the tmpfs file system");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the zfs file system");
+
+static int
+sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
+{
+ struct prison *pr;
+ int level, error;
+
+ pr = req->td->td_ucred->cr_prison;
+ level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
+ error = sysctl_handle_int(oidp, &level, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ *(int *)arg1 = level;
+ return (0);
+}
+
+SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
+ sysctl_jail_default_level, "I",
+ "Processes in jail cannot see all mounted file systems");
+
+SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
+ sysctl_jail_default_level, "I",
+ "Ruleset for the devfs filesystem in jail");
+
+/*
+ * Nodes to describe jail parameters. Maximum length of string parameters
+ * is returned in the string itself, and the other parameters exist merely
+ * to make themselves and their types known.
+ */
+SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
+ "Jail parameters");
+
+int
+sysctl_jail_param(SYSCTL_HANDLER_ARGS)
+{
+ int i;
+ long l;
+ size_t s;
+ char numbuf[12];
+
+ switch (oidp->oid_kind & CTLTYPE)
+ {
+ case CTLTYPE_LONG:
+ case CTLTYPE_ULONG:
+ l = 0;
+#ifdef SCTL_MASK32
+ if (!(req->flags & SCTL_MASK32))
+#endif
+ return (SYSCTL_OUT(req, &l, sizeof(l)));
+ case CTLTYPE_INT:
+ case CTLTYPE_UINT:
+ i = 0;
+ return (SYSCTL_OUT(req, &i, sizeof(i)));
+ case CTLTYPE_STRING:
+ snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
+ return
+ (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
+ case CTLTYPE_STRUCT:
+ s = (size_t)arg2;
+ return (SYSCTL_OUT(req, &s, sizeof(s)));
+ }
+ return (0);
+}
+
+SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
+SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
+SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
+SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
+SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
+ "I", "Jail secure level");
+SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
+ "I", "Jail cannot see all mounted file systems");
+SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
+ "I", "Ruleset for in-jail devfs mounts");
+SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail persistence");
+#ifdef VIMAGE
+SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
+ "E,jailsys", "Virtual network stack");
+#endif
+SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
+ "B", "Jail is in the process of shutting down");
+
+SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
+SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
+ "I", "Current number of child jails");
+SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
+ "I", "Maximum number of child jails");
+
+SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
+SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
+ "Jail hostname");
+SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
+ "Jail NIS domainname");
+SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
+ "Jail host UUID");
+SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
+ "LU", "Jail host ID");
+
+SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
+SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
+
+#ifdef INET
+SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
+ "Jail IPv4 address virtualization");
+SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
+ "S,in_addr,a", "Jail IPv4 addresses");
+SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Do (not) use IPv4 source address selection rather than the "
+ "primary jail IPv4 address.");
+#endif
+#ifdef INET6
+SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
+ "Jail IPv6 address virtualization");
+SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
+ "S,in6_addr,a", "Jail IPv6 addresses");
+SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Do (not) use IPv6 source address selection rather than the "
+ "primary jail IPv6 address.");
+#endif
+
+SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
+SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may set hostname");
+SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may use SYSV IPC");
+SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may create raw sockets");
+SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may alter system file flags");
+SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may set file quotas");
+SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
+
+SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
+SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount/unmount jail-friendly file systems in general");
+SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the devfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the nullfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the procfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the tmpfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the zfs file system");
+
+void
+prison_racct_foreach(void (*callback)(struct racct *racct,
+ void *arg2, void *arg3), void *arg2, void *arg3)
+{
+ struct prison_racct *prr;
+
+ sx_slock(&allprison_lock);
+ LIST_FOREACH(prr, &allprison_racct, prr_next)
+ (callback)(prr->prr_racct, arg2, arg3);
+ sx_sunlock(&allprison_lock);
+}
+
+static struct prison_racct *
+prison_racct_find_locked(const char *name)
+{
+ struct prison_racct *prr;
+
+ sx_assert(&allprison_lock, SA_XLOCKED);
+
+ if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
+ return (NULL);
+
+ LIST_FOREACH(prr, &allprison_racct, prr_next) {
+ if (strcmp(name, prr->prr_name) != 0)
+ continue;
+
+ /* Found prison_racct with a matching name? */
+ prison_racct_hold(prr);
+ return (prr);
+ }
+
+ /* Add new prison_racct. */
+ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
+ racct_create(&prr->prr_racct);
+
+ strcpy(prr->prr_name, name);
+ refcount_init(&prr->prr_refcount, 1);
+ LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
+
+ return (prr);
+}
+
+struct prison_racct *
+prison_racct_find(const char *name)
+{
+ struct prison_racct *prr;
+
+ sx_xlock(&allprison_lock);
+ prr = prison_racct_find_locked(name);
+ sx_xunlock(&allprison_lock);
+ return (prr);
+}
+
+void
+prison_racct_hold(struct prison_racct *prr)
+{
+
+ refcount_acquire(&prr->prr_refcount);
+}
+
+static void
+prison_racct_free_locked(struct prison_racct *prr)
+{
+
+ sx_assert(&allprison_lock, SA_XLOCKED);
+
+ if (refcount_release(&prr->prr_refcount)) {
+ racct_destroy(&prr->prr_racct);
+ LIST_REMOVE(prr, prr_next);
+ free(prr, M_PRISON_RACCT);
+ }
+}
+
+void
+prison_racct_free(struct prison_racct *prr)
+{
+ int old;
+
+ sx_assert(&allprison_lock, SA_UNLOCKED);
+
+ old = prr->prr_refcount;
+ if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
+ return;
+
+ sx_xlock(&allprison_lock);
+ prison_racct_free_locked(prr);
+ sx_xunlock(&allprison_lock);
+}
+
+#ifdef RACCT
+static void
+prison_racct_attach(struct prison *pr)
+{
+ struct prison_racct *prr;
+
+ sx_assert(&allprison_lock, SA_XLOCKED);
+
+ prr = prison_racct_find_locked(pr->pr_name);
+ KASSERT(prr != NULL, ("cannot find prison_racct"));
+
+ pr->pr_prison_racct = prr;
+}
+
+/*
+ * Handle jail renaming. From the racct point of view, renaming means
+ * moving from one prison_racct to another.
+ */
+static void
+prison_racct_modify(struct prison *pr)
+{
+ struct proc *p;
+ struct ucred *cred;
+ struct prison_racct *oldprr;
+
+ sx_slock(&allproc_lock);
+ sx_xlock(&allprison_lock);
+
+ if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
+ sx_xunlock(&allprison_lock);
+ sx_sunlock(&allproc_lock);
+ return;
+ }
+
+ oldprr = pr->pr_prison_racct;
+ pr->pr_prison_racct = NULL;
+
+ prison_racct_attach(pr);
+
+ /*
+ * Move resource utilisation records.
+ */
+ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
+
+ /*
+ * Force rctl to reattach rules to processes.
+ */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ cred = crhold(p->p_ucred);
+ PROC_UNLOCK(p);
+ racct_proc_ucred_changed(p, cred, cred);
+ crfree(cred);
+ }
+
+ sx_sunlock(&allproc_lock);
+ prison_racct_free_locked(oldprr);
+ sx_xunlock(&allprison_lock);
+}
+
+static void
+prison_racct_detach(struct prison *pr)
+{
+
+ sx_assert(&allprison_lock, SA_UNLOCKED);
+
+ if (pr->pr_prison_racct == NULL)
+ return;
+ prison_racct_free(pr->pr_prison_racct);
+ pr->pr_prison_racct = NULL;
+}
+#endif /* RACCT */
+
+#ifdef DDB
+
+static void
+db_show_prison(struct prison *pr)
+{
+ int fi;
+#if defined(INET) || defined(INET6)
+ int ii;
+#endif
+ unsigned jsf;
+#ifdef INET6
+ char ip6buf[INET6_ADDRSTRLEN];
+#endif
+
+ db_printf("prison %p:\n", pr);
+ db_printf(" jid = %d\n", pr->pr_id);
+ db_printf(" name = %s\n", pr->pr_name);
+ db_printf(" parent = %p\n", pr->pr_parent);
+ db_printf(" ref = %d\n", pr->pr_ref);
+ db_printf(" uref = %d\n", pr->pr_uref);
+ db_printf(" path = %s\n", pr->pr_path);
+ db_printf(" cpuset = %d\n", pr->pr_cpuset
+ ? pr->pr_cpuset->cs_id : -1);
+#ifdef VIMAGE
+ db_printf(" vnet = %p\n", pr->pr_vnet);
+#endif
+ db_printf(" root = %p\n", pr->pr_root);
+ db_printf(" securelevel = %d\n", pr->pr_securelevel);
+ db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum);
+ db_printf(" children.max = %d\n", pr->pr_childmax);
+ db_printf(" children.cur = %d\n", pr->pr_childcount);
+ db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children));
+ db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling));
+ db_printf(" flags = 0x%x", pr->pr_flags);
+ for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
+ fi++)
+ if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
+ db_printf(" %s", pr_flag_names[fi]);
+ for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
+ fi++) {
+ jsf = pr->pr_flags &
+ (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
+ db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
+ pr_flag_jailsys[fi].disable &&
+ (jsf == pr_flag_jailsys[fi].disable) ? "disable"
+ : (jsf == pr_flag_jailsys[fi].new) ? "new"
+ : "inherit");
+ }
+ db_printf(" allow = 0x%x", pr->pr_allow);
+ for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
+ fi++)
+ if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
+ db_printf(" %s", pr_allow_names[fi]);
+ db_printf("\n");
+ db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs);
+ db_printf(" host.hostname = %s\n", pr->pr_hostname);
+ db_printf(" host.domainname = %s\n", pr->pr_domainname);
+ db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid);
+ db_printf(" host.hostid = %lu\n", pr->pr_hostid);
+#ifdef INET
+ db_printf(" ip4s = %d\n", pr->pr_ip4s);
+ for (ii = 0; ii < pr->pr_ip4s; ii++)
+ db_printf(" %s %s\n",
+ ii == 0 ? "ip4.addr =" : " ",
+ inet_ntoa(pr->pr_ip4[ii]));
+#endif
+#ifdef INET6
+ db_printf(" ip6s = %d\n", pr->pr_ip6s);
+ for (ii = 0; ii < pr->pr_ip6s; ii++)
+ db_printf(" %s %s\n",
+ ii == 0 ? "ip6.addr =" : " ",
+ ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
+#endif
+}
+
+DB_SHOW_COMMAND(prison, db_show_prison_command)
+{
+ struct prison *pr;
+
+ if (!have_addr) {
+ /*
+ * Show all prisons in the list, and prison0 which is not
+ * listed.
+ */
+ db_show_prison(&prison0);
+ if (!db_pager_quit) {
+ TAILQ_FOREACH(pr, &allprison, pr_list) {
+ db_show_prison(pr);
+ if (db_pager_quit)
+ break;
+ }
+ }
+ return;
+ }
+
+ if (addr == 0)
+ pr = &prison0;
+ else {
+ /* Look for a prison with the ID and with references. */
+ TAILQ_FOREACH(pr, &allprison, pr_list)
+ if (pr->pr_id == addr && pr->pr_ref > 0)
+ break;
+ if (pr == NULL)
+ /* Look again, without requiring a reference. */
+ TAILQ_FOREACH(pr, &allprison, pr_list)
+ if (pr->pr_id == addr)
+ break;
+ if (pr == NULL)
+ /* Assume address points to a valid prison. */
+ pr = (struct prison *)addr;
+ }
+ db_show_prison(pr);
+}
+
+#endif /* DDB */
diff --git a/sys/kern/kern_khelp.c b/sys/kern/kern_khelp.c
new file mode 100644
index 0000000..50751e9
--- /dev/null
+++ b/sys/kern/kern_khelp.c
@@ -0,0 +1,372 @@
+/*-
+ * Copyright (c) 2010,2013 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University of Technology,
+ * made possible in part by grants from the FreeBSD Foundation and Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/khelp.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/module_khelp.h>
+#include <sys/osd.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/rwlock.h>
+#include <sys/systm.h>
+
+static struct rwlock khelp_list_lock;
+RW_SYSINIT(khelplistlock, &khelp_list_lock, "helper list lock");
+
+static TAILQ_HEAD(helper_head, helper) helpers = TAILQ_HEAD_INITIALIZER(helpers);
+
+/* Private function prototypes. */
+static inline void khelp_remove_osd(struct helper *h, struct osd *hosd);
+void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
+
+#define KHELP_LIST_WLOCK() rw_wlock(&khelp_list_lock)
+#define KHELP_LIST_WUNLOCK() rw_wunlock(&khelp_list_lock)
+#define KHELP_LIST_RLOCK() rw_rlock(&khelp_list_lock)
+#define KHELP_LIST_RUNLOCK() rw_runlock(&khelp_list_lock)
+#define KHELP_LIST_LOCK_ASSERT() rw_assert(&khelp_list_lock, RA_LOCKED)
+
+int
+khelp_register_helper(struct helper *h)
+{
+ struct helper *tmph;
+ int error, i, inserted;
+
+ error = inserted = 0;
+ refcount_init(&h->h_refcount, 0);
+ h->h_id = osd_register(OSD_KHELP, NULL, NULL);
+
+ /* It's only safe to add the hooks after osd_register(). */
+ for (i = 0; i < h->h_nhooks && !error; i++) {
+ /* We don't require the module to assign hook_helper. */
+ h->h_hooks[i].hook_helper = h;
+ error = hhook_add_hook_lookup(&h->h_hooks[i], HHOOK_WAITOK);
+ if (error)
+ printf("%s: \"%s\" khelp module unable to "
+ "hook type %d id %d due to error %d\n", __func__,
+ h->h_name, h->h_hooks[i].hook_type,
+ h->h_hooks[i].hook_id, error);
+ }
+
+ if (error) {
+ for (i--; i >= 0; i--)
+ hhook_remove_hook_lookup(&h->h_hooks[i]);
+ osd_deregister(OSD_KHELP, h->h_id);
+ } else {
+ KHELP_LIST_WLOCK();
+ /*
+ * Keep list of helpers sorted in descending h_id order. Due to
+ * the way osd_set() works, a sorted list ensures
+ * khelp_init_osd() will operate with improved efficiency.
+ */
+ TAILQ_FOREACH(tmph, &helpers, h_next) {
+ if (tmph->h_id < h->h_id) {
+ TAILQ_INSERT_BEFORE(tmph, h, h_next);
+ inserted = 1;
+ break;
+ }
+ }
+
+ if (!inserted)
+ TAILQ_INSERT_TAIL(&helpers, h, h_next);
+ KHELP_LIST_WUNLOCK();
+ }
+
+ return (error);
+}
+
+int
+khelp_deregister_helper(struct helper *h)
+{
+ struct helper *tmph;
+ int error, i;
+
+ KHELP_LIST_WLOCK();
+ if (h->h_refcount > 0)
+ error = EBUSY;
+ else {
+ error = ENOENT;
+ TAILQ_FOREACH(tmph, &helpers, h_next) {
+ if (tmph == h) {
+ TAILQ_REMOVE(&helpers, h, h_next);
+ error = 0;
+ break;
+ }
+ }
+ }
+ KHELP_LIST_WUNLOCK();
+
+ if (!error) {
+ for (i = 0; i < h->h_nhooks; i++)
+ hhook_remove_hook_lookup(&h->h_hooks[i]);
+ osd_deregister(OSD_KHELP, h->h_id);
+ }
+
+ return (error);
+}
+
+int
+khelp_init_osd(uint32_t classes, struct osd *hosd)
+{
+ struct helper *h;
+ void *hdata;
+ int error;
+
+ KASSERT(hosd != NULL, ("struct osd not initialised!"));
+
+ error = 0;
+
+ KHELP_LIST_RLOCK();
+ TAILQ_FOREACH(h, &helpers, h_next) {
+ /* If helper is correct class and needs to store OSD... */
+ if (h->h_classes & classes && h->h_flags & HELPER_NEEDS_OSD) {
+ hdata = uma_zalloc(h->h_zone, M_NOWAIT);
+ if (hdata == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ osd_set(OSD_KHELP, hosd, h->h_id, hdata);
+ refcount_acquire(&h->h_refcount);
+ }
+ }
+
+ if (error) {
+ /* Delete OSD that was assigned prior to the error. */
+ TAILQ_FOREACH(h, &helpers, h_next) {
+ if (h->h_classes & classes)
+ khelp_remove_osd(h, hosd);
+ }
+ }
+ KHELP_LIST_RUNLOCK();
+
+ return (error);
+}
+
+int
+khelp_destroy_osd(struct osd *hosd)
+{
+ struct helper *h;
+ int error;
+
+ KASSERT(hosd != NULL, ("struct osd not initialised!"));
+
+ error = 0;
+
+ KHELP_LIST_RLOCK();
+ /*
+ * Clean up all khelp related OSD.
+ *
+ * XXXLAS: Would be nice to use something like osd_exit() here but it
+ * doesn't have the right semantics for this purpose.
+ */
+ TAILQ_FOREACH(h, &helpers, h_next)
+ khelp_remove_osd(h, hosd);
+ KHELP_LIST_RUNLOCK();
+
+ return (error);
+}
+
+static inline void
+khelp_remove_osd(struct helper *h, struct osd *hosd)
+{
+ void *hdata;
+
+ if (h->h_flags & HELPER_NEEDS_OSD) {
+ /*
+ * If the current helper uses OSD and calling osd_get()
+ * on the helper's h_id returns non-NULL, the helper has
+ * OSD attached to 'hosd' which needs to be cleaned up.
+ */
+ hdata = osd_get(OSD_KHELP, hosd, h->h_id);
+ if (hdata != NULL) {
+ uma_zfree(h->h_zone, hdata);
+ osd_del(OSD_KHELP, hosd, h->h_id);
+ refcount_release(&h->h_refcount);
+ }
+ }
+}
+
+void *
+khelp_get_osd(struct osd *hosd, int32_t id)
+{
+
+ return (osd_get(OSD_KHELP, hosd, id));
+}
+
+int32_t
+khelp_get_id(char *hname)
+{
+ struct helper *h;
+ int32_t id;
+
+ id = -1;
+
+ KHELP_LIST_RLOCK();
+ TAILQ_FOREACH(h, &helpers, h_next) {
+ if (strncmp(h->h_name, hname, HELPER_NAME_MAXLEN) == 0) {
+ id = h->h_id;
+ break;
+ }
+ }
+ KHELP_LIST_RUNLOCK();
+
+ return (id);
+}
+
+int
+khelp_add_hhook(struct hookinfo *hki, uint32_t flags)
+{
+ int error;
+
+ /*
+ * XXXLAS: Should probably include the functionality to update the
+ * helper's h_hooks struct member.
+ */
+ error = hhook_add_hook_lookup(hki, flags);
+
+ return (error);
+}
+
+int
+khelp_remove_hhook(struct hookinfo *hki)
+{
+ int error;
+
+ /*
+ * XXXLAS: Should probably include the functionality to update the
+ * helper's h_hooks struct member.
+ */
+ error = hhook_remove_hook_lookup(hki);
+
+ return (error);
+}
+
+/*
+ * Private KPI between hhook and khelp that allows khelp modules to insert hook
+ * functions into hhook points which register after the modules were loaded.
+ */
+void
+khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags)
+{
+ struct helper *h;
+ int error, i;
+
+ KHELP_LIST_RLOCK();
+ TAILQ_FOREACH(h, &helpers, h_next) {
+ for (i = 0; i < h->h_nhooks; i++) {
+ if (hhh->hhh_type != h->h_hooks[i].hook_type ||
+ hhh->hhh_id != h->h_hooks[i].hook_id)
+ continue;
+ error = hhook_add_hook(hhh, &h->h_hooks[i], flags);
+ if (error) {
+ printf("%s: \"%s\" khelp module unable to "
+ "hook type %d id %d due to error %d\n",
+ __func__, h->h_name,
+ h->h_hooks[i].hook_type,
+ h->h_hooks[i].hook_id, error);
+ error = 0;
+ }
+ }
+ }
+ KHELP_LIST_RUNLOCK();
+}
+
+int
+khelp_modevent(module_t mod, int event_type, void *data)
+{
+ struct khelp_modevent_data *kmd;
+ int error;
+
+ kmd = (struct khelp_modevent_data *)data;
+ error = 0;
+
+ switch(event_type) {
+ case MOD_LOAD:
+ if (kmd->helper->h_flags & HELPER_NEEDS_OSD) {
+ if (kmd->uma_zsize <= 0) {
+ printf("Use KHELP_DECLARE_MOD_UMA() instead!\n");
+ error = EDOOFUS;
+ break;
+ }
+ kmd->helper->h_zone = uma_zcreate(kmd->name,
+ kmd->uma_zsize, kmd->umactor, kmd->umadtor, NULL,
+ NULL, 0, 0);
+ if (kmd->helper->h_zone == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ }
+ strlcpy(kmd->helper->h_name, kmd->name, HELPER_NAME_MAXLEN);
+ kmd->helper->h_hooks = kmd->hooks;
+ kmd->helper->h_nhooks = kmd->nhooks;
+ if (kmd->helper->mod_init != NULL)
+ error = kmd->helper->mod_init();
+ if (!error)
+ error = khelp_register_helper(kmd->helper);
+ break;
+
+ case MOD_QUIESCE:
+ case MOD_SHUTDOWN:
+ case MOD_UNLOAD:
+ error = khelp_deregister_helper(kmd->helper);
+ if (!error) {
+ if (kmd->helper->h_flags & HELPER_NEEDS_OSD)
+ uma_zdestroy(kmd->helper->h_zone);
+ if (kmd->helper->mod_destroy != NULL)
+ kmd->helper->mod_destroy();
+ } else if (error == ENOENT)
+ /* Do nothing and allow unload if helper not in list. */
+ error = 0;
+ else if (error == EBUSY)
+ printf("Khelp module \"%s\" can't unload until its "
+ "refcount drops from %d to 0.\n", kmd->name,
+ kmd->helper->h_refcount);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c
new file mode 100644
index 0000000..969c513
--- /dev/null
+++ b/sys/kern/kern_kthread.c
@@ -0,0 +1,466 @@
+/*-
+ * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/signalvar.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+#include <sys/sched.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#include <machine/stdarg.h>
+
+/*
+ * Start a kernel process. This is called after a fork() call in
+ * mi_startup() in the file kern/init_main.c.
+ *
+ * This function is used to start "internal" daemons and intended
+ * to be called from SYSINIT().
+ */
+void
+kproc_start(udata)
+ const void *udata;
+{
+ const struct kproc_desc *kp = udata;
+ int error;
+
+ error = kproc_create((void (*)(void *))kp->func, NULL,
+ kp->global_procpp, 0, 0, "%s", kp->arg0);
+ if (error)
+ panic("kproc_start: %s: error %d", kp->arg0, error);
+}
+
+/*
+ * Create a kernel process/thread/whatever. It shares its address space
+ * with proc0 - ie: kernel only.
+ *
+ * func is the function to start.
+ * arg is the parameter to pass to function on first startup.
+ * newpp is the return value pointing to the thread's struct proc.
+ * flags are flags to fork1 (in unistd.h)
+ * fmt and following will be *printf'd into (*newpp)->p_comm (for ps, etc.).
+ */
+int
+kproc_create(void (*func)(void *), void *arg,
+ struct proc **newpp, int flags, int pages, const char *fmt, ...)
+{
+ int error;
+ va_list ap;
+ struct thread *td;
+ struct proc *p2;
+
+ if (!proc0.p_stats)
+ panic("kproc_create called too soon");
+
+ error = fork1(&thread0, RFMEM | RFFDG | RFPROC | RFSTOPPED | flags,
+ pages, &p2, NULL, 0);
+ if (error)
+ return error;
+
+ /* save a global descriptor, if desired */
+ if (newpp != NULL)
+ *newpp = p2;
+
+ /* this is a non-swapped system process */
+ PROC_LOCK(p2);
+ td = FIRST_THREAD_IN_PROC(p2);
+ p2->p_flag |= P_SYSTEM | P_KTHREAD;
+ td->td_pflags |= TDP_KTHREAD;
+ mtx_lock(&p2->p_sigacts->ps_mtx);
+ p2->p_sigacts->ps_flag |= PS_NOCLDWAIT;
+ mtx_unlock(&p2->p_sigacts->ps_mtx);
+ PROC_UNLOCK(p2);
+
+ /* set up arg0 for 'ps', et al */
+ va_start(ap, fmt);
+ vsnprintf(p2->p_comm, sizeof(p2->p_comm), fmt, ap);
+ va_end(ap);
+ /* set up arg0 for 'ps', et al */
+ va_start(ap, fmt);
+ vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap);
+ va_end(ap);
+#ifdef KTR
+ sched_clear_tdname(td);
+#endif
+
+ /* call the processes' main()... */
+ cpu_set_fork_handler(td, func, arg);
+
+ /* Avoid inheriting affinity from a random parent. */
+ cpuset_setthread(td->td_tid, cpuset_root);
+ thread_lock(td);
+ TD_SET_CAN_RUN(td);
+ sched_prio(td, PVM);
+ sched_user_prio(td, PUSER);
+
+ /* Delay putting it on the run queue until now. */
+ if (!(flags & RFSTOPPED))
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
+
+ return 0;
+}
+
+void
+kproc_exit(int ecode)
+{
+ struct thread *td;
+ struct proc *p;
+
+ td = curthread;
+ p = td->td_proc;
+
+ /*
+ * Reparent curthread from proc0 to init so that the zombie
+ * is harvested.
+ */
+ sx_xlock(&proctree_lock);
+ PROC_LOCK(p);
+ proc_reparent(p, initproc);
+ PROC_UNLOCK(p);
+ sx_xunlock(&proctree_lock);
+
+ /*
+ * Wakeup anyone waiting for us to exit.
+ */
+ wakeup(p);
+
+ /* Buh-bye! */
+ exit1(td, W_EXITCODE(ecode, 0));
+}
+
+/*
+ * Advise a kernel process to suspend (or resume) in its main loop.
+ * Participation is voluntary.
+ */
+int
+kproc_suspend(struct proc *p, int timo)
+{
+ /*
+ * Make sure this is indeed a system process and we can safely
+ * use the p_siglist field.
+ */
+ PROC_LOCK(p);
+ if ((p->p_flag & P_KTHREAD) == 0) {
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+ SIGADDSET(p->p_siglist, SIGSTOP);
+ wakeup(p);
+ return msleep(&p->p_siglist, &p->p_mtx, PPAUSE | PDROP, "suspkp", timo);
+}
+
+int
+kproc_resume(struct proc *p)
+{
+ /*
+ * Make sure this is indeed a system process and we can safely
+ * use the p_siglist field.
+ */
+ PROC_LOCK(p);
+ if ((p->p_flag & P_KTHREAD) == 0) {
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+ SIGDELSET(p->p_siglist, SIGSTOP);
+ PROC_UNLOCK(p);
+ wakeup(&p->p_siglist);
+ return (0);
+}
+
+void
+kproc_suspend_check(struct proc *p)
+{
+ PROC_LOCK(p);
+ while (SIGISMEMBER(p->p_siglist, SIGSTOP)) {
+ wakeup(&p->p_siglist);
+ msleep(&p->p_siglist, &p->p_mtx, PPAUSE, "kpsusp", 0);
+ }
+ PROC_UNLOCK(p);
+}
+
+
+/*
+ * Start a kernel thread.
+ *
+ * This function is used to start "internal" daemons and intended
+ * to be called from SYSINIT().
+ */
+
+void
+kthread_start(udata)
+ const void *udata;
+{
+ const struct kthread_desc *kp = udata;
+ int error;
+
+ error = kthread_add((void (*)(void *))kp->func, NULL,
+ NULL, kp->global_threadpp, 0, 0, "%s", kp->arg0);
+ if (error)
+ panic("kthread_start: %s: error %d", kp->arg0, error);
+}
+
+/*
+ * Create a kernel thread. It shares its address space
+ * with proc0 - ie: kernel only.
+ *
+ * func is the function to start.
+ * arg is the parameter to pass to function on first startup.
+ * newtdp is the return value pointing to the thread's struct thread.
+ * ** XXX fix this --> flags are flags to fork1 (in unistd.h)
+ * fmt and following will be *printf'd into (*newtd)->td_name (for ps, etc.).
+ */
+int
+kthread_add(void (*func)(void *), void *arg, struct proc *p,
+ struct thread **newtdp, int flags, int pages, const char *fmt, ...)
+{
+ va_list ap;
+ struct thread *newtd, *oldtd;
+
+ if (!proc0.p_stats)
+ panic("kthread_add called too soon");
+
+ /* If no process supplied, put it on proc0 */
+ if (p == NULL)
+ p = &proc0;
+
+ /* Initialize our new td */
+ newtd = thread_alloc(pages);
+ if (newtd == NULL)
+ return (ENOMEM);
+
+ PROC_LOCK(p);
+ oldtd = FIRST_THREAD_IN_PROC(p);
+
+ bzero(&newtd->td_startzero,
+ __rangeof(struct thread, td_startzero, td_endzero));
+ bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
+ __rangeof(struct thread, td_startcopy, td_endcopy));
+
+ /* set up arg0 for 'ps', et al */
+ va_start(ap, fmt);
+ vsnprintf(newtd->td_name, sizeof(newtd->td_name), fmt, ap);
+ va_end(ap);
+
+ newtd->td_proc = p; /* needed for cpu_set_upcall */
+
+ /* XXX optimise this probably? */
+ /* On x86 (and probably the others too) it is way too full of junk */
+ /* Needs a better name */
+ cpu_set_upcall(newtd, oldtd);
+ /* put the designated function(arg) as the resume context */
+ cpu_set_fork_handler(newtd, func, arg);
+
+ newtd->td_pflags |= TDP_KTHREAD;
+ newtd->td_ucred = crhold(p->p_ucred);
+
+ /* this code almost the same as create_thread() in kern_thr.c */
+ p->p_flag |= P_HADTHREADS;
+ thread_link(newtd, p);
+ thread_lock(oldtd);
+ /* let the scheduler know about these things. */
+ sched_fork_thread(oldtd, newtd);
+ TD_SET_CAN_RUN(newtd);
+ thread_unlock(oldtd);
+ PROC_UNLOCK(p);
+
+ tidhash_add(newtd);
+
+ /* Avoid inheriting affinity from a random parent. */
+ cpuset_setthread(newtd->td_tid, cpuset_root);
+
+ /* Delay putting it on the run queue until now. */
+ if (!(flags & RFSTOPPED)) {
+ thread_lock(newtd);
+ sched_add(newtd, SRQ_BORING);
+ thread_unlock(newtd);
+ }
+ if (newtdp)
+ *newtdp = newtd;
+ return 0;
+}
+
+void
+kthread_exit(void)
+{
+ struct proc *p;
+
+ p = curthread->td_proc;
+
+ /* A module may be waiting for us to exit. */
+ wakeup(curthread);
+
+ /*
+ * The last exiting thread in a kernel process must tear down
+ * the whole process.
+ */
+ rw_wlock(&tidhash_lock);
+ PROC_LOCK(p);
+ if (p->p_numthreads == 1) {
+ PROC_UNLOCK(p);
+ rw_wunlock(&tidhash_lock);
+ kproc_exit(0);
+ }
+ LIST_REMOVE(curthread, td_hash);
+ rw_wunlock(&tidhash_lock);
+ PROC_SLOCK(p);
+ thread_exit();
+}
+
+/*
+ * Advise a kernel process to suspend (or resume) in its main loop.
+ * Participation is voluntary.
+ */
+int
+kthread_suspend(struct thread *td, int timo)
+{
+ struct proc *p;
+
+ p = td->td_proc;
+
+ /*
+ * td_pflags should not be read by any thread other than
+ * curthread, but as long as this flag is invariant during the
+ * thread's lifetime, it is OK to check its state.
+ */
+ if ((td->td_pflags & TDP_KTHREAD) == 0)
+ return (EINVAL);
+
+ /*
+ * The caller of the primitive should have already checked that the
+ * thread is up and running, thus not being blocked by other
+ * conditions.
+ */
+ PROC_LOCK(p);
+ thread_lock(td);
+ td->td_flags |= TDF_KTH_SUSP;
+ thread_unlock(td);
+ return (msleep(&td->td_flags, &p->p_mtx, PPAUSE | PDROP, "suspkt",
+ timo));
+}
+
+/*
+ * Resume a thread previously put asleep with kthread_suspend().
+ */
+int
+kthread_resume(struct thread *td)
+{
+ struct proc *p;
+
+ p = td->td_proc;
+
+ /*
+ * td_pflags should not be read by any thread other than
+ * curthread, but as long as this flag is invariant during the
+ * thread's lifetime, it is OK to check its state.
+ */
+ if ((td->td_pflags & TDP_KTHREAD) == 0)
+ return (EINVAL);
+
+ PROC_LOCK(p);
+ thread_lock(td);
+ td->td_flags &= ~TDF_KTH_SUSP;
+ thread_unlock(td);
+ wakeup(&td->td_flags);
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+/*
+ * Used by the thread to poll as to whether it should yield/sleep
+ * and notify the caller that is has happened.
+ */
+void
+kthread_suspend_check()
+{
+ struct proc *p;
+ struct thread *td;
+
+ td = curthread;
+ p = td->td_proc;
+
+ if ((td->td_pflags & TDP_KTHREAD) == 0)
+ panic("%s: curthread is not a valid kthread", __func__);
+
+ /*
+ * As long as the double-lock protection is used when accessing the
+ * TDF_KTH_SUSP flag, synchronizing the read operation via proc mutex
+ * is fine.
+ */
+ PROC_LOCK(p);
+ while (td->td_flags & TDF_KTH_SUSP) {
+ wakeup(&td->td_flags);
+ msleep(&td->td_flags, &p->p_mtx, PPAUSE, "ktsusp", 0);
+ }
+ PROC_UNLOCK(p);
+}
+
+int
+kproc_kthread_add(void (*func)(void *), void *arg,
+ struct proc **procptr, struct thread **tdptr,
+ int flags, int pages, const char *procname, const char *fmt, ...)
+{
+ int error;
+ va_list ap;
+ char buf[100];
+ struct thread *td;
+
+ if (*procptr == 0) {
+ error = kproc_create(func, arg,
+ procptr, flags, pages, "%s", procname);
+ if (error)
+ return (error);
+ td = FIRST_THREAD_IN_PROC(*procptr);
+ if (tdptr)
+ *tdptr = td;
+ va_start(ap, fmt);
+ vsnprintf(td->td_name, sizeof(td->td_name), fmt, ap);
+ va_end(ap);
+#ifdef KTR
+ sched_clear_tdname(td);
+#endif
+ return (0);
+ }
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+ error = kthread_add(func, arg, *procptr,
+ tdptr, flags, pages, "%s", buf);
+ return (error);
+}
diff --git a/sys/kern/kern_ktr.c b/sys/kern/kern_ktr.c
new file mode 100644
index 0000000..3202b9b
--- /dev/null
+++ b/sys/kern/kern_ktr.c
@@ -0,0 +1,495 @@
+/*-
+ * Copyright (c) 2000 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module holds the global variables used by KTR and the ktr_tracepoint()
+ * function that does the actual tracing.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_ktr.h"
+#include "opt_alq.h"
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/alq.h>
+#include <sys/cons.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+
+#include <machine/cpu.h>
+#ifdef __sparc64__
+#include <machine/ktr.h>
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <ddb/db_output.h>
+#endif
+
+#ifndef KTR_BOOT_ENTRIES
+#define KTR_BOOT_ENTRIES 1024
+#endif
+
+#ifndef KTR_ENTRIES
+#define KTR_ENTRIES 1024
+#endif
+
+/* Limit the allocations to something manageable. */
+#define KTR_ENTRIES_MAX (8 * 1024 * 1024)
+
+#ifndef KTR_MASK
+#define KTR_MASK (0)
+#endif
+
+#ifndef KTR_CPUMASK
+#define KTR_CPUMASK CPUSET_FSET
+#endif
+
+#ifndef KTR_TIME
+#define KTR_TIME get_cyclecount()
+#endif
+
+#ifndef KTR_CPU
+#define KTR_CPU PCPU_GET(cpuid)
+#endif
+
+static MALLOC_DEFINE(M_KTR, "KTR", "KTR");
+
+FEATURE(ktr, "Kernel support for KTR kernel tracing facility");
+
+volatile int ktr_idx = 0;
+int ktr_mask = KTR_MASK;
+int ktr_compile = KTR_COMPILE;
+int ktr_entries = KTR_BOOT_ENTRIES;
+int ktr_version = KTR_VERSION;
+struct ktr_entry ktr_buf_init[KTR_BOOT_ENTRIES];
+struct ktr_entry *ktr_buf = ktr_buf_init;
+cpuset_t ktr_cpumask = CPUSET_T_INITIALIZER(KTR_CPUMASK);
+static char ktr_cpumask_str[CPUSETBUFSIZ];
+
+TUNABLE_INT("debug.ktr.mask", &ktr_mask);
+
+TUNABLE_STR("debug.ktr.cpumask", ktr_cpumask_str, sizeof(ktr_cpumask_str));
+
+static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
+
+SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD,
+ &ktr_version, 0, "Version of the KTR interface");
+
+SYSCTL_UINT(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD,
+ &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel");
+
+static void
+ktr_cpumask_initializer(void *dummy __unused)
+{
+
+ /*
+ * TUNABLE_STR() runs with SI_ORDER_MIDDLE priority, thus it must be
+ * already set, if necessary.
+ */
+ if (ktr_cpumask_str[0] != '\0' &&
+ cpusetobj_strscan(&ktr_cpumask, ktr_cpumask_str) == -1)
+ CPU_FILL(&ktr_cpumask);
+}
+SYSINIT(ktr_cpumask_initializer, SI_SUB_TUNABLES, SI_ORDER_ANY,
+ ktr_cpumask_initializer, NULL);
+
+static int
+sysctl_debug_ktr_cpumask(SYSCTL_HANDLER_ARGS)
+{
+ char lktr_cpumask_str[CPUSETBUFSIZ];
+ cpuset_t imask;
+ int error;
+
+ cpusetobj_strprint(lktr_cpumask_str, &ktr_cpumask);
+ error = sysctl_handle_string(oidp, lktr_cpumask_str,
+ sizeof(lktr_cpumask_str), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (cpusetobj_strscan(&imask, lktr_cpumask_str) == -1)
+ return (EINVAL);
+ CPU_COPY(&imask, &ktr_cpumask);
+
+ return (error);
+}
+SYSCTL_PROC(_debug_ktr, OID_AUTO, cpumask,
+ CTLFLAG_RW | CTLFLAG_MPSAFE | CTLTYPE_STRING, NULL, 0,
+ sysctl_debug_ktr_cpumask, "S",
+ "Bitmask of CPUs on which KTR logging is enabled");
+
+static int
+sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS)
+{
+ int clear, error;
+
+ clear = 0;
+ error = sysctl_handle_int(oidp, &clear, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ if (clear) {
+ bzero(ktr_buf, sizeof(*ktr_buf) * ktr_entries);
+ ktr_idx = 0;
+ }
+
+ return (error);
+}
+SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+ sysctl_debug_ktr_clear, "I", "Clear KTR Buffer");
+
+/*
+ * This is a sysctl proc so that it is serialized as !MPSAFE along with
+ * the other ktr sysctl procs.
+ */
+static int
+sysctl_debug_ktr_mask(SYSCTL_HANDLER_ARGS)
+{
+ int mask, error;
+
+ mask = ktr_mask;
+ error = sysctl_handle_int(oidp, &mask, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ ktr_mask = mask;
+ return (error);
+}
+
+SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_UINT|CTLFLAG_RW, 0, 0,
+ sysctl_debug_ktr_mask, "IU",
+ "Bitmask of KTR event classes for which logging is enabled");
+
+#if KTR_ENTRIES > KTR_BOOT_ENTRIES
+/*
+ * A simplified version of sysctl_debug_ktr_entries.
+ * No need to care about SMP, scheduling, etc.
+ */
+static void
+ktr_entries_initializer(void *dummy __unused)
+{
+ int mask;
+
+ /* Temporarily disable ktr in case malloc() is being traced. */
+ mask = ktr_mask;
+ ktr_mask = 0;
+ ktr_buf = malloc(sizeof(*ktr_buf) * KTR_ENTRIES, M_KTR,
+ M_WAITOK | M_ZERO);
+ memcpy(ktr_buf, ktr_buf_init + ktr_idx,
+ (KTR_BOOT_ENTRIES - ktr_idx) * sizeof(*ktr_buf));
+ if (ktr_idx != 0)
+ memcpy(ktr_buf + KTR_BOOT_ENTRIES - ktr_idx, ktr_buf_init,
+ ktr_idx * sizeof(*ktr_buf));
+ ktr_entries = KTR_ENTRIES;
+ ktr_mask = mask;
+}
+SYSINIT(ktr_entries_initializer, SI_SUB_KMEM, SI_ORDER_ANY,
+ ktr_entries_initializer, NULL);
+#endif
+
+static int
+sysctl_debug_ktr_entries(SYSCTL_HANDLER_ARGS)
+{
+ int entries, error, mask;
+ struct ktr_entry *buf, *oldbuf;
+
+ entries = ktr_entries;
+ error = sysctl_handle_int(oidp, &entries, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (entries > KTR_ENTRIES_MAX)
+ return (ERANGE);
+ /* Disable ktr temporarily. */
+ mask = ktr_mask;
+ atomic_store_rel_int(&ktr_mask, 0);
+ /* Wait for threads to go idle. */
+ if ((error = quiesce_all_cpus("ktrent", PCATCH)) != 0) {
+ ktr_mask = mask;
+ return (error);
+ }
+ if (ktr_buf != ktr_buf_init)
+ oldbuf = ktr_buf;
+ else
+ oldbuf = NULL;
+ /* Allocate a new buffer. */
+ buf = malloc(sizeof(*buf) * entries, M_KTR, M_WAITOK | M_ZERO);
+ /* Install the new buffer and restart ktr. */
+ ktr_buf = buf;
+ ktr_entries = entries;
+ ktr_idx = 0;
+ atomic_store_rel_int(&ktr_mask, mask);
+ if (oldbuf != NULL)
+ free(oldbuf, M_KTR);
+
+ return (error);
+}
+
+SYSCTL_PROC(_debug_ktr, OID_AUTO, entries, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+ sysctl_debug_ktr_entries, "I", "Number of entries in the KTR buffer");
+
+#ifdef KTR_VERBOSE
+int ktr_verbose = KTR_VERBOSE;
+TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
+SYSCTL_INT(_debug_ktr, OID_AUTO, verbose, CTLFLAG_RW, &ktr_verbose, 0, "");
+#endif
+
+#ifdef KTR_ALQ
+struct alq *ktr_alq;
+char ktr_alq_file[MAXPATHLEN] = "/tmp/ktr.out";
+int ktr_alq_cnt = 0;
+int ktr_alq_depth = KTR_ENTRIES;
+int ktr_alq_enabled = 0;
+int ktr_alq_failed = 0;
+int ktr_alq_max = 0;
+
+SYSCTL_INT(_debug_ktr, OID_AUTO, alq_max, CTLFLAG_RW, &ktr_alq_max, 0,
+ "Maximum number of entries to write");
+SYSCTL_INT(_debug_ktr, OID_AUTO, alq_cnt, CTLFLAG_RD, &ktr_alq_cnt, 0,
+ "Current number of written entries");
+SYSCTL_INT(_debug_ktr, OID_AUTO, alq_failed, CTLFLAG_RD, &ktr_alq_failed, 0,
+ "Number of times we overran the buffer");
+SYSCTL_INT(_debug_ktr, OID_AUTO, alq_depth, CTLFLAG_RW, &ktr_alq_depth, 0,
+ "Number of items in the write buffer");
+SYSCTL_STRING(_debug_ktr, OID_AUTO, alq_file, CTLFLAG_RW, ktr_alq_file,
+ sizeof(ktr_alq_file), "KTR logging file");
+
+static int
+sysctl_debug_ktr_alq_enable(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int enable;
+
+ enable = ktr_alq_enabled;
+
+ error = sysctl_handle_int(oidp, &enable, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ if (enable) {
+ if (ktr_alq_enabled)
+ return (0);
+ error = alq_open(&ktr_alq, (const char *)ktr_alq_file,
+ req->td->td_ucred, ALQ_DEFAULT_CMODE,
+ sizeof(struct ktr_entry), ktr_alq_depth);
+ if (error == 0) {
+ ktr_alq_cnt = 0;
+ ktr_alq_failed = 0;
+ ktr_alq_enabled = 1;
+ }
+ } else {
+ if (ktr_alq_enabled == 0)
+ return (0);
+ ktr_alq_enabled = 0;
+ alq_close(ktr_alq);
+ ktr_alq = NULL;
+ }
+
+ return (error);
+}
+SYSCTL_PROC(_debug_ktr, OID_AUTO, alq_enable,
+ CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_debug_ktr_alq_enable,
+ "I", "Enable KTR logging");
+#endif
+
+void
+ktr_tracepoint(u_int mask, const char *file, int line, const char *format,
+ u_long arg1, u_long arg2, u_long arg3, u_long arg4, u_long arg5,
+ u_long arg6)
+{
+ struct ktr_entry *entry;
+#ifdef KTR_ALQ
+ struct ale *ale = NULL;
+#endif
+ int newindex, saveindex;
+#if defined(KTR_VERBOSE) || defined(KTR_ALQ)
+ struct thread *td;
+#endif
+ int cpu;
+
+ if (panicstr)
+ return;
+ if ((ktr_mask & mask) == 0 || ktr_buf == NULL)
+ return;
+ cpu = KTR_CPU;
+ if (!CPU_ISSET(cpu, &ktr_cpumask))
+ return;
+#if defined(KTR_VERBOSE) || defined(KTR_ALQ)
+ td = curthread;
+ if (td->td_pflags & TDP_INKTR)
+ return;
+ td->td_pflags |= TDP_INKTR;
+#endif
+#ifdef KTR_ALQ
+ if (ktr_alq_enabled) {
+ if (td->td_critnest == 0 &&
+ (td->td_flags & TDF_IDLETD) == 0 &&
+ td != ald_thread) {
+ if (ktr_alq_max && ktr_alq_cnt > ktr_alq_max)
+ goto done;
+ if ((ale = alq_get(ktr_alq, ALQ_NOWAIT)) == NULL) {
+ ktr_alq_failed++;
+ goto done;
+ }
+ ktr_alq_cnt++;
+ entry = (struct ktr_entry *)ale->ae_data;
+ } else {
+ goto done;
+ }
+ } else
+#endif
+ {
+ do {
+ saveindex = ktr_idx;
+ newindex = (saveindex + 1) % ktr_entries;
+ } while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0);
+ entry = &ktr_buf[saveindex];
+ }
+ entry->ktr_timestamp = KTR_TIME;
+ entry->ktr_cpu = cpu;
+ entry->ktr_thread = curthread;
+ if (file != NULL)
+ while (strncmp(file, "../", 3) == 0)
+ file += 3;
+ entry->ktr_file = file;
+ entry->ktr_line = line;
+#ifdef KTR_VERBOSE
+ if (ktr_verbose) {
+#ifdef SMP
+ printf("cpu%d ", cpu);
+#endif
+ if (ktr_verbose > 1) {
+ printf("%s.%d\t", entry->ktr_file,
+ entry->ktr_line);
+ }
+ printf(format, arg1, arg2, arg3, arg4, arg5, arg6);
+ printf("\n");
+ }
+#endif
+ entry->ktr_desc = format;
+ entry->ktr_parms[0] = arg1;
+ entry->ktr_parms[1] = arg2;
+ entry->ktr_parms[2] = arg3;
+ entry->ktr_parms[3] = arg4;
+ entry->ktr_parms[4] = arg5;
+ entry->ktr_parms[5] = arg6;
+#ifdef KTR_ALQ
+ if (ktr_alq_enabled && ale)
+ alq_post(ktr_alq, ale);
+done:
+#endif
+#if defined(KTR_VERBOSE) || defined(KTR_ALQ)
+ td->td_pflags &= ~TDP_INKTR;
+#endif
+}
+
+#ifdef DDB
+
+struct tstate {
+ int cur;
+ int first;
+};
+static struct tstate tstate;
+static int db_ktr_verbose;
+static int db_mach_vtrace(void);
+
+DB_SHOW_COMMAND(ktr, db_ktr_all)
+{
+
+ tstate.cur = (ktr_idx - 1) % ktr_entries;
+ tstate.first = -1;
+ db_ktr_verbose = 0;
+ db_ktr_verbose |= (strchr(modif, 'v') != NULL) ? 2 : 0;
+ db_ktr_verbose |= (strchr(modif, 'V') != NULL) ? 1 : 0; /* just timestap please */
+ if (strchr(modif, 'a') != NULL) {
+ db_disable_pager();
+ while (cncheckc() != -1)
+ if (db_mach_vtrace() == 0)
+ break;
+ } else {
+ while (!db_pager_quit)
+ if (db_mach_vtrace() == 0)
+ break;
+ }
+}
+
+static int
+db_mach_vtrace(void)
+{
+ struct ktr_entry *kp;
+
+ if (tstate.cur == tstate.first || ktr_buf == NULL) {
+ db_printf("--- End of trace buffer ---\n");
+ return (0);
+ }
+ kp = &ktr_buf[tstate.cur];
+
+ /* Skip over unused entries. */
+ if (kp->ktr_desc == NULL) {
+ db_printf("--- End of trace buffer ---\n");
+ return (0);
+ }
+ db_printf("%d (%p", tstate.cur, kp->ktr_thread);
+#ifdef SMP
+ db_printf(":cpu%d", kp->ktr_cpu);
+#endif
+ db_printf(")");
+ if (db_ktr_verbose >= 1) {
+ db_printf(" %10.10lld", (long long)kp->ktr_timestamp);
+ }
+ if (db_ktr_verbose >= 2) {
+ db_printf(" %s.%d", kp->ktr_file, kp->ktr_line);
+ }
+ db_printf(": ");
+ db_printf(kp->ktr_desc, kp->ktr_parms[0], kp->ktr_parms[1],
+ kp->ktr_parms[2], kp->ktr_parms[3], kp->ktr_parms[4],
+ kp->ktr_parms[5]);
+ db_printf("\n");
+
+ if (tstate.first == -1)
+ tstate.first = tstate.cur;
+
+ if (--tstate.cur < 0)
+ tstate.cur = ktr_entries - 1;
+
+ return (1);
+}
+
+#endif /* DDB */
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
new file mode 100644
index 0000000..3b34fb0
--- /dev/null
+++ b/sys/kern/kern_ktrace.c
@@ -0,0 +1,1269 @@
+/*-
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/ktrace.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * The ktrace facility allows the tracing of certain key events in user space
+ * processes, such as system calls, signal delivery, context switches, and
+ * user generated events using utrace(2). It works by streaming event
+ * records and data to a vnode associated with the process using the
+ * ktrace(2) system call. In general, records can be written directly from
+ * the context that generates the event. One important exception to this is
+ * during a context switch, where sleeping is not permitted. To handle this
+ * case, trace events are generated using in-kernel ktr_request records, and
+ * then delivered to disk at a convenient moment -- either immediately, the
+ * next traceable event, at system call return, or at process exit.
+ *
+ * When dealing with multiple threads or processes writing to the same event
+ * log, ordering guarantees are weak: specifically, if an event has multiple
+ * records (i.e., system call enter and return), they may be interlaced with
+ * records from another event. Process and thread ID information is provided
+ * in the record, and user applications can de-interlace events if required.
+ */
+
+static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
+
+#ifdef KTRACE
+
+FEATURE(ktrace, "Kernel support for system-call tracing");
+
+#ifndef KTRACE_REQUEST_POOL
+#define KTRACE_REQUEST_POOL 100
+#endif
+
+struct ktr_request {
+ struct ktr_header ktr_header;
+ void *ktr_buffer;
+ union {
+ struct ktr_proc_ctor ktr_proc_ctor;
+ struct ktr_cap_fail ktr_cap_fail;
+ struct ktr_syscall ktr_syscall;
+ struct ktr_sysret ktr_sysret;
+ struct ktr_genio ktr_genio;
+ struct ktr_psig ktr_psig;
+ struct ktr_csw ktr_csw;
+ struct ktr_fault ktr_fault;
+ struct ktr_faultend ktr_faultend;
+ } ktr_data;
+ STAILQ_ENTRY(ktr_request) ktr_list;
+};
+
+static int data_lengths[] = {
+ 0, /* none */
+ offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */
+ sizeof(struct ktr_sysret), /* KTR_SYSRET */
+ 0, /* KTR_NAMEI */
+ sizeof(struct ktr_genio), /* KTR_GENIO */
+ sizeof(struct ktr_psig), /* KTR_PSIG */
+ sizeof(struct ktr_csw), /* KTR_CSW */
+ 0, /* KTR_USER */
+ 0, /* KTR_STRUCT */
+ 0, /* KTR_SYSCTL */
+ sizeof(struct ktr_proc_ctor), /* KTR_PROCCTOR */
+ 0, /* KTR_PROCDTOR */
+ sizeof(struct ktr_cap_fail), /* KTR_CAPFAIL */
+ sizeof(struct ktr_fault), /* KTR_FAULT */
+ sizeof(struct ktr_faultend), /* KTR_FAULTEND */
+};
+
+static STAILQ_HEAD(, ktr_request) ktr_free;
+
+static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
+
+static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
+TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);
+
+static u_int ktr_geniosize = PAGE_SIZE;
+TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
+SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
+ 0, "Maximum size of genio event payload");
+
+static int print_message = 1;
+static struct mtx ktrace_mtx;
+static struct sx ktrace_sx;
+
+static void ktrace_init(void *dummy);
+static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
+static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
+static struct ktr_request *ktr_getrequest_entered(struct thread *td, int type);
+static struct ktr_request *ktr_getrequest(int type);
+static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
+static void ktr_freeproc(struct proc *p, struct ucred **uc,
+ struct vnode **vp);
+static void ktr_freerequest(struct ktr_request *req);
+static void ktr_freerequest_locked(struct ktr_request *req);
+static void ktr_writerequest(struct thread *td, struct ktr_request *req);
+static int ktrcanset(struct thread *,struct proc *);
+static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
+static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
+static void ktrprocctor_entered(struct thread *, struct proc *);
+
+/*
+ * ktrace itself generates events, such as context switches, which we do not
+ * wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine
+ * whether or not it is in a region where tracing of events should be
+ * suppressed.
+ */
+static void
+ktrace_enter(struct thread *td)
+{
+
+ KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
+ td->td_pflags |= TDP_INKTRACE;
+}
+
+static void
+ktrace_exit(struct thread *td)
+{
+
+ KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
+ td->td_pflags &= ~TDP_INKTRACE;
+}
+
+static void
+ktrace_assert(struct thread *td)
+{
+
+ KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
+}
+
+static void
+ktrace_init(void *dummy)
+{
+ struct ktr_request *req;
+ int i;
+
+ mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
+ sx_init(&ktrace_sx, "ktrace_sx");
+ STAILQ_INIT(&ktr_free);
+ for (i = 0; i < ktr_requestpool; i++) {
+ req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
+ STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+ }
+}
+SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
+
+static int
+sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
+{
+ struct thread *td;
+ u_int newsize, oldsize, wantsize;
+ int error;
+
+ /* Handle easy read-only case first to avoid warnings from GCC. */
+ if (!req->newptr) {
+ oldsize = ktr_requestpool;
+ return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
+ }
+
+ error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
+ if (error)
+ return (error);
+ td = curthread;
+ ktrace_enter(td);
+ oldsize = ktr_requestpool;
+ newsize = ktrace_resize_pool(oldsize, wantsize);
+ ktrace_exit(td);
+ error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
+ if (error)
+ return (error);
+ if (wantsize > oldsize && newsize < wantsize)
+ return (ENOSPC);
+ return (0);
+}
+SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT|CTLFLAG_RW,
+ &ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU",
+ "Pool buffer size for ktrace(1)");
+
+static u_int
+ktrace_resize_pool(u_int oldsize, u_int newsize)
+{
+ STAILQ_HEAD(, ktr_request) ktr_new;
+ struct ktr_request *req;
+ int bound;
+
+ print_message = 1;
+ bound = newsize - oldsize;
+ if (bound == 0)
+ return (ktr_requestpool);
+ if (bound < 0) {
+ mtx_lock(&ktrace_mtx);
+ /* Shrink pool down to newsize if possible. */
+ while (bound++ < 0) {
+ req = STAILQ_FIRST(&ktr_free);
+ if (req == NULL)
+ break;
+ STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
+ ktr_requestpool--;
+ free(req, M_KTRACE);
+ }
+ } else {
+ /* Grow pool up to newsize. */
+ STAILQ_INIT(&ktr_new);
+ while (bound-- > 0) {
+ req = malloc(sizeof(struct ktr_request), M_KTRACE,
+ M_WAITOK);
+ STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
+ }
+ mtx_lock(&ktrace_mtx);
+ STAILQ_CONCAT(&ktr_free, &ktr_new);
+ ktr_requestpool += (newsize - oldsize);
+ }
+ mtx_unlock(&ktrace_mtx);
+ return (ktr_requestpool);
+}
+
+/* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
+CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
+ (sizeof((struct thread *)NULL)->td_name));
+
+static struct ktr_request *
+ktr_getrequest_entered(struct thread *td, int type)
+{
+ struct ktr_request *req;
+ struct proc *p = td->td_proc;
+ int pm;
+
+ mtx_lock(&ktrace_mtx);
+ if (!KTRCHECK(td, type)) {
+ mtx_unlock(&ktrace_mtx);
+ return (NULL);
+ }
+ req = STAILQ_FIRST(&ktr_free);
+ if (req != NULL) {
+ STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
+ req->ktr_header.ktr_type = type;
+ if (p->p_traceflag & KTRFAC_DROP) {
+ req->ktr_header.ktr_type |= KTR_DROP;
+ p->p_traceflag &= ~KTRFAC_DROP;
+ }
+ mtx_unlock(&ktrace_mtx);
+ microtime(&req->ktr_header.ktr_time);
+ req->ktr_header.ktr_pid = p->p_pid;
+ req->ktr_header.ktr_tid = td->td_tid;
+ bcopy(td->td_name, req->ktr_header.ktr_comm,
+ sizeof(req->ktr_header.ktr_comm));
+ req->ktr_buffer = NULL;
+ req->ktr_header.ktr_len = 0;
+ } else {
+ p->p_traceflag |= KTRFAC_DROP;
+ pm = print_message;
+ print_message = 0;
+ mtx_unlock(&ktrace_mtx);
+ if (pm)
+ printf("Out of ktrace request objects.\n");
+ }
+ return (req);
+}
+
+static struct ktr_request *
+ktr_getrequest(int type)
+{
+ struct thread *td = curthread;
+ struct ktr_request *req;
+
+ ktrace_enter(td);
+ req = ktr_getrequest_entered(td, type);
+ if (req == NULL)
+ ktrace_exit(td);
+
+ return (req);
+}
+
+/*
+ * Some trace generation environments don't permit direct access to VFS,
+ * such as during a context switch where sleeping is not allowed. Under these
+ * circumstances, queue a request to the thread to be written asynchronously
+ * later.
+ */
+static void
+ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
+{
+
+ mtx_lock(&ktrace_mtx);
+ STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
+ mtx_unlock(&ktrace_mtx);
+}
+
+/*
+ * Drain any pending ktrace records from the per-thread queue to disk. This
+ * is used both internally before committing other records, and also on
+ * system call return. We drain all the ones we can find at the time when
+ * drain is requested, but don't keep draining after that as those events
+ * may be approximately "after" the current event.
+ */
+static void
+ktr_drain(struct thread *td)
+{
+ struct ktr_request *queued_req;
+ STAILQ_HEAD(, ktr_request) local_queue;
+
+ ktrace_assert(td);
+ sx_assert(&ktrace_sx, SX_XLOCKED);
+
+ STAILQ_INIT(&local_queue);
+
+ if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
+ mtx_lock(&ktrace_mtx);
+ STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
+ mtx_unlock(&ktrace_mtx);
+
+ while ((queued_req = STAILQ_FIRST(&local_queue))) {
+ STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
+ ktr_writerequest(td, queued_req);
+ ktr_freerequest(queued_req);
+ }
+ }
+}
+
+/*
+ * Submit a trace record for immediate commit to disk -- to be used only
+ * where entering VFS is OK. First drain any pending records that may have
+ * been cached in the thread.
+ */
+static void
+ktr_submitrequest(struct thread *td, struct ktr_request *req)
+{
+
+ ktrace_assert(td);
+
+ sx_xlock(&ktrace_sx);
+ ktr_drain(td);
+ ktr_writerequest(td, req);
+ ktr_freerequest(req);
+ sx_xunlock(&ktrace_sx);
+ ktrace_exit(td);
+}
+
+static void
+ktr_freerequest(struct ktr_request *req)
+{
+
+ mtx_lock(&ktrace_mtx);
+ ktr_freerequest_locked(req);
+ mtx_unlock(&ktrace_mtx);
+}
+
+static void
+ktr_freerequest_locked(struct ktr_request *req)
+{
+
+ mtx_assert(&ktrace_mtx, MA_OWNED);
+ if (req->ktr_buffer != NULL)
+ free(req->ktr_buffer, M_KTRACE);
+ STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+}
+
+/*
+ * Disable tracing for a process and release all associated resources.
+ * The caller is responsible for releasing a reference on the returned
+ * vnode and credentials.
+ */
+static void
+ktr_freeproc(struct proc *p, struct ucred **uc, struct vnode **vp)
+{
+ struct ktr_request *req;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ mtx_assert(&ktrace_mtx, MA_OWNED);
+ *uc = p->p_tracecred;
+ p->p_tracecred = NULL;
+ if (vp != NULL)
+ *vp = p->p_tracevp;
+ p->p_tracevp = NULL;
+ p->p_traceflag = 0;
+ while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
+ STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
+ ktr_freerequest_locked(req);
+ }
+}
+
+void
+ktrsyscall(code, narg, args)
+ int code, narg;
+ register_t args[];
+{
+ struct ktr_request *req;
+ struct ktr_syscall *ktp;
+ size_t buflen;
+ char *buf = NULL;
+
+ buflen = sizeof(register_t) * narg;
+ if (buflen > 0) {
+ buf = malloc(buflen, M_KTRACE, M_WAITOK);
+ bcopy(args, buf, buflen);
+ }
+ req = ktr_getrequest(KTR_SYSCALL);
+ if (req == NULL) {
+ if (buf != NULL)
+ free(buf, M_KTRACE);
+ return;
+ }
+ ktp = &req->ktr_data.ktr_syscall;
+ ktp->ktr_code = code;
+ ktp->ktr_narg = narg;
+ if (buflen > 0) {
+ req->ktr_header.ktr_len = buflen;
+ req->ktr_buffer = buf;
+ }
+ ktr_submitrequest(curthread, req);
+}
+
+void
+ktrsysret(code, error, retval)
+ int code, error;
+ register_t retval;
+{
+ struct ktr_request *req;
+ struct ktr_sysret *ktp;
+
+ req = ktr_getrequest(KTR_SYSRET);
+ if (req == NULL)
+ return;
+ ktp = &req->ktr_data.ktr_sysret;
+ ktp->ktr_code = code;
+ ktp->ktr_error = error;
+ ktp->ktr_retval = ((error == 0) ? retval: 0); /* what about val2 ? */
+ ktr_submitrequest(curthread, req);
+}
+
+/*
+ * When a setuid process execs, disable tracing.
+ *
+ * XXX: We toss any pending asynchronous records.
+ */
+void
+ktrprocexec(struct proc *p, struct ucred **uc, struct vnode **vp)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, uc, vp);
+ mtx_unlock(&ktrace_mtx);
+}
+
+/*
+ * When a process exits, drain per-process asynchronous trace records
+ * and disable tracing.
+ */
+void
+ktrprocexit(struct thread *td)
+{
+ struct ktr_request *req;
+ struct proc *p;
+ struct ucred *cred;
+ struct vnode *vp;
+
+ p = td->td_proc;
+ if (p->p_traceflag == 0)
+ return;
+
+ ktrace_enter(td);
+ req = ktr_getrequest_entered(td, KTR_PROCDTOR);
+ if (req != NULL)
+ ktr_enqueuerequest(td, req);
+ sx_xlock(&ktrace_sx);
+ ktr_drain(td);
+ sx_xunlock(&ktrace_sx);
+ PROC_LOCK(p);
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, &cred, &vp);
+ mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p);
+ if (vp != NULL)
+ vrele(vp);
+ if (cred != NULL)
+ crfree(cred);
+ ktrace_exit(td);
+}
+
+static void
+ktrprocctor_entered(struct thread *td, struct proc *p)
+{
+ struct ktr_proc_ctor *ktp;
+ struct ktr_request *req;
+ struct thread *td2;
+
+ ktrace_assert(td);
+ td2 = FIRST_THREAD_IN_PROC(p);
+ req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
+ if (req == NULL)
+ return;
+ ktp = &req->ktr_data.ktr_proc_ctor;
+ ktp->sv_flags = p->p_sysent->sv_flags;
+ ktr_enqueuerequest(td2, req);
+}
+
+void
+ktrprocctor(struct proc *p)
+{
+ struct thread *td = curthread;
+
+ if ((p->p_traceflag & KTRFAC_MASK) == 0)
+ return;
+
+ ktrace_enter(td);
+ ktrprocctor_entered(td, p);
+ ktrace_exit(td);
+}
+
+/*
+ * When a process forks, enable tracing in the new process if needed.
+ */
+void
+ktrprocfork(struct proc *p1, struct proc *p2)
+{
+
+ PROC_LOCK(p1);
+ mtx_lock(&ktrace_mtx);
+ KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
+ if (p1->p_traceflag & KTRFAC_INHERIT) {
+ p2->p_traceflag = p1->p_traceflag;
+ if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
+ VREF(p2->p_tracevp);
+ KASSERT(p1->p_tracecred != NULL,
+ ("ktrace vnode with no cred"));
+ p2->p_tracecred = crhold(p1->p_tracecred);
+ }
+ }
+ mtx_unlock(&ktrace_mtx);
+ PROC_UNLOCK(p1);
+
+ ktrprocctor(p2);
+}
+
+/*
+ * When a thread returns, drain any asynchronous records generated by the
+ * system call.
+ */
+void
+ktruserret(struct thread *td)
+{
+
+ ktrace_enter(td);
+ sx_xlock(&ktrace_sx);
+ ktr_drain(td);
+ sx_xunlock(&ktrace_sx);
+ ktrace_exit(td);
+}
+
+void
+ktrnamei(path)
+ char *path;
+{
+ struct ktr_request *req;
+ int namelen;
+ char *buf = NULL;
+
+ namelen = strlen(path);
+ if (namelen > 0) {
+ buf = malloc(namelen, M_KTRACE, M_WAITOK);
+ bcopy(path, buf, namelen);
+ }
+ req = ktr_getrequest(KTR_NAMEI);
+ if (req == NULL) {
+ if (buf != NULL)
+ free(buf, M_KTRACE);
+ return;
+ }
+ if (namelen > 0) {
+ req->ktr_header.ktr_len = namelen;
+ req->ktr_buffer = buf;
+ }
+ ktr_submitrequest(curthread, req);
+}
+
+void
+ktrsysctl(name, namelen)
+ int *name;
+ u_int namelen;
+{
+ struct ktr_request *req;
+ u_int mib[CTL_MAXNAME + 2];
+ char *mibname;
+ size_t mibnamelen;
+ int error;
+
+ /* Lookup name of mib. */
+ KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
+ mib[0] = 0;
+ mib[1] = 1;
+ bcopy(name, mib + 2, namelen * sizeof(*name));
+ mibnamelen = 128;
+ mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
+ error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
+ NULL, 0, &mibnamelen, 0);
+ if (error) {
+ free(mibname, M_KTRACE);
+ return;
+ }
+ req = ktr_getrequest(KTR_SYSCTL);
+ if (req == NULL) {
+ free(mibname, M_KTRACE);
+ return;
+ }
+ req->ktr_header.ktr_len = mibnamelen;
+ req->ktr_buffer = mibname;
+ ktr_submitrequest(curthread, req);
+}
+
+void
+ktrgenio(fd, rw, uio, error)
+ int fd;
+ enum uio_rw rw;
+ struct uio *uio;
+ int error;
+{
+ struct ktr_request *req;
+ struct ktr_genio *ktg;
+ int datalen;
+ char *buf;
+
+ if (error) {
+ free(uio, M_IOV);
+ return;
+ }
+ uio->uio_offset = 0;
+ uio->uio_rw = UIO_WRITE;
+ datalen = MIN(uio->uio_resid, ktr_geniosize);
+ buf = malloc(datalen, M_KTRACE, M_WAITOK);
+ error = uiomove(buf, datalen, uio);
+ free(uio, M_IOV);
+ if (error) {
+ free(buf, M_KTRACE);
+ return;
+ }
+ req = ktr_getrequest(KTR_GENIO);
+ if (req == NULL) {
+ free(buf, M_KTRACE);
+ return;
+ }
+ ktg = &req->ktr_data.ktr_genio;
+ ktg->ktr_fd = fd;
+ ktg->ktr_rw = rw;
+ req->ktr_header.ktr_len = datalen;
+ req->ktr_buffer = buf;
+ ktr_submitrequest(curthread, req);
+}
+
+void
+ktrpsig(sig, action, mask, code)
+ int sig;
+ sig_t action;
+ sigset_t *mask;
+ int code;
+{
+ struct thread *td = curthread;
+ struct ktr_request *req;
+ struct ktr_psig *kp;
+
+ req = ktr_getrequest(KTR_PSIG);
+ if (req == NULL)
+ return;
+ kp = &req->ktr_data.ktr_psig;
+ kp->signo = (char)sig;
+ kp->action = action;
+ kp->mask = *mask;
+ kp->code = code;
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
+}
+
+void
+ktrcsw(out, user, wmesg)
+ int out, user;
+ const char *wmesg;
+{
+ struct thread *td = curthread;
+ struct ktr_request *req;
+ struct ktr_csw *kc;
+
+ req = ktr_getrequest(KTR_CSW);
+ if (req == NULL)
+ return;
+ kc = &req->ktr_data.ktr_csw;
+ kc->out = out;
+ kc->user = user;
+ if (wmesg != NULL)
+ strlcpy(kc->wmesg, wmesg, sizeof(kc->wmesg));
+ else
+ bzero(kc->wmesg, sizeof(kc->wmesg));
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
+}
+
+void
+ktrstruct(name, data, datalen)
+ const char *name;
+ void *data;
+ size_t datalen;
+{
+ struct ktr_request *req;
+ char *buf = NULL;
+ size_t buflen;
+
+ if (!data)
+ datalen = 0;
+ buflen = strlen(name) + 1 + datalen;
+ buf = malloc(buflen, M_KTRACE, M_WAITOK);
+ strcpy(buf, name);
+ bcopy(data, buf + strlen(name) + 1, datalen);
+ if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
+ free(buf, M_KTRACE);
+ return;
+ }
+ req->ktr_buffer = buf;
+ req->ktr_header.ktr_len = buflen;
+ ktr_submitrequest(curthread, req);
+}
+
+void
+ktrcapfail(type, needed, held)
+ enum ktr_cap_fail_type type;
+ const cap_rights_t *needed;
+ const cap_rights_t *held;
+{
+ struct thread *td = curthread;
+ struct ktr_request *req;
+ struct ktr_cap_fail *kcf;
+
+ req = ktr_getrequest(KTR_CAPFAIL);
+ if (req == NULL)
+ return;
+ kcf = &req->ktr_data.ktr_cap_fail;
+ kcf->cap_type = type;
+ kcf->cap_needed = *needed;
+ kcf->cap_held = *held;
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
+}
+
+void
+ktrfault(vaddr, type)
+ vm_offset_t vaddr;
+ int type;
+{
+ struct thread *td = curthread;
+ struct ktr_request *req;
+ struct ktr_fault *kf;
+
+ req = ktr_getrequest(KTR_FAULT);
+ if (req == NULL)
+ return;
+ kf = &req->ktr_data.ktr_fault;
+ kf->vaddr = vaddr;
+ kf->type = type;
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
+}
+
+void
+ktrfaultend(result)
+ int result;
+{
+ struct thread *td = curthread;
+ struct ktr_request *req;
+ struct ktr_faultend *kf;
+
+ req = ktr_getrequest(KTR_FAULTEND);
+ if (req == NULL)
+ return;
+ kf = &req->ktr_data.ktr_faultend;
+ kf->result = result;
+ ktr_enqueuerequest(td, req);
+ ktrace_exit(td);
+}
+#endif /* KTRACE */
+
+/* Interface and common routines */
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktrace_args {
+ char *fname;
+ int ops;
+ int facs;
+ int pid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ktrace(td, uap)
+ struct thread *td;
+ register struct ktrace_args *uap;
+{
+#ifdef KTRACE
+ register struct vnode *vp = NULL;
+ register struct proc *p;
+ struct pgrp *pg;
+ int facs = uap->facs & ~KTRFAC_ROOT;
+ int ops = KTROP(uap->ops);
+ int descend = uap->ops & KTRFLAG_DESCEND;
+ int nfound, ret = 0;
+ int flags, error = 0;
+ struct nameidata nd;
+ struct ucred *cred;
+
+ /*
+ * Need something to (un)trace.
+ */
+ if (ops != KTROP_CLEARFILE && facs == 0)
+ return (EINVAL);
+
+ ktrace_enter(td);
+ if (ops != KTROP_CLEAR) {
+ /*
+ * an operation which requires a file argument.
+ */
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname, td);
+ flags = FREAD | FWRITE | O_NOFOLLOW;
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error) {
+ ktrace_exit(td);
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ VOP_UNLOCK(vp, 0);
+ if (vp->v_type != VREG) {
+ (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
+ ktrace_exit(td);
+ return (EACCES);
+ }
+ }
+ /*
+ * Clear all uses of the tracefile.
+ */
+ if (ops == KTROP_CLEARFILE) {
+ int vrele_count;
+
+ vrele_count = 0;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_tracevp == vp) {
+ if (ktrcanset(td, p)) {
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, &cred, NULL);
+ mtx_unlock(&ktrace_mtx);
+ vrele_count++;
+ crfree(cred);
+ } else
+ error = EPERM;
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ if (vrele_count > 0) {
+ while (vrele_count-- > 0)
+ vrele(vp);
+ }
+ goto done;
+ }
+ /*
+ * do it
+ */
+ sx_slock(&proctree_lock);
+ if (uap->pid < 0) {
+ /*
+ * by process group
+ */
+ pg = pgfind(-uap->pid);
+ if (pg == NULL) {
+ sx_sunlock(&proctree_lock);
+ error = ESRCH;
+ goto done;
+ }
+ /*
+ * ktrops() may call vrele(). Lock pg_members
+ * by the proctree_lock rather than pg_mtx.
+ */
+ PGRP_UNLOCK(pg);
+ nfound = 0;
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW ||
+ p_cansee(td, p) != 0) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ nfound++;
+ if (descend)
+ ret |= ktrsetchildren(td, p, ops, facs, vp);
+ else
+ ret |= ktrops(td, p, ops, facs, vp);
+ }
+ if (nfound == 0) {
+ sx_sunlock(&proctree_lock);
+ error = ESRCH;
+ goto done;
+ }
+ } else {
+ /*
+ * by pid
+ */
+ p = pfind(uap->pid);
+ if (p == NULL)
+ error = ESRCH;
+ else
+ error = p_cansee(td, p);
+ if (error) {
+ if (p != NULL)
+ PROC_UNLOCK(p);
+ sx_sunlock(&proctree_lock);
+ goto done;
+ }
+ if (descend)
+ ret |= ktrsetchildren(td, p, ops, facs, vp);
+ else
+ ret |= ktrops(td, p, ops, facs, vp);
+ }
+ sx_sunlock(&proctree_lock);
+ if (!ret)
+ error = EPERM;
+done:
+ if (vp != NULL)
+ (void) vn_close(vp, FWRITE, td->td_ucred, td);
+ ktrace_exit(td);
+ return (error);
+#else /* !KTRACE */
+ return (ENOSYS);
+#endif /* KTRACE */
+}
+
+/* ARGSUSED */
+int
+sys_utrace(td, uap)
+ struct thread *td;
+ register struct utrace_args *uap;
+{
+
+#ifdef KTRACE
+ struct ktr_request *req;
+ void *cp;
+ int error;
+
+ if (!KTRPOINT(td, KTR_USER))
+ return (0);
+ if (uap->len > KTR_USER_MAXLEN)
+ return (EINVAL);
+ cp = malloc(uap->len, M_KTRACE, M_WAITOK);
+ error = copyin(uap->addr, cp, uap->len);
+ if (error) {
+ free(cp, M_KTRACE);
+ return (error);
+ }
+ req = ktr_getrequest(KTR_USER);
+ if (req == NULL) {
+ free(cp, M_KTRACE);
+ return (ENOMEM);
+ }
+ req->ktr_buffer = cp;
+ req->ktr_header.ktr_len = uap->len;
+ ktr_submitrequest(td, req);
+ return (0);
+#else /* !KTRACE */
+ return (ENOSYS);
+#endif /* KTRACE */
+}
+
+#ifdef KTRACE
+static int
+ktrops(td, p, ops, facs, vp)
+ struct thread *td;
+ struct proc *p;
+ int ops, facs;
+ struct vnode *vp;
+{
+ struct vnode *tracevp = NULL;
+ struct ucred *tracecred = NULL;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (!ktrcanset(td, p)) {
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ if (p->p_flag & P_WEXIT) {
+ /* If the process is exiting, just ignore it. */
+ PROC_UNLOCK(p);
+ return (1);
+ }
+ mtx_lock(&ktrace_mtx);
+ if (ops == KTROP_SET) {
+ if (p->p_tracevp != vp) {
+ /*
+ * if trace file already in use, relinquish below
+ */
+ tracevp = p->p_tracevp;
+ VREF(vp);
+ p->p_tracevp = vp;
+ }
+ if (p->p_tracecred != td->td_ucred) {
+ tracecred = p->p_tracecred;
+ p->p_tracecred = crhold(td->td_ucred);
+ }
+ p->p_traceflag |= facs;
+ if (priv_check(td, PRIV_KTRACE) == 0)
+ p->p_traceflag |= KTRFAC_ROOT;
+ } else {
+ /* KTROP_CLEAR */
+ if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
+ /* no more tracing */
+ ktr_freeproc(p, &tracecred, &tracevp);
+ }
+ mtx_unlock(&ktrace_mtx);
+ if ((p->p_traceflag & KTRFAC_MASK) != 0)
+ ktrprocctor_entered(td, p);
+ PROC_UNLOCK(p);
+ if (tracevp != NULL)
+ vrele(tracevp);
+ if (tracecred != NULL)
+ crfree(tracecred);
+
+ return (1);
+}
+
+static int
+ktrsetchildren(td, top, ops, facs, vp)
+ struct thread *td;
+ struct proc *top;
+ int ops, facs;
+ struct vnode *vp;
+{
+ register struct proc *p;
+ register int ret = 0;
+
+ p = top;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ sx_assert(&proctree_lock, SX_LOCKED);
+ for (;;) {
+ ret |= ktrops(td, p, ops, facs, vp);
+ /*
+ * If this process has children, descend to them next,
+ * otherwise do any siblings, and if done with this level,
+ * follow back up the tree (but not past top).
+ */
+ if (!LIST_EMPTY(&p->p_children))
+ p = LIST_FIRST(&p->p_children);
+ else for (;;) {
+ if (p == top)
+ return (ret);
+ if (LIST_NEXT(p, p_sibling)) {
+ p = LIST_NEXT(p, p_sibling);
+ break;
+ }
+ p = p->p_pptr;
+ }
+ PROC_LOCK(p);
+ }
+ /*NOTREACHED*/
+}
+
+static void
+ktr_writerequest(struct thread *td, struct ktr_request *req)
+{
+ struct ktr_header *kth;
+ struct vnode *vp;
+ struct proc *p;
+ struct ucred *cred;
+ struct uio auio;
+ struct iovec aiov[3];
+ struct mount *mp;
+ int datalen, buflen, vrele_count;
+ int error;
+
+ /*
+ * We hold the vnode and credential for use in I/O in case ktrace is
+ * disabled on the process as we write out the request.
+ *
+ * XXXRW: This is not ideal: we could end up performing a write after
+ * the vnode has been closed.
+ */
+ mtx_lock(&ktrace_mtx);
+ vp = td->td_proc->p_tracevp;
+ cred = td->td_proc->p_tracecred;
+
+ /*
+ * If vp is NULL, the vp has been cleared out from under this
+ * request, so just drop it. Make sure the credential and vnode are
+ * in sync: we should have both or neither.
+ */
+ if (vp == NULL) {
+ KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
+ mtx_unlock(&ktrace_mtx);
+ return;
+ }
+ VREF(vp);
+ KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
+ crhold(cred);
+ mtx_unlock(&ktrace_mtx);
+
+ kth = &req->ktr_header;
+ KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
+ sizeof(data_lengths) / sizeof(data_lengths[0]),
+ ("data_lengths array overflow"));
+ datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
+ buflen = kth->ktr_len;
+ auio.uio_iov = &aiov[0];
+ auio.uio_offset = 0;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ aiov[0].iov_base = (caddr_t)kth;
+ aiov[0].iov_len = sizeof(struct ktr_header);
+ auio.uio_resid = sizeof(struct ktr_header);
+ auio.uio_iovcnt = 1;
+ auio.uio_td = td;
+ if (datalen != 0) {
+ aiov[1].iov_base = (caddr_t)&req->ktr_data;
+ aiov[1].iov_len = datalen;
+ auio.uio_resid += datalen;
+ auio.uio_iovcnt++;
+ kth->ktr_len += datalen;
+ }
+ if (buflen != 0) {
+ KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
+ aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
+ aiov[auio.uio_iovcnt].iov_len = buflen;
+ auio.uio_resid += buflen;
+ auio.uio_iovcnt++;
+ }
+
+ vn_start_write(vp, &mp, V_WAIT);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+ error = mac_vnode_check_write(cred, NOCRED, vp);
+ if (error == 0)
+#endif
+ error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ crfree(cred);
+ if (!error) {
+ vrele(vp);
+ return;
+ }
+
+ /*
+ * If error encountered, give up tracing on this vnode. We defer
+ * all the vrele()'s on the vnode until after we are finished walking
+ * the various lists to avoid needlessly holding locks.
+ * NB: at this point we still hold the vnode reference that must
+ * not go away as we need the valid vnode to compare with. Thus let
+ * vrele_count start at 1 and the reference will be freed
+ * by the loop at the end after our last use of vp.
+ */
+ log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
+ error);
+ vrele_count = 1;
+ /*
+ * First, clear this vnode from being used by any processes in the
+ * system.
+ * XXX - If one process gets an EPERM writing to the vnode, should
+ * we really do this? Other processes might have suitable
+ * credentials for the operation.
+ */
+ cred = NULL;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_tracevp == vp) {
+ mtx_lock(&ktrace_mtx);
+ ktr_freeproc(p, &cred, NULL);
+ mtx_unlock(&ktrace_mtx);
+ vrele_count++;
+ }
+ PROC_UNLOCK(p);
+ if (cred != NULL) {
+ crfree(cred);
+ cred = NULL;
+ }
+ }
+ sx_sunlock(&allproc_lock);
+
+ while (vrele_count-- > 0)
+ vrele(vp);
+}
+
+/*
+ * Return true if caller has permission to set the ktracing state
+ * of target. Essentially, the target can't possess any
+ * more permissions than the caller. KTRFAC_ROOT signifies that
+ * root previously set the tracing status on the target process, and
+ * so, only root may further change it.
+ */
+static int
+ktrcanset(td, targetp)
+ struct thread *td;
+ struct proc *targetp;
+{
+
+ PROC_LOCK_ASSERT(targetp, MA_OWNED);
+ if (targetp->p_traceflag & KTRFAC_ROOT &&
+ priv_check(td, PRIV_KTRACE))
+ return (0);
+
+ if (p_candebug(td, targetp) != 0)
+ return (0);
+
+ return (1);
+}
+
+#endif /* KTRACE */
diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c
new file mode 100644
index 0000000..7d32260
--- /dev/null
+++ b/sys/kern/kern_linker.c
@@ -0,0 +1,2162 @@
+/*-
+ * Copyright (c) 1997-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kld.h"
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+#include <sys/mount.h>
+#include <sys/linker.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/libkern.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include "linker_if.h"
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#ifdef KLD_DEBUG
+int kld_debug = 0;
+SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RW | CTLFLAG_TUN,
+ &kld_debug, 0, "Set various levels of KLD debug");
+TUNABLE_INT("debug.kld_debug", &kld_debug);
+#endif
+
+/*
+ * static char *linker_search_path(const char *name, struct mod_depend
+ * *verinfo);
+ */
+static const char *linker_basename(const char *path);
+
+/*
+ * Find a currently loaded file given its filename.
+ */
+static linker_file_t linker_find_file_by_name(const char* _filename);
+
+/*
+ * Find a currently loaded file given its file id.
+ */
+static linker_file_t linker_find_file_by_id(int _fileid);
+
+/* Metadata from the static kernel */
+SET_DECLARE(modmetadata_set, struct mod_metadata);
+
+MALLOC_DEFINE(M_LINKER, "linker", "kernel linker");
+
+linker_file_t linker_kernel_file;
+
+static struct sx kld_sx; /* kernel linker lock */
+
+/*
+ * Load counter used by clients to determine if a linker file has been
+ * re-loaded. This counter is incremented for each file load.
+ */
+static int loadcnt;
+
+static linker_class_list_t classes;
+static linker_file_list_t linker_files;
+static int next_file_id = 1;
+static int linker_no_more_classes = 0;
+
+#define LINKER_GET_NEXT_FILE_ID(a) do { \
+ linker_file_t lftmp; \
+ \
+ if (!cold) \
+ sx_assert(&kld_sx, SA_XLOCKED); \
+retry: \
+ TAILQ_FOREACH(lftmp, &linker_files, link) { \
+ if (next_file_id == lftmp->id) { \
+ next_file_id++; \
+ goto retry; \
+ } \
+ } \
+ (a) = next_file_id; \
+} while(0)
+
+
+/* XXX wrong name; we're looking at version provision tags here, not modules */
+typedef TAILQ_HEAD(, modlist) modlisthead_t;
+struct modlist {
+ TAILQ_ENTRY(modlist) link; /* chain together all modules */
+ linker_file_t container;
+ const char *name;
+ int version;
+};
+typedef struct modlist *modlist_t;
+static modlisthead_t found_modules;
+
+static int linker_file_add_dependency(linker_file_t file,
+ linker_file_t dep);
+static caddr_t linker_file_lookup_symbol_internal(linker_file_t file,
+ const char* name, int deps);
+static int linker_load_module(const char *kldname,
+ const char *modname, struct linker_file *parent,
+ struct mod_depend *verinfo, struct linker_file **lfpp);
+static modlist_t modlist_lookup2(const char *name, struct mod_depend *verinfo);
+
+static void
+linker_init(void *arg)
+{
+
+ sx_init(&kld_sx, "kernel linker");
+ TAILQ_INIT(&classes);
+ TAILQ_INIT(&linker_files);
+}
+
+SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0);
+
+static void
+linker_stop_class_add(void *arg)
+{
+
+ linker_no_more_classes = 1;
+}
+
+SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL);
+
+int
+linker_add_class(linker_class_t lc)
+{
+
+ /*
+ * We disallow any class registration past SI_ORDER_ANY
+ * of SI_SUB_KLD. We bump the reference count to keep the
+ * ops from being freed.
+ */
+ if (linker_no_more_classes == 1)
+ return (EPERM);
+ kobj_class_compile((kobj_class_t) lc);
+ ((kobj_class_t)lc)->refs++; /* XXX: kobj_mtx */
+ TAILQ_INSERT_TAIL(&classes, lc, link);
+ return (0);
+}
+
+static void
+linker_file_sysinit(linker_file_t lf)
+{
+ struct sysinit **start, **stop, **sipp, **xipp, *save;
+
+ KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
+ lf->filename));
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+
+ if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0)
+ return;
+ /*
+ * Perform a bubble sort of the system initialization objects by
+ * their subsystem (primary key) and order (secondary key).
+ *
+ * Since some things care about execution order, this is the operation
+ * which ensures continued function.
+ */
+ for (sipp = start; sipp < stop; sipp++) {
+ for (xipp = sipp + 1; xipp < stop; xipp++) {
+ if ((*sipp)->subsystem < (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order <= (*xipp)->order))
+ continue; /* skip */
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ */
+ sx_xunlock(&kld_sx);
+ mtx_lock(&Giant);
+ for (sipp = start; sipp < stop; sipp++) {
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s) */
+
+ /* Call function */
+ (*((*sipp)->func)) ((*sipp)->udata);
+ }
+ mtx_unlock(&Giant);
+ sx_xlock(&kld_sx);
+}
+
+static void
+linker_file_sysuninit(linker_file_t lf)
+{
+ struct sysinit **start, **stop, **sipp, **xipp, *save;
+
+ KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
+ lf->filename));
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+
+ if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop,
+ NULL) != 0)
+ return;
+
+ /*
+ * Perform a reverse bubble sort of the system initialization objects
+ * by their subsystem (primary key) and order (secondary key).
+ *
+ * Since some things care about execution order, this is the operation
+ * which ensures continued function.
+ */
+ for (sipp = start; sipp < stop; sipp++) {
+ for (xipp = sipp + 1; xipp < stop; xipp++) {
+ if ((*sipp)->subsystem > (*xipp)->subsystem ||
+ ((*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order >= (*xipp)->order))
+ continue; /* skip */
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ */
+ sx_xunlock(&kld_sx);
+ mtx_lock(&Giant);
+ for (sipp = start; sipp < stop; sipp++) {
+ if ((*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s) */
+
+ /* Call function */
+ (*((*sipp)->func)) ((*sipp)->udata);
+ }
+ mtx_unlock(&Giant);
+ sx_xlock(&kld_sx);
+}
+
+static void
+linker_file_register_sysctls(linker_file_t lf)
+{
+ struct sysctl_oid **start, **stop, **oidp;
+
+ KLD_DPF(FILE,
+ ("linker_file_register_sysctls: registering SYSCTLs for %s\n",
+ lf->filename));
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+
+ if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
+ return;
+
+ sx_xunlock(&kld_sx);
+ sysctl_lock();
+ for (oidp = start; oidp < stop; oidp++)
+ sysctl_register_oid(*oidp);
+ sysctl_unlock();
+ sx_xlock(&kld_sx);
+}
+
+static void
+linker_file_unregister_sysctls(linker_file_t lf)
+{
+ struct sysctl_oid **start, **stop, **oidp;
+
+ KLD_DPF(FILE, ("linker_file_unregister_sysctls: unregistering SYSCTLs"
+ " for %s\n", lf->filename));
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+
+ if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
+ return;
+
+ sx_xunlock(&kld_sx);
+ sysctl_lock();
+ for (oidp = start; oidp < stop; oidp++)
+ sysctl_unregister_oid(*oidp);
+ sysctl_unlock();
+ sx_xlock(&kld_sx);
+}
+
+static int
+linker_file_register_modules(linker_file_t lf)
+{
+ struct mod_metadata **start, **stop, **mdp;
+ const moduledata_t *moddata;
+ int first_error, error;
+
+ KLD_DPF(FILE, ("linker_file_register_modules: registering modules"
+ " in %s\n", lf->filename));
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+
+ if (linker_file_lookup_set(lf, "modmetadata_set", &start,
+ &stop, NULL) != 0) {
+ /*
+ * This fallback should be unnecessary, but if we get booted
+ * from boot2 instead of loader and we are missing our
+ * metadata then we have to try the best we can.
+ */
+ if (lf == linker_kernel_file) {
+ start = SET_BEGIN(modmetadata_set);
+ stop = SET_LIMIT(modmetadata_set);
+ } else
+ return (0);
+ }
+ first_error = 0;
+ for (mdp = start; mdp < stop; mdp++) {
+ if ((*mdp)->md_type != MDT_MODULE)
+ continue;
+ moddata = (*mdp)->md_data;
+ KLD_DPF(FILE, ("Registering module %s in %s\n",
+ moddata->name, lf->filename));
+ error = module_register(moddata, lf);
+ if (error) {
+ printf("Module %s failed to register: %d\n",
+ moddata->name, error);
+ if (first_error == 0)
+ first_error = error;
+ }
+ }
+ return (first_error);
+}
+
+static void
+linker_init_kernel_modules(void)
+{
+
+ sx_xlock(&kld_sx);
+ linker_file_register_modules(linker_kernel_file);
+ sx_xunlock(&kld_sx);
+}
+
+SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules,
+ 0);
+
+static int
+linker_load_file(const char *filename, linker_file_t *result)
+{
+ linker_class_t lc;
+ linker_file_t lf;
+ int foundfile, error, modules;
+
+ /* Refuse to load modules if securelevel raised */
+ if (prison0.pr_securelevel > 0)
+ return (EPERM);
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+ lf = linker_find_file_by_name(filename);
+ if (lf) {
+ KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
+ " incrementing refs\n", filename));
+ *result = lf;
+ lf->refs++;
+ return (0);
+ }
+ foundfile = 0;
+ error = 0;
+
+ /*
+ * We do not need to protect (lock) classes here because there is
+ * no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY)
+ * and there is no class deregistration mechanism at this time.
+ */
+ TAILQ_FOREACH(lc, &classes, link) {
+ KLD_DPF(FILE, ("linker_load_file: trying to load %s\n",
+ filename));
+ error = LINKER_LOAD_FILE(lc, filename, &lf);
+ /*
+ * If we got something other than ENOENT, then it exists but
+ * we cannot load it for some other reason.
+ */
+ if (error != ENOENT)
+ foundfile = 1;
+ if (lf) {
+ error = linker_file_register_modules(lf);
+ if (error == EEXIST) {
+ linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ return (error);
+ }
+ modules = !TAILQ_EMPTY(&lf->modules);
+ linker_file_register_sysctls(lf);
+ linker_file_sysinit(lf);
+ lf->flags |= LINKER_FILE_LINKED;
+
+ /*
+ * If all of the modules in this file failed
+ * to load, unload the file and return an
+ * error of ENOEXEC.
+ */
+ if (modules && TAILQ_EMPTY(&lf->modules)) {
+ linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ return (ENOEXEC);
+ }
+ *result = lf;
+ return (0);
+ }
+ }
+ /*
+ * Less than ideal, but tells the user whether it failed to load or
+ * the module was not found.
+ */
+ if (foundfile) {
+
+ /*
+ * If the file type has not been recognized by the last try
+ * printout a message before to fail.
+ */
+ if (error == ENOSYS)
+ printf("linker_load_file: Unsupported file type\n");
+
+ /*
+ * Format not recognized or otherwise unloadable.
+ * When loading a module that is statically built into
+ * the kernel EEXIST percolates back up as the return
+ * value. Preserve this so that apps like sysinstall
+ * can recognize this special case and not post bogus
+ * dialog boxes.
+ */
+ if (error != EEXIST)
+ error = ENOEXEC;
+ } else
+ error = ENOENT; /* Nothing found */
+ return (error);
+}
+
+int
+linker_reference_module(const char *modname, struct mod_depend *verinfo,
+ linker_file_t *result)
+{
+ modlist_t mod;
+ int error;
+
+ sx_xlock(&kld_sx);
+ if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
+ *result = mod->container;
+ (*result)->refs++;
+ sx_xunlock(&kld_sx);
+ return (0);
+ }
+
+ error = linker_load_module(NULL, modname, NULL, verinfo, result);
+ sx_xunlock(&kld_sx);
+ return (error);
+}
+
+int
+linker_release_module(const char *modname, struct mod_depend *verinfo,
+ linker_file_t lf)
+{
+ modlist_t mod;
+ int error;
+
+ sx_xlock(&kld_sx);
+ if (lf == NULL) {
+ KASSERT(modname != NULL,
+ ("linker_release_module: no file or name"));
+ mod = modlist_lookup2(modname, verinfo);
+ if (mod == NULL) {
+ sx_xunlock(&kld_sx);
+ return (ESRCH);
+ }
+ lf = mod->container;
+ } else
+ KASSERT(modname == NULL && verinfo == NULL,
+ ("linker_release_module: both file and name"));
+ error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL);
+ sx_xunlock(&kld_sx);
+ return (error);
+}
+
+static linker_file_t
+linker_find_file_by_name(const char *filename)
+{
+ linker_file_t lf;
+ char *koname;
+
+ koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
+ sprintf(koname, "%s.ko", filename);
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (strcmp(lf->filename, koname) == 0)
+ break;
+ if (strcmp(lf->filename, filename) == 0)
+ break;
+ }
+ free(koname, M_LINKER);
+ return (lf);
+}
+
+static linker_file_t
+linker_find_file_by_id(int fileid)
+{
+ linker_file_t lf;
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+ TAILQ_FOREACH(lf, &linker_files, link)
+ if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED)
+ break;
+ return (lf);
+}
+
+int
+linker_file_foreach(linker_predicate_t *predicate, void *context)
+{
+ linker_file_t lf;
+ int retval = 0;
+
+ sx_xlock(&kld_sx);
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ retval = predicate(lf, context);
+ if (retval != 0)
+ break;
+ }
+ sx_xunlock(&kld_sx);
+ return (retval);
+}
+
+linker_file_t
+linker_make_file(const char *pathname, linker_class_t lc)
+{
+ linker_file_t lf;
+ const char *filename;
+
+ if (!cold)
+ sx_assert(&kld_sx, SA_XLOCKED);
+ filename = linker_basename(pathname);
+
+ KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname));
+ lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
+ if (lf == NULL)
+ return (NULL);
+ lf->refs = 1;
+ lf->userrefs = 0;
+ lf->flags = 0;
+ lf->filename = strdup(filename, M_LINKER);
+ lf->pathname = strdup(pathname, M_LINKER);
+ LINKER_GET_NEXT_FILE_ID(lf->id);
+ lf->ndeps = 0;
+ lf->deps = NULL;
+ lf->loadcnt = ++loadcnt;
+ STAILQ_INIT(&lf->common);
+ TAILQ_INIT(&lf->modules);
+ TAILQ_INSERT_TAIL(&linker_files, lf, link);
+ return (lf);
+}
+
+int
+linker_file_unload(linker_file_t file, int flags)
+{
+ module_t mod, next;
+ modlist_t ml, nextml;
+ struct common_symbol *cp;
+ int error, i;
+
+ /* Refuse to unload modules if securelevel raised. */
+ if (prison0.pr_securelevel > 0)
+ return (EPERM);
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+ KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
+
+ /* Easy case of just dropping a reference. */
+ if (file->refs > 1) {
+ file->refs--;
+ return (0);
+ }
+
+ KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
+ " informing modules\n"));
+
+ /*
+ * Quiesce all the modules to give them a chance to veto the unload.
+ */
+ MOD_SLOCK;
+ for (mod = TAILQ_FIRST(&file->modules); mod;
+ mod = module_getfnext(mod)) {
+
+ error = module_quiesce(mod);
+ if (error != 0 && flags != LINKER_UNLOAD_FORCE) {
+ KLD_DPF(FILE, ("linker_file_unload: module %s"
+ " vetoed unload\n", module_getname(mod)));
+ /*
+ * XXX: Do we need to tell all the quiesced modules
+ * that they can resume work now via a new module
+ * event?
+ */
+ MOD_SUNLOCK;
+ return (error);
+ }
+ }
+ MOD_SUNLOCK;
+
+ /*
+ * Inform any modules associated with this file that they are
+ * being unloaded.
+ */
+ MOD_XLOCK;
+ for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
+ next = module_getfnext(mod);
+ MOD_XUNLOCK;
+
+ /*
+ * Give the module a chance to veto the unload.
+ */
+ if ((error = module_unload(mod)) != 0) {
+#ifdef KLD_DEBUG
+ MOD_SLOCK;
+ KLD_DPF(FILE, ("linker_file_unload: module %s"
+ " failed unload\n", module_getname(mod)));
+ MOD_SUNLOCK;
+#endif
+ return (error);
+ }
+ MOD_XLOCK;
+ module_release(mod);
+ }
+ MOD_XUNLOCK;
+
+ TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) {
+ if (ml->container == file) {
+ TAILQ_REMOVE(&found_modules, ml, link);
+ free(ml, M_LINKER);
+ }
+ }
+
+ /*
+ * Don't try to run SYSUNINITs if we are unloaded due to a
+ * link error.
+ */
+ if (file->flags & LINKER_FILE_LINKED) {
+ file->flags &= ~LINKER_FILE_LINKED;
+ linker_file_sysuninit(file);
+ linker_file_unregister_sysctls(file);
+ }
+ TAILQ_REMOVE(&linker_files, file, link);
+
+ if (file->deps) {
+ for (i = 0; i < file->ndeps; i++)
+ linker_file_unload(file->deps[i], flags);
+ free(file->deps, M_LINKER);
+ file->deps = NULL;
+ }
+ while ((cp = STAILQ_FIRST(&file->common)) != NULL) {
+ STAILQ_REMOVE_HEAD(&file->common, link);
+ free(cp, M_LINKER);
+ }
+
+ LINKER_UNLOAD(file);
+ if (file->filename) {
+ free(file->filename, M_LINKER);
+ file->filename = NULL;
+ }
+ if (file->pathname) {
+ free(file->pathname, M_LINKER);
+ file->pathname = NULL;
+ }
+ kobj_delete((kobj_t) file, M_LINKER);
+ return (0);
+}
+
+int
+linker_ctf_get(linker_file_t file, linker_ctf_t *lc)
+{
+ return (LINKER_CTF_GET(file, lc));
+}
+
+static int
+linker_file_add_dependency(linker_file_t file, linker_file_t dep)
+{
+ linker_file_t *newdeps;
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+ newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
+ M_LINKER, M_WAITOK | M_ZERO);
+ if (newdeps == NULL)
+ return (ENOMEM);
+
+ if (file->deps) {
+ bcopy(file->deps, newdeps,
+ file->ndeps * sizeof(linker_file_t *));
+ free(file->deps, M_LINKER);
+ }
+ file->deps = newdeps;
+ file->deps[file->ndeps] = dep;
+ file->ndeps++;
+ KLD_DPF(FILE, ("linker_file_add_dependency:"
+ " adding %s as dependency for %s\n",
+ dep->filename, file->filename));
+ return (0);
+}
+
+/*
+ * Locate a linker set and its contents. This is a helper function to avoid
+ * linker_if.h exposure elsewhere. Note: firstp and lastp are really void **.
+ * This function is used in this file so we can avoid having lots of (void **)
+ * casts.
+ */
+int
+linker_file_lookup_set(linker_file_t file, const char *name,
+ void *firstp, void *lastp, int *countp)
+{
+
+ sx_assert(&kld_sx, SA_LOCKED);
+ return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp));
+}
+
+/*
+ * List all functions in a file.
+ */
+int
+linker_file_function_listall(linker_file_t lf,
+ linker_function_nameval_callback_t callback_func, void *arg)
+{
+ return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg));
+}
+
+caddr_t
+linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
+{
+ caddr_t sym;
+ int locked;
+
+ locked = sx_xlocked(&kld_sx);
+ if (!locked)
+ sx_xlock(&kld_sx);
+ sym = linker_file_lookup_symbol_internal(file, name, deps);
+ if (!locked)
+ sx_xunlock(&kld_sx);
+ return (sym);
+}
+
+static caddr_t
+linker_file_lookup_symbol_internal(linker_file_t file, const char *name,
+ int deps)
+{
+ c_linker_sym_t sym;
+ linker_symval_t symval;
+ caddr_t address;
+ size_t common_size = 0;
+ int i;
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n",
+ file, name, deps));
+
+ if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) {
+ LINKER_SYMBOL_VALUES(file, sym, &symval);
+ if (symval.value == 0)
+ /*
+ * For commons, first look them up in the
+ * dependencies and only allocate space if not found
+ * there.
+ */
+ common_size = symval.size;
+ else {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol"
+ ".value=%p\n", symval.value));
+ return (symval.value);
+ }
+ }
+ if (deps) {
+ for (i = 0; i < file->ndeps; i++) {
+ address = linker_file_lookup_symbol_internal(
+ file->deps[i], name, 0);
+ if (address) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol:"
+ " deps value=%p\n", address));
+ return (address);
+ }
+ }
+ }
+ if (common_size > 0) {
+ /*
+ * This is a common symbol which was not found in the
+ * dependencies. We maintain a simple common symbol table in
+ * the file object.
+ */
+ struct common_symbol *cp;
+
+ STAILQ_FOREACH(cp, &file->common, link) {
+ if (strcmp(cp->name, name) == 0) {
+ KLD_DPF(SYM, ("linker_file_lookup_symbol:"
+ " old common value=%p\n", cp->address));
+ return (cp->address);
+ }
+ }
+ /*
+ * Round the symbol size up to align.
+ */
+ common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
+ cp = malloc(sizeof(struct common_symbol)
+ + common_size + strlen(name) + 1, M_LINKER,
+ M_WAITOK | M_ZERO);
+ cp->address = (caddr_t)(cp + 1);
+ cp->name = cp->address + common_size;
+ strcpy(cp->name, name);
+ bzero(cp->address, common_size);
+ STAILQ_INSERT_TAIL(&file->common, cp, link);
+
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: new common"
+ " value=%p\n", cp->address));
+ return (cp->address);
+ }
+ KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
+ return (0);
+}
+
+/*
+ * Both DDB and stack(9) rely on the kernel linker to provide forward and
+ * backward lookup of symbols. However, DDB and sometimes stack(9) need to
+ * do this in a lockfree manner. We provide a set of internal helper
+ * routines to perform these operations without locks, and then wrappers that
+ * optionally lock.
+ *
+ * linker_debug_lookup() is ifdef DDB as currently it's only used by DDB.
+ */
+#ifdef DDB
+static int
+linker_debug_lookup(const char *symstr, c_linker_sym_t *sym)
+{
+ linker_file_t lf;
+
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0)
+ return (0);
+ }
+ return (ENOENT);
+}
+#endif
+
+static int
+linker_debug_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
+{
+ linker_file_t lf;
+ c_linker_sym_t best, es;
+ u_long diff, bestdiff, off;
+
+ best = 0;
+ off = (uintptr_t)value;
+ bestdiff = off;
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0)
+ continue;
+ if (es != 0 && diff < bestdiff) {
+ best = es;
+ bestdiff = diff;
+ }
+ if (bestdiff == 0)
+ break;
+ }
+ if (best) {
+ *sym = best;
+ *diffp = bestdiff;
+ return (0);
+ } else {
+ *sym = 0;
+ *diffp = off;
+ return (ENOENT);
+ }
+}
+
+static int
+linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
+{
+ linker_file_t lf;
+
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0)
+ return (0);
+ }
+ return (ENOENT);
+}
+
+static int
+linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen,
+ long *offset)
+{
+ linker_symval_t symval;
+ c_linker_sym_t sym;
+ int error;
+
+ *offset = 0;
+ error = linker_debug_search_symbol(value, &sym, offset);
+ if (error)
+ return (error);
+ error = linker_debug_symbol_values(sym, &symval);
+ if (error)
+ return (error);
+ strlcpy(buf, symval.name, buflen);
+ return (0);
+}
+
+/*
+ * DDB Helpers. DDB has to look across multiple files with their own symbol
+ * tables and string tables.
+ *
+ * Note that we do not obey list locking protocols here. We really don't need
+ * DDB to hang because somebody's got the lock held. We'll take the chance
+ * that the files list is inconsistant instead.
+ */
+#ifdef DDB
+int
+linker_ddb_lookup(const char *symstr, c_linker_sym_t *sym)
+{
+
+ return (linker_debug_lookup(symstr, sym));
+}
+#endif
+
+int
+linker_ddb_search_symbol(caddr_t value, c_linker_sym_t *sym, long *diffp)
+{
+
+ return (linker_debug_search_symbol(value, sym, diffp));
+}
+
+int
+linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
+{
+
+ return (linker_debug_symbol_values(sym, symval));
+}
+
+int
+linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen,
+ long *offset)
+{
+
+ return (linker_debug_search_symbol_name(value, buf, buflen, offset));
+}
+
+/*
+ * stack(9) helper for non-debugging environemnts. Unlike DDB helpers, we do
+ * obey locking protocols, and offer a significantly less complex interface.
+ */
+int
+linker_search_symbol_name(caddr_t value, char *buf, u_int buflen,
+ long *offset)
+{
+ int error;
+
+ sx_xlock(&kld_sx);
+ error = linker_debug_search_symbol_name(value, buf, buflen, offset);
+ sx_xunlock(&kld_sx);
+ return (error);
+}
+
+/*
+ * Syscalls.
+ */
+int
+kern_kldload(struct thread *td, const char *file, int *fileid)
+{
+ const char *kldname, *modname;
+ linker_file_t lf;
+ int error;
+
+ if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+ return (error);
+
+ if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0)
+ return (error);
+
+ /*
+ * It is possible that kldloaded module will attach a new ifnet,
+ * so vnet context must be set when this ocurs.
+ */
+ CURVNET_SET(TD_TO_VNET(td));
+
+ /*
+ * If file does not contain a qualified name or any dot in it
+ * (kldname.ko, or kldname.ver.ko) treat it as an interface
+ * name.
+ */
+ if (strchr(file, '/') || strchr(file, '.')) {
+ kldname = file;
+ modname = NULL;
+ } else {
+ kldname = NULL;
+ modname = file;
+ }
+
+ sx_xlock(&kld_sx);
+ error = linker_load_module(kldname, modname, NULL, NULL, &lf);
+ if (error) {
+ sx_xunlock(&kld_sx);
+ goto done;
+ }
+ lf->userrefs++;
+ if (fileid != NULL)
+ *fileid = lf->id;
+
+ sx_downgrade(&kld_sx);
+ EVENTHANDLER_INVOKE(kld_load, lf);
+ sx_sunlock(&kld_sx);
+
+done:
+ CURVNET_RESTORE();
+ return (error);
+}
+
+int
+sys_kldload(struct thread *td, struct kldload_args *uap)
+{
+ char *pathname = NULL;
+ int error, fileid;
+
+ td->td_retval[0] = -1;
+
+ pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL);
+ if (error == 0) {
+ error = kern_kldload(td, pathname, &fileid);
+ if (error == 0)
+ td->td_retval[0] = fileid;
+ }
+ free(pathname, M_TEMP);
+ return (error);
+}
+
+int
+kern_kldunload(struct thread *td, int fileid, int flags)
+{
+ linker_file_t lf;
+ char *filename = NULL;
+ caddr_t address;
+ size_t size;
+ int error = 0;
+
+ if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
+ return (error);
+
+ if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0)
+ return (error);
+
+ CURVNET_SET(TD_TO_VNET(td));
+ sx_xlock(&kld_sx);
+ lf = linker_find_file_by_id(fileid);
+ if (lf) {
+ KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
+
+ EVENTHANDLER_INVOKE(kld_unload_try, lf, &error);
+ if (error != 0)
+ error = EBUSY;
+ else if (lf->userrefs == 0) {
+ /*
+ * XXX: maybe LINKER_UNLOAD_FORCE should override ?
+ */
+ printf("kldunload: attempt to unload file that was"
+ " loaded by the kernel\n");
+ error = EBUSY;
+ } else {
+ /* Save data needed for the kld_unload callbacks. */
+ filename = strdup(lf->filename, M_TEMP);
+ address = lf->address;
+ size = lf->size;
+
+ lf->userrefs--;
+ error = linker_file_unload(lf, flags);
+ if (error)
+ lf->userrefs++;
+ }
+ } else
+ error = ENOENT;
+
+ if (error == 0) {
+ sx_downgrade(&kld_sx);
+ EVENTHANDLER_INVOKE(kld_unload, filename, address, size);
+ sx_sunlock(&kld_sx);
+ } else
+ sx_xunlock(&kld_sx);
+ free(filename, M_TEMP);
+
+ CURVNET_RESTORE();
+ return (error);
+}
+
+int
+sys_kldunload(struct thread *td, struct kldunload_args *uap)
+{
+
+ return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL));
+}
+
+int
+sys_kldunloadf(struct thread *td, struct kldunloadf_args *uap)
+{
+
+ if (uap->flags != LINKER_UNLOAD_NORMAL &&
+ uap->flags != LINKER_UNLOAD_FORCE)
+ return (EINVAL);
+ return (kern_kldunload(td, uap->fileid, uap->flags));
+}
+
+int
+sys_kldfind(struct thread *td, struct kldfind_args *uap)
+{
+ char *pathname;
+ const char *filename;
+ linker_file_t lf;
+ int error;
+
+#ifdef MAC
+ error = mac_kld_check_stat(td->td_ucred);
+ if (error)
+ return (error);
+#endif
+
+ td->td_retval[0] = -1;
+
+ pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0)
+ goto out;
+
+ filename = linker_basename(pathname);
+ sx_xlock(&kld_sx);
+ lf = linker_find_file_by_name(filename);
+ if (lf)
+ td->td_retval[0] = lf->id;
+ else
+ error = ENOENT;
+ sx_xunlock(&kld_sx);
+out:
+ free(pathname, M_TEMP);
+ return (error);
+}
+
+int
+sys_kldnext(struct thread *td, struct kldnext_args *uap)
+{
+ linker_file_t lf;
+ int error = 0;
+
+#ifdef MAC
+ error = mac_kld_check_stat(td->td_ucred);
+ if (error)
+ return (error);
+#endif
+
+ sx_xlock(&kld_sx);
+ if (uap->fileid == 0)
+ lf = TAILQ_FIRST(&linker_files);
+ else {
+ lf = linker_find_file_by_id(uap->fileid);
+ if (lf == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ lf = TAILQ_NEXT(lf, link);
+ }
+
+ /* Skip partially loaded files. */
+ while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED))
+ lf = TAILQ_NEXT(lf, link);
+
+ if (lf)
+ td->td_retval[0] = lf->id;
+ else
+ td->td_retval[0] = 0;
+out:
+ sx_xunlock(&kld_sx);
+ return (error);
+}
+
+int
+sys_kldstat(struct thread *td, struct kldstat_args *uap)
+{
+ struct kld_file_stat stat;
+ int error, version;
+
+ /*
+ * Check the version of the user's structure.
+ */
+ if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
+ != 0)
+ return (error);
+ if (version != sizeof(struct kld_file_stat_1) &&
+ version != sizeof(struct kld_file_stat))
+ return (EINVAL);
+
+ error = kern_kldstat(td, uap->fileid, &stat);
+ if (error != 0)
+ return (error);
+ return (copyout(&stat, uap->stat, version));
+}
+
+int
+kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat)
+{
+ linker_file_t lf;
+ int namelen;
+#ifdef MAC
+ int error;
+
+ error = mac_kld_check_stat(td->td_ucred);
+ if (error)
+ return (error);
+#endif
+
+ sx_xlock(&kld_sx);
+ lf = linker_find_file_by_id(fileid);
+ if (lf == NULL) {
+ sx_xunlock(&kld_sx);
+ return (ENOENT);
+ }
+
+ /* Version 1 fields: */
+ namelen = strlen(lf->filename) + 1;
+ if (namelen > MAXPATHLEN)
+ namelen = MAXPATHLEN;
+ bcopy(lf->filename, &stat->name[0], namelen);
+ stat->refs = lf->refs;
+ stat->id = lf->id;
+ stat->address = lf->address;
+ stat->size = lf->size;
+ /* Version 2 fields: */
+ namelen = strlen(lf->pathname) + 1;
+ if (namelen > MAXPATHLEN)
+ namelen = MAXPATHLEN;
+ bcopy(lf->pathname, &stat->pathname[0], namelen);
+ sx_xunlock(&kld_sx);
+
+ td->td_retval[0] = 0;
+ return (0);
+}
+
+int
+sys_kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
+{
+ linker_file_t lf;
+ module_t mp;
+ int error = 0;
+
+#ifdef MAC
+ error = mac_kld_check_stat(td->td_ucred);
+ if (error)
+ return (error);
+#endif
+
+ sx_xlock(&kld_sx);
+ lf = linker_find_file_by_id(uap->fileid);
+ if (lf) {
+ MOD_SLOCK;
+ mp = TAILQ_FIRST(&lf->modules);
+ if (mp != NULL)
+ td->td_retval[0] = module_getid(mp);
+ else
+ td->td_retval[0] = 0;
+ MOD_SUNLOCK;
+ } else
+ error = ENOENT;
+ sx_xunlock(&kld_sx);
+ return (error);
+}
+
+int
+sys_kldsym(struct thread *td, struct kldsym_args *uap)
+{
+ char *symstr = NULL;
+ c_linker_sym_t sym;
+ linker_symval_t symval;
+ linker_file_t lf;
+ struct kld_sym_lookup lookup;
+ int error = 0;
+
+#ifdef MAC
+ error = mac_kld_check_stat(td->td_ucred);
+ if (error)
+ return (error);
+#endif
+
+ if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0)
+ return (error);
+ if (lookup.version != sizeof(lookup) ||
+ uap->cmd != KLDSYM_LOOKUP)
+ return (EINVAL);
+ symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
+ goto out;
+ sx_xlock(&kld_sx);
+ if (uap->fileid != 0) {
+ lf = linker_find_file_by_id(uap->fileid);
+ if (lf == NULL)
+ error = ENOENT;
+ else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+ LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
+ lookup.symvalue = (uintptr_t) symval.value;
+ lookup.symsize = symval.size;
+ error = copyout(&lookup, uap->data, sizeof(lookup));
+ } else
+ error = ENOENT;
+ } else {
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+ LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
+ lookup.symvalue = (uintptr_t)symval.value;
+ lookup.symsize = symval.size;
+ error = copyout(&lookup, uap->data,
+ sizeof(lookup));
+ break;
+ }
+ }
+ if (lf == NULL)
+ error = ENOENT;
+ }
+ sx_xunlock(&kld_sx);
+out:
+ free(symstr, M_TEMP);
+ return (error);
+}
+
+/*
+ * Preloaded module support
+ */
+
+static modlist_t
+modlist_lookup(const char *name, int ver)
+{
+ modlist_t mod;
+
+ TAILQ_FOREACH(mod, &found_modules, link) {
+ if (strcmp(mod->name, name) == 0 &&
+ (ver == 0 || mod->version == ver))
+ return (mod);
+ }
+ return (NULL);
+}
+
+static modlist_t
+modlist_lookup2(const char *name, struct mod_depend *verinfo)
+{
+ modlist_t mod, bestmod;
+ int ver;
+
+ if (verinfo == NULL)
+ return (modlist_lookup(name, 0));
+ bestmod = NULL;
+ TAILQ_FOREACH(mod, &found_modules, link) {
+ if (strcmp(mod->name, name) != 0)
+ continue;
+ ver = mod->version;
+ if (ver == verinfo->md_ver_preferred)
+ return (mod);
+ if (ver >= verinfo->md_ver_minimum &&
+ ver <= verinfo->md_ver_maximum &&
+ (bestmod == NULL || ver > bestmod->version))
+ bestmod = mod;
+ }
+ return (bestmod);
+}
+
+static modlist_t
+modlist_newmodule(const char *modname, int version, linker_file_t container)
+{
+ modlist_t mod;
+
+ mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT | M_ZERO);
+ if (mod == NULL)
+ panic("no memory for module list");
+ mod->container = container;
+ mod->name = modname;
+ mod->version = version;
+ TAILQ_INSERT_TAIL(&found_modules, mod, link);
+ return (mod);
+}
+
+static void
+linker_addmodules(linker_file_t lf, struct mod_metadata **start,
+ struct mod_metadata **stop, int preload)
+{
+ struct mod_metadata *mp, **mdp;
+ const char *modname;
+ int ver;
+
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = *mdp;
+ if (mp->md_type != MDT_VERSION)
+ continue;
+ modname = mp->md_cval;
+ ver = ((struct mod_version *)mp->md_data)->mv_version;
+ if (modlist_lookup(modname, ver) != NULL) {
+ printf("module %s already present!\n", modname);
+ /* XXX what can we do? this is a build error. :-( */
+ continue;
+ }
+ modlist_newmodule(modname, ver, lf);
+ }
+}
+
+static void
+linker_preload(void *arg)
+{
+ caddr_t modptr;
+ const char *modname, *nmodname;
+ char *modtype;
+ linker_file_t lf, nlf;
+ linker_class_t lc;
+ int error;
+ linker_file_list_t loaded_files;
+ linker_file_list_t depended_files;
+ struct mod_metadata *mp, *nmp;
+ struct mod_metadata **start, **stop, **mdp, **nmdp;
+ struct mod_depend *verinfo;
+ int nver;
+ int resolves;
+ modlist_t mod;
+ struct sysinit **si_start, **si_stop;
+
+ TAILQ_INIT(&loaded_files);
+ TAILQ_INIT(&depended_files);
+ TAILQ_INIT(&found_modules);
+ error = 0;
+
+ modptr = NULL;
+ sx_xlock(&kld_sx);
+ while ((modptr = preload_search_next_name(modptr)) != NULL) {
+ modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+ modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ if (modname == NULL) {
+ printf("Preloaded module at %p does not have a"
+ " name!\n", modptr);
+ continue;
+ }
+ if (modtype == NULL) {
+ printf("Preloaded module at %p does not have a type!\n",
+ modptr);
+ continue;
+ }
+ if (bootverbose)
+ printf("Preloaded %s \"%s\" at %p.\n", modtype, modname,
+ modptr);
+ lf = NULL;
+ TAILQ_FOREACH(lc, &classes, link) {
+ error = LINKER_LINK_PRELOAD(lc, modname, &lf);
+ if (!error)
+ break;
+ lf = NULL;
+ }
+ if (lf)
+ TAILQ_INSERT_TAIL(&loaded_files, lf, loaded);
+ }
+
+ /*
+ * First get a list of stuff in the kernel.
+ */
+ if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start,
+ &stop, NULL) == 0)
+ linker_addmodules(linker_kernel_file, start, stop, 1);
+
+ /*
+ * This is a once-off kinky bubble sort to resolve relocation
+ * dependency requirements.
+ */
+restart:
+ TAILQ_FOREACH(lf, &loaded_files, loaded) {
+ error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
+ &stop, NULL);
+ /*
+ * First, look to see if we would successfully link with this
+ * stuff.
+ */
+ resolves = 1; /* unless we know otherwise */
+ if (!error) {
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = *mdp;
+ if (mp->md_type != MDT_DEPEND)
+ continue;
+ modname = mp->md_cval;
+ verinfo = mp->md_data;
+ for (nmdp = start; nmdp < stop; nmdp++) {
+ nmp = *nmdp;
+ if (nmp->md_type != MDT_VERSION)
+ continue;
+ nmodname = nmp->md_cval;
+ if (strcmp(modname, nmodname) == 0)
+ break;
+ }
+ if (nmdp < stop) /* it's a self reference */
+ continue;
+
+ /*
+ * ok, the module isn't here yet, we
+ * are not finished
+ */
+ if (modlist_lookup2(modname, verinfo) == NULL)
+ resolves = 0;
+ }
+ }
+ /*
+ * OK, if we found our modules, we can link. So, "provide"
+ * the modules inside and add it to the end of the link order
+ * list.
+ */
+ if (resolves) {
+ if (!error) {
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = *mdp;
+ if (mp->md_type != MDT_VERSION)
+ continue;
+ modname = mp->md_cval;
+ nver = ((struct mod_version *)
+ mp->md_data)->mv_version;
+ if (modlist_lookup(modname,
+ nver) != NULL) {
+ printf("module %s already"
+ " present!\n", modname);
+ TAILQ_REMOVE(&loaded_files,
+ lf, loaded);
+ linker_file_unload(lf,
+ LINKER_UNLOAD_FORCE);
+ /* we changed tailq next ptr */
+ goto restart;
+ }
+ modlist_newmodule(modname, nver, lf);
+ }
+ }
+ TAILQ_REMOVE(&loaded_files, lf, loaded);
+ TAILQ_INSERT_TAIL(&depended_files, lf, loaded);
+ /*
+ * Since we provided modules, we need to restart the
+ * sort so that the previous files that depend on us
+ * have a chance. Also, we've busted the tailq next
+ * pointer with the REMOVE.
+ */
+ goto restart;
+ }
+ }
+
+ /*
+ * At this point, we check to see what could not be resolved..
+ */
+ while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) {
+ TAILQ_REMOVE(&loaded_files, lf, loaded);
+ printf("KLD file %s is missing dependencies\n", lf->filename);
+ linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ }
+
+ /*
+ * We made it. Finish off the linking in the order we determined.
+ */
+ TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) {
+ if (linker_kernel_file) {
+ linker_kernel_file->refs++;
+ error = linker_file_add_dependency(lf,
+ linker_kernel_file);
+ if (error)
+ panic("cannot add dependency");
+ }
+ lf->userrefs++; /* so we can (try to) kldunload it */
+ error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
+ &stop, NULL);
+ if (!error) {
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = *mdp;
+ if (mp->md_type != MDT_DEPEND)
+ continue;
+ modname = mp->md_cval;
+ verinfo = mp->md_data;
+ mod = modlist_lookup2(modname, verinfo);
+ if (mod == NULL) {
+ printf("KLD file %s - cannot find "
+ "dependency \"%s\"\n",
+ lf->filename, modname);
+ goto fail;
+ }
+ /* Don't count self-dependencies */
+ if (lf == mod->container)
+ continue;
+ mod->container->refs++;
+ error = linker_file_add_dependency(lf,
+ mod->container);
+ if (error)
+ panic("cannot add dependency");
+ }
+ }
+ /*
+ * Now do relocation etc using the symbol search paths
+ * established by the dependencies
+ */
+ error = LINKER_LINK_PRELOAD_FINISH(lf);
+ if (error) {
+ printf("KLD file %s - could not finalize loading\n",
+ lf->filename);
+ goto fail;
+ }
+ linker_file_register_modules(lf);
+ if (linker_file_lookup_set(lf, "sysinit_set", &si_start,
+ &si_stop, NULL) == 0)
+ sysinit_add(si_start, si_stop);
+ linker_file_register_sysctls(lf);
+ lf->flags |= LINKER_FILE_LINKED;
+ continue;
+fail:
+ TAILQ_REMOVE(&depended_files, lf, loaded);
+ linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ }
+ sx_xunlock(&kld_sx);
+ /* woohoo! we made it! */
+}
+
+SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0);
+
+/*
+ * Search for a not-loaded module by name.
+ *
+ * Modules may be found in the following locations:
+ *
+ * - preloaded (result is just the module name) - on disk (result is full path
+ * to module)
+ *
+ * If the module name is qualified in any way (contains path, etc.) the we
+ * simply return a copy of it.
+ *
+ * The search path can be manipulated via sysctl. Note that we use the ';'
+ * character as a separator to be consistent with the bootloader.
+ */
+
+static char linker_hintfile[] = "linker.hints";
+static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules";
+
+SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
+ sizeof(linker_path), "module load search path");
+
+TUNABLE_STR("module_path", linker_path, sizeof(linker_path));
+
+static char *linker_ext_list[] = {
+ "",
+ ".ko",
+ NULL
+};
+
+/*
+ * Check if file actually exists either with or without extension listed in
+ * the linker_ext_list. (probably should be generic for the rest of the
+ * kernel)
+ */
+static char *
+linker_lookup_file(const char *path, int pathlen, const char *name,
+ int namelen, struct vattr *vap)
+{
+ struct nameidata nd;
+ struct thread *td = curthread; /* XXX */
+ char *result, **cpp, *sep;
+ int error, len, extlen, reclen, flags;
+ enum vtype type;
+
+ extlen = 0;
+ for (cpp = linker_ext_list; *cpp; cpp++) {
+ len = strlen(*cpp);
+ if (len > extlen)
+ extlen = len;
+ }
+ extlen++; /* trailing '\0' */
+ sep = (path[pathlen - 1] != '/') ? "/" : "";
+
+ reclen = pathlen + strlen(sep) + namelen + extlen + 1;
+ result = malloc(reclen, M_LINKER, M_WAITOK);
+ for (cpp = linker_ext_list; *cpp; cpp++) {
+ snprintf(result, reclen, "%.*s%s%.*s%s", pathlen, path, sep,
+ namelen, name, *cpp);
+ /*
+ * Attempt to open the file, and return the path if
+ * we succeed and it's a regular file.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error == 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ type = nd.ni_vp->v_type;
+ if (vap)
+ VOP_GETATTR(nd.ni_vp, vap, td->td_ucred);
+ VOP_UNLOCK(nd.ni_vp, 0);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+ if (type == VREG)
+ return (result);
+ }
+ }
+ free(result, M_LINKER);
+ return (NULL);
+}
+
+#define INT_ALIGN(base, ptr) ptr = \
+ (base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1))
+
+/*
+ * Lookup KLD which contains requested module in the "linker.hints" file. If
+ * version specification is available, then try to find the best KLD.
+ * Otherwise just find the latest one.
+ */
+static char *
+linker_hints_lookup(const char *path, int pathlen, const char *modname,
+ int modnamelen, struct mod_depend *verinfo)
+{
+ struct thread *td = curthread; /* XXX */
+ struct ucred *cred = td ? td->td_ucred : NULL;
+ struct nameidata nd;
+ struct vattr vattr, mattr;
+ u_char *hints = NULL;
+ u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
+ int error, ival, bestver, *intp, found, flags, clen, blen;
+ ssize_t reclen;
+
+ result = NULL;
+ bestver = found = 0;
+
+ sep = (path[pathlen - 1] != '/') ? "/" : "";
+ reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen +
+ strlen(sep) + 1;
+ pathbuf = malloc(reclen, M_LINKER, M_WAITOK);
+ snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
+ linker_hintfile);
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error)
+ goto bad;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp->v_type != VREG)
+ goto bad;
+ best = cp = NULL;
+ error = VOP_GETATTR(nd.ni_vp, &vattr, cred);
+ if (error)
+ goto bad;
+ /*
+ * XXX: we need to limit this number to some reasonable value
+ */
+ if (vattr.va_size > 100 * 1024) {
+ printf("hints file too large %ld\n", (long)vattr.va_size);
+ goto bad;
+ }
+ hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
+ if (hints == NULL)
+ goto bad;
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td);
+ if (error)
+ goto bad;
+ VOP_UNLOCK(nd.ni_vp, 0);
+ vn_close(nd.ni_vp, FREAD, cred, td);
+ nd.ni_vp = NULL;
+ if (reclen != 0) {
+ printf("can't read %zd\n", reclen);
+ goto bad;
+ }
+ intp = (int *)hints;
+ ival = *intp++;
+ if (ival != LINKER_HINTS_VERSION) {
+ printf("hints file version mismatch %d\n", ival);
+ goto bad;
+ }
+ bufend = hints + vattr.va_size;
+ recptr = (u_char *)intp;
+ clen = blen = 0;
+ while (recptr < bufend && !found) {
+ intp = (int *)recptr;
+ reclen = *intp++;
+ ival = *intp++;
+ cp = (char *)intp;
+ switch (ival) {
+ case MDT_VERSION:
+ clen = *cp++;
+ if (clen != modnamelen || bcmp(cp, modname, clen) != 0)
+ break;
+ cp += clen;
+ INT_ALIGN(hints, cp);
+ ival = *(int *)cp;
+ cp += sizeof(int);
+ clen = *cp++;
+ if (verinfo == NULL ||
+ ival == verinfo->md_ver_preferred) {
+ found = 1;
+ break;
+ }
+ if (ival >= verinfo->md_ver_minimum &&
+ ival <= verinfo->md_ver_maximum &&
+ ival > bestver) {
+ bestver = ival;
+ best = cp;
+ blen = clen;
+ }
+ break;
+ default:
+ break;
+ }
+ recptr += reclen + sizeof(int);
+ }
+ /*
+ * Finally check if KLD is in the place
+ */
+ if (found)
+ result = linker_lookup_file(path, pathlen, cp, clen, &mattr);
+ else if (best)
+ result = linker_lookup_file(path, pathlen, best, blen, &mattr);
+
+ /*
+ * KLD is newer than hints file. What we should do now?
+ */
+ if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >))
+ printf("warning: KLD '%s' is newer than the linker.hints"
+ " file\n", result);
+bad:
+ free(pathbuf, M_LINKER);
+ if (hints)
+ free(hints, M_TEMP);
+ if (nd.ni_vp != NULL) {
+ VOP_UNLOCK(nd.ni_vp, 0);
+ vn_close(nd.ni_vp, FREAD, cred, td);
+ }
+ /*
+ * If nothing found or hints is absent - fallback to the old
+ * way by using "kldname[.ko]" as module name.
+ */
+ if (!found && !bestver && result == NULL)
+ result = linker_lookup_file(path, pathlen, modname,
+ modnamelen, NULL);
+ return (result);
+}
+
+/*
+ * Lookup KLD which contains requested module in the all directories.
+ */
+static char *
+linker_search_module(const char *modname, int modnamelen,
+ struct mod_depend *verinfo)
+{
+ char *cp, *ep, *result;
+
+ /*
+ * traverse the linker path
+ */
+ for (cp = linker_path; *cp; cp = ep + 1) {
+ /* find the end of this component */
+ for (ep = cp; (*ep != 0) && (*ep != ';'); ep++);
+ result = linker_hints_lookup(cp, ep - cp, modname,
+ modnamelen, verinfo);
+ if (result != NULL)
+ return (result);
+ if (*ep == 0)
+ break;
+ }
+ return (NULL);
+}
+
+/*
+ * Search for module in all directories listed in the linker_path.
+ */
+static char *
+linker_search_kld(const char *name)
+{
+ char *cp, *ep, *result;
+ int len;
+
+ /* qualified at all? */
+ if (strchr(name, '/'))
+ return (strdup(name, M_LINKER));
+
+ /* traverse the linker path */
+ len = strlen(name);
+ for (ep = linker_path; *ep; ep++) {
+ cp = ep;
+ /* find the end of this component */
+ for (; *ep != 0 && *ep != ';'; ep++);
+ result = linker_lookup_file(cp, ep - cp, name, len, NULL);
+ if (result != NULL)
+ return (result);
+ }
+ return (NULL);
+}
+
+static const char *
+linker_basename(const char *path)
+{
+ const char *filename;
+
+ filename = strrchr(path, '/');
+ if (filename == NULL)
+ return path;
+ if (filename[1])
+ filename++;
+ return (filename);
+}
+
+#ifdef HWPMC_HOOKS
+/*
+ * Inform hwpmc about the set of kernel modules currently loaded.
+ */
+void *
+linker_hwpmc_list_objects(void)
+{
+ linker_file_t lf;
+ struct pmckern_map_in *kobase;
+ int i, nmappings;
+
+ nmappings = 0;
+ sx_slock(&kld_sx);
+ TAILQ_FOREACH(lf, &linker_files, link)
+ nmappings++;
+
+ /* Allocate nmappings + 1 entries. */
+ kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in),
+ M_LINKER, M_WAITOK | M_ZERO);
+ i = 0;
+ TAILQ_FOREACH(lf, &linker_files, link) {
+
+ /* Save the info for this linker file. */
+ kobase[i].pm_file = lf->filename;
+ kobase[i].pm_address = (uintptr_t)lf->address;
+ i++;
+ }
+ sx_sunlock(&kld_sx);
+
+ KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?"));
+
+ /* The last entry of the malloced area comprises of all zeros. */
+ KASSERT(kobase[i].pm_file == NULL,
+ ("linker_hwpmc_list_objects: last object not NULL"));
+
+ return ((void *)kobase);
+}
+#endif
+
+/*
+ * Find a file which contains given module and load it, if "parent" is not
+ * NULL, register a reference to it.
+ */
+static int
+linker_load_module(const char *kldname, const char *modname,
+ struct linker_file *parent, struct mod_depend *verinfo,
+ struct linker_file **lfpp)
+{
+ linker_file_t lfdep;
+ const char *filename;
+ char *pathname;
+ int error;
+
+ sx_assert(&kld_sx, SA_XLOCKED);
+ if (modname == NULL) {
+ /*
+ * We have to load KLD
+ */
+ KASSERT(verinfo == NULL, ("linker_load_module: verinfo"
+ " is not NULL"));
+ pathname = linker_search_kld(kldname);
+ } else {
+ if (modlist_lookup2(modname, verinfo) != NULL)
+ return (EEXIST);
+ if (kldname != NULL)
+ pathname = strdup(kldname, M_LINKER);
+ else if (rootvnode == NULL)
+ pathname = NULL;
+ else
+ /*
+ * Need to find a KLD with required module
+ */
+ pathname = linker_search_module(modname,
+ strlen(modname), verinfo);
+ }
+ if (pathname == NULL)
+ return (ENOENT);
+
+ /*
+ * Can't load more than one file with the same basename XXX:
+ * Actually it should be possible to have multiple KLDs with
+ * the same basename but different path because they can
+ * provide different versions of the same modules.
+ */
+ filename = linker_basename(pathname);
+ if (linker_find_file_by_name(filename))
+ error = EEXIST;
+ else do {
+ error = linker_load_file(pathname, &lfdep);
+ if (error)
+ break;
+ if (modname && verinfo &&
+ modlist_lookup2(modname, verinfo) == NULL) {
+ linker_file_unload(lfdep, LINKER_UNLOAD_FORCE);
+ error = ENOENT;
+ break;
+ }
+ if (parent) {
+ error = linker_file_add_dependency(parent, lfdep);
+ if (error)
+ break;
+ }
+ if (lfpp)
+ *lfpp = lfdep;
+ } while (0);
+ free(pathname, M_LINKER);
+ return (error);
+}
+
+/*
+ * This routine is responsible for finding dependencies of userland initiated
+ * kldload(2)'s of files.
+ */
+int
+linker_load_dependencies(linker_file_t lf)
+{
+ linker_file_t lfdep;
+ struct mod_metadata **start, **stop, **mdp, **nmdp;
+ struct mod_metadata *mp, *nmp;
+ struct mod_depend *verinfo;
+ modlist_t mod;
+ const char *modname, *nmodname;
+ int ver, error = 0, count;
+
+ /*
+ * All files are dependant on /kernel.
+ */
+ sx_assert(&kld_sx, SA_XLOCKED);
+ if (linker_kernel_file) {
+ linker_kernel_file->refs++;
+ error = linker_file_add_dependency(lf, linker_kernel_file);
+ if (error)
+ return (error);
+ }
+ if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop,
+ &count) != 0)
+ return (0);
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = *mdp;
+ if (mp->md_type != MDT_VERSION)
+ continue;
+ modname = mp->md_cval;
+ ver = ((struct mod_version *)mp->md_data)->mv_version;
+ mod = modlist_lookup(modname, ver);
+ if (mod != NULL) {
+ printf("interface %s.%d already present in the KLD"
+ " '%s'!\n", modname, ver,
+ mod->container->filename);
+ return (EEXIST);
+ }
+ }
+
+ for (mdp = start; mdp < stop; mdp++) {
+ mp = *mdp;
+ if (mp->md_type != MDT_DEPEND)
+ continue;
+ modname = mp->md_cval;
+ verinfo = mp->md_data;
+ nmodname = NULL;
+ for (nmdp = start; nmdp < stop; nmdp++) {
+ nmp = *nmdp;
+ if (nmp->md_type != MDT_VERSION)
+ continue;
+ nmodname = nmp->md_cval;
+ if (strcmp(modname, nmodname) == 0)
+ break;
+ }
+ if (nmdp < stop)/* early exit, it's a self reference */
+ continue;
+ mod = modlist_lookup2(modname, verinfo);
+ if (mod) { /* woohoo, it's loaded already */
+ lfdep = mod->container;
+ lfdep->refs++;
+ error = linker_file_add_dependency(lf, lfdep);
+ if (error)
+ break;
+ continue;
+ }
+ error = linker_load_module(NULL, modname, lf, verinfo, NULL);
+ if (error) {
+ printf("KLD %s: depends on %s - not available or"
+ " version mismatch\n", lf->filename, modname);
+ break;
+ }
+ }
+
+ if (error)
+ return (error);
+ linker_addmodules(lf, start, stop, 0);
+ return (error);
+}
+
+static int
+sysctl_kern_function_list_iterate(const char *name, void *opaque)
+{
+ struct sysctl_req *req;
+
+ req = opaque;
+ return (SYSCTL_OUT(req, name, strlen(name) + 1));
+}
+
+/*
+ * Export a nul-separated, double-nul-terminated list of all function names
+ * in the kernel.
+ */
+static int
+sysctl_kern_function_list(SYSCTL_HANDLER_ARGS)
+{
+ linker_file_t lf;
+ int error;
+
+#ifdef MAC
+ error = mac_kld_check_stat(req->td->td_ucred);
+ if (error)
+ return (error);
+#endif
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sx_xlock(&kld_sx);
+ TAILQ_FOREACH(lf, &linker_files, link) {
+ error = LINKER_EACH_FUNCTION_NAME(lf,
+ sysctl_kern_function_list_iterate, req);
+ if (error) {
+ sx_xunlock(&kld_sx);
+ return (error);
+ }
+ }
+ sx_xunlock(&kld_sx);
+ return (SYSCTL_OUT(req, "", 1));
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_kern_function_list, "", "kernel function list");
diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c
new file mode 100644
index 0000000..87dca63
--- /dev/null
+++ b/sys/kern/kern_lock.c
@@ -0,0 +1,1505 @@
+/*-
+ * Copyright (c) 2008 Attilio Rao <attilio@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "opt_adaptive_lockmgrs.h"
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/lock_profile.h>
+#include <sys/lockmgr.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sleepqueue.h>
+#ifdef DEBUG_LOCKS
+#include <sys/stack.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DECLARE( , , lock, failed);
+#endif
+
+CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) ==
+ (LK_ADAPTIVE | LK_NOSHARE));
+CTASSERT(LK_UNLOCKED == (LK_UNLOCKED &
+ ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS)));
+
+#define SQ_EXCLUSIVE_QUEUE 0
+#define SQ_SHARED_QUEUE 1
+
+#ifndef INVARIANTS
+#define _lockmgr_assert(lk, what, file, line)
+#define TD_LOCKS_INC(td)
+#define TD_LOCKS_DEC(td)
+#else
+#define TD_LOCKS_INC(td) ((td)->td_locks++)
+#define TD_LOCKS_DEC(td) ((td)->td_locks--)
+#endif
+#define TD_SLOCKS_INC(td) ((td)->td_lk_slocks++)
+#define TD_SLOCKS_DEC(td) ((td)->td_lk_slocks--)
+
+#ifndef DEBUG_LOCKS
+#define STACK_PRINT(lk)
+#define STACK_SAVE(lk)
+#define STACK_ZERO(lk)
+#else
+#define STACK_PRINT(lk) stack_print_ddb(&(lk)->lk_stack)
+#define STACK_SAVE(lk) stack_save(&(lk)->lk_stack)
+#define STACK_ZERO(lk) stack_zero(&(lk)->lk_stack)
+#endif
+
+#define LOCK_LOG2(lk, string, arg1, arg2) \
+ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \
+ CTR2(KTR_LOCK, (string), (arg1), (arg2))
+#define LOCK_LOG3(lk, string, arg1, arg2, arg3) \
+ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \
+ CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3))
+
+#define GIANT_DECLARE \
+ int _i = 0; \
+ WITNESS_SAVE_DECL(Giant)
+#define GIANT_RESTORE() do { \
+ if (_i > 0) { \
+ while (_i--) \
+ mtx_lock(&Giant); \
+ WITNESS_RESTORE(&Giant.lock_object, Giant); \
+ } \
+} while (0)
+#define GIANT_SAVE() do { \
+ if (mtx_owned(&Giant)) { \
+ WITNESS_SAVE(&Giant.lock_object, Giant); \
+ while (mtx_owned(&Giant)) { \
+ _i++; \
+ mtx_unlock(&Giant); \
+ } \
+ } \
+} while (0)
+
+#define LK_CAN_SHARE(x) \
+ (((x) & LK_SHARE) && (((x) & LK_EXCLUSIVE_WAITERS) == 0 || \
+ ((x) & LK_EXCLUSIVE_SPINNERS) == 0 || \
+ curthread->td_lk_slocks || (curthread->td_pflags & TDP_DEADLKTREAT)))
+#define LK_TRYOP(x) \
+ ((x) & LK_NOWAIT)
+
+#define LK_CAN_WITNESS(x) \
+ (((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x))
+#define LK_TRYWIT(x) \
+ (LK_TRYOP(x) ? LOP_TRYLOCK : 0)
+
+#define LK_CAN_ADAPT(lk, f) \
+ (((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 && \
+ ((f) & LK_SLEEPFAIL) == 0)
+
+#define lockmgr_disowned(lk) \
+ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC)
+
+#define lockmgr_xlocked(lk) \
+ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread)
+
+static void assert_lockmgr(const struct lock_object *lock, int how);
+#ifdef DDB
+static void db_show_lockmgr(const struct lock_object *lock);
+#endif
+static void lock_lockmgr(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int owner_lockmgr(const struct lock_object *lock,
+ struct thread **owner);
+#endif
+static int unlock_lockmgr(struct lock_object *lock);
+
+struct lock_class lock_class_lockmgr = {
+ .lc_name = "lockmgr",
+ .lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE,
+ .lc_assert = assert_lockmgr,
+#ifdef DDB
+ .lc_ddb_show = db_show_lockmgr,
+#endif
+ .lc_lock = lock_lockmgr,
+ .lc_unlock = unlock_lockmgr,
+#ifdef KDTRACE_HOOKS
+ .lc_owner = owner_lockmgr,
+#endif
+};
+
+#ifdef ADAPTIVE_LOCKMGRS
+static u_int alk_retries = 10;
+static u_int alk_loops = 10000;
+static SYSCTL_NODE(_debug, OID_AUTO, lockmgr, CTLFLAG_RD, NULL,
+ "lockmgr debugging");
+SYSCTL_UINT(_debug_lockmgr, OID_AUTO, retries, CTLFLAG_RW, &alk_retries, 0, "");
+SYSCTL_UINT(_debug_lockmgr, OID_AUTO, loops, CTLFLAG_RW, &alk_loops, 0, "");
+#endif
+
+static __inline struct thread *
+lockmgr_xholder(const struct lock *lk)
+{
+ uintptr_t x;
+
+ x = lk->lk_lock;
+ return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x));
+}
+
+/*
+ * It assumes sleepq_lock held and returns with this one unheld.
+ * It also assumes the generic interlock is sane and previously checked.
+ * If LK_INTERLOCK is specified the interlock is not reacquired after the
+ * sleep.
+ */
+static __inline int
+sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,
+ const char *wmesg, int pri, int timo, int queue)
+{
+ GIANT_DECLARE;
+ struct lock_class *class;
+ int catch, error;
+
+ class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
+ catch = pri & PCATCH;
+ pri &= PRIMASK;
+ error = 0;
+
+ LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk,
+ (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared");
+
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0)
+ lk->lk_exslpfail++;
+ GIANT_SAVE();
+ sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
+ SLEEPQ_INTERRUPTIBLE : 0), queue);
+ if ((flags & LK_TIMELOCK) && timo)
+ sleepq_set_timeout(&lk->lock_object, timo);
+
+ /*
+ * Decisional switch for real sleeping.
+ */
+ if ((flags & LK_TIMELOCK) && timo && catch)
+ error = sleepq_timedwait_sig(&lk->lock_object, pri);
+ else if ((flags & LK_TIMELOCK) && timo)
+ error = sleepq_timedwait(&lk->lock_object, pri);
+ else if (catch)
+ error = sleepq_wait_sig(&lk->lock_object, pri);
+ else
+ sleepq_wait(&lk->lock_object, pri);
+ GIANT_RESTORE();
+ if ((flags & LK_SLEEPFAIL) && error == 0)
+ error = ENOLCK;
+
+ return (error);
+}
+
+static __inline int
+wakeupshlk(struct lock *lk, const char *file, int line)
+{
+ uintptr_t v, x;
+ u_int realexslp;
+ int queue, wakeup_swapper;
+
+ WITNESS_UNLOCK(&lk->lock_object, 0, file, line);
+ LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line);
+
+ wakeup_swapper = 0;
+ for (;;) {
+ x = lk->lk_lock;
+
+ /*
+ * If there is more than one shared lock held, just drop one
+ * and return.
+ */
+ if (LK_SHARERS(x) > 1) {
+ if (atomic_cmpset_rel_ptr(&lk->lk_lock, x,
+ x - LK_ONE_SHARER))
+ break;
+ continue;
+ }
+
+ /*
+ * If there are not waiters on the exclusive queue, drop the
+ * lock quickly.
+ */
+ if ((x & LK_ALL_WAITERS) == 0) {
+ MPASS((x & ~LK_EXCLUSIVE_SPINNERS) ==
+ LK_SHARERS_LOCK(1));
+ if (atomic_cmpset_rel_ptr(&lk->lk_lock, x, LK_UNLOCKED))
+ break;
+ continue;
+ }
+
+ /*
+ * We should have a sharer with waiters, so enter the hard
+ * path in order to handle wakeups correctly.
+ */
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+ v = LK_UNLOCKED;
+
+ /*
+ * If the lock has exclusive waiters, give them preference in
+ * order to avoid deadlock with shared runners up.
+ * If interruptible sleeps left the exclusive queue empty
+ * avoid a starvation for the threads sleeping on the shared
+ * queue by giving them precedence and cleaning up the
+ * exclusive waiters bit anyway.
+ * Please note that lk_exslpfail count may be lying about
+ * the real number of waiters with the LK_SLEEPFAIL flag on
+ * because they may be used in conjuction with interruptible
+ * sleeps so lk_exslpfail might be considered an 'upper limit'
+ * bound, including the edge cases.
+ */
+ realexslp = sleepq_sleepcnt(&lk->lock_object,
+ SQ_EXCLUSIVE_QUEUE);
+ if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+ if (lk->lk_exslpfail < realexslp) {
+ lk->lk_exslpfail = 0;
+ queue = SQ_EXCLUSIVE_QUEUE;
+ v |= (x & LK_SHARED_WAITERS);
+ } else {
+ lk->lk_exslpfail = 0;
+ LOCK_LOG2(lk,
+ "%s: %p has only LK_SLEEPFAIL sleepers",
+ __func__, lk);
+ LOCK_LOG2(lk,
+ "%s: %p waking up threads on the exclusive queue",
+ __func__, lk);
+ wakeup_swapper =
+ sleepq_broadcast(&lk->lock_object,
+ SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+ queue = SQ_SHARED_QUEUE;
+ }
+
+ } else {
+
+ /*
+ * Exclusive waiters sleeping with LK_SLEEPFAIL on
+ * and using interruptible sleeps/timeout may have
+ * left spourious lk_exslpfail counts on, so clean
+ * it up anyway.
+ */
+ lk->lk_exslpfail = 0;
+ queue = SQ_SHARED_QUEUE;
+ }
+
+ if (!atomic_cmpset_rel_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x,
+ v)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
+ __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
+ "exclusive");
+ wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
+ 0, queue);
+ sleepq_release(&lk->lock_object);
+ break;
+ }
+
+ lock_profile_release_lock(&lk->lock_object);
+ TD_LOCKS_DEC(curthread);
+ TD_SLOCKS_DEC(curthread);
+ return (wakeup_swapper);
+}
+
+static void
+assert_lockmgr(const struct lock_object *lock, int what)
+{
+
+ panic("lockmgr locks do not support assertions");
+}
+
+static void
+lock_lockmgr(struct lock_object *lock, int how)
+{
+
+ panic("lockmgr locks do not support sleep interlocking");
+}
+
+static int
+unlock_lockmgr(struct lock_object *lock)
+{
+
+ panic("lockmgr locks do not support sleep interlocking");
+}
+
+#ifdef KDTRACE_HOOKS
+static int
+owner_lockmgr(const struct lock_object *lock, struct thread **owner)
+{
+
+ panic("lockmgr locks do not support owner inquiring");
+}
+#endif
+
+void
+lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags)
+{
+ int iflags;
+
+ MPASS((flags & ~LK_INIT_MASK) == 0);
+ ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock,
+ ("%s: lockmgr not aligned for %s: %p", __func__, wmesg,
+ &lk->lk_lock));
+
+ iflags = LO_SLEEPABLE | LO_UPGRADABLE;
+ if (flags & LK_CANRECURSE)
+ iflags |= LO_RECURSABLE;
+ if ((flags & LK_NODUP) == 0)
+ iflags |= LO_DUPOK;
+ if (flags & LK_NOPROFILE)
+ iflags |= LO_NOPROFILE;
+ if ((flags & LK_NOWITNESS) == 0)
+ iflags |= LO_WITNESS;
+ if (flags & LK_QUIET)
+ iflags |= LO_QUIET;
+ if (flags & LK_IS_VNODE)
+ iflags |= LO_IS_VNODE;
+ iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE);
+
+ lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags);
+ lk->lk_lock = LK_UNLOCKED;
+ lk->lk_recurse = 0;
+ lk->lk_exslpfail = 0;
+ lk->lk_timo = timo;
+ lk->lk_pri = pri;
+ STACK_ZERO(lk);
+}
+
+/*
+ * XXX: Gross hacks to manipulate external lock flags after
+ * initialization. Used for certain vnode and buf locks.
+ */
+void
+lockallowshare(struct lock *lk)
+{
+
+ lockmgr_assert(lk, KA_XLOCKED);
+ lk->lock_object.lo_flags &= ~LK_NOSHARE;
+}
+
+void
+lockallowrecurse(struct lock *lk)
+{
+
+ lockmgr_assert(lk, KA_XLOCKED);
+ lk->lock_object.lo_flags |= LO_RECURSABLE;
+}
+
+void
+lockdisablerecurse(struct lock *lk)
+{
+
+ lockmgr_assert(lk, KA_XLOCKED);
+ lk->lock_object.lo_flags &= ~LO_RECURSABLE;
+}
+
+void
+lockdestroy(struct lock *lk)
+{
+
+ KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held"));
+ KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed"));
+ KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters"));
+ lock_destroy(&lk->lock_object);
+}
+
+int
+__lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
+ const char *wmesg, int pri, int timo, const char *file, int line)
+{
+ GIANT_DECLARE;
+ struct lock_class *class;
+ const char *iwmesg;
+ uintptr_t tid, v, x;
+ u_int op, realexslp;
+ int error, ipri, itimo, queue, wakeup_swapper;
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+#ifdef ADAPTIVE_LOCKMGRS
+ volatile struct thread *owner;
+ u_int i, spintries = 0;
+#endif
+
+ error = 0;
+ tid = (uintptr_t)curthread;
+ op = (flags & LK_TYPE_MASK);
+ iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg;
+ ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri;
+ itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo;
+
+ MPASS((flags & ~LK_TOTAL_MASK) == 0);
+ KASSERT((op & (op - 1)) == 0,
+ ("%s: Invalid requested operation @ %s:%d", __func__, file, line));
+ KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 ||
+ (op != LK_DOWNGRADE && op != LK_RELEASE),
+ ("%s: Invalid flags in regard of the operation desired @ %s:%d",
+ __func__, file, line));
+ KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL,
+ ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d",
+ __func__, file, line));
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread,
+ lk->lock_object.lo_name, file, line));
+
+ class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
+ if (panicstr != NULL) {
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ return (0);
+ }
+
+ if (lk->lock_object.lo_flags & LK_NOSHARE) {
+ switch (op) {
+ case LK_SHARED:
+ op = LK_EXCLUSIVE;
+ break;
+ case LK_UPGRADE:
+ case LK_DOWNGRADE:
+ _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED,
+ file, line);
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ return (0);
+ }
+ }
+
+ wakeup_swapper = 0;
+ switch (op) {
+ case LK_SHARED:
+ if (LK_CAN_WITNESS(flags))
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
+ file, line, flags & LK_INTERLOCK ? ilk : NULL);
+ for (;;) {
+ x = lk->lk_lock;
+
+ /*
+ * If no other thread has an exclusive lock, or
+ * no exclusive waiter is present, bump the count of
+ * sharers. Since we have to preserve the state of
+ * waiters, if we fail to acquire the shared lock
+ * loop back and retry.
+ */
+ if (LK_CAN_SHARE(x)) {
+ if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
+ x + LK_ONE_SHARER))
+ break;
+ continue;
+ }
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&lk->lock_object,
+ &contested, &waittime);
+
+ /*
+ * If the lock is already held by curthread in
+ * exclusive way avoid a deadlock.
+ */
+ if (LK_HOLDER(x) == tid) {
+ LOCK_LOG2(lk,
+ "%s: %p already held in exclusive mode",
+ __func__, lk);
+ error = EDEADLK;
+ break;
+ }
+
+ /*
+ * If the lock is expected to not sleep just give up
+ * and return.
+ */
+ if (LK_TRYOP(flags)) {
+ LOCK_LOG2(lk, "%s: %p fails the try operation",
+ __func__, lk);
+ error = EBUSY;
+ break;
+ }
+
+#ifdef ADAPTIVE_LOCKMGRS
+ /*
+ * If the owner is running on another CPU, spin until
+ * the owner stops running or the state of the lock
+ * changes. We need a double-state handle here
+ * because for a failed acquisition the lock can be
+ * either held in exclusive mode or shared mode
+ * (for the writer starvation avoidance technique).
+ */
+ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
+ LK_HOLDER(x) != LK_KERNPROC) {
+ owner = (struct thread *)LK_HOLDER(x);
+ if (LOCK_LOG_TEST(&lk->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, lk, owner);
+
+ /*
+ * If we are holding also an interlock drop it
+ * in order to avoid a deadlock if the lockmgr
+ * owner is adaptively spinning on the
+ * interlock itself.
+ */
+ if (flags & LK_INTERLOCK) {
+ class->lc_unlock(ilk);
+ flags &= ~LK_INTERLOCK;
+ }
+ GIANT_SAVE();
+ while (LK_HOLDER(lk->lk_lock) ==
+ (uintptr_t)owner && TD_IS_RUNNING(owner))
+ cpu_spinwait();
+ GIANT_RESTORE();
+ continue;
+ } else if (LK_CAN_ADAPT(lk, flags) &&
+ (x & LK_SHARE) != 0 && LK_SHARERS(x) &&
+ spintries < alk_retries) {
+ if (flags & LK_INTERLOCK) {
+ class->lc_unlock(ilk);
+ flags &= ~LK_INTERLOCK;
+ }
+ GIANT_SAVE();
+ spintries++;
+ for (i = 0; i < alk_loops; i++) {
+ if (LOCK_LOG_TEST(&lk->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: shared spinning on %p with %u and %u",
+ __func__, lk, spintries, i);
+ x = lk->lk_lock;
+ if ((x & LK_SHARE) == 0 ||
+ LK_CAN_SHARE(x) != 0)
+ break;
+ cpu_spinwait();
+ }
+ GIANT_RESTORE();
+ if (i != alk_loops)
+ continue;
+ }
+#endif
+
+ /*
+ * Acquire the sleepqueue chain lock because we
+ * probabilly will need to manipulate waiters flags.
+ */
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock;
+
+ /*
+ * if the lock can be acquired in shared mode, try
+ * again.
+ */
+ if (LK_CAN_SHARE(x)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+
+#ifdef ADAPTIVE_LOCKMGRS
+ /*
+ * The current lock owner might have started executing
+ * on another CPU (or the lock could have changed
+ * owner) while we were waiting on the turnstile
+ * chain lock. If so, drop the turnstile lock and try
+ * again.
+ */
+ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
+ LK_HOLDER(x) != LK_KERNPROC) {
+ owner = (struct thread *)LK_HOLDER(x);
+ if (TD_IS_RUNNING(owner)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * Try to set the LK_SHARED_WAITERS flag. If we fail,
+ * loop back and retry.
+ */
+ if ((x & LK_SHARED_WAITERS) == 0) {
+ if (!atomic_cmpset_acq_ptr(&lk->lk_lock, x,
+ x | LK_SHARED_WAITERS)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ LOCK_LOG2(lk, "%s: %p set shared waiters flag",
+ __func__, lk);
+ }
+
+ /*
+ * As far as we have been unable to acquire the
+ * shared lock and the shared waiters flag is set,
+ * we will sleep.
+ */
+ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
+ SQ_SHARED_QUEUE);
+ flags &= ~LK_INTERLOCK;
+ if (error) {
+ LOCK_LOG3(lk,
+ "%s: interrupted sleep for %p with %d",
+ __func__, lk, error);
+ break;
+ }
+ LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+ __func__, lk);
+ }
+ if (error == 0) {
+ lock_profile_obtain_lock_success(&lk->lock_object,
+ contested, waittime, file, line);
+ LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file,
+ line);
+ WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file,
+ line);
+ TD_LOCKS_INC(curthread);
+ TD_SLOCKS_INC(curthread);
+ STACK_SAVE(lk);
+ }
+ break;
+ case LK_UPGRADE:
+ _lockmgr_assert(lk, KA_SLOCKED, file, line);
+ v = lk->lk_lock;
+ x = v & LK_ALL_WAITERS;
+ v &= LK_EXCLUSIVE_SPINNERS;
+
+ /*
+ * Try to switch from one shared lock to an exclusive one.
+ * We need to preserve waiters flags during the operation.
+ */
+ if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v,
+ tid | x)) {
+ LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file,
+ line);
+ WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE |
+ LK_TRYWIT(flags), file, line);
+ TD_SLOCKS_DEC(curthread);
+ break;
+ }
+
+ /*
+ * We have been unable to succeed in upgrading, so just
+ * give up the shared lock.
+ */
+ wakeup_swapper |= wakeupshlk(lk, file, line);
+
+ /* FALLTHROUGH */
+ case LK_EXCLUSIVE:
+ if (LK_CAN_WITNESS(flags))
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+ LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+ ilk : NULL);
+
+ /*
+ * If curthread already holds the lock and this one is
+ * allowed to recurse, simply recurse on it.
+ */
+ if (lockmgr_xlocked(lk)) {
+ if ((flags & LK_CANRECURSE) == 0 &&
+ (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) {
+
+ /*
+ * If the lock is expected to not panic just
+ * give up and return.
+ */
+ if (LK_TRYOP(flags)) {
+ LOCK_LOG2(lk,
+ "%s: %p fails the try operation",
+ __func__, lk);
+ error = EBUSY;
+ break;
+ }
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ panic("%s: recursing on non recursive lockmgr %s @ %s:%d\n",
+ __func__, iwmesg, file, line);
+ }
+ lk->lk_recurse++;
+ LOCK_LOG2(lk, "%s: %p recursing", __func__, lk);
+ LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
+ lk->lk_recurse, file, line);
+ WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+ LK_TRYWIT(flags), file, line);
+ TD_LOCKS_INC(curthread);
+ break;
+ }
+
+ while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED,
+ tid)) {
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&lk->lock_object,
+ &contested, &waittime);
+
+ /*
+ * If the lock is expected to not sleep just give up
+ * and return.
+ */
+ if (LK_TRYOP(flags)) {
+ LOCK_LOG2(lk, "%s: %p fails the try operation",
+ __func__, lk);
+ error = EBUSY;
+ break;
+ }
+
+#ifdef ADAPTIVE_LOCKMGRS
+ /*
+ * If the owner is running on another CPU, spin until
+ * the owner stops running or the state of the lock
+ * changes.
+ */
+ x = lk->lk_lock;
+ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
+ LK_HOLDER(x) != LK_KERNPROC) {
+ owner = (struct thread *)LK_HOLDER(x);
+ if (LOCK_LOG_TEST(&lk->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, lk, owner);
+
+ /*
+ * If we are holding also an interlock drop it
+ * in order to avoid a deadlock if the lockmgr
+ * owner is adaptively spinning on the
+ * interlock itself.
+ */
+ if (flags & LK_INTERLOCK) {
+ class->lc_unlock(ilk);
+ flags &= ~LK_INTERLOCK;
+ }
+ GIANT_SAVE();
+ while (LK_HOLDER(lk->lk_lock) ==
+ (uintptr_t)owner && TD_IS_RUNNING(owner))
+ cpu_spinwait();
+ GIANT_RESTORE();
+ continue;
+ } else if (LK_CAN_ADAPT(lk, flags) &&
+ (x & LK_SHARE) != 0 && LK_SHARERS(x) &&
+ spintries < alk_retries) {
+ if ((x & LK_EXCLUSIVE_SPINNERS) == 0 &&
+ !atomic_cmpset_ptr(&lk->lk_lock, x,
+ x | LK_EXCLUSIVE_SPINNERS))
+ continue;
+ if (flags & LK_INTERLOCK) {
+ class->lc_unlock(ilk);
+ flags &= ~LK_INTERLOCK;
+ }
+ GIANT_SAVE();
+ spintries++;
+ for (i = 0; i < alk_loops; i++) {
+ if (LOCK_LOG_TEST(&lk->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: shared spinning on %p with %u and %u",
+ __func__, lk, spintries, i);
+ if ((lk->lk_lock &
+ LK_EXCLUSIVE_SPINNERS) == 0)
+ break;
+ cpu_spinwait();
+ }
+ GIANT_RESTORE();
+ if (i != alk_loops)
+ continue;
+ }
+#endif
+
+ /*
+ * Acquire the sleepqueue chain lock because we
+ * probabilly will need to manipulate waiters flags.
+ */
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock;
+
+ /*
+ * if the lock has been released while we spun on
+ * the sleepqueue chain lock just try again.
+ */
+ if (x == LK_UNLOCKED) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+
+#ifdef ADAPTIVE_LOCKMGRS
+ /*
+ * The current lock owner might have started executing
+ * on another CPU (or the lock could have changed
+ * owner) while we were waiting on the turnstile
+ * chain lock. If so, drop the turnstile lock and try
+ * again.
+ */
+ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
+ LK_HOLDER(x) != LK_KERNPROC) {
+ owner = (struct thread *)LK_HOLDER(x);
+ if (TD_IS_RUNNING(owner)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * The lock can be in the state where there is a
+ * pending queue of waiters, but still no owner.
+ * This happens when the lock is contested and an
+ * owner is going to claim the lock.
+ * If curthread is the one successfully acquiring it
+ * claim lock ownership and return, preserving waiters
+ * flags.
+ */
+ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+ if ((x & ~v) == LK_UNLOCKED) {
+ v &= ~LK_EXCLUSIVE_SPINNERS;
+ if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
+ tid | v)) {
+ sleepq_release(&lk->lock_object);
+ LOCK_LOG2(lk,
+ "%s: %p claimed by a new writer",
+ __func__, lk);
+ break;
+ }
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+
+ /*
+ * Try to set the LK_EXCLUSIVE_WAITERS flag. If we
+ * fail, loop back and retry.
+ */
+ if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
+ if (!atomic_cmpset_ptr(&lk->lk_lock, x,
+ x | LK_EXCLUSIVE_WAITERS)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ LOCK_LOG2(lk, "%s: %p set excl waiters flag",
+ __func__, lk);
+ }
+
+ /*
+ * As far as we have been unable to acquire the
+ * exclusive lock and the exclusive waiters flag
+ * is set, we will sleep.
+ */
+ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
+ SQ_EXCLUSIVE_QUEUE);
+ flags &= ~LK_INTERLOCK;
+ if (error) {
+ LOCK_LOG3(lk,
+ "%s: interrupted sleep for %p with %d",
+ __func__, lk, error);
+ break;
+ }
+ LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+ __func__, lk);
+ }
+ if (error == 0) {
+ lock_profile_obtain_lock_success(&lk->lock_object,
+ contested, waittime, file, line);
+ LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
+ lk->lk_recurse, file, line);
+ WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+ LK_TRYWIT(flags), file, line);
+ TD_LOCKS_INC(curthread);
+ STACK_SAVE(lk);
+ }
+ break;
+ case LK_DOWNGRADE:
+ _lockmgr_assert(lk, KA_XLOCKED, file, line);
+ LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line);
+ WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line);
+
+ /*
+ * Panic if the lock is recursed.
+ */
+ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n",
+ __func__, iwmesg, file, line);
+ }
+ TD_SLOCKS_INC(curthread);
+
+ /*
+ * In order to preserve waiters flags, just spin.
+ */
+ for (;;) {
+ x = lk->lk_lock;
+ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+ x &= LK_ALL_WAITERS;
+ if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
+ LK_SHARERS_LOCK(1) | x))
+ break;
+ cpu_spinwait();
+ }
+ break;
+ case LK_RELEASE:
+ _lockmgr_assert(lk, KA_LOCKED, file, line);
+ x = lk->lk_lock;
+
+ if ((x & LK_SHARE) == 0) {
+
+ /*
+ * As first option, treact the lock as if it has not
+ * any waiter.
+ * Fix-up the tid var if the lock has been disowned.
+ */
+ if (LK_HOLDER(x) == LK_KERNPROC)
+ tid = LK_KERNPROC;
+ else {
+ WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE,
+ file, line);
+ TD_LOCKS_DEC(curthread);
+ }
+ LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0,
+ lk->lk_recurse, file, line);
+
+ /*
+ * The lock is held in exclusive mode.
+ * If the lock is recursed also, then unrecurse it.
+ */
+ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
+ LOCK_LOG2(lk, "%s: %p unrecursing", __func__,
+ lk);
+ lk->lk_recurse--;
+ break;
+ }
+ if (tid != LK_KERNPROC)
+ lock_profile_release_lock(&lk->lock_object);
+
+ if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid,
+ LK_UNLOCKED))
+ break;
+
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock;
+ v = LK_UNLOCKED;
+
+ /*
+ * If the lock has exclusive waiters, give them
+ * preference in order to avoid deadlock with
+ * shared runners up.
+ * If interruptible sleeps left the exclusive queue
+ * empty avoid a starvation for the threads sleeping
+ * on the shared queue by giving them precedence
+ * and cleaning up the exclusive waiters bit anyway.
+ * Please note that lk_exslpfail count may be lying
+ * about the real number of waiters with the
+ * LK_SLEEPFAIL flag on because they may be used in
+ * conjuction with interruptible sleeps so
+ * lk_exslpfail might be considered an 'upper limit'
+ * bound, including the edge cases.
+ */
+ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+ realexslp = sleepq_sleepcnt(&lk->lock_object,
+ SQ_EXCLUSIVE_QUEUE);
+ if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
+ if (lk->lk_exslpfail < realexslp) {
+ lk->lk_exslpfail = 0;
+ queue = SQ_EXCLUSIVE_QUEUE;
+ v |= (x & LK_SHARED_WAITERS);
+ } else {
+ lk->lk_exslpfail = 0;
+ LOCK_LOG2(lk,
+ "%s: %p has only LK_SLEEPFAIL sleepers",
+ __func__, lk);
+ LOCK_LOG2(lk,
+ "%s: %p waking up threads on the exclusive queue",
+ __func__, lk);
+ wakeup_swapper =
+ sleepq_broadcast(&lk->lock_object,
+ SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
+ queue = SQ_SHARED_QUEUE;
+ }
+ } else {
+
+ /*
+ * Exclusive waiters sleeping with LK_SLEEPFAIL
+ * on and using interruptible sleeps/timeout
+ * may have left spourious lk_exslpfail counts
+ * on, so clean it up anyway.
+ */
+ lk->lk_exslpfail = 0;
+ queue = SQ_SHARED_QUEUE;
+ }
+
+ LOCK_LOG3(lk,
+ "%s: %p waking up threads on the %s queue",
+ __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
+ "exclusive");
+ atomic_store_rel_ptr(&lk->lk_lock, v);
+ wakeup_swapper |= sleepq_broadcast(&lk->lock_object,
+ SLEEPQ_LK, 0, queue);
+ sleepq_release(&lk->lock_object);
+ break;
+ } else
+ wakeup_swapper = wakeupshlk(lk, file, line);
+ break;
+ case LK_DRAIN:
+ if (LK_CAN_WITNESS(flags))
+ WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
+ LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
+ ilk : NULL);
+
+ /*
+ * Trying to drain a lock we already own will result in a
+ * deadlock.
+ */
+ if (lockmgr_xlocked(lk)) {
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ panic("%s: draining %s with the lock held @ %s:%d\n",
+ __func__, iwmesg, file, line);
+ }
+
+ while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) {
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&lk->lock_object,
+ &contested, &waittime);
+
+ /*
+ * If the lock is expected to not sleep just give up
+ * and return.
+ */
+ if (LK_TRYOP(flags)) {
+ LOCK_LOG2(lk, "%s: %p fails the try operation",
+ __func__, lk);
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * Acquire the sleepqueue chain lock because we
+ * probabilly will need to manipulate waiters flags.
+ */
+ sleepq_lock(&lk->lock_object);
+ x = lk->lk_lock;
+
+ /*
+ * if the lock has been released while we spun on
+ * the sleepqueue chain lock just try again.
+ */
+ if (x == LK_UNLOCKED) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+
+ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
+ if ((x & ~v) == LK_UNLOCKED) {
+ v = (x & ~LK_EXCLUSIVE_SPINNERS);
+
+ /*
+ * If interruptible sleeps left the exclusive
+ * queue empty avoid a starvation for the
+ * threads sleeping on the shared queue by
+ * giving them precedence and cleaning up the
+ * exclusive waiters bit anyway.
+ * Please note that lk_exslpfail count may be
+ * lying about the real number of waiters with
+ * the LK_SLEEPFAIL flag on because they may
+ * be used in conjuction with interruptible
+ * sleeps so lk_exslpfail might be considered
+ * an 'upper limit' bound, including the edge
+ * cases.
+ */
+ if (v & LK_EXCLUSIVE_WAITERS) {
+ queue = SQ_EXCLUSIVE_QUEUE;
+ v &= ~LK_EXCLUSIVE_WAITERS;
+ } else {
+
+ /*
+ * Exclusive waiters sleeping with
+ * LK_SLEEPFAIL on and using
+ * interruptible sleeps/timeout may
+ * have left spourious lk_exslpfail
+ * counts on, so clean it up anyway.
+ */
+ MPASS(v & LK_SHARED_WAITERS);
+ lk->lk_exslpfail = 0;
+ queue = SQ_SHARED_QUEUE;
+ v &= ~LK_SHARED_WAITERS;
+ }
+ if (queue == SQ_EXCLUSIVE_QUEUE) {
+ realexslp =
+ sleepq_sleepcnt(&lk->lock_object,
+ SQ_EXCLUSIVE_QUEUE);
+ if (lk->lk_exslpfail >= realexslp) {
+ lk->lk_exslpfail = 0;
+ queue = SQ_SHARED_QUEUE;
+ v &= ~LK_SHARED_WAITERS;
+ if (realexslp != 0) {
+ LOCK_LOG2(lk,
+ "%s: %p has only LK_SLEEPFAIL sleepers",
+ __func__, lk);
+ LOCK_LOG2(lk,
+ "%s: %p waking up threads on the exclusive queue",
+ __func__, lk);
+ wakeup_swapper =
+ sleepq_broadcast(
+ &lk->lock_object,
+ SLEEPQ_LK, 0,
+ SQ_EXCLUSIVE_QUEUE);
+ }
+ } else
+ lk->lk_exslpfail = 0;
+ }
+ if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ LOCK_LOG3(lk,
+ "%s: %p waking up all threads on the %s queue",
+ __func__, lk, queue == SQ_SHARED_QUEUE ?
+ "shared" : "exclusive");
+ wakeup_swapper |= sleepq_broadcast(
+ &lk->lock_object, SLEEPQ_LK, 0, queue);
+
+ /*
+ * If shared waiters have been woken up we need
+ * to wait for one of them to acquire the lock
+ * before to set the exclusive waiters in
+ * order to avoid a deadlock.
+ */
+ if (queue == SQ_SHARED_QUEUE) {
+ for (v = lk->lk_lock;
+ (v & LK_SHARE) && !LK_SHARERS(v);
+ v = lk->lk_lock)
+ cpu_spinwait();
+ }
+ }
+
+ /*
+ * Try to set the LK_EXCLUSIVE_WAITERS flag. If we
+ * fail, loop back and retry.
+ */
+ if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
+ if (!atomic_cmpset_ptr(&lk->lk_lock, x,
+ x | LK_EXCLUSIVE_WAITERS)) {
+ sleepq_release(&lk->lock_object);
+ continue;
+ }
+ LOCK_LOG2(lk, "%s: %p set drain waiters flag",
+ __func__, lk);
+ }
+
+ /*
+ * As far as we have been unable to acquire the
+ * exclusive lock and the exclusive waiters flag
+ * is set, we will sleep.
+ */
+ if (flags & LK_INTERLOCK) {
+ class->lc_unlock(ilk);
+ flags &= ~LK_INTERLOCK;
+ }
+ GIANT_SAVE();
+ sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK,
+ SQ_EXCLUSIVE_QUEUE);
+ sleepq_wait(&lk->lock_object, ipri & PRIMASK);
+ GIANT_RESTORE();
+ LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
+ __func__, lk);
+ }
+
+ if (error == 0) {
+ lock_profile_obtain_lock_success(&lk->lock_object,
+ contested, waittime, file, line);
+ LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0,
+ lk->lk_recurse, file, line);
+ WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
+ LK_TRYWIT(flags), file, line);
+ TD_LOCKS_INC(curthread);
+ STACK_SAVE(lk);
+ }
+ break;
+ default:
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ panic("%s: unknown lockmgr request 0x%x\n", __func__, op);
+ }
+
+ if (flags & LK_INTERLOCK)
+ class->lc_unlock(ilk);
+ if (wakeup_swapper)
+ kick_proc0();
+
+ return (error);
+}
+
+void
+_lockmgr_disown(struct lock *lk, const char *file, int line)
+{
+ uintptr_t tid, x;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ tid = (uintptr_t)curthread;
+ _lockmgr_assert(lk, KA_XLOCKED, file, line);
+
+ /*
+ * Panic if the lock is recursed.
+ */
+ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk))
+ panic("%s: disown a recursed lockmgr @ %s:%d\n",
+ __func__, file, line);
+
+ /*
+ * If the owner is already LK_KERNPROC just skip the whole operation.
+ */
+ if (LK_HOLDER(lk->lk_lock) != tid)
+ return;
+ lock_profile_release_lock(&lk->lock_object);
+ LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line);
+ WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
+ TD_LOCKS_DEC(curthread);
+ STACK_SAVE(lk);
+
+ /*
+ * In order to preserve waiters flags, just spin.
+ */
+ for (;;) {
+ x = lk->lk_lock;
+ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
+ x &= LK_ALL_WAITERS;
+ if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
+ LK_KERNPROC | x))
+ return;
+ cpu_spinwait();
+ }
+}
+
+void
+lockmgr_printinfo(const struct lock *lk)
+{
+ struct thread *td;
+ uintptr_t x;
+
+ if (lk->lk_lock == LK_UNLOCKED)
+ printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name);
+ else if (lk->lk_lock & LK_SHARE)
+ printf("lock type %s: SHARED (count %ju)\n",
+ lk->lock_object.lo_name,
+ (uintmax_t)LK_SHARERS(lk->lk_lock));
+ else {
+ td = lockmgr_xholder(lk);
+ printf("lock type %s: EXCL by thread %p "
+ "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name, td,
+ td->td_proc->p_pid, td->td_proc->p_comm, td->td_tid);
+ }
+
+ x = lk->lk_lock;
+ if (x & LK_EXCLUSIVE_WAITERS)
+ printf(" with exclusive waiters pending\n");
+ if (x & LK_SHARED_WAITERS)
+ printf(" with shared waiters pending\n");
+ if (x & LK_EXCLUSIVE_SPINNERS)
+ printf(" with exclusive spinners pending\n");
+
+ STACK_PRINT(lk);
+}
+
+int
+lockstatus(const struct lock *lk)
+{
+ uintptr_t v, x;
+ int ret;
+
+ ret = LK_SHARED;
+ x = lk->lk_lock;
+ v = LK_HOLDER(x);
+
+ if ((x & LK_SHARE) == 0) {
+ if (v == (uintptr_t)curthread || v == LK_KERNPROC)
+ ret = LK_EXCLUSIVE;
+ else
+ ret = LK_EXCLOTHER;
+ } else if (x == LK_UNLOCKED)
+ ret = 0;
+
+ return (ret);
+}
+
+#ifdef INVARIANT_SUPPORT
+
+FEATURE(invariant_support,
+ "Support for modules compiled with INVARIANTS option");
+
+#ifndef INVARIANTS
+#undef _lockmgr_assert
+#endif
+
+void
+_lockmgr_assert(const struct lock *lk, int what, const char *file, int line)
+{
+ int slocked = 0;
+
+ if (panicstr != NULL)
+ return;
+ switch (what) {
+ case KA_SLOCKED:
+ case KA_SLOCKED | KA_NOTRECURSED:
+ case KA_SLOCKED | KA_RECURSED:
+ slocked = 1;
+ case KA_LOCKED:
+ case KA_LOCKED | KA_NOTRECURSED:
+ case KA_LOCKED | KA_RECURSED:
+#ifdef WITNESS
+
+ /*
+ * We cannot trust WITNESS if the lock is held in exclusive
+ * mode and a call to lockmgr_disown() happened.
+ * Workaround this skipping the check if the lock is held in
+ * exclusive mode even for the KA_LOCKED case.
+ */
+ if (slocked || (lk->lk_lock & LK_SHARE)) {
+ witness_assert(&lk->lock_object, what, file, line);
+ break;
+ }
+#endif
+ if (lk->lk_lock == LK_UNLOCKED ||
+ ((lk->lk_lock & LK_SHARE) == 0 && (slocked ||
+ (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)))))
+ panic("Lock %s not %slocked @ %s:%d\n",
+ lk->lock_object.lo_name, slocked ? "share" : "",
+ file, line);
+
+ if ((lk->lk_lock & LK_SHARE) == 0) {
+ if (lockmgr_recursed(lk)) {
+ if (what & KA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ lk->lock_object.lo_name, file,
+ line);
+ } else if (what & KA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ }
+ break;
+ case KA_XLOCKED:
+ case KA_XLOCKED | KA_NOTRECURSED:
+ case KA_XLOCKED | KA_RECURSED:
+ if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))
+ panic("Lock %s not exclusively locked @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ if (lockmgr_recursed(lk)) {
+ if (what & KA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ } else if (what & KA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ break;
+ case KA_UNLOCKED:
+ if (lockmgr_xlocked(lk) || lockmgr_disowned(lk))
+ panic("Lock %s exclusively locked @ %s:%d\n",
+ lk->lock_object.lo_name, file, line);
+ break;
+ default:
+ panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file,
+ line);
+ }
+}
+#endif
+
+#ifdef DDB
+int
+lockmgr_chain(struct thread *td, struct thread **ownerp)
+{
+ struct lock *lk;
+
+ lk = td->td_wchan;
+
+ if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
+ return (0);
+ db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
+ if (lk->lk_lock & LK_SHARE)
+ db_printf("SHARED (count %ju)\n",
+ (uintmax_t)LK_SHARERS(lk->lk_lock));
+ else
+ db_printf("EXCL\n");
+ *ownerp = lockmgr_xholder(lk);
+
+ return (1);
+}
+
+static void
+db_show_lockmgr(const struct lock_object *lock)
+{
+ struct thread *td;
+ const struct lock *lk;
+
+ lk = (const struct lock *)lock;
+
+ db_printf(" state: ");
+ if (lk->lk_lock == LK_UNLOCKED)
+ db_printf("UNLOCKED\n");
+ else if (lk->lk_lock & LK_SHARE)
+ db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock));
+ else {
+ td = lockmgr_xholder(lk);
+ if (td == (struct thread *)LK_KERNPROC)
+ db_printf("XLOCK: LK_KERNPROC\n");
+ else
+ db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+ td->td_tid, td->td_proc->p_pid,
+ td->td_proc->p_comm);
+ if (lockmgr_recursed(lk))
+ db_printf(" recursed: %d\n", lk->lk_recurse);
+ }
+ db_printf(" waiters: ");
+ switch (lk->lk_lock & LK_ALL_WAITERS) {
+ case LK_SHARED_WAITERS:
+ db_printf("shared\n");
+ break;
+ case LK_EXCLUSIVE_WAITERS:
+ db_printf("exclusive\n");
+ break;
+ case LK_ALL_WAITERS:
+ db_printf("shared and exclusive\n");
+ break;
+ default:
+ db_printf("none\n");
+ }
+ db_printf(" spinners: ");
+ if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS)
+ db_printf("exclusive\n");
+ else
+ db_printf("none\n");
+}
+#endif
diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c
new file mode 100644
index 0000000..6d6dc51
--- /dev/null
+++ b/sys/kern/kern_lockf.c
@@ -0,0 +1,2545 @@
+/*-
+ * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
+ * Authors: Doug Rabson <dfr@rabson.org>
+ * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Scooter Morris at Genentech Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_debug_lockf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/hash.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/lockf.h>
+#include <sys/taskqueue.h>
+
+#ifdef LOCKF_DEBUG
+#include <sys/sysctl.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+
+static int lockf_debug = 0; /* control debug output */
+SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+#endif
+
+static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
+
+struct owner_edge;
+struct owner_vertex;
+struct owner_vertex_list;
+struct owner_graph;
+
+#define NOLOCKF (struct lockf_entry *)0
+#define SELF 0x1
+#define OTHERS 0x2
+static void lf_init(void *);
+static int lf_hash_owner(caddr_t, struct flock *, int);
+static int lf_owner_matches(struct lock_owner *, caddr_t, struct flock *,
+ int);
+static struct lockf_entry *
+ lf_alloc_lock(struct lock_owner *);
+static int lf_free_lock(struct lockf_entry *);
+static int lf_clearlock(struct lockf *, struct lockf_entry *);
+static int lf_overlaps(struct lockf_entry *, struct lockf_entry *);
+static int lf_blocks(struct lockf_entry *, struct lockf_entry *);
+static void lf_free_edge(struct lockf_edge *);
+static struct lockf_edge *
+ lf_alloc_edge(void);
+static void lf_alloc_vertex(struct lockf_entry *);
+static int lf_add_edge(struct lockf_entry *, struct lockf_entry *);
+static void lf_remove_edge(struct lockf_edge *);
+static void lf_remove_outgoing(struct lockf_entry *);
+static void lf_remove_incoming(struct lockf_entry *);
+static int lf_add_outgoing(struct lockf *, struct lockf_entry *);
+static int lf_add_incoming(struct lockf *, struct lockf_entry *);
+static int lf_findoverlap(struct lockf_entry **, struct lockf_entry *,
+ int);
+static struct lockf_entry *
+ lf_getblock(struct lockf *, struct lockf_entry *);
+static int lf_getlock(struct lockf *, struct lockf_entry *, struct flock *);
+static void lf_insert_lock(struct lockf *, struct lockf_entry *);
+static void lf_wakeup_lock(struct lockf *, struct lockf_entry *);
+static void lf_update_dependancies(struct lockf *, struct lockf_entry *,
+ int all, struct lockf_entry_list *);
+static void lf_set_start(struct lockf *, struct lockf_entry *, off_t,
+ struct lockf_entry_list*);
+static void lf_set_end(struct lockf *, struct lockf_entry *, off_t,
+ struct lockf_entry_list*);
+static int lf_setlock(struct lockf *, struct lockf_entry *,
+ struct vnode *, void **cookiep);
+static int lf_cancel(struct lockf *, struct lockf_entry *, void *);
+static void lf_split(struct lockf *, struct lockf_entry *,
+ struct lockf_entry *, struct lockf_entry_list *);
+#ifdef LOCKF_DEBUG
+static int graph_reaches(struct owner_vertex *x, struct owner_vertex *y,
+ struct owner_vertex_list *path);
+static void graph_check(struct owner_graph *g, int checkorder);
+static void graph_print_vertices(struct owner_vertex_list *set);
+#endif
+static int graph_delta_forward(struct owner_graph *g,
+ struct owner_vertex *x, struct owner_vertex *y,
+ struct owner_vertex_list *delta);
+static int graph_delta_backward(struct owner_graph *g,
+ struct owner_vertex *x, struct owner_vertex *y,
+ struct owner_vertex_list *delta);
+static int graph_add_indices(int *indices, int n,
+ struct owner_vertex_list *set);
+static int graph_assign_indices(struct owner_graph *g, int *indices,
+ int nextunused, struct owner_vertex_list *set);
+static int graph_add_edge(struct owner_graph *g,
+ struct owner_vertex *x, struct owner_vertex *y);
+static void graph_remove_edge(struct owner_graph *g,
+ struct owner_vertex *x, struct owner_vertex *y);
+static struct owner_vertex *graph_alloc_vertex(struct owner_graph *g,
+ struct lock_owner *lo);
+static void graph_free_vertex(struct owner_graph *g,
+ struct owner_vertex *v);
+static struct owner_graph * graph_init(struct owner_graph *g);
+#ifdef LOCKF_DEBUG
+static void lf_print(char *, struct lockf_entry *);
+static void lf_printlist(char *, struct lockf_entry *);
+static void lf_print_owner(struct lock_owner *);
+#endif
+
+/*
+ * This structure is used to keep track of both local and remote lock
+ * owners. The lf_owner field of the struct lockf_entry points back at
+ * the lock owner structure. Each possible lock owner (local proc for
+ * POSIX fcntl locks, local file for BSD flock locks or <pid,sysid>
+ * pair for remote locks) is represented by a unique instance of
+ * struct lock_owner.
+ *
+ * If a lock owner has a lock that blocks some other lock or a lock
+ * that is waiting for some other lock, it also has a vertex in the
+ * owner_graph below.
+ *
+ * Locks:
+ * (s) locked by state->ls_lock
+ * (S) locked by lf_lock_states_lock
+ * (l) locked by lf_lock_owners_lock
+ * (g) locked by lf_owner_graph_lock
+ * (c) const until freeing
+ */
+#define LOCK_OWNER_HASH_SIZE 256
+
+struct lock_owner {
+ LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */
+ int lo_refs; /* (l) Number of locks referring to this */
+ int lo_flags; /* (c) Flags passwd to lf_advlock */
+ caddr_t lo_id; /* (c) Id value passed to lf_advlock */
+ pid_t lo_pid; /* (c) Process Id of the lock owner */
+ int lo_sysid; /* (c) System Id of the lock owner */
+ struct owner_vertex *lo_vertex; /* (g) entry in deadlock graph */
+};
+
+LIST_HEAD(lock_owner_list, lock_owner);
+
+static struct sx lf_lock_states_lock;
+static struct lockf_list lf_lock_states; /* (S) */
+static struct sx lf_lock_owners_lock;
+static struct lock_owner_list lf_lock_owners[LOCK_OWNER_HASH_SIZE]; /* (l) */
+
+/*
+ * Structures for deadlock detection.
+ *
+ * We have two types of directed graph, the first is the set of locks,
+ * both active and pending on a vnode. Within this graph, active locks
+ * are terminal nodes in the graph (i.e. have no out-going
+ * edges). Pending locks have out-going edges to each blocking active
+ * lock that prevents the lock from being granted and also to each
+ * older pending lock that would block them if it was active. The
+ * graph for each vnode is naturally acyclic; new edges are only ever
+ * added to or from new nodes (either new pending locks which only add
+ * out-going edges or new active locks which only add in-coming edges)
+ * therefore they cannot create loops in the lock graph.
+ *
+ * The second graph is a global graph of lock owners. Each lock owner
+ * is a vertex in that graph and an edge is added to the graph
+ * whenever an edge is added to a vnode graph, with end points
+ * corresponding to owner of the new pending lock and the owner of the
+ * lock upon which it waits. In order to prevent deadlock, we only add
+ * an edge to this graph if the new edge would not create a cycle.
+ *
+ * The lock owner graph is topologically sorted, i.e. if a node has
+ * any outgoing edges, then it has an order strictly less than any
+ * node to which it has an outgoing edge. We preserve this ordering
+ * (and detect cycles) on edge insertion using Algorithm PK from the
+ * paper "A Dynamic Topological Sort Algorithm for Directed Acyclic
+ * Graphs" (ACM Journal of Experimental Algorithms, Vol 11, Article
+ * No. 1.7)
+ */
+struct owner_vertex;
+
+struct owner_edge {
+ LIST_ENTRY(owner_edge) e_outlink; /* (g) link from's out-edge list */
+ LIST_ENTRY(owner_edge) e_inlink; /* (g) link to's in-edge list */
+ int e_refs; /* (g) number of times added */
+ struct owner_vertex *e_from; /* (c) out-going from here */
+ struct owner_vertex *e_to; /* (c) in-coming to here */
+};
+LIST_HEAD(owner_edge_list, owner_edge);
+
+struct owner_vertex {
+ TAILQ_ENTRY(owner_vertex) v_link; /* (g) workspace for edge insertion */
+ uint32_t v_gen; /* (g) workspace for edge insertion */
+ int v_order; /* (g) order of vertex in graph */
+ struct owner_edge_list v_outedges;/* (g) list of out-edges */
+ struct owner_edge_list v_inedges; /* (g) list of in-edges */
+ struct lock_owner *v_owner; /* (c) corresponding lock owner */
+};
+TAILQ_HEAD(owner_vertex_list, owner_vertex);
+
+struct owner_graph {
+ struct owner_vertex** g_vertices; /* (g) pointers to vertices */
+ int g_size; /* (g) number of vertices */
+ int g_space; /* (g) space allocated for vertices */
+ int *g_indexbuf; /* (g) workspace for loop detection */
+ uint32_t g_gen; /* (g) increment when re-ordering */
+};
+
+static struct sx lf_owner_graph_lock;
+static struct owner_graph lf_owner_graph;
+
+/*
+ * Initialise various structures and locks.
+ */
+static void
+lf_init(void *dummy)
+{
+ int i;
+
+ sx_init(&lf_lock_states_lock, "lock states lock");
+ LIST_INIT(&lf_lock_states);
+
+ sx_init(&lf_lock_owners_lock, "lock owners lock");
+ for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
+ LIST_INIT(&lf_lock_owners[i]);
+
+ sx_init(&lf_owner_graph_lock, "owner graph lock");
+ graph_init(&lf_owner_graph);
+}
+SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL);
+
+/*
+ * Generate a hash value for a lock owner.
+ */
+static int
+lf_hash_owner(caddr_t id, struct flock *fl, int flags)
+{
+ uint32_t h;
+
+ if (flags & F_REMOTE) {
+ h = HASHSTEP(0, fl->l_pid);
+ h = HASHSTEP(h, fl->l_sysid);
+ } else if (flags & F_FLOCK) {
+ h = ((uintptr_t) id) >> 7;
+ } else {
+ struct proc *p = (struct proc *) id;
+ h = HASHSTEP(0, p->p_pid);
+ h = HASHSTEP(h, 0);
+ }
+
+ return (h % LOCK_OWNER_HASH_SIZE);
+}
+
+/*
+ * Return true if a lock owner matches the details passed to
+ * lf_advlock.
+ */
+static int
+lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl,
+ int flags)
+{
+ if (flags & F_REMOTE) {
+ return lo->lo_pid == fl->l_pid
+ && lo->lo_sysid == fl->l_sysid;
+ } else {
+ return lo->lo_id == id;
+ }
+}
+
+static struct lockf_entry *
+lf_alloc_lock(struct lock_owner *lo)
+{
+ struct lockf_entry *lf;
+
+ lf = malloc(sizeof(struct lockf_entry), M_LOCKF, M_WAITOK|M_ZERO);
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 4)
+ printf("Allocated lock %p\n", lf);
+#endif
+ if (lo) {
+ sx_xlock(&lf_lock_owners_lock);
+ lo->lo_refs++;
+ sx_xunlock(&lf_lock_owners_lock);
+ lf->lf_owner = lo;
+ }
+
+ return (lf);
+}
+
+static int
+lf_free_lock(struct lockf_entry *lock)
+{
+
+ KASSERT(lock->lf_refs > 0, ("lockf_entry negative ref count %p", lock));
+ if (--lock->lf_refs > 0)
+ return (0);
+ /*
+ * Adjust the lock_owner reference count and
+ * reclaim the entry if this is the last lock
+ * for that owner.
+ */
+ struct lock_owner *lo = lock->lf_owner;
+ if (lo) {
+ KASSERT(LIST_EMPTY(&lock->lf_outedges),
+ ("freeing lock with dependancies"));
+ KASSERT(LIST_EMPTY(&lock->lf_inedges),
+ ("freeing lock with dependants"));
+ sx_xlock(&lf_lock_owners_lock);
+ KASSERT(lo->lo_refs > 0, ("lock owner refcount"));
+ lo->lo_refs--;
+ if (lo->lo_refs == 0) {
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ printf("lf_free_lock: freeing lock owner %p\n",
+ lo);
+#endif
+ if (lo->lo_vertex) {
+ sx_xlock(&lf_owner_graph_lock);
+ graph_free_vertex(&lf_owner_graph,
+ lo->lo_vertex);
+ sx_xunlock(&lf_owner_graph_lock);
+ }
+ LIST_REMOVE(lo, lo_link);
+ free(lo, M_LOCKF);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 4)
+ printf("Freed lock owner %p\n", lo);
+#endif
+ }
+ sx_unlock(&lf_lock_owners_lock);
+ }
+ if ((lock->lf_flags & F_REMOTE) && lock->lf_vnode) {
+ vrele(lock->lf_vnode);
+ lock->lf_vnode = NULL;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 4)
+ printf("Freed lock %p\n", lock);
+#endif
+ free(lock, M_LOCKF);
+ return (1);
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+lf_advlockasync(struct vop_advlockasync_args *ap, struct lockf **statep,
+ u_quad_t size)
+{
+ struct lockf *state, *freestate = NULL;
+ struct flock *fl = ap->a_fl;
+ struct lockf_entry *lock;
+ struct vnode *vp = ap->a_vp;
+ caddr_t id = ap->a_id;
+ int flags = ap->a_flags;
+ int hash;
+ struct lock_owner *lo;
+ off_t start, end, oadd;
+ int error;
+
+ /*
+ * Handle the F_UNLKSYS case first - no need to mess about
+ * creating a lock owner for this one.
+ */
+ if (ap->a_op == F_UNLCKSYS) {
+ lf_clearremotesys(fl->l_sysid);
+ return (0);
+ }
+
+ /*
+ * Convert the flock structure into a start and end.
+ */
+ switch (fl->l_whence) {
+
+ case SEEK_SET:
+ case SEEK_CUR:
+ /*
+ * Caller is responsible for adding any necessary offset
+ * when SEEK_CUR is used.
+ */
+ start = fl->l_start;
+ break;
+
+ case SEEK_END:
+ if (size > OFF_MAX ||
+ (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
+ return (EOVERFLOW);
+ start = size + fl->l_start;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ if (start < 0)
+ return (EINVAL);
+ if (fl->l_len < 0) {
+ if (start == 0)
+ return (EINVAL);
+ end = start - 1;
+ start += fl->l_len;
+ if (start < 0)
+ return (EINVAL);
+ } else if (fl->l_len == 0) {
+ end = OFF_MAX;
+ } else {
+ oadd = fl->l_len - 1;
+ if (oadd > OFF_MAX - start)
+ return (EOVERFLOW);
+ end = start + oadd;
+ }
+ /*
+ * Avoid the common case of unlocking when inode has no locks.
+ */
+ VI_LOCK(vp);
+ if ((*statep) == NULL) {
+ if (ap->a_op != F_SETLK) {
+ fl->l_type = F_UNLCK;
+ VI_UNLOCK(vp);
+ return (0);
+ }
+ }
+ VI_UNLOCK(vp);
+
+ /*
+ * Map our arguments to an existing lock owner or create one
+ * if this is the first time we have seen this owner.
+ */
+ hash = lf_hash_owner(id, fl, flags);
+ sx_xlock(&lf_lock_owners_lock);
+ LIST_FOREACH(lo, &lf_lock_owners[hash], lo_link)
+ if (lf_owner_matches(lo, id, fl, flags))
+ break;
+ if (!lo) {
+ /*
+ * We initialise the lock with a reference
+ * count which matches the new lockf_entry
+ * structure created below.
+ */
+ lo = malloc(sizeof(struct lock_owner), M_LOCKF,
+ M_WAITOK|M_ZERO);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 4)
+ printf("Allocated lock owner %p\n", lo);
+#endif
+
+ lo->lo_refs = 1;
+ lo->lo_flags = flags;
+ lo->lo_id = id;
+ if (flags & F_REMOTE) {
+ lo->lo_pid = fl->l_pid;
+ lo->lo_sysid = fl->l_sysid;
+ } else if (flags & F_FLOCK) {
+ lo->lo_pid = -1;
+ lo->lo_sysid = 0;
+ } else {
+ struct proc *p = (struct proc *) id;
+ lo->lo_pid = p->p_pid;
+ lo->lo_sysid = 0;
+ }
+ lo->lo_vertex = NULL;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ printf("lf_advlockasync: new lock owner %p ", lo);
+ lf_print_owner(lo);
+ printf("\n");
+ }
+#endif
+
+ LIST_INSERT_HEAD(&lf_lock_owners[hash], lo, lo_link);
+ } else {
+ /*
+ * We have seen this lock owner before, increase its
+ * reference count to account for the new lockf_entry
+ * structure we create below.
+ */
+ lo->lo_refs++;
+ }
+ sx_xunlock(&lf_lock_owners_lock);
+
+ /*
+ * Create the lockf structure. We initialise the lf_owner
+ * field here instead of in lf_alloc_lock() to avoid paying
+ * the lf_lock_owners_lock tax twice.
+ */
+ lock = lf_alloc_lock(NULL);
+ lock->lf_refs = 1;
+ lock->lf_start = start;
+ lock->lf_end = end;
+ lock->lf_owner = lo;
+ lock->lf_vnode = vp;
+ if (flags & F_REMOTE) {
+ /*
+ * For remote locks, the caller may release its ref to
+ * the vnode at any time - we have to ref it here to
+ * prevent it from being recycled unexpectedly.
+ */
+ vref(vp);
+ }
+
+ /*
+ * XXX The problem is that VTOI is ufs specific, so it will
+ * break LOCKF_DEBUG for all other FS's other than UFS because
+ * it casts the vnode->data ptr to struct inode *.
+ */
+/* lock->lf_inode = VTOI(ap->a_vp); */
+ lock->lf_inode = (struct inode *)0;
+ lock->lf_type = fl->l_type;
+ LIST_INIT(&lock->lf_outedges);
+ LIST_INIT(&lock->lf_inedges);
+ lock->lf_async_task = ap->a_task;
+ lock->lf_flags = ap->a_flags;
+
+ /*
+ * Do the requested operation. First find our state structure
+ * and create a new one if necessary - the caller's *statep
+ * variable and the state's ls_threads count is protected by
+ * the vnode interlock.
+ */
+ VI_LOCK(vp);
+ if (vp->v_iflag & VI_DOOMED) {
+ VI_UNLOCK(vp);
+ lf_free_lock(lock);
+ return (ENOENT);
+ }
+
+ /*
+ * Allocate a state structure if necessary.
+ */
+ state = *statep;
+ if (state == NULL) {
+ struct lockf *ls;
+
+ VI_UNLOCK(vp);
+
+ ls = malloc(sizeof(struct lockf), M_LOCKF, M_WAITOK|M_ZERO);
+ sx_init(&ls->ls_lock, "ls_lock");
+ LIST_INIT(&ls->ls_active);
+ LIST_INIT(&ls->ls_pending);
+ ls->ls_threads = 1;
+
+ sx_xlock(&lf_lock_states_lock);
+ LIST_INSERT_HEAD(&lf_lock_states, ls, ls_link);
+ sx_xunlock(&lf_lock_states_lock);
+
+ /*
+ * Cope if we lost a race with some other thread while
+ * trying to allocate memory.
+ */
+ VI_LOCK(vp);
+ if (vp->v_iflag & VI_DOOMED) {
+ VI_UNLOCK(vp);
+ sx_xlock(&lf_lock_states_lock);
+ LIST_REMOVE(ls, ls_link);
+ sx_xunlock(&lf_lock_states_lock);
+ sx_destroy(&ls->ls_lock);
+ free(ls, M_LOCKF);
+ lf_free_lock(lock);
+ return (ENOENT);
+ }
+ if ((*statep) == NULL) {
+ state = *statep = ls;
+ VI_UNLOCK(vp);
+ } else {
+ state = *statep;
+ state->ls_threads++;
+ VI_UNLOCK(vp);
+
+ sx_xlock(&lf_lock_states_lock);
+ LIST_REMOVE(ls, ls_link);
+ sx_xunlock(&lf_lock_states_lock);
+ sx_destroy(&ls->ls_lock);
+ free(ls, M_LOCKF);
+ }
+ } else {
+ state->ls_threads++;
+ VI_UNLOCK(vp);
+ }
+
+ sx_xlock(&state->ls_lock);
+ /*
+ * Recheck the doomed vnode after state->ls_lock is
+ * locked. lf_purgelocks() requires that no new threads add
+ * pending locks when vnode is marked by VI_DOOMED flag.
+ */
+ VI_LOCK(vp);
+ if (vp->v_iflag & VI_DOOMED) {
+ state->ls_threads--;
+ wakeup(state);
+ VI_UNLOCK(vp);
+ sx_xunlock(&state->ls_lock);
+ lf_free_lock(lock);
+ return (ENOENT);
+ }
+ VI_UNLOCK(vp);
+
+ switch (ap->a_op) {
+ case F_SETLK:
+ error = lf_setlock(state, lock, vp, ap->a_cookiep);
+ break;
+
+ case F_UNLCK:
+ error = lf_clearlock(state, lock);
+ lf_free_lock(lock);
+ break;
+
+ case F_GETLK:
+ error = lf_getlock(state, lock, fl);
+ lf_free_lock(lock);
+ break;
+
+ case F_CANCEL:
+ if (ap->a_cookiep)
+ error = lf_cancel(state, lock, *ap->a_cookiep);
+ else
+ error = EINVAL;
+ lf_free_lock(lock);
+ break;
+
+ default:
+ lf_free_lock(lock);
+ error = EINVAL;
+ break;
+ }
+
+#ifdef INVARIANTS
+ /*
+ * Check for some can't happen stuff. In this case, the active
+ * lock list becoming disordered or containing mutually
+ * blocking locks. We also check the pending list for locks
+ * which should be active (i.e. have no out-going edges).
+ */
+ LIST_FOREACH(lock, &state->ls_active, lf_link) {
+ struct lockf_entry *lf;
+ if (LIST_NEXT(lock, lf_link))
+ KASSERT((lock->lf_start
+ <= LIST_NEXT(lock, lf_link)->lf_start),
+ ("locks disordered"));
+ LIST_FOREACH(lf, &state->ls_active, lf_link) {
+ if (lock == lf)
+ break;
+ KASSERT(!lf_blocks(lock, lf),
+ ("two conflicting active locks"));
+ if (lock->lf_owner == lf->lf_owner)
+ KASSERT(!lf_overlaps(lock, lf),
+ ("two overlapping locks from same owner"));
+ }
+ }
+ LIST_FOREACH(lock, &state->ls_pending, lf_link) {
+ KASSERT(!LIST_EMPTY(&lock->lf_outedges),
+ ("pending lock which should be active"));
+ }
+#endif
+ sx_xunlock(&state->ls_lock);
+
+ /*
+ * If we have removed the last active lock on the vnode and
+ * this is the last thread that was in-progress, we can free
+ * the state structure. We update the caller's pointer inside
+ * the vnode interlock but call free outside.
+ *
+ * XXX alternatively, keep the state structure around until
+ * the filesystem recycles - requires a callback from the
+ * filesystem.
+ */
+ VI_LOCK(vp);
+
+ state->ls_threads--;
+ wakeup(state);
+ if (LIST_EMPTY(&state->ls_active) && state->ls_threads == 0) {
+ KASSERT(LIST_EMPTY(&state->ls_pending),
+ ("freeing state with pending locks"));
+ freestate = state;
+ *statep = NULL;
+ }
+
+ VI_UNLOCK(vp);
+
+ if (freestate) {
+ sx_xlock(&lf_lock_states_lock);
+ LIST_REMOVE(freestate, ls_link);
+ sx_xunlock(&lf_lock_states_lock);
+ sx_destroy(&freestate->ls_lock);
+ free(freestate, M_LOCKF);
+ }
+ return (error);
+}
+
+int
+lf_advlock(struct vop_advlock_args *ap, struct lockf **statep, u_quad_t size)
+{
+ struct vop_advlockasync_args a;
+
+ a.a_vp = ap->a_vp;
+ a.a_id = ap->a_id;
+ a.a_op = ap->a_op;
+ a.a_fl = ap->a_fl;
+ a.a_flags = ap->a_flags;
+ a.a_task = NULL;
+ a.a_cookiep = NULL;
+
+ return (lf_advlockasync(&a, statep, size));
+}
+
+void
+lf_purgelocks(struct vnode *vp, struct lockf **statep)
+{
+ struct lockf *state;
+ struct lockf_entry *lock, *nlock;
+
+ /*
+ * For this to work correctly, the caller must ensure that no
+ * other threads enter the locking system for this vnode,
+ * e.g. by checking VI_DOOMED. We wake up any threads that are
+ * sleeping waiting for locks on this vnode and then free all
+ * the remaining locks.
+ */
+ VI_LOCK(vp);
+ KASSERT(vp->v_iflag & VI_DOOMED,
+ ("lf_purgelocks: vp %p has not vgone yet", vp));
+ state = *statep;
+ if (state) {
+ *statep = NULL;
+ state->ls_threads++;
+ VI_UNLOCK(vp);
+
+ sx_xlock(&state->ls_lock);
+ sx_xlock(&lf_owner_graph_lock);
+ LIST_FOREACH_SAFE(lock, &state->ls_pending, lf_link, nlock) {
+ LIST_REMOVE(lock, lf_link);
+ lf_remove_outgoing(lock);
+ lf_remove_incoming(lock);
+
+ /*
+ * If its an async lock, we can just free it
+ * here, otherwise we let the sleeping thread
+ * free it.
+ */
+ if (lock->lf_async_task) {
+ lf_free_lock(lock);
+ } else {
+ lock->lf_flags |= F_INTR;
+ wakeup(lock);
+ }
+ }
+ sx_xunlock(&lf_owner_graph_lock);
+ sx_xunlock(&state->ls_lock);
+
+ /*
+ * Wait for all other threads, sleeping and otherwise
+ * to leave.
+ */
+ VI_LOCK(vp);
+ while (state->ls_threads > 1)
+ msleep(state, VI_MTX(vp), 0, "purgelocks", 0);
+ VI_UNLOCK(vp);
+
+ /*
+ * We can just free all the active locks since they
+ * will have no dependancies (we removed them all
+ * above). We don't need to bother locking since we
+ * are the last thread using this state structure.
+ */
+ KASSERT(LIST_EMPTY(&state->ls_pending),
+ ("lock pending for %p", state));
+ LIST_FOREACH_SAFE(lock, &state->ls_active, lf_link, nlock) {
+ LIST_REMOVE(lock, lf_link);
+ lf_free_lock(lock);
+ }
+ sx_xlock(&lf_lock_states_lock);
+ LIST_REMOVE(state, ls_link);
+ sx_xunlock(&lf_lock_states_lock);
+ sx_destroy(&state->ls_lock);
+ free(state, M_LOCKF);
+ } else {
+ VI_UNLOCK(vp);
+ }
+}
+
+/*
+ * Return non-zero if locks 'x' and 'y' overlap.
+ */
+static int
+lf_overlaps(struct lockf_entry *x, struct lockf_entry *y)
+{
+
+ return (x->lf_start <= y->lf_end && x->lf_end >= y->lf_start);
+}
+
+/*
+ * Return non-zero if lock 'x' is blocked by lock 'y' (or vice versa).
+ */
+static int
+lf_blocks(struct lockf_entry *x, struct lockf_entry *y)
+{
+
+ return x->lf_owner != y->lf_owner
+ && (x->lf_type == F_WRLCK || y->lf_type == F_WRLCK)
+ && lf_overlaps(x, y);
+}
+
+/*
+ * Allocate a lock edge from the free list
+ */
+static struct lockf_edge *
+lf_alloc_edge(void)
+{
+
+ return (malloc(sizeof(struct lockf_edge), M_LOCKF, M_WAITOK|M_ZERO));
+}
+
+/*
+ * Free a lock edge.
+ */
+static void
+lf_free_edge(struct lockf_edge *e)
+{
+
+ free(e, M_LOCKF);
+}
+
+
+/*
+ * Ensure that the lock's owner has a corresponding vertex in the
+ * owner graph.
+ */
+static void
+lf_alloc_vertex(struct lockf_entry *lock)
+{
+ struct owner_graph *g = &lf_owner_graph;
+
+ if (!lock->lf_owner->lo_vertex)
+ lock->lf_owner->lo_vertex =
+ graph_alloc_vertex(g, lock->lf_owner);
+}
+
+/*
+ * Attempt to record an edge from lock x to lock y. Return EDEADLK if
+ * the new edge would cause a cycle in the owner graph.
+ */
+static int
+lf_add_edge(struct lockf_entry *x, struct lockf_entry *y)
+{
+ struct owner_graph *g = &lf_owner_graph;
+ struct lockf_edge *e;
+ int error;
+
+#ifdef INVARIANTS
+ LIST_FOREACH(e, &x->lf_outedges, le_outlink)
+ KASSERT(e->le_to != y, ("adding lock edge twice"));
+#endif
+
+ /*
+ * Make sure the two owners have entries in the owner graph.
+ */
+ lf_alloc_vertex(x);
+ lf_alloc_vertex(y);
+
+ error = graph_add_edge(g, x->lf_owner->lo_vertex,
+ y->lf_owner->lo_vertex);
+ if (error)
+ return (error);
+
+ e = lf_alloc_edge();
+ LIST_INSERT_HEAD(&x->lf_outedges, e, le_outlink);
+ LIST_INSERT_HEAD(&y->lf_inedges, e, le_inlink);
+ e->le_from = x;
+ e->le_to = y;
+
+ return (0);
+}
+
+/*
+ * Remove an edge from the lock graph.
+ */
+static void
+lf_remove_edge(struct lockf_edge *e)
+{
+ struct owner_graph *g = &lf_owner_graph;
+ struct lockf_entry *x = e->le_from;
+ struct lockf_entry *y = e->le_to;
+
+ graph_remove_edge(g, x->lf_owner->lo_vertex, y->lf_owner->lo_vertex);
+ LIST_REMOVE(e, le_outlink);
+ LIST_REMOVE(e, le_inlink);
+ e->le_from = NULL;
+ e->le_to = NULL;
+ lf_free_edge(e);
+}
+
+/*
+ * Remove all out-going edges from lock x.
+ */
+static void
+lf_remove_outgoing(struct lockf_entry *x)
+{
+ struct lockf_edge *e;
+
+ while ((e = LIST_FIRST(&x->lf_outedges)) != NULL) {
+ lf_remove_edge(e);
+ }
+}
+
+/*
+ * Remove all in-coming edges from lock x.
+ */
+static void
+lf_remove_incoming(struct lockf_entry *x)
+{
+ struct lockf_edge *e;
+
+ while ((e = LIST_FIRST(&x->lf_inedges)) != NULL) {
+ lf_remove_edge(e);
+ }
+}
+
+/*
+ * Walk the list of locks for the file and create an out-going edge
+ * from lock to each blocking lock.
+ */
+static int
+lf_add_outgoing(struct lockf *state, struct lockf_entry *lock)
+{
+ struct lockf_entry *overlap;
+ int error;
+
+ LIST_FOREACH(overlap, &state->ls_active, lf_link) {
+ /*
+ * We may assume that the active list is sorted by
+ * lf_start.
+ */
+ if (overlap->lf_start > lock->lf_end)
+ break;
+ if (!lf_blocks(lock, overlap))
+ continue;
+
+ /*
+ * We've found a blocking lock. Add the corresponding
+ * edge to the graphs and see if it would cause a
+ * deadlock.
+ */
+ error = lf_add_edge(lock, overlap);
+
+ /*
+ * The only error that lf_add_edge returns is EDEADLK.
+ * Remove any edges we added and return the error.
+ */
+ if (error) {
+ lf_remove_outgoing(lock);
+ return (error);
+ }
+ }
+
+ /*
+ * We also need to add edges to sleeping locks that block
+ * us. This ensures that lf_wakeup_lock cannot grant two
+ * mutually blocking locks simultaneously and also enforces a
+ * 'first come, first served' fairness model. Note that this
+ * only happens if we are blocked by at least one active lock
+ * due to the call to lf_getblock in lf_setlock below.
+ */
+ LIST_FOREACH(overlap, &state->ls_pending, lf_link) {
+ if (!lf_blocks(lock, overlap))
+ continue;
+ /*
+ * We've found a blocking lock. Add the corresponding
+ * edge to the graphs and see if it would cause a
+ * deadlock.
+ */
+ error = lf_add_edge(lock, overlap);
+
+ /*
+ * The only error that lf_add_edge returns is EDEADLK.
+ * Remove any edges we added and return the error.
+ */
+ if (error) {
+ lf_remove_outgoing(lock);
+ return (error);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Walk the list of pending locks for the file and create an in-coming
+ * edge from lock to each blocking lock.
+ */
+static int
+lf_add_incoming(struct lockf *state, struct lockf_entry *lock)
+{
+ struct lockf_entry *overlap;
+ int error;
+
+ LIST_FOREACH(overlap, &state->ls_pending, lf_link) {
+ if (!lf_blocks(lock, overlap))
+ continue;
+
+ /*
+ * We've found a blocking lock. Add the corresponding
+ * edge to the graphs and see if it would cause a
+ * deadlock.
+ */
+ error = lf_add_edge(overlap, lock);
+
+ /*
+ * The only error that lf_add_edge returns is EDEADLK.
+ * Remove any edges we added and return the error.
+ */
+ if (error) {
+ lf_remove_incoming(lock);
+ return (error);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Insert lock into the active list, keeping list entries ordered by
+ * increasing values of lf_start.
+ */
+static void
+lf_insert_lock(struct lockf *state, struct lockf_entry *lock)
+{
+ struct lockf_entry *lf, *lfprev;
+
+ if (LIST_EMPTY(&state->ls_active)) {
+ LIST_INSERT_HEAD(&state->ls_active, lock, lf_link);
+ return;
+ }
+
+ lfprev = NULL;
+ LIST_FOREACH(lf, &state->ls_active, lf_link) {
+ if (lf->lf_start > lock->lf_start) {
+ LIST_INSERT_BEFORE(lf, lock, lf_link);
+ return;
+ }
+ lfprev = lf;
+ }
+ LIST_INSERT_AFTER(lfprev, lock, lf_link);
+}
+
+/*
+ * Wake up a sleeping lock and remove it from the pending list now
+ * that all its dependancies have been resolved. The caller should
+ * arrange for the lock to be added to the active list, adjusting any
+ * existing locks for the same owner as needed.
+ */
+static void
+lf_wakeup_lock(struct lockf *state, struct lockf_entry *wakelock)
+{
+
+ /*
+ * Remove from ls_pending list and wake up the caller
+ * or start the async notification, as appropriate.
+ */
+ LIST_REMOVE(wakelock, lf_link);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_wakeup_lock: awakening", wakelock);
+#endif /* LOCKF_DEBUG */
+ if (wakelock->lf_async_task) {
+ taskqueue_enqueue(taskqueue_thread, wakelock->lf_async_task);
+ } else {
+ wakeup(wakelock);
+ }
+}
+
+/*
+ * Re-check all dependant locks and remove edges to locks that we no
+ * longer block. If 'all' is non-zero, the lock has been removed and
+ * we must remove all the dependancies, otherwise it has simply been
+ * reduced but remains active. Any pending locks which have been been
+ * unblocked are added to 'granted'
+ */
+static void
+lf_update_dependancies(struct lockf *state, struct lockf_entry *lock, int all,
+ struct lockf_entry_list *granted)
+{
+ struct lockf_edge *e, *ne;
+ struct lockf_entry *deplock;
+
+ LIST_FOREACH_SAFE(e, &lock->lf_inedges, le_inlink, ne) {
+ deplock = e->le_from;
+ if (all || !lf_blocks(lock, deplock)) {
+ sx_xlock(&lf_owner_graph_lock);
+ lf_remove_edge(e);
+ sx_xunlock(&lf_owner_graph_lock);
+ if (LIST_EMPTY(&deplock->lf_outedges)) {
+ lf_wakeup_lock(state, deplock);
+ LIST_INSERT_HEAD(granted, deplock, lf_link);
+ }
+ }
+ }
+}
+
+/*
+ * Set the start of an existing active lock, updating dependancies and
+ * adding any newly woken locks to 'granted'.
+ */
+static void
+lf_set_start(struct lockf *state, struct lockf_entry *lock, off_t new_start,
+ struct lockf_entry_list *granted)
+{
+
+ KASSERT(new_start >= lock->lf_start, ("can't increase lock"));
+ lock->lf_start = new_start;
+ LIST_REMOVE(lock, lf_link);
+ lf_insert_lock(state, lock);
+ lf_update_dependancies(state, lock, FALSE, granted);
+}
+
+/*
+ * Set the end of an existing active lock, updating dependancies and
+ * adding any newly woken locks to 'granted'.
+ */
+static void
+lf_set_end(struct lockf *state, struct lockf_entry *lock, off_t new_end,
+ struct lockf_entry_list *granted)
+{
+
+ KASSERT(new_end <= lock->lf_end, ("can't increase lock"));
+ lock->lf_end = new_end;
+ lf_update_dependancies(state, lock, FALSE, granted);
+}
+
+/*
+ * Add a lock to the active list, updating or removing any current
+ * locks owned by the same owner and processing any pending locks that
+ * become unblocked as a result. This code is also used for unlock
+ * since the logic for updating existing locks is identical.
+ *
+ * As a result of processing the new lock, we may unblock existing
+ * pending locks as a result of downgrading/unlocking. We simply
+ * activate the newly granted locks by looping.
+ *
+ * Since the new lock already has its dependancies set up, we always
+ * add it to the list (unless its an unlock request). This may
+ * fragment the lock list in some pathological cases but its probably
+ * not a real problem.
+ */
+static void
+lf_activate_lock(struct lockf *state, struct lockf_entry *lock)
+{
+ struct lockf_entry *overlap, *lf;
+ struct lockf_entry_list granted;
+ int ovcase;
+
+ LIST_INIT(&granted);
+ LIST_INSERT_HEAD(&granted, lock, lf_link);
+
+ while (!LIST_EMPTY(&granted)) {
+ lock = LIST_FIRST(&granted);
+ LIST_REMOVE(lock, lf_link);
+
+ /*
+ * Skip over locks owned by other processes. Handle
+ * any locks that overlap and are owned by ourselves.
+ */
+ overlap = LIST_FIRST(&state->ls_active);
+ for (;;) {
+ ovcase = lf_findoverlap(&overlap, lock, SELF);
+
+#ifdef LOCKF_DEBUG
+ if (ovcase && (lockf_debug & 2)) {
+ printf("lf_setlock: overlap %d", ovcase);
+ lf_print("", overlap);
+ }
+#endif
+ /*
+ * Six cases:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ */
+ switch (ovcase) {
+ case 0: /* no overlap */
+ break;
+
+ case 1: /* overlap == lock */
+ /*
+ * We have already setup the
+ * dependants for the new lock, taking
+ * into account a possible downgrade
+ * or unlock. Remove the old lock.
+ */
+ LIST_REMOVE(overlap, lf_link);
+ lf_update_dependancies(state, overlap, TRUE,
+ &granted);
+ lf_free_lock(overlap);
+ break;
+
+ case 2: /* overlap contains lock */
+ /*
+ * Just split the existing lock.
+ */
+ lf_split(state, overlap, lock, &granted);
+ break;
+
+ case 3: /* lock contains overlap */
+ /*
+ * Delete the overlap and advance to
+ * the next entry in the list.
+ */
+ lf = LIST_NEXT(overlap, lf_link);
+ LIST_REMOVE(overlap, lf_link);
+ lf_update_dependancies(state, overlap, TRUE,
+ &granted);
+ lf_free_lock(overlap);
+ overlap = lf;
+ continue;
+
+ case 4: /* overlap starts before lock */
+ /*
+ * Just update the overlap end and
+ * move on.
+ */
+ lf_set_end(state, overlap, lock->lf_start - 1,
+ &granted);
+ overlap = LIST_NEXT(overlap, lf_link);
+ continue;
+
+ case 5: /* overlap ends after lock */
+ /*
+ * Change the start of overlap and
+ * re-insert.
+ */
+ lf_set_start(state, overlap, lock->lf_end + 1,
+ &granted);
+ break;
+ }
+ break;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ if (lock->lf_type != F_UNLCK)
+ lf_print("lf_activate_lock: activated", lock);
+ else
+ lf_print("lf_activate_lock: unlocked", lock);
+ lf_printlist("lf_activate_lock", lock);
+ }
+#endif /* LOCKF_DEBUG */
+ if (lock->lf_type != F_UNLCK)
+ lf_insert_lock(state, lock);
+ }
+}
+
+/*
+ * Cancel a pending lock request, either as a result of a signal or a
+ * cancel request for an async lock.
+ */
+static void
+lf_cancel_lock(struct lockf *state, struct lockf_entry *lock)
+{
+ struct lockf_entry_list granted;
+
+ /*
+ * Note it is theoretically possible that cancelling this lock
+ * may allow some other pending lock to become
+ * active. Consider this case:
+ *
+ * Owner Action Result Dependancies
+ *
+ * A: lock [0..0] succeeds
+ * B: lock [2..2] succeeds
+ * C: lock [1..2] blocked C->B
+ * D: lock [0..1] blocked C->B,D->A,D->C
+ * A: unlock [0..0] C->B,D->C
+ * C: cancel [1..2]
+ */
+
+ LIST_REMOVE(lock, lf_link);
+
+ /*
+ * Removing out-going edges is simple.
+ */
+ sx_xlock(&lf_owner_graph_lock);
+ lf_remove_outgoing(lock);
+ sx_xunlock(&lf_owner_graph_lock);
+
+ /*
+ * Removing in-coming edges may allow some other lock to
+ * become active - we use lf_update_dependancies to figure
+ * this out.
+ */
+ LIST_INIT(&granted);
+ lf_update_dependancies(state, lock, TRUE, &granted);
+ lf_free_lock(lock);
+
+ /*
+ * Feed any newly active locks to lf_activate_lock.
+ */
+ while (!LIST_EMPTY(&granted)) {
+ lock = LIST_FIRST(&granted);
+ LIST_REMOVE(lock, lf_link);
+ lf_activate_lock(state, lock);
+ }
+}
+
+/*
+ * Set a byte-range lock.
+ */
+static int
+lf_setlock(struct lockf *state, struct lockf_entry *lock, struct vnode *vp,
+ void **cookiep)
+{
+ static char lockstr[] = "lockf";
+ int priority, error;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_setlock", lock);
+#endif /* LOCKF_DEBUG */
+
+ /*
+ * Set the priority
+ */
+ priority = PLOCK;
+ if (lock->lf_type == F_WRLCK)
+ priority += 4;
+ if (!(lock->lf_flags & F_NOINTR))
+ priority |= PCATCH;
+ /*
+ * Scan lock list for this file looking for locks that would block us.
+ */
+ if (lf_getblock(state, lock)) {
+ /*
+ * Free the structure and return if nonblocking.
+ */
+ if ((lock->lf_flags & F_WAIT) == 0
+ && lock->lf_async_task == NULL) {
+ lf_free_lock(lock);
+ error = EAGAIN;
+ goto out;
+ }
+
+ /*
+ * For flock type locks, we must first remove
+ * any shared locks that we hold before we sleep
+ * waiting for an exclusive lock.
+ */
+ if ((lock->lf_flags & F_FLOCK) &&
+ lock->lf_type == F_WRLCK) {
+ lock->lf_type = F_UNLCK;
+ lf_activate_lock(state, lock);
+ lock->lf_type = F_WRLCK;
+ }
+
+ /*
+ * We are blocked. Create edges to each blocking lock,
+ * checking for deadlock using the owner graph. For
+ * simplicity, we run deadlock detection for all
+ * locks, posix and otherwise.
+ */
+ sx_xlock(&lf_owner_graph_lock);
+ error = lf_add_outgoing(state, lock);
+ sx_xunlock(&lf_owner_graph_lock);
+
+ if (error) {
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_setlock: deadlock", lock);
+#endif
+ lf_free_lock(lock);
+ goto out;
+ }
+
+ /*
+ * We have added edges to everything that blocks
+ * us. Sleep until they all go away.
+ */
+ LIST_INSERT_HEAD(&state->ls_pending, lock, lf_link);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ struct lockf_edge *e;
+ LIST_FOREACH(e, &lock->lf_outedges, le_outlink) {
+ lf_print("lf_setlock: blocking on", e->le_to);
+ lf_printlist("lf_setlock", e->le_to);
+ }
+ }
+#endif /* LOCKF_DEBUG */
+
+ if ((lock->lf_flags & F_WAIT) == 0) {
+ /*
+ * The caller requested async notification -
+ * this callback happens when the blocking
+ * lock is released, allowing the caller to
+ * make another attempt to take the lock.
+ */
+ *cookiep = (void *) lock;
+ error = EINPROGRESS;
+ goto out;
+ }
+
+ lock->lf_refs++;
+ error = sx_sleep(lock, &state->ls_lock, priority, lockstr, 0);
+ if (lf_free_lock(lock)) {
+ error = EINTR;
+ goto out;
+ }
+
+ /*
+ * We may have been awakened by a signal and/or by a
+ * debugger continuing us (in which cases we must
+ * remove our lock graph edges) and/or by another
+ * process releasing a lock (in which case our edges
+ * have already been removed and we have been moved to
+ * the active list). We may also have been woken by
+ * lf_purgelocks which we report to the caller as
+ * EINTR. In that case, lf_purgelocks will have
+ * removed our lock graph edges.
+ *
+ * Note that it is possible to receive a signal after
+ * we were successfully woken (and moved to the active
+ * list) but before we resumed execution. In this
+ * case, our lf_outedges list will be clear. We
+ * pretend there was no error.
+ *
+ * Note also, if we have been sleeping long enough, we
+ * may now have incoming edges from some newer lock
+ * which is waiting behind us in the queue.
+ */
+ if (lock->lf_flags & F_INTR) {
+ error = EINTR;
+ lf_free_lock(lock);
+ goto out;
+ }
+ if (LIST_EMPTY(&lock->lf_outedges)) {
+ error = 0;
+ } else {
+ lf_cancel_lock(state, lock);
+ goto out;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ lf_print("lf_setlock: granted", lock);
+ }
+#endif
+ goto out;
+ }
+ /*
+ * It looks like we are going to grant the lock. First add
+ * edges from any currently pending lock that the new lock
+ * would block.
+ */
+ sx_xlock(&lf_owner_graph_lock);
+ error = lf_add_incoming(state, lock);
+ sx_xunlock(&lf_owner_graph_lock);
+ if (error) {
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_setlock: deadlock", lock);
+#endif
+ lf_free_lock(lock);
+ goto out;
+ }
+
+ /*
+ * No blocks!! Add the lock. Note that we will
+ * downgrade or upgrade any overlapping locks this
+ * process already owns.
+ */
+ lf_activate_lock(state, lock);
+ error = 0;
+out:
+ return (error);
+}
+
+/*
+ * Remove a byte-range lock on an inode.
+ *
+ * Generally, find the lock (or an overlap to that lock)
+ * and remove it (or shrink it), then wakeup anyone we can.
+ */
+static int
+lf_clearlock(struct lockf *state, struct lockf_entry *unlock)
+{
+ struct lockf_entry *overlap;
+
+ overlap = LIST_FIRST(&state->ls_active);
+
+ if (overlap == NOLOCKF)
+ return (0);
+#ifdef LOCKF_DEBUG
+ if (unlock->lf_type != F_UNLCK)
+ panic("lf_clearlock: bad type");
+ if (lockf_debug & 1)
+ lf_print("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+
+ lf_activate_lock(state, unlock);
+
+ return (0);
+}
+
+/*
+ * Check whether there is a blocking lock, and if so return its
+ * details in '*fl'.
+ */
+static int
+lf_getlock(struct lockf *state, struct lockf_entry *lock, struct flock *fl)
+{
+ struct lockf_entry *block;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_getlock", lock);
+#endif /* LOCKF_DEBUG */
+
+ if ((block = lf_getblock(state, lock))) {
+ fl->l_type = block->lf_type;
+ fl->l_whence = SEEK_SET;
+ fl->l_start = block->lf_start;
+ if (block->lf_end == OFF_MAX)
+ fl->l_len = 0;
+ else
+ fl->l_len = block->lf_end - block->lf_start + 1;
+ fl->l_pid = block->lf_owner->lo_pid;
+ fl->l_sysid = block->lf_owner->lo_sysid;
+ } else {
+ fl->l_type = F_UNLCK;
+ }
+ return (0);
+}
+
+/*
+ * Cancel an async lock request.
+ */
+static int
+lf_cancel(struct lockf *state, struct lockf_entry *lock, void *cookie)
+{
+ struct lockf_entry *reallock;
+
+ /*
+ * We need to match this request with an existing lock
+ * request.
+ */
+ LIST_FOREACH(reallock, &state->ls_pending, lf_link) {
+ if ((void *) reallock == cookie) {
+ /*
+ * Double-check that this lock looks right
+ * (maybe use a rolling ID for the cancel
+ * cookie instead?)
+ */
+ if (!(reallock->lf_vnode == lock->lf_vnode
+ && reallock->lf_start == lock->lf_start
+ && reallock->lf_end == lock->lf_end)) {
+ return (ENOENT);
+ }
+
+ /*
+ * Make sure this lock was async and then just
+ * remove it from its wait lists.
+ */
+ if (!reallock->lf_async_task) {
+ return (ENOENT);
+ }
+
+ /*
+ * Note that since any other thread must take
+ * state->ls_lock before it can possibly
+ * trigger the async callback, we are safe
+ * from a race with lf_wakeup_lock, i.e. we
+ * can free the lock (actually our caller does
+ * this).
+ */
+ lf_cancel_lock(state, reallock);
+ return (0);
+ }
+ }
+
+ /*
+ * We didn't find a matching lock - not much we can do here.
+ */
+ return (ENOENT);
+}
+
+/*
+ * Walk the list of locks for an inode and
+ * return the first blocking lock.
+ */
+static struct lockf_entry *
+lf_getblock(struct lockf *state, struct lockf_entry *lock)
+{
+ struct lockf_entry *overlap;
+
+ LIST_FOREACH(overlap, &state->ls_active, lf_link) {
+ /*
+ * We may assume that the active list is sorted by
+ * lf_start.
+ */
+ if (overlap->lf_start > lock->lf_end)
+ break;
+ if (!lf_blocks(lock, overlap))
+ continue;
+ return (overlap);
+ }
+ return (NOLOCKF);
+}
+
+/*
+ * Walk the list of locks for an inode to find an overlapping lock (if
+ * any) and return a classification of that overlap.
+ *
+ * Arguments:
+ * *overlap The place in the lock list to start looking
+ * lock The lock which is being tested
+ * type Pass 'SELF' to test only locks with the same
+ * owner as lock, or 'OTHER' to test only locks
+ * with a different owner
+ *
+ * Returns one of six values:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ *
+ * If there is an overlapping lock, '*overlap' is set to point at the
+ * overlapping lock.
+ *
+ * NOTE: this returns only the FIRST overlapping lock. There
+ * may be more than one.
+ */
+static int
+lf_findoverlap(struct lockf_entry **overlap, struct lockf_entry *lock, int type)
+{
+ struct lockf_entry *lf;
+ off_t start, end;
+ int res;
+
+ if ((*overlap) == NOLOCKF) {
+ return (0);
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("lf_findoverlap: looking for overlap in", lock);
+#endif /* LOCKF_DEBUG */
+ start = lock->lf_start;
+ end = lock->lf_end;
+ res = 0;
+ while (*overlap) {
+ lf = *overlap;
+ if (lf->lf_start > end)
+ break;
+ if (((type & SELF) && lf->lf_owner != lock->lf_owner) ||
+ ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) {
+ *overlap = LIST_NEXT(lf, lf_link);
+ continue;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("\tchecking", lf);
+#endif /* LOCKF_DEBUG */
+ /*
+ * OK, check for overlap
+ *
+ * Six cases:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ */
+ if (start > lf->lf_end) {
+ /* Case 0 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("no overlap\n");
+#endif /* LOCKF_DEBUG */
+ *overlap = LIST_NEXT(lf, lf_link);
+ continue;
+ }
+ if (lf->lf_start == start && lf->lf_end == end) {
+ /* Case 1 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap == lock\n");
+#endif /* LOCKF_DEBUG */
+ res = 1;
+ break;
+ }
+ if (lf->lf_start <= start && lf->lf_end >= end) {
+ /* Case 2 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap contains lock\n");
+#endif /* LOCKF_DEBUG */
+ res = 2;
+ break;
+ }
+ if (start <= lf->lf_start && end >= lf->lf_end) {
+ /* Case 3 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("lock contains overlap\n");
+#endif /* LOCKF_DEBUG */
+ res = 3;
+ break;
+ }
+ if (lf->lf_start < start && lf->lf_end >= start) {
+ /* Case 4 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap starts before lock\n");
+#endif /* LOCKF_DEBUG */
+ res = 4;
+ break;
+ }
+ if (lf->lf_start > start && lf->lf_end > end) {
+ /* Case 5 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap ends after lock\n");
+#endif /* LOCKF_DEBUG */
+ res = 5;
+ break;
+ }
+ panic("lf_findoverlap: default");
+ }
+ return (res);
+}
+
+/*
+ * Split an the existing 'lock1', based on the extent of the lock
+ * described by 'lock2'. The existing lock should cover 'lock2'
+ * entirely.
+ *
+ * Any pending locks which have been been unblocked are added to
+ * 'granted'
+ */
+static void
+lf_split(struct lockf *state, struct lockf_entry *lock1,
+ struct lockf_entry *lock2, struct lockf_entry_list *granted)
+{
+ struct lockf_entry *splitlock;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2) {
+ lf_print("lf_split", lock1);
+ lf_print("splitting from", lock2);
+ }
+#endif /* LOCKF_DEBUG */
+ /*
+ * Check to see if we don't need to split at all.
+ */
+ if (lock1->lf_start == lock2->lf_start) {
+ lf_set_start(state, lock1, lock2->lf_end + 1, granted);
+ return;
+ }
+ if (lock1->lf_end == lock2->lf_end) {
+ lf_set_end(state, lock1, lock2->lf_start - 1, granted);
+ return;
+ }
+ /*
+ * Make a new lock consisting of the last part of
+ * the encompassing lock.
+ */
+ splitlock = lf_alloc_lock(lock1->lf_owner);
+ memcpy(splitlock, lock1, sizeof *splitlock);
+ splitlock->lf_refs = 1;
+ if (splitlock->lf_flags & F_REMOTE)
+ vref(splitlock->lf_vnode);
+
+ /*
+ * This cannot cause a deadlock since any edges we would add
+ * to splitlock already exist in lock1. We must be sure to add
+ * necessary dependancies to splitlock before we reduce lock1
+ * otherwise we may accidentally grant a pending lock that
+ * was blocked by the tail end of lock1.
+ */
+ splitlock->lf_start = lock2->lf_end + 1;
+ LIST_INIT(&splitlock->lf_outedges);
+ LIST_INIT(&splitlock->lf_inedges);
+ sx_xlock(&lf_owner_graph_lock);
+ lf_add_incoming(state, splitlock);
+ sx_xunlock(&lf_owner_graph_lock);
+
+ lf_set_end(state, lock1, lock2->lf_start - 1, granted);
+
+ /*
+ * OK, now link it in
+ */
+ lf_insert_lock(state, splitlock);
+}
+
+struct lockdesc {
+ STAILQ_ENTRY(lockdesc) link;
+ struct vnode *vp;
+ struct flock fl;
+};
+STAILQ_HEAD(lockdesclist, lockdesc);
+
+int
+lf_iteratelocks_sysid(int sysid, lf_iterator *fn, void *arg)
+{
+ struct lockf *ls;
+ struct lockf_entry *lf;
+ struct lockdesc *ldesc;
+ struct lockdesclist locks;
+ int error;
+
+ /*
+ * In order to keep the locking simple, we iterate over the
+ * active lock lists to build a list of locks that need
+ * releasing. We then call the iterator for each one in turn.
+ *
+ * We take an extra reference to the vnode for the duration to
+ * make sure it doesn't go away before we are finished.
+ */
+ STAILQ_INIT(&locks);
+ sx_xlock(&lf_lock_states_lock);
+ LIST_FOREACH(ls, &lf_lock_states, ls_link) {
+ sx_xlock(&ls->ls_lock);
+ LIST_FOREACH(lf, &ls->ls_active, lf_link) {
+ if (lf->lf_owner->lo_sysid != sysid)
+ continue;
+
+ ldesc = malloc(sizeof(struct lockdesc), M_LOCKF,
+ M_WAITOK);
+ ldesc->vp = lf->lf_vnode;
+ vref(ldesc->vp);
+ ldesc->fl.l_start = lf->lf_start;
+ if (lf->lf_end == OFF_MAX)
+ ldesc->fl.l_len = 0;
+ else
+ ldesc->fl.l_len =
+ lf->lf_end - lf->lf_start + 1;
+ ldesc->fl.l_whence = SEEK_SET;
+ ldesc->fl.l_type = F_UNLCK;
+ ldesc->fl.l_pid = lf->lf_owner->lo_pid;
+ ldesc->fl.l_sysid = sysid;
+ STAILQ_INSERT_TAIL(&locks, ldesc, link);
+ }
+ sx_xunlock(&ls->ls_lock);
+ }
+ sx_xunlock(&lf_lock_states_lock);
+
+ /*
+ * Call the iterator function for each lock in turn. If the
+ * iterator returns an error code, just free the rest of the
+ * lockdesc structures.
+ */
+ error = 0;
+ while ((ldesc = STAILQ_FIRST(&locks)) != NULL) {
+ STAILQ_REMOVE_HEAD(&locks, link);
+ if (!error)
+ error = fn(ldesc->vp, &ldesc->fl, arg);
+ vrele(ldesc->vp);
+ free(ldesc, M_LOCKF);
+ }
+
+ return (error);
+}
+
+int
+lf_iteratelocks_vnode(struct vnode *vp, lf_iterator *fn, void *arg)
+{
+ struct lockf *ls;
+ struct lockf_entry *lf;
+ struct lockdesc *ldesc;
+ struct lockdesclist locks;
+ int error;
+
+ /*
+ * In order to keep the locking simple, we iterate over the
+ * active lock lists to build a list of locks that need
+ * releasing. We then call the iterator for each one in turn.
+ *
+ * We take an extra reference to the vnode for the duration to
+ * make sure it doesn't go away before we are finished.
+ */
+ STAILQ_INIT(&locks);
+ VI_LOCK(vp);
+ ls = vp->v_lockf;
+ if (!ls) {
+ VI_UNLOCK(vp);
+ return (0);
+ }
+ ls->ls_threads++;
+ VI_UNLOCK(vp);
+
+ sx_xlock(&ls->ls_lock);
+ LIST_FOREACH(lf, &ls->ls_active, lf_link) {
+ ldesc = malloc(sizeof(struct lockdesc), M_LOCKF,
+ M_WAITOK);
+ ldesc->vp = lf->lf_vnode;
+ vref(ldesc->vp);
+ ldesc->fl.l_start = lf->lf_start;
+ if (lf->lf_end == OFF_MAX)
+ ldesc->fl.l_len = 0;
+ else
+ ldesc->fl.l_len =
+ lf->lf_end - lf->lf_start + 1;
+ ldesc->fl.l_whence = SEEK_SET;
+ ldesc->fl.l_type = F_UNLCK;
+ ldesc->fl.l_pid = lf->lf_owner->lo_pid;
+ ldesc->fl.l_sysid = lf->lf_owner->lo_sysid;
+ STAILQ_INSERT_TAIL(&locks, ldesc, link);
+ }
+ sx_xunlock(&ls->ls_lock);
+ VI_LOCK(vp);
+ ls->ls_threads--;
+ wakeup(ls);
+ VI_UNLOCK(vp);
+
+ /*
+ * Call the iterator function for each lock in turn. If the
+ * iterator returns an error code, just free the rest of the
+ * lockdesc structures.
+ */
+ error = 0;
+ while ((ldesc = STAILQ_FIRST(&locks)) != NULL) {
+ STAILQ_REMOVE_HEAD(&locks, link);
+ if (!error)
+ error = fn(ldesc->vp, &ldesc->fl, arg);
+ vrele(ldesc->vp);
+ free(ldesc, M_LOCKF);
+ }
+
+ return (error);
+}
+
+static int
+lf_clearremotesys_iterator(struct vnode *vp, struct flock *fl, void *arg)
+{
+
+ VOP_ADVLOCK(vp, 0, F_UNLCK, fl, F_REMOTE);
+ return (0);
+}
+
+void
+lf_clearremotesys(int sysid)
+{
+
+ KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS"));
+ lf_iteratelocks_sysid(sysid, lf_clearremotesys_iterator, NULL);
+}
+
+int
+lf_countlocks(int sysid)
+{
+ int i;
+ struct lock_owner *lo;
+ int count;
+
+ count = 0;
+ sx_xlock(&lf_lock_owners_lock);
+ for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++)
+ LIST_FOREACH(lo, &lf_lock_owners[i], lo_link)
+ if (lo->lo_sysid == sysid)
+ count += lo->lo_refs;
+ sx_xunlock(&lf_lock_owners_lock);
+
+ return (count);
+}
+
+#ifdef LOCKF_DEBUG
+
+/*
+ * Return non-zero if y is reachable from x using a brute force
+ * search. If reachable and path is non-null, return the route taken
+ * in path.
+ */
+static int
+graph_reaches(struct owner_vertex *x, struct owner_vertex *y,
+ struct owner_vertex_list *path)
+{
+ struct owner_edge *e;
+
+ if (x == y) {
+ if (path)
+ TAILQ_INSERT_HEAD(path, x, v_link);
+ return 1;
+ }
+
+ LIST_FOREACH(e, &x->v_outedges, e_outlink) {
+ if (graph_reaches(e->e_to, y, path)) {
+ if (path)
+ TAILQ_INSERT_HEAD(path, x, v_link);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Perform consistency checks on the graph. Make sure the values of
+ * v_order are correct. If checkorder is non-zero, check no vertex can
+ * reach any other vertex with a smaller order.
+ */
+static void
+graph_check(struct owner_graph *g, int checkorder)
+{
+ int i, j;
+
+ for (i = 0; i < g->g_size; i++) {
+ if (!g->g_vertices[i]->v_owner)
+ continue;
+ KASSERT(g->g_vertices[i]->v_order == i,
+ ("lock graph vertices disordered"));
+ if (checkorder) {
+ for (j = 0; j < i; j++) {
+ if (!g->g_vertices[j]->v_owner)
+ continue;
+ KASSERT(!graph_reaches(g->g_vertices[i],
+ g->g_vertices[j], NULL),
+ ("lock graph vertices disordered"));
+ }
+ }
+ }
+}
+
+static void
+graph_print_vertices(struct owner_vertex_list *set)
+{
+ struct owner_vertex *v;
+
+ printf("{ ");
+ TAILQ_FOREACH(v, set, v_link) {
+ printf("%d:", v->v_order);
+ lf_print_owner(v->v_owner);
+ if (TAILQ_NEXT(v, v_link))
+ printf(", ");
+ }
+ printf(" }\n");
+}
+
+#endif
+
+/*
+ * Calculate the sub-set of vertices v from the affected region [y..x]
+ * where v is reachable from y. Return -1 if a loop was detected
+ * (i.e. x is reachable from y, otherwise the number of vertices in
+ * this subset.
+ */
+static int
+graph_delta_forward(struct owner_graph *g, struct owner_vertex *x,
+ struct owner_vertex *y, struct owner_vertex_list *delta)
+{
+ uint32_t gen;
+ struct owner_vertex *v;
+ struct owner_edge *e;
+ int n;
+
+ /*
+ * We start with a set containing just y. Then for each vertex
+ * v in the set so far unprocessed, we add each vertex that v
+ * has an out-edge to and that is within the affected region
+ * [y..x]. If we see the vertex x on our travels, stop
+ * immediately.
+ */
+ TAILQ_INIT(delta);
+ TAILQ_INSERT_TAIL(delta, y, v_link);
+ v = y;
+ n = 1;
+ gen = g->g_gen;
+ while (v) {
+ LIST_FOREACH(e, &v->v_outedges, e_outlink) {
+ if (e->e_to == x)
+ return -1;
+ if (e->e_to->v_order < x->v_order
+ && e->e_to->v_gen != gen) {
+ e->e_to->v_gen = gen;
+ TAILQ_INSERT_TAIL(delta, e->e_to, v_link);
+ n++;
+ }
+ }
+ v = TAILQ_NEXT(v, v_link);
+ }
+
+ return (n);
+}
+
+/*
+ * Calculate the sub-set of vertices v from the affected region [y..x]
+ * where v reaches x. Return the number of vertices in this subset.
+ */
+static int
+graph_delta_backward(struct owner_graph *g, struct owner_vertex *x,
+ struct owner_vertex *y, struct owner_vertex_list *delta)
+{
+ uint32_t gen;
+ struct owner_vertex *v;
+ struct owner_edge *e;
+ int n;
+
+ /*
+ * We start with a set containing just x. Then for each vertex
+ * v in the set so far unprocessed, we add each vertex that v
+ * has an in-edge from and that is within the affected region
+ * [y..x].
+ */
+ TAILQ_INIT(delta);
+ TAILQ_INSERT_TAIL(delta, x, v_link);
+ v = x;
+ n = 1;
+ gen = g->g_gen;
+ while (v) {
+ LIST_FOREACH(e, &v->v_inedges, e_inlink) {
+ if (e->e_from->v_order > y->v_order
+ && e->e_from->v_gen != gen) {
+ e->e_from->v_gen = gen;
+ TAILQ_INSERT_HEAD(delta, e->e_from, v_link);
+ n++;
+ }
+ }
+ v = TAILQ_PREV(v, owner_vertex_list, v_link);
+ }
+
+ return (n);
+}
+
+static int
+graph_add_indices(int *indices, int n, struct owner_vertex_list *set)
+{
+ struct owner_vertex *v;
+ int i, j;
+
+ TAILQ_FOREACH(v, set, v_link) {
+ for (i = n;
+ i > 0 && indices[i - 1] > v->v_order; i--)
+ ;
+ for (j = n - 1; j >= i; j--)
+ indices[j + 1] = indices[j];
+ indices[i] = v->v_order;
+ n++;
+ }
+
+ return (n);
+}
+
+static int
+graph_assign_indices(struct owner_graph *g, int *indices, int nextunused,
+ struct owner_vertex_list *set)
+{
+ struct owner_vertex *v, *vlowest;
+
+ while (!TAILQ_EMPTY(set)) {
+ vlowest = NULL;
+ TAILQ_FOREACH(v, set, v_link) {
+ if (!vlowest || v->v_order < vlowest->v_order)
+ vlowest = v;
+ }
+ TAILQ_REMOVE(set, vlowest, v_link);
+ vlowest->v_order = indices[nextunused];
+ g->g_vertices[vlowest->v_order] = vlowest;
+ nextunused++;
+ }
+
+ return (nextunused);
+}
+
+static int
+graph_add_edge(struct owner_graph *g, struct owner_vertex *x,
+ struct owner_vertex *y)
+{
+ struct owner_edge *e;
+ struct owner_vertex_list deltaF, deltaB;
+ int nF, nB, n, vi, i;
+ int *indices;
+
+ sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
+
+ LIST_FOREACH(e, &x->v_outedges, e_outlink) {
+ if (e->e_to == y) {
+ e->e_refs++;
+ return (0);
+ }
+ }
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 8) {
+ printf("adding edge %d:", x->v_order);
+ lf_print_owner(x->v_owner);
+ printf(" -> %d:", y->v_order);
+ lf_print_owner(y->v_owner);
+ printf("\n");
+ }
+#endif
+ if (y->v_order < x->v_order) {
+ /*
+ * The new edge violates the order. First find the set
+ * of affected vertices reachable from y (deltaF) and
+ * the set of affect vertices affected that reach x
+ * (deltaB), using the graph generation number to
+ * detect whether we have visited a given vertex
+ * already. We re-order the graph so that each vertex
+ * in deltaB appears before each vertex in deltaF.
+ *
+ * If x is a member of deltaF, then the new edge would
+ * create a cycle. Otherwise, we may assume that
+ * deltaF and deltaB are disjoint.
+ */
+ g->g_gen++;
+ if (g->g_gen == 0) {
+ /*
+ * Generation wrap.
+ */
+ for (vi = 0; vi < g->g_size; vi++) {
+ g->g_vertices[vi]->v_gen = 0;
+ }
+ g->g_gen++;
+ }
+ nF = graph_delta_forward(g, x, y, &deltaF);
+ if (nF < 0) {
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 8) {
+ struct owner_vertex_list path;
+ printf("deadlock: ");
+ TAILQ_INIT(&path);
+ graph_reaches(y, x, &path);
+ graph_print_vertices(&path);
+ }
+#endif
+ return (EDEADLK);
+ }
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 8) {
+ printf("re-ordering graph vertices\n");
+ printf("deltaF = ");
+ graph_print_vertices(&deltaF);
+ }
+#endif
+
+ nB = graph_delta_backward(g, x, y, &deltaB);
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 8) {
+ printf("deltaB = ");
+ graph_print_vertices(&deltaB);
+ }
+#endif
+
+ /*
+ * We first build a set of vertex indices (vertex
+ * order values) that we may use, then we re-assign
+ * orders first to those vertices in deltaB, then to
+ * deltaF. Note that the contents of deltaF and deltaB
+ * may be partially disordered - we perform an
+ * insertion sort while building our index set.
+ */
+ indices = g->g_indexbuf;
+ n = graph_add_indices(indices, 0, &deltaF);
+ graph_add_indices(indices, n, &deltaB);
+
+ /*
+ * We must also be sure to maintain the relative
+ * ordering of deltaF and deltaB when re-assigning
+ * vertices. We do this by iteratively removing the
+ * lowest ordered element from the set and assigning
+ * it the next value from our new ordering.
+ */
+ i = graph_assign_indices(g, indices, 0, &deltaB);
+ graph_assign_indices(g, indices, i, &deltaF);
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 8) {
+ struct owner_vertex_list set;
+ TAILQ_INIT(&set);
+ for (i = 0; i < nB + nF; i++)
+ TAILQ_INSERT_TAIL(&set,
+ g->g_vertices[indices[i]], v_link);
+ printf("new ordering = ");
+ graph_print_vertices(&set);
+ }
+#endif
+ }
+
+ KASSERT(x->v_order < y->v_order, ("Failed to re-order graph"));
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 8) {
+ graph_check(g, TRUE);
+ }
+#endif
+
+ e = malloc(sizeof(struct owner_edge), M_LOCKF, M_WAITOK);
+
+ LIST_INSERT_HEAD(&x->v_outedges, e, e_outlink);
+ LIST_INSERT_HEAD(&y->v_inedges, e, e_inlink);
+ e->e_refs = 1;
+ e->e_from = x;
+ e->e_to = y;
+
+ return (0);
+}
+
+/*
+ * Remove an edge x->y from the graph.
+ */
+static void
+graph_remove_edge(struct owner_graph *g, struct owner_vertex *x,
+ struct owner_vertex *y)
+{
+ struct owner_edge *e;
+
+ sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
+
+ LIST_FOREACH(e, &x->v_outedges, e_outlink) {
+ if (e->e_to == y)
+ break;
+ }
+ KASSERT(e, ("Removing non-existent edge from deadlock graph"));
+
+ e->e_refs--;
+ if (e->e_refs == 0) {
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 8) {
+ printf("removing edge %d:", x->v_order);
+ lf_print_owner(x->v_owner);
+ printf(" -> %d:", y->v_order);
+ lf_print_owner(y->v_owner);
+ printf("\n");
+ }
+#endif
+ LIST_REMOVE(e, e_outlink);
+ LIST_REMOVE(e, e_inlink);
+ free(e, M_LOCKF);
+ }
+}
+
+/*
+ * Allocate a vertex from the free list. Return ENOMEM if there are
+ * none.
+ */
+static struct owner_vertex *
+graph_alloc_vertex(struct owner_graph *g, struct lock_owner *lo)
+{
+ struct owner_vertex *v;
+
+ sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
+
+ v = malloc(sizeof(struct owner_vertex), M_LOCKF, M_WAITOK);
+ if (g->g_size == g->g_space) {
+ g->g_vertices = realloc(g->g_vertices,
+ 2 * g->g_space * sizeof(struct owner_vertex *),
+ M_LOCKF, M_WAITOK);
+ free(g->g_indexbuf, M_LOCKF);
+ g->g_indexbuf = malloc(2 * g->g_space * sizeof(int),
+ M_LOCKF, M_WAITOK);
+ g->g_space = 2 * g->g_space;
+ }
+ v->v_order = g->g_size;
+ v->v_gen = g->g_gen;
+ g->g_vertices[g->g_size] = v;
+ g->g_size++;
+
+ LIST_INIT(&v->v_outedges);
+ LIST_INIT(&v->v_inedges);
+ v->v_owner = lo;
+
+ return (v);
+}
+
+static void
+graph_free_vertex(struct owner_graph *g, struct owner_vertex *v)
+{
+ struct owner_vertex *w;
+ int i;
+
+ sx_assert(&lf_owner_graph_lock, SX_XLOCKED);
+
+ KASSERT(LIST_EMPTY(&v->v_outedges), ("Freeing vertex with edges"));
+ KASSERT(LIST_EMPTY(&v->v_inedges), ("Freeing vertex with edges"));
+
+ /*
+ * Remove from the graph's array and close up the gap,
+ * renumbering the other vertices.
+ */
+ for (i = v->v_order + 1; i < g->g_size; i++) {
+ w = g->g_vertices[i];
+ w->v_order--;
+ g->g_vertices[i - 1] = w;
+ }
+ g->g_size--;
+
+ free(v, M_LOCKF);
+}
+
+static struct owner_graph *
+graph_init(struct owner_graph *g)
+{
+
+ g->g_vertices = malloc(10 * sizeof(struct owner_vertex *),
+ M_LOCKF, M_WAITOK);
+ g->g_size = 0;
+ g->g_space = 10;
+ g->g_indexbuf = malloc(g->g_space * sizeof(int), M_LOCKF, M_WAITOK);
+ g->g_gen = 0;
+
+ return (g);
+}
+
+#ifdef LOCKF_DEBUG
+/*
+ * Print description of a lock owner
+ */
+static void
+lf_print_owner(struct lock_owner *lo)
+{
+
+ if (lo->lo_flags & F_REMOTE) {
+ printf("remote pid %d, system %d",
+ lo->lo_pid, lo->lo_sysid);
+ } else if (lo->lo_flags & F_FLOCK) {
+ printf("file %p", lo->lo_id);
+ } else {
+ printf("local pid %d", lo->lo_pid);
+ }
+}
+
+/*
+ * Print out a lock.
+ */
+static void
+lf_print(char *tag, struct lockf_entry *lock)
+{
+
+ printf("%s: lock %p for ", tag, (void *)lock);
+ lf_print_owner(lock->lf_owner);
+ if (lock->lf_inode != (struct inode *)0)
+ printf(" in ino %ju on dev <%s>,",
+ (uintmax_t)lock->lf_inode->i_number,
+ devtoname(lock->lf_inode->i_dev));
+ printf(" %s, start %jd, end ",
+ lock->lf_type == F_RDLCK ? "shared" :
+ lock->lf_type == F_WRLCK ? "exclusive" :
+ lock->lf_type == F_UNLCK ? "unlock" : "unknown",
+ (intmax_t)lock->lf_start);
+ if (lock->lf_end == OFF_MAX)
+ printf("EOF");
+ else
+ printf("%jd", (intmax_t)lock->lf_end);
+ if (!LIST_EMPTY(&lock->lf_outedges))
+ printf(" block %p\n",
+ (void *)LIST_FIRST(&lock->lf_outedges)->le_to);
+ else
+ printf("\n");
+}
+
+static void
+lf_printlist(char *tag, struct lockf_entry *lock)
+{
+ struct lockf_entry *lf, *blk;
+ struct lockf_edge *e;
+
+ if (lock->lf_inode == (struct inode *)0)
+ return;
+
+ printf("%s: Lock list for ino %ju on dev <%s>:\n",
+ tag, (uintmax_t)lock->lf_inode->i_number,
+ devtoname(lock->lf_inode->i_dev));
+ LIST_FOREACH(lf, &lock->lf_vnode->v_lockf->ls_active, lf_link) {
+ printf("\tlock %p for ",(void *)lf);
+ lf_print_owner(lock->lf_owner);
+ printf(", %s, start %jd, end %jd",
+ lf->lf_type == F_RDLCK ? "shared" :
+ lf->lf_type == F_WRLCK ? "exclusive" :
+ lf->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
+ LIST_FOREACH(e, &lf->lf_outedges, le_outlink) {
+ blk = e->le_to;
+ printf("\n\t\tlock request %p for ", (void *)blk);
+ lf_print_owner(blk->lf_owner);
+ printf(", %s, start %jd, end %jd",
+ blk->lf_type == F_RDLCK ? "shared" :
+ blk->lf_type == F_WRLCK ? "exclusive" :
+ blk->lf_type == F_UNLCK ? "unlock" :
+ "unknown", (intmax_t)blk->lf_start,
+ (intmax_t)blk->lf_end);
+ if (!LIST_EMPTY(&blk->lf_inedges))
+ panic("lf_printlist: bad list");
+ }
+ printf("\n");
+ }
+}
+#endif /* LOCKF_DEBUG */
diff --git a/sys/kern/kern_lockstat.c b/sys/kern/kern_lockstat.c
new file mode 100644
index 0000000..1f35893
--- /dev/null
+++ b/sys/kern/kern_lockstat.c
@@ -0,0 +1,64 @@
+/*-
+ * Copyright 2008-2009 Stacey Son <sson@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Backend for the lock tracing (lockstat) kernel support. This is required
+ * to allow a module to load even though DTrace kernel support may not be
+ * present.
+ *
+ */
+
+#include "opt_kdtrace.h"
+
+#ifdef KDTRACE_HOOKS
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/lockstat.h>
+
+/*
+ * The following must match the type definition of dtrace_probe. It is
+ * defined this way to avoid having to rely on CDDL code.
+ */
+uint32_t lockstat_probemap[LS_NPROBES];
+void (*lockstat_probe_func)(uint32_t, uintptr_t, uintptr_t,
+ uintptr_t, uintptr_t, uintptr_t);
+
+
+uint64_t
+lockstat_nsecs(void)
+{
+ struct bintime bt;
+ uint64_t ns;
+
+ binuptime(&bt);
+ ns = bt.sec * (uint64_t)1000000000;
+ ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
+ return (ns);
+}
+
+#endif /* KDTRACE_HOOKS */
diff --git a/sys/kern/kern_loginclass.c b/sys/kern/kern_loginclass.c
new file mode 100644
index 0000000..beac93b
--- /dev/null
+++ b/sys/kern/kern_loginclass.c
@@ -0,0 +1,238 @@
+/*-
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Edward Tomasz Napierala under sponsorship
+ * from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Processes may set login class name using setloginclass(2). This
+ * is usually done through call to setusercontext(3), by programs
+ * such as login(1), based on information from master.passwd(5). Kernel
+ * uses this information to enforce per-class resource limits. Current
+ * login class can be determined using id(1). Login class is inherited
+ * from the parent process during fork(2). If not set, it defaults
+ * to "default".
+ *
+ * Code in this file implements setloginclass(2) and getloginclass(2)
+ * system calls, and maintains class name storage and retrieval.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/types.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/racct.h>
+#include <sys/refcount.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+
+static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures");
+
+LIST_HEAD(, loginclass) loginclasses;
+
+/*
+ * Lock protecting loginclasses list.
+ */
+static struct mtx loginclasses_lock;
+
+static void lc_init(void);
+SYSINIT(loginclass, SI_SUB_CPU, SI_ORDER_FIRST, lc_init, NULL);
+
+void
+loginclass_hold(struct loginclass *lc)
+{
+
+ refcount_acquire(&lc->lc_refcount);
+}
+
+void
+loginclass_free(struct loginclass *lc)
+{
+ int old;
+
+ old = lc->lc_refcount;
+ if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1))
+ return;
+
+ mtx_lock(&loginclasses_lock);
+ if (refcount_release(&lc->lc_refcount)) {
+ racct_destroy(&lc->lc_racct);
+ LIST_REMOVE(lc, lc_next);
+ mtx_unlock(&loginclasses_lock);
+ free(lc, M_LOGINCLASS);
+
+ return;
+ }
+ mtx_unlock(&loginclasses_lock);
+}
+
+/*
+ * Return loginclass structure with a corresponding name. Not
+ * performance critical, as it's used mainly by setloginclass(2),
+ * which happens once per login session. Caller has to use
+ * loginclass_free() on the returned value when it's no longer
+ * needed.
+ */
+struct loginclass *
+loginclass_find(const char *name)
+{
+ struct loginclass *lc, *newlc;
+
+ if (name[0] == '\0' || strlen(name) >= MAXLOGNAME)
+ return (NULL);
+
+ newlc = malloc(sizeof(*newlc), M_LOGINCLASS, M_ZERO | M_WAITOK);
+ racct_create(&newlc->lc_racct);
+
+ mtx_lock(&loginclasses_lock);
+ LIST_FOREACH(lc, &loginclasses, lc_next) {
+ if (strcmp(name, lc->lc_name) != 0)
+ continue;
+
+ /* Found loginclass with a matching name? */
+ loginclass_hold(lc);
+ mtx_unlock(&loginclasses_lock);
+ racct_destroy(&newlc->lc_racct);
+ free(newlc, M_LOGINCLASS);
+ return (lc);
+ }
+
+ /* Add new loginclass. */
+ strcpy(newlc->lc_name, name);
+ refcount_init(&newlc->lc_refcount, 1);
+ LIST_INSERT_HEAD(&loginclasses, newlc, lc_next);
+ mtx_unlock(&loginclasses_lock);
+
+ return (newlc);
+}
+
+/*
+ * Get login class name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getloginclass_args {
+ char *namebuf;
+ size_t namelen;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getloginclass(struct thread *td, struct getloginclass_args *uap)
+{
+ int error = 0;
+ size_t lcnamelen;
+ struct proc *p;
+ struct loginclass *lc;
+
+ p = td->td_proc;
+ PROC_LOCK(p);
+ lc = p->p_ucred->cr_loginclass;
+ loginclass_hold(lc);
+ PROC_UNLOCK(p);
+
+ lcnamelen = strlen(lc->lc_name) + 1;
+ if (lcnamelen > uap->namelen)
+ error = ERANGE;
+ if (error == 0)
+ error = copyout(lc->lc_name, uap->namebuf, lcnamelen);
+ loginclass_free(lc);
+ return (error);
+}
+
+/*
+ * Set login class name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setloginclass_args {
+ const char *namebuf;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setloginclass(struct thread *td, struct setloginclass_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int error;
+ char lcname[MAXLOGNAME];
+ struct loginclass *newlc;
+ struct ucred *newcred, *oldcred;
+
+ error = priv_check(td, PRIV_PROC_SETLOGINCLASS);
+ if (error != 0)
+ return (error);
+ error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL);
+ if (error != 0)
+ return (error);
+
+ newlc = loginclass_find(lcname);
+ if (newlc == NULL)
+ return (EINVAL);
+ newcred = crget();
+
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+ newcred->cr_loginclass = newlc;
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+#ifdef RACCT
+ racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+ loginclass_free(oldcred->cr_loginclass);
+ crfree(oldcred);
+
+ return (0);
+}
+
+void
+loginclass_racct_foreach(void (*callback)(struct racct *racct,
+ void *arg2, void *arg3), void *arg2, void *arg3)
+{
+ struct loginclass *lc;
+
+ mtx_lock(&loginclasses_lock);
+ LIST_FOREACH(lc, &loginclasses, lc_next)
+ (callback)(lc->lc_racct, arg2, arg3);
+ mtx_unlock(&loginclasses_lock);
+}
+
+static void
+lc_init(void)
+{
+
+ mtx_init(&loginclasses_lock, "loginclasses lock", NULL, MTX_DEF);
+}
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
new file mode 100644
index 0000000..9116433
--- /dev/null
+++ b/sys/kern/kern_malloc.c
@@ -0,0 +1,1100 @@
+/*-
+ * Copyright (c) 1987, 1991, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2005-2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94
+ */
+
+/*
+ * Kernel malloc(9) implementation -- general purpose kernel memory allocator
+ * based on memory types. Back end is implemented using the UMA(9) zone
+ * allocator. A set of fixed-size buckets are used for smaller allocations,
+ * and a special UMA allocation interface is used for larger allocations.
+ * Callers declare memory types, and statistics are maintained independently
+ * for each memory type. Statistics are maintained per-CPU for performance
+ * reasons. See malloc(9) and comments in malloc.h for a detailed
+ * description.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/vmmeter.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <sys/vmem.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#include <vm/uma_dbg.h>
+
+#ifdef DEBUG_MEMGUARD
+#include <vm/memguard.h>
+#endif
+#ifdef DEBUG_REDZONE
+#include <vm/redzone.h>
+#endif
+
+#if defined(INVARIANTS) && defined(__i386__)
+#include <machine/cpu.h>
+#endif
+
+#include <ddb/ddb.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+
+dtrace_malloc_probe_func_t dtrace_malloc_probe;
+#endif
+
+/*
+ * When realloc() is called, if the new size is sufficiently smaller than
+ * the old size, realloc() will allocate a new, smaller block to avoid
+ * wasting memory. 'Sufficiently smaller' is defined as: newsize <=
+ * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'.
+ */
+#ifndef REALLOC_FRACTION
+#define REALLOC_FRACTION 1 /* new block if <= half the size */
+#endif
+
+/*
+ * Centrally define some common malloc types.
+ */
+MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
+MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
+MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
+
+MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
+MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
+
+static struct malloc_type *kmemstatistics;
+static int kmemcount;
+
+#define KMEM_ZSHIFT 4
+#define KMEM_ZBASE 16
+#define KMEM_ZMASK (KMEM_ZBASE - 1)
+
+#define KMEM_ZMAX PAGE_SIZE
+#define KMEM_ZSIZE (KMEM_ZMAX >> KMEM_ZSHIFT)
+static uint8_t kmemsize[KMEM_ZSIZE + 1];
+
+#ifndef MALLOC_DEBUG_MAXZONES
+#define MALLOC_DEBUG_MAXZONES 1
+#endif
+static int numzones = MALLOC_DEBUG_MAXZONES;
+
+/*
+ * Small malloc(9) memory allocations are allocated from a set of UMA buckets
+ * of various sizes.
+ *
+ * XXX: The comment here used to read "These won't be powers of two for
+ * long." It's possible that a significant amount of wasted memory could be
+ * recovered by tuning the sizes of these buckets.
+ */
+struct {
+ int kz_size;
+ char *kz_name;
+ uma_zone_t kz_zone[MALLOC_DEBUG_MAXZONES];
+} kmemzones[] = {
+ {16, "16", },
+ {32, "32", },
+ {64, "64", },
+ {128, "128", },
+ {256, "256", },
+ {512, "512", },
+ {1024, "1024", },
+ {2048, "2048", },
+ {4096, "4096", },
+#if PAGE_SIZE > 4096
+ {8192, "8192", },
+#if PAGE_SIZE > 8192
+ {16384, "16384", },
+#if PAGE_SIZE > 16384
+ {32768, "32768", },
+#if PAGE_SIZE > 32768
+ {65536, "65536", },
+#if PAGE_SIZE > 65536
+#error "Unsupported PAGE_SIZE"
+#endif /* 65536 */
+#endif /* 32768 */
+#endif /* 16384 */
+#endif /* 8192 */
+#endif /* 4096 */
+ {0, NULL},
+};
+
+/*
+ * Zone to allocate malloc type descriptions from. For ABI reasons, memory
+ * types are described by a data structure passed by the declaring code, but
+ * the malloc(9) implementation has its own data structure describing the
+ * type and statistics. This permits the malloc(9)-internal data structures
+ * to be modified without breaking binary-compiled kernel modules that
+ * declare malloc types.
+ */
+static uma_zone_t mt_zone;
+
+u_long vm_kmem_size;
+SYSCTL_ULONG(_vm, OID_AUTO, kmem_size, CTLFLAG_RDTUN, &vm_kmem_size, 0,
+ "Size of kernel memory");
+
+static u_long vm_kmem_size_min;
+SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RDTUN, &vm_kmem_size_min, 0,
+ "Minimum size of kernel memory");
+
+static u_long vm_kmem_size_max;
+SYSCTL_ULONG(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RDTUN, &vm_kmem_size_max, 0,
+ "Maximum size of kernel memory");
+
+static u_int vm_kmem_size_scale;
+SYSCTL_UINT(_vm, OID_AUTO, kmem_size_scale, CTLFLAG_RDTUN, &vm_kmem_size_scale, 0,
+ "Scale factor for kernel memory size");
+
+static int sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, kmem_map_size,
+ CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_kmem_map_size, "LU", "Current kmem allocation size");
+
+static int sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, kmem_map_free,
+ CTLFLAG_RD | CTLTYPE_ULONG | CTLFLAG_MPSAFE, NULL, 0,
+ sysctl_kmem_map_free, "LU", "Free space in kmem");
+
+/*
+ * The malloc_mtx protects the kmemstatistics linked list.
+ */
+struct mtx malloc_mtx;
+
+#ifdef MALLOC_PROFILE
+uint64_t krequests[KMEM_ZSIZE + 1];
+
+static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
+#endif
+
+static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS);
+
+/*
+ * time_uptime of the last malloc(9) failure (induced or real).
+ */
+static time_t t_malloc_fail;
+
+#if defined(MALLOC_MAKE_FAILURES) || (MALLOC_DEBUG_MAXZONES > 1)
+static SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0,
+ "Kernel malloc debugging options");
+#endif
+
+/*
+ * malloc(9) fault injection -- cause malloc failures every (n) mallocs when
+ * the caller specifies M_NOWAIT. If set to 0, no failures are caused.
+ */
+#ifdef MALLOC_MAKE_FAILURES
+static int malloc_failure_rate;
+static int malloc_nowait_count;
+static int malloc_failure_count;
+SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RW,
+ &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail");
+TUNABLE_INT("debug.malloc.failure_rate", &malloc_failure_rate);
+SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD,
+ &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures");
+#endif
+
+static int
+sysctl_kmem_map_size(SYSCTL_HANDLER_ARGS)
+{
+ u_long size;
+
+ size = vmem_size(kmem_arena, VMEM_ALLOC);
+ return (sysctl_handle_long(oidp, &size, 0, req));
+}
+
+static int
+sysctl_kmem_map_free(SYSCTL_HANDLER_ARGS)
+{
+ u_long size;
+
+ size = vmem_size(kmem_arena, VMEM_FREE);
+ return (sysctl_handle_long(oidp, &size, 0, req));
+}
+
+/*
+ * malloc(9) uma zone separation -- sub-page buffer overruns in one
+ * malloc type will affect only a subset of other malloc types.
+ */
+#if MALLOC_DEBUG_MAXZONES > 1
+static void
+tunable_set_numzones(void)
+{
+
+ TUNABLE_INT_FETCH("debug.malloc.numzones",
+ &numzones);
+
+ /* Sanity check the number of malloc uma zones. */
+ if (numzones <= 0)
+ numzones = 1;
+ if (numzones > MALLOC_DEBUG_MAXZONES)
+ numzones = MALLOC_DEBUG_MAXZONES;
+}
+SYSINIT(numzones, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_set_numzones, NULL);
+SYSCTL_INT(_debug_malloc, OID_AUTO, numzones, CTLFLAG_RDTUN,
+ &numzones, 0, "Number of malloc uma subzones");
+
+/*
+ * Any number that changes regularly is an okay choice for the
+ * offset. Build numbers are pretty good of you have them.
+ */
+static u_int zone_offset = __FreeBSD_version;
+TUNABLE_INT("debug.malloc.zone_offset", &zone_offset);
+SYSCTL_UINT(_debug_malloc, OID_AUTO, zone_offset, CTLFLAG_RDTUN,
+ &zone_offset, 0, "Separate malloc types by examining the "
+ "Nth character in the malloc type short description.");
+
+static u_int
+mtp_get_subzone(const char *desc)
+{
+ size_t len;
+ u_int val;
+
+ if (desc == NULL || (len = strlen(desc)) == 0)
+ return (0);
+ val = desc[zone_offset % len];
+ return (val % numzones);
+}
+#elif MALLOC_DEBUG_MAXZONES == 0
+#error "MALLOC_DEBUG_MAXZONES must be positive."
+#else
+static inline u_int
+mtp_get_subzone(const char *desc)
+{
+
+ return (0);
+}
+#endif /* MALLOC_DEBUG_MAXZONES > 1 */
+
+int
+malloc_last_fail(void)
+{
+
+ return (time_uptime - t_malloc_fail);
+}
+
+/*
+ * An allocation has succeeded -- update malloc type statistics for the
+ * amount of bucket size. Occurs within a critical section so that the
+ * thread isn't preempted and doesn't migrate while updating per-PCU
+ * statistics.
+ */
+static void
+malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size,
+ int zindx)
+{
+ struct malloc_type_internal *mtip;
+ struct malloc_type_stats *mtsp;
+
+ critical_enter();
+ mtip = mtp->ks_handle;
+ mtsp = &mtip->mti_stats[curcpu];
+ if (size > 0) {
+ mtsp->mts_memalloced += size;
+ mtsp->mts_numallocs++;
+ }
+ if (zindx != -1)
+ mtsp->mts_size |= 1 << zindx;
+
+#ifdef KDTRACE_HOOKS
+ if (dtrace_malloc_probe != NULL) {
+ uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_MALLOC];
+ if (probe_id != 0)
+ (dtrace_malloc_probe)(probe_id,
+ (uintptr_t) mtp, (uintptr_t) mtip,
+ (uintptr_t) mtsp, size, zindx);
+ }
+#endif
+
+ critical_exit();
+}
+
+void
+malloc_type_allocated(struct malloc_type *mtp, unsigned long size)
+{
+
+ if (size > 0)
+ malloc_type_zone_allocated(mtp, size, -1);
+}
+
+/*
+ * A free operation has occurred -- update malloc type statistics for the
+ * amount of the bucket size. Occurs within a critical section so that the
+ * thread isn't preempted and doesn't migrate while updating per-CPU
+ * statistics.
+ */
+void
+malloc_type_freed(struct malloc_type *mtp, unsigned long size)
+{
+ struct malloc_type_internal *mtip;
+ struct malloc_type_stats *mtsp;
+
+ critical_enter();
+ mtip = mtp->ks_handle;
+ mtsp = &mtip->mti_stats[curcpu];
+ mtsp->mts_memfreed += size;
+ mtsp->mts_numfrees++;
+
+#ifdef KDTRACE_HOOKS
+ if (dtrace_malloc_probe != NULL) {
+ uint32_t probe_id = mtip->mti_probes[DTMALLOC_PROBE_FREE];
+ if (probe_id != 0)
+ (dtrace_malloc_probe)(probe_id,
+ (uintptr_t) mtp, (uintptr_t) mtip,
+ (uintptr_t) mtsp, size, 0);
+ }
+#endif
+
+ critical_exit();
+}
+
+/*
+ * contigmalloc:
+ *
+ * Allocate a block of physically contiguous memory.
+ *
+ * If M_NOWAIT is set, this routine will not block and return NULL if
+ * the allocation fails.
+ */
+void *
+contigmalloc(unsigned long size, struct malloc_type *type, int flags,
+ vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
+ vm_paddr_t boundary)
+{
+ void *ret;
+
+ ret = (void *)kmem_alloc_contig(kernel_arena, size, flags, low, high,
+ alignment, boundary, VM_MEMATTR_DEFAULT);
+ if (ret != NULL)
+ malloc_type_allocated(type, round_page(size));
+ return (ret);
+}
+
+/*
+ * contigfree:
+ *
+ * Free a block of memory allocated by contigmalloc.
+ *
+ * This routine may not block.
+ */
+void
+contigfree(void *addr, unsigned long size, struct malloc_type *type)
+{
+
+ kmem_free(kernel_arena, (vm_offset_t)addr, size);
+ malloc_type_freed(type, round_page(size));
+}
+
+/*
+ * malloc:
+ *
+ * Allocate a block of memory.
+ *
+ * If M_NOWAIT is set, this routine will not block and return NULL if
+ * the allocation fails.
+ */
+void *
+malloc(unsigned long size, struct malloc_type *mtp, int flags)
+{
+ int indx;
+ struct malloc_type_internal *mtip;
+ caddr_t va;
+ uma_zone_t zone;
+#if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
+ unsigned long osize = size;
+#endif
+
+#ifdef INVARIANTS
+ KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic"));
+ /*
+ * Check that exactly one of M_WAITOK or M_NOWAIT is specified.
+ */
+ indx = flags & (M_WAITOK | M_NOWAIT);
+ if (indx != M_NOWAIT && indx != M_WAITOK) {
+ static struct timeval lasterr;
+ static int curerr, once;
+ if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
+ printf("Bad malloc flags: %x\n", indx);
+ kdb_backtrace();
+ flags |= M_WAITOK;
+ once++;
+ }
+ }
+#endif
+#ifdef MALLOC_MAKE_FAILURES
+ if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) {
+ atomic_add_int(&malloc_nowait_count, 1);
+ if ((malloc_nowait_count % malloc_failure_rate) == 0) {
+ atomic_add_int(&malloc_failure_count, 1);
+ t_malloc_fail = time_uptime;
+ return (NULL);
+ }
+ }
+#endif
+ if (flags & M_WAITOK)
+ KASSERT(curthread->td_intr_nesting_level == 0,
+ ("malloc(M_WAITOK) in interrupt context"));
+
+#ifdef DEBUG_MEMGUARD
+ if (memguard_cmp_mtp(mtp, size)) {
+ va = memguard_alloc(size, flags);
+ if (va != NULL)
+ return (va);
+ /* This is unfortunate but should not be fatal. */
+ }
+#endif
+
+#ifdef DEBUG_REDZONE
+ size = redzone_size_ntor(size);
+#endif
+
+ if (size <= KMEM_ZMAX) {
+ mtip = mtp->ks_handle;
+ if (size & KMEM_ZMASK)
+ size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
+ indx = kmemsize[size >> KMEM_ZSHIFT];
+ KASSERT(mtip->mti_zone < numzones,
+ ("mti_zone %u out of range %d",
+ mtip->mti_zone, numzones));
+ zone = kmemzones[indx].kz_zone[mtip->mti_zone];
+#ifdef MALLOC_PROFILE
+ krequests[size >> KMEM_ZSHIFT]++;
+#endif
+ va = uma_zalloc(zone, flags);
+ if (va != NULL)
+ size = zone->uz_size;
+ malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
+ } else {
+ size = roundup(size, PAGE_SIZE);
+ zone = NULL;
+ va = uma_large_malloc(size, flags);
+ malloc_type_allocated(mtp, va == NULL ? 0 : size);
+ }
+ if (flags & M_WAITOK)
+ KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
+ else if (va == NULL)
+ t_malloc_fail = time_uptime;
+#ifdef DIAGNOSTIC
+ if (va != NULL && !(flags & M_ZERO)) {
+ memset(va, 0x70, osize);
+ }
+#endif
+#ifdef DEBUG_REDZONE
+ if (va != NULL)
+ va = redzone_setup(va, osize);
+#endif
+ return ((void *) va);
+}
+
+/*
+ * free:
+ *
+ * Free a block of memory allocated by malloc.
+ *
+ * This routine may not block.
+ */
+void
+free(void *addr, struct malloc_type *mtp)
+{
+ uma_slab_t slab;
+ u_long size;
+
+ KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic"));
+
+ /* free(NULL, ...) does nothing */
+ if (addr == NULL)
+ return;
+
+#ifdef DEBUG_MEMGUARD
+ if (is_memguard_addr(addr)) {
+ memguard_free(addr);
+ return;
+ }
+#endif
+
+#ifdef DEBUG_REDZONE
+ redzone_check(addr);
+ addr = redzone_addr_ntor(addr);
+#endif
+
+ slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
+
+ if (slab == NULL)
+ panic("free: address %p(%p) has not been allocated.\n",
+ addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
+
+ if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
+#ifdef INVARIANTS
+ struct malloc_type **mtpp = addr;
+#endif
+ size = slab->us_keg->uk_size;
+#ifdef INVARIANTS
+ /*
+ * Cache a pointer to the malloc_type that most recently freed
+ * this memory here. This way we know who is most likely to
+ * have stepped on it later.
+ *
+ * This code assumes that size is a multiple of 8 bytes for
+ * 64 bit machines
+ */
+ mtpp = (struct malloc_type **)
+ ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
+ mtpp += (size - sizeof(struct malloc_type *)) /
+ sizeof(struct malloc_type *);
+ *mtpp = mtp;
+#endif
+ uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
+ } else {
+ size = slab->us_size;
+ uma_large_free(slab);
+ }
+ malloc_type_freed(mtp, size);
+}
+
+/*
+ * realloc: change the size of a memory block
+ */
+void *
+realloc(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
+{
+ uma_slab_t slab;
+ unsigned long alloc;
+ void *newaddr;
+
+ KASSERT(mtp->ks_magic == M_MAGIC,
+ ("realloc: bad malloc type magic"));
+
+ /* realloc(NULL, ...) is equivalent to malloc(...) */
+ if (addr == NULL)
+ return (malloc(size, mtp, flags));
+
+ /*
+ * XXX: Should report free of old memory and alloc of new memory to
+ * per-CPU stats.
+ */
+
+#ifdef DEBUG_MEMGUARD
+ if (is_memguard_addr(addr))
+ return (memguard_realloc(addr, size, mtp, flags));
+#endif
+
+#ifdef DEBUG_REDZONE
+ slab = NULL;
+ alloc = redzone_get_size(addr);
+#else
+ slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK));
+
+ /* Sanity check */
+ KASSERT(slab != NULL,
+ ("realloc: address %p out of range", (void *)addr));
+
+ /* Get the size of the original block */
+ if (!(slab->us_flags & UMA_SLAB_MALLOC))
+ alloc = slab->us_keg->uk_size;
+ else
+ alloc = slab->us_size;
+
+ /* Reuse the original block if appropriate */
+ if (size <= alloc
+ && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
+ return (addr);
+#endif /* !DEBUG_REDZONE */
+
+ /* Allocate a new, bigger (or smaller) block */
+ if ((newaddr = malloc(size, mtp, flags)) == NULL)
+ return (NULL);
+
+ /* Copy over original contents */
+ bcopy(addr, newaddr, min(size, alloc));
+ free(addr, mtp);
+ return (newaddr);
+}
+
+/*
+ * reallocf: same as realloc() but free memory on failure.
+ */
+void *
+reallocf(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
+{
+ void *mem;
+
+ if ((mem = realloc(addr, size, mtp, flags)) == NULL)
+ free(addr, mtp);
+ return (mem);
+}
+
+/*
+ * Wake the page daemon when we exhaust KVA. It will call the lowmem handler
+ * and uma_reclaim() callbacks in a context that is safe.
+ */
+static void
+kmem_reclaim(vmem_t *vm, int flags)
+{
+
+ pagedaemon_wakeup();
+}
+
+/*
+ * Initialize the kernel memory arena.
+ */
+void
+kmeminit(void)
+{
+ u_long mem_size, tmp;
+
+ /*
+ * Try to auto-tune the kernel memory size, so that it is
+ * more applicable for a wider range of machine sizes. The
+ * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
+ * available.
+ *
+ * Note that the kmem_map is also used by the zone allocator,
+ * so make sure that there is enough space.
+ */
+ vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
+ mem_size = cnt.v_page_count;
+
+#if defined(VM_KMEM_SIZE_SCALE)
+ vm_kmem_size_scale = VM_KMEM_SIZE_SCALE;
+#endif
+ TUNABLE_INT_FETCH("vm.kmem_size_scale", &vm_kmem_size_scale);
+ if (vm_kmem_size_scale > 0 &&
+ (mem_size / vm_kmem_size_scale) > (vm_kmem_size / PAGE_SIZE))
+ vm_kmem_size = (mem_size / vm_kmem_size_scale) * PAGE_SIZE;
+
+#if defined(VM_KMEM_SIZE_MIN)
+ vm_kmem_size_min = VM_KMEM_SIZE_MIN;
+#endif
+ TUNABLE_ULONG_FETCH("vm.kmem_size_min", &vm_kmem_size_min);
+ if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min) {
+ vm_kmem_size = vm_kmem_size_min;
+ }
+
+#if defined(VM_KMEM_SIZE_MAX)
+ vm_kmem_size_max = VM_KMEM_SIZE_MAX;
+#endif
+ TUNABLE_ULONG_FETCH("vm.kmem_size_max", &vm_kmem_size_max);
+ if (vm_kmem_size_max > 0 && vm_kmem_size >= vm_kmem_size_max)
+ vm_kmem_size = vm_kmem_size_max;
+
+ /* Allow final override from the kernel environment */
+ TUNABLE_ULONG_FETCH("vm.kmem_size", &vm_kmem_size);
+
+ /*
+ * Limit kmem virtual size to twice the physical memory.
+ * This allows for kmem map sparseness, but limits the size
+ * to something sane. Be careful to not overflow the 32bit
+ * ints while doing the check or the adjustment.
+ */
+ if (vm_kmem_size / 2 / PAGE_SIZE > mem_size)
+ vm_kmem_size = 2 * mem_size * PAGE_SIZE;
+
+ vm_kmem_size = round_page(vm_kmem_size);
+#ifdef DEBUG_MEMGUARD
+ tmp = memguard_fudge(vm_kmem_size, kernel_map);
+#else
+ tmp = vm_kmem_size;
+#endif
+ vmem_init(kmem_arena, "kmem arena", kva_alloc(tmp), tmp, PAGE_SIZE,
+ 0, 0);
+ vmem_set_reclaim(kmem_arena, kmem_reclaim);
+
+#ifdef DEBUG_MEMGUARD
+ /*
+ * Initialize MemGuard if support compiled in. MemGuard is a
+ * replacement allocator used for detecting tamper-after-free
+ * scenarios as they occur. It is only used for debugging.
+ */
+ memguard_init(kmem_arena);
+#endif
+}
+
+/*
+ * Initialize the kernel memory allocator
+ */
+/* ARGSUSED*/
+static void
+mallocinit(void *dummy)
+{
+ int i;
+ uint8_t indx;
+
+ mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF);
+
+ kmeminit();
+
+ uma_startup2();
+
+ mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal),
+#ifdef INVARIANTS
+ mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
+#else
+ NULL, NULL, NULL, NULL,
+#endif
+ UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
+ for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) {
+ int size = kmemzones[indx].kz_size;
+ char *name = kmemzones[indx].kz_name;
+ int subzone;
+
+ for (subzone = 0; subzone < numzones; subzone++) {
+ kmemzones[indx].kz_zone[subzone] =
+ uma_zcreate(name, size,
+#ifdef INVARIANTS
+ mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
+#else
+ NULL, NULL, NULL, NULL,
+#endif
+ UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
+ }
+ for (;i <= size; i+= KMEM_ZBASE)
+ kmemsize[i >> KMEM_ZSHIFT] = indx;
+
+ }
+}
+SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, mallocinit, NULL);
+
+void
+malloc_init(void *data)
+{
+ struct malloc_type_internal *mtip;
+ struct malloc_type *mtp;
+
+ KASSERT(cnt.v_page_count != 0, ("malloc_register before vm_init"));
+
+ mtp = data;
+ if (mtp->ks_magic != M_MAGIC)
+ panic("malloc_init: bad malloc type magic");
+
+ mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO);
+ mtp->ks_handle = mtip;
+ mtip->mti_zone = mtp_get_subzone(mtp->ks_shortdesc);
+
+ mtx_lock(&malloc_mtx);
+ mtp->ks_next = kmemstatistics;
+ kmemstatistics = mtp;
+ kmemcount++;
+ mtx_unlock(&malloc_mtx);
+}
+
+void
+malloc_uninit(void *data)
+{
+ struct malloc_type_internal *mtip;
+ struct malloc_type_stats *mtsp;
+ struct malloc_type *mtp, *temp;
+ uma_slab_t slab;
+ long temp_allocs, temp_bytes;
+ int i;
+
+ mtp = data;
+ KASSERT(mtp->ks_magic == M_MAGIC,
+ ("malloc_uninit: bad malloc type magic"));
+ KASSERT(mtp->ks_handle != NULL, ("malloc_deregister: cookie NULL"));
+
+ mtx_lock(&malloc_mtx);
+ mtip = mtp->ks_handle;
+ mtp->ks_handle = NULL;
+ if (mtp != kmemstatistics) {
+ for (temp = kmemstatistics; temp != NULL;
+ temp = temp->ks_next) {
+ if (temp->ks_next == mtp) {
+ temp->ks_next = mtp->ks_next;
+ break;
+ }
+ }
+ KASSERT(temp,
+ ("malloc_uninit: type '%s' not found", mtp->ks_shortdesc));
+ } else
+ kmemstatistics = mtp->ks_next;
+ kmemcount--;
+ mtx_unlock(&malloc_mtx);
+
+ /*
+ * Look for memory leaks.
+ */
+ temp_allocs = temp_bytes = 0;
+ for (i = 0; i < MAXCPU; i++) {
+ mtsp = &mtip->mti_stats[i];
+ temp_allocs += mtsp->mts_numallocs;
+ temp_allocs -= mtsp->mts_numfrees;
+ temp_bytes += mtsp->mts_memalloced;
+ temp_bytes -= mtsp->mts_memfreed;
+ }
+ if (temp_allocs > 0 || temp_bytes > 0) {
+ printf("Warning: memory type %s leaked memory on destroy "
+ "(%ld allocations, %ld bytes leaked).\n", mtp->ks_shortdesc,
+ temp_allocs, temp_bytes);
+ }
+
+ slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
+ uma_zfree_arg(mt_zone, mtip, slab);
+}
+
+struct malloc_type *
+malloc_desc2type(const char *desc)
+{
+ struct malloc_type *mtp;
+
+ mtx_assert(&malloc_mtx, MA_OWNED);
+ for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+ if (strcmp(mtp->ks_shortdesc, desc) == 0)
+ return (mtp);
+ }
+ return (NULL);
+}
+
+static int
+sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS)
+{
+ struct malloc_type_stream_header mtsh;
+ struct malloc_type_internal *mtip;
+ struct malloc_type_header mth;
+ struct malloc_type *mtp;
+ int error, i;
+ struct sbuf sbuf;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+ mtx_lock(&malloc_mtx);
+
+ /*
+ * Insert stream header.
+ */
+ bzero(&mtsh, sizeof(mtsh));
+ mtsh.mtsh_version = MALLOC_TYPE_STREAM_VERSION;
+ mtsh.mtsh_maxcpus = MAXCPU;
+ mtsh.mtsh_count = kmemcount;
+ (void)sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh));
+
+ /*
+ * Insert alternating sequence of type headers and type statistics.
+ */
+ for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+ mtip = (struct malloc_type_internal *)mtp->ks_handle;
+
+ /*
+ * Insert type header.
+ */
+ bzero(&mth, sizeof(mth));
+ strlcpy(mth.mth_name, mtp->ks_shortdesc, MALLOC_MAX_NAME);
+ (void)sbuf_bcat(&sbuf, &mth, sizeof(mth));
+
+ /*
+ * Insert type statistics for each CPU.
+ */
+ for (i = 0; i < MAXCPU; i++) {
+ (void)sbuf_bcat(&sbuf, &mtip->mti_stats[i],
+ sizeof(mtip->mti_stats[i]));
+ }
+ }
+ mtx_unlock(&malloc_mtx);
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, malloc_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
+ 0, 0, sysctl_kern_malloc_stats, "s,malloc_type_ustats",
+ "Return malloc types");
+
+SYSCTL_INT(_kern, OID_AUTO, malloc_count, CTLFLAG_RD, &kmemcount, 0,
+ "Count of kernel malloc types");
+
+void
+malloc_type_list(malloc_type_list_func_t *func, void *arg)
+{
+ struct malloc_type *mtp, **bufmtp;
+ int count, i;
+ size_t buflen;
+
+ mtx_lock(&malloc_mtx);
+restart:
+ mtx_assert(&malloc_mtx, MA_OWNED);
+ count = kmemcount;
+ mtx_unlock(&malloc_mtx);
+
+ buflen = sizeof(struct malloc_type *) * count;
+ bufmtp = malloc(buflen, M_TEMP, M_WAITOK);
+
+ mtx_lock(&malloc_mtx);
+
+ if (count < kmemcount) {
+ free(bufmtp, M_TEMP);
+ goto restart;
+ }
+
+ for (mtp = kmemstatistics, i = 0; mtp != NULL; mtp = mtp->ks_next, i++)
+ bufmtp[i] = mtp;
+
+ mtx_unlock(&malloc_mtx);
+
+ for (i = 0; i < count; i++)
+ (func)(bufmtp[i], arg);
+
+ free(bufmtp, M_TEMP);
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(malloc, db_show_malloc)
+{
+ struct malloc_type_internal *mtip;
+ struct malloc_type *mtp;
+ uint64_t allocs, frees;
+ uint64_t alloced, freed;
+ int i;
+
+ db_printf("%18s %12s %12s %12s\n", "Type", "InUse", "MemUse",
+ "Requests");
+ for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+ mtip = (struct malloc_type_internal *)mtp->ks_handle;
+ allocs = 0;
+ frees = 0;
+ alloced = 0;
+ freed = 0;
+ for (i = 0; i < MAXCPU; i++) {
+ allocs += mtip->mti_stats[i].mts_numallocs;
+ frees += mtip->mti_stats[i].mts_numfrees;
+ alloced += mtip->mti_stats[i].mts_memalloced;
+ freed += mtip->mti_stats[i].mts_memfreed;
+ }
+ db_printf("%18s %12ju %12juK %12ju\n",
+ mtp->ks_shortdesc, allocs - frees,
+ (alloced - freed + 1023) / 1024, allocs);
+ if (db_pager_quit)
+ break;
+ }
+}
+
+#if MALLOC_DEBUG_MAXZONES > 1
+DB_SHOW_COMMAND(multizone_matches, db_show_multizone_matches)
+{
+ struct malloc_type_internal *mtip;
+ struct malloc_type *mtp;
+ u_int subzone;
+
+ if (!have_addr) {
+ db_printf("Usage: show multizone_matches <malloc type/addr>\n");
+ return;
+ }
+ mtp = (void *)addr;
+ if (mtp->ks_magic != M_MAGIC) {
+ db_printf("Magic %lx does not match expected %x\n",
+ mtp->ks_magic, M_MAGIC);
+ return;
+ }
+
+ mtip = mtp->ks_handle;
+ subzone = mtip->mti_zone;
+
+ for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+ mtip = mtp->ks_handle;
+ if (mtip->mti_zone != subzone)
+ continue;
+ db_printf("%s\n", mtp->ks_shortdesc);
+ if (db_pager_quit)
+ break;
+ }
+}
+#endif /* MALLOC_DEBUG_MAXZONES > 1 */
+#endif /* DDB */
+
+#ifdef MALLOC_PROFILE
+
+static int
+sysctl_kern_mprof(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf sbuf;
+ uint64_t count;
+ uint64_t waste;
+ uint64_t mem;
+ int error;
+ int rsize;
+ int size;
+ int i;
+
+ waste = 0;
+ mem = 0;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+ sbuf_printf(&sbuf,
+ "\n Size Requests Real Size\n");
+ for (i = 0; i < KMEM_ZSIZE; i++) {
+ size = i << KMEM_ZSHIFT;
+ rsize = kmemzones[kmemsize[i]].kz_size;
+ count = (long long unsigned)krequests[i];
+
+ sbuf_printf(&sbuf, "%6d%28llu%11d\n", size,
+ (unsigned long long)count, rsize);
+
+ if ((rsize * count) > (size * count))
+ waste += (rsize * count) - (size * count);
+ mem += (rsize * count);
+ }
+ sbuf_printf(&sbuf,
+ "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n",
+ (unsigned long long)mem, (unsigned long long)waste);
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ return (error);
+}
+
+SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling");
+#endif /* MALLOC_PROFILE */
diff --git a/sys/kern/kern_mbuf.c b/sys/kern/kern_mbuf.c
new file mode 100644
index 0000000..5d58942
--- /dev/null
+++ b/sys/kern/kern_mbuf.c
@@ -0,0 +1,694 @@
+/*-
+ * Copyright (c) 2004, 2005,
+ * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#include <vm/uma_dbg.h>
+
+/*
+ * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
+ * Zones.
+ *
+ * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
+ * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the
+ * administrator so desires.
+ *
+ * Mbufs are allocated from a UMA Master Zone called the Mbuf
+ * Zone.
+ *
+ * Additionally, FreeBSD provides a Packet Zone, which it
+ * configures as a Secondary Zone to the Mbuf Master Zone,
+ * thus sharing backend Slab kegs with the Mbuf Master Zone.
+ *
+ * Thus common-case allocations and locking are simplified:
+ *
+ * m_clget() m_getcl()
+ * | |
+ * | .------------>[(Packet Cache)] m_get(), m_gethdr()
+ * | | [ Packet ] |
+ * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ]
+ * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ]
+ * | \________ |
+ * [ Cluster Keg ] \ /
+ * | [ Mbuf Keg ]
+ * [ Cluster Slabs ] |
+ * | [ Mbuf Slabs ]
+ * \____________(VM)_________________/
+ *
+ *
+ * Whenever an object is allocated with uma_zalloc() out of
+ * one of the Zones its _ctor_ function is executed. The same
+ * for any deallocation through uma_zfree() the _dtor_ function
+ * is executed.
+ *
+ * Caches are per-CPU and are filled from the Master Zone.
+ *
+ * Whenever an object is allocated from the underlying global
+ * memory pool it gets pre-initialized with the _zinit_ functions.
+ * When the Keg's are overfull objects get decomissioned with
+ * _zfini_ functions and free'd back to the global memory pool.
+ *
+ */
+
+int nmbufs; /* limits number of mbufs */
+int nmbclusters; /* limits number of mbuf clusters */
+int nmbjumbop; /* limits number of page size jumbo clusters */
+int nmbjumbo9; /* limits number of 9k jumbo clusters */
+int nmbjumbo16; /* limits number of 16k jumbo clusters */
+
+static quad_t maxmbufmem; /* overall real memory limit for all mbufs */
+
+SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN, &maxmbufmem, 0,
+ "Maximum real memory allocatable to various mbuf types");
+
+/*
+ * tunable_mbinit() has to be run before any mbuf allocations are done.
+ */
+static void
+tunable_mbinit(void *dummy)
+{
+ quad_t realmem;
+
+ /*
+ * The default limit for all mbuf related memory is 1/2 of all
+ * available kernel memory (physical or kmem).
+ * At most it can be 3/4 of available kernel memory.
+ */
+ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
+ maxmbufmem = realmem / 2;
+ TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
+ if (maxmbufmem > realmem / 4 * 3)
+ maxmbufmem = realmem / 4 * 3;
+
+ TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
+ if (nmbclusters == 0)
+ nmbclusters = maxmbufmem / MCLBYTES / 4;
+
+ TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
+ if (nmbjumbop == 0)
+ nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
+
+ TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
+ if (nmbjumbo9 == 0)
+ nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
+
+ TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
+ if (nmbjumbo16 == 0)
+ nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
+
+ /*
+ * We need at least as many mbufs as we have clusters of
+ * the various types added together.
+ */
+ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
+ if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
+ nmbufs = lmax(maxmbufmem / MSIZE / 5,
+ nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
+}
+SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
+
+static int
+sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbclusters;
+
+ newnmbclusters = nmbclusters;
+ error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbclusters > nmbclusters &&
+ nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
+ nmbclusters = newnmbclusters;
+ nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
+ EVENTHANDLER_INVOKE(nmbclusters_change);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
+&nmbclusters, 0, sysctl_nmbclusters, "IU",
+ "Maximum number of mbuf clusters allowed");
+
+static int
+sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbjumbop;
+
+ newnmbjumbop = nmbjumbop;
+ error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbjumbop > nmbjumbop &&
+ nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
+ nmbjumbop = newnmbjumbop;
+ nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbop, 0, sysctl_nmbjumbop, "IU",
+ "Maximum number of mbuf page size jumbo clusters allowed");
+
+static int
+sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbjumbo9;
+
+ newnmbjumbo9 = nmbjumbo9;
+ error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbjumbo9 > nmbjumbo9 &&
+ nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
+ nmbjumbo9 = newnmbjumbo9;
+ nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
+ "Maximum number of mbuf 9k jumbo clusters allowed");
+
+static int
+sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbjumbo16;
+
+ newnmbjumbo16 = nmbjumbo16;
+ error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbjumbo16 > nmbjumbo16 &&
+ nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
+ nmbjumbo16 = newnmbjumbo16;
+ nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
+ "Maximum number of mbuf 16k jumbo clusters allowed");
+
+static int
+sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
+{
+ int error, newnmbufs;
+
+ newnmbufs = nmbufs;
+ error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newnmbufs > nmbufs) {
+ nmbufs = newnmbufs;
+ nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
+ EVENTHANDLER_INVOKE(nmbufs_change);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW,
+&nmbufs, 0, sysctl_nmbufs, "IU",
+ "Maximum number of mbufs allowed");
+
+/*
+ * Zones from which we allocate.
+ */
+uma_zone_t zone_mbuf;
+uma_zone_t zone_clust;
+uma_zone_t zone_pack;
+uma_zone_t zone_jumbop;
+uma_zone_t zone_jumbo9;
+uma_zone_t zone_jumbo16;
+uma_zone_t zone_ext_refcnt;
+
+/*
+ * Local prototypes.
+ */
+static int mb_ctor_mbuf(void *, int, void *, int);
+static int mb_ctor_clust(void *, int, void *, int);
+static int mb_ctor_pack(void *, int, void *, int);
+static void mb_dtor_mbuf(void *, int, void *);
+static void mb_dtor_clust(void *, int, void *);
+static void mb_dtor_pack(void *, int, void *);
+static int mb_zinit_pack(void *, int, int);
+static void mb_zfini_pack(void *, int);
+
+static void mb_reclaim(void *);
+static void *mbuf_jumbo_alloc(uma_zone_t, int, uint8_t *, int);
+
+/* Ensure that MSIZE is a power of 2. */
+CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
+
+/*
+ * Initialize FreeBSD Network buffer allocation.
+ */
+static void
+mbuf_init(void *dummy)
+{
+
+ /*
+ * Configure UMA zones for Mbufs, Clusters, and Packets.
+ */
+ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
+ mb_ctor_mbuf, mb_dtor_mbuf,
+#ifdef INVARIANTS
+ trash_init, trash_fini,
+#else
+ NULL, NULL,
+#endif
+ MSIZE - 1, UMA_ZONE_MAXBUCKET);
+ if (nmbufs > 0)
+ nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
+ uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
+
+ zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
+ mb_ctor_clust, mb_dtor_clust,
+#ifdef INVARIANTS
+ trash_init, trash_fini,
+#else
+ NULL, NULL,
+#endif
+ UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+ if (nmbclusters > 0)
+ nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
+ uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
+
+ zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
+ mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
+
+ /* Make jumbo frame zone too. Page size, 9k and 16k. */
+ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
+ mb_ctor_clust, mb_dtor_clust,
+#ifdef INVARIANTS
+ trash_init, trash_fini,
+#else
+ NULL, NULL,
+#endif
+ UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+ if (nmbjumbop > 0)
+ nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
+ uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
+
+ zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
+ mb_ctor_clust, mb_dtor_clust,
+#ifdef INVARIANTS
+ trash_init, trash_fini,
+#else
+ NULL, NULL,
+#endif
+ UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+ uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
+ if (nmbjumbo9 > 0)
+ nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
+ uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
+
+ zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
+ mb_ctor_clust, mb_dtor_clust,
+#ifdef INVARIANTS
+ trash_init, trash_fini,
+#else
+ NULL, NULL,
+#endif
+ UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
+ uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
+ if (nmbjumbo16 > 0)
+ nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+ uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
+
+ zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
+ NULL, NULL,
+ NULL, NULL,
+ UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+
+ /* uma_prealloc() goes here... */
+
+ /*
+ * Hook event handler for low-memory situation, used to
+ * drain protocols and push data back to the caches (UMA
+ * later pushes it back to VM).
+ */
+ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
+ EVENTHANDLER_PRI_FIRST);
+}
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
+
+/*
+ * UMA backend page allocator for the jumbo frame zones.
+ *
+ * Allocates kernel virtual memory that is backed by contiguous physical
+ * pages.
+ */
+static void *
+mbuf_jumbo_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
+{
+
+ /* Inform UMA that this allocator uses kernel_map/object. */
+ *flags = UMA_SLAB_KERNEL;
+ return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait,
+ (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
+}
+
+/*
+ * Constructor for Mbuf master zone.
+ *
+ * The 'arg' pointer points to a mb_args structure which
+ * contains call-specific information required to support the
+ * mbuf allocation API. See mbuf.h.
+ */
+static int
+mb_ctor_mbuf(void *mem, int size, void *arg, int how)
+{
+ struct mbuf *m;
+ struct mb_args *args;
+ int error;
+ int flags;
+ short type;
+
+#ifdef INVARIANTS
+ trash_ctor(mem, size, arg, how);
+#endif
+ args = (struct mb_args *)arg;
+ type = args->type;
+
+ /*
+ * The mbuf is initialized later. The caller has the
+ * responsibility to set up any MAC labels too.
+ */
+ if (type == MT_NOINIT)
+ return (0);
+
+ m = (struct mbuf *)mem;
+ flags = args->flags;
+
+ error = m_init(m, NULL, size, how, type, flags);
+
+ return (error);
+}
+
+/*
+ * The Mbuf master zone destructor.
+ */
+static void
+mb_dtor_mbuf(void *mem, int size, void *arg)
+{
+ struct mbuf *m;
+ unsigned long flags;
+
+ m = (struct mbuf *)mem;
+ flags = (unsigned long)arg;
+
+ if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
+ m_tag_delete_chain(m, NULL);
+ KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
+ KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
+#ifdef INVARIANTS
+ trash_dtor(mem, size, arg);
+#endif
+}
+
+/*
+ * The Mbuf Packet zone destructor.
+ */
+static void
+mb_dtor_pack(void *mem, int size, void *arg)
+{
+ struct mbuf *m;
+
+ m = (struct mbuf *)mem;
+ if ((m->m_flags & M_PKTHDR) != 0)
+ m_tag_delete_chain(m, NULL);
+
+ /* Make sure we've got a clean cluster back. */
+ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+ KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
+ KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
+ KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
+ KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
+ KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
+ KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
+ KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__));
+#ifdef INVARIANTS
+ trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
+#endif
+ /*
+ * If there are processes blocked on zone_clust, waiting for pages
+ * to be freed up, * cause them to be woken up by draining the
+ * packet zone. We are exposed to a race here * (in the check for
+ * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
+ * is deliberate. We don't want to acquire the zone lock for every
+ * mbuf free.
+ */
+ if (uma_zone_exhausted_nolock(zone_clust))
+ zone_drain(zone_pack);
+}
+
+/*
+ * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
+ *
+ * Here the 'arg' pointer points to the Mbuf which we
+ * are configuring cluster storage for. If 'arg' is
+ * empty we allocate just the cluster without setting
+ * the mbuf to it. See mbuf.h.
+ */
+static int
+mb_ctor_clust(void *mem, int size, void *arg, int how)
+{
+ struct mbuf *m;
+ u_int *refcnt;
+ int type;
+ uma_zone_t zone;
+
+#ifdef INVARIANTS
+ trash_ctor(mem, size, arg, how);
+#endif
+ switch (size) {
+ case MCLBYTES:
+ type = EXT_CLUSTER;
+ zone = zone_clust;
+ break;
+#if MJUMPAGESIZE != MCLBYTES
+ case MJUMPAGESIZE:
+ type = EXT_JUMBOP;
+ zone = zone_jumbop;
+ break;
+#endif
+ case MJUM9BYTES:
+ type = EXT_JUMBO9;
+ zone = zone_jumbo9;
+ break;
+ case MJUM16BYTES:
+ type = EXT_JUMBO16;
+ zone = zone_jumbo16;
+ break;
+ default:
+ panic("unknown cluster size");
+ break;
+ }
+
+ m = (struct mbuf *)arg;
+ refcnt = uma_find_refcnt(zone, mem);
+ *refcnt = 1;
+ if (m != NULL) {
+ m->m_ext.ext_buf = (caddr_t)mem;
+ m->m_data = m->m_ext.ext_buf;
+ m->m_flags |= M_EXT;
+ m->m_ext.ext_free = NULL;
+ m->m_ext.ext_arg1 = NULL;
+ m->m_ext.ext_arg2 = NULL;
+ m->m_ext.ext_size = size;
+ m->m_ext.ext_type = type;
+ m->m_ext.ext_flags = 0;
+ m->m_ext.ref_cnt = refcnt;
+ }
+
+ return (0);
+}
+
+/*
+ * The Mbuf Cluster zone destructor.
+ */
+static void
+mb_dtor_clust(void *mem, int size, void *arg)
+{
+#ifdef INVARIANTS
+ uma_zone_t zone;
+
+ zone = m_getzone(size);
+ KASSERT(*(uma_find_refcnt(zone, mem)) <= 1,
+ ("%s: refcnt incorrect %u", __func__,
+ *(uma_find_refcnt(zone, mem))) );
+
+ trash_dtor(mem, size, arg);
+#endif
+}
+
+/*
+ * The Packet secondary zone's init routine, executed on the
+ * object's transition from mbuf keg slab to zone cache.
+ */
+static int
+mb_zinit_pack(void *mem, int size, int how)
+{
+ struct mbuf *m;
+
+ m = (struct mbuf *)mem; /* m is virgin. */
+ if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
+ m->m_ext.ext_buf == NULL)
+ return (ENOMEM);
+ m->m_ext.ext_type = EXT_PACKET; /* Override. */
+#ifdef INVARIANTS
+ trash_init(m->m_ext.ext_buf, MCLBYTES, how);
+#endif
+ return (0);
+}
+
+/*
+ * The Packet secondary zone's fini routine, executed on the
+ * object's transition from zone cache to keg slab.
+ */
+static void
+mb_zfini_pack(void *mem, int size)
+{
+ struct mbuf *m;
+
+ m = (struct mbuf *)mem;
+#ifdef INVARIANTS
+ trash_fini(m->m_ext.ext_buf, MCLBYTES);
+#endif
+ uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
+#ifdef INVARIANTS
+ trash_dtor(mem, size, NULL);
+#endif
+}
+
+/*
+ * The "packet" keg constructor.
+ */
+static int
+mb_ctor_pack(void *mem, int size, void *arg, int how)
+{
+ struct mbuf *m;
+ struct mb_args *args;
+ int error, flags;
+ short type;
+
+ m = (struct mbuf *)mem;
+ args = (struct mb_args *)arg;
+ flags = args->flags;
+ type = args->type;
+
+#ifdef INVARIANTS
+ trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
+#endif
+
+ error = m_init(m, NULL, size, how, type, flags);
+
+ /* m_ext is already initialized. */
+ m->m_data = m->m_ext.ext_buf;
+ m->m_flags = (flags | M_EXT);
+
+ return (error);
+}
+
+int
+m_pkthdr_init(struct mbuf *m, int how)
+{
+#ifdef MAC
+ int error;
+#endif
+ m->m_data = m->m_pktdat;
+ m->m_pkthdr.rcvif = NULL;
+ SLIST_INIT(&m->m_pkthdr.tags);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.flowid = 0;
+ m->m_pkthdr.csum_flags = 0;
+ m->m_pkthdr.fibnum = 0;
+ m->m_pkthdr.cosqos = 0;
+ m->m_pkthdr.rsstype = 0;
+ m->m_pkthdr.l2hlen = 0;
+ m->m_pkthdr.l3hlen = 0;
+ m->m_pkthdr.l4hlen = 0;
+ m->m_pkthdr.l5hlen = 0;
+ m->m_pkthdr.PH_per.sixtyfour[0] = 0;
+ m->m_pkthdr.PH_loc.sixtyfour[0] = 0;
+#ifdef MAC
+ /* If the label init fails, fail the alloc */
+ error = mac_mbuf_init(m, how);
+ if (error)
+ return (error);
+#endif
+
+ return (0);
+}
+
+/*
+ * This is the protocol drain routine.
+ *
+ * No locks should be held when this is called. The drain routines have to
+ * presently acquire some locks which raises the possibility of lock order
+ * reversal.
+ */
+static void
+mb_reclaim(void *junk)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
+ "mb_reclaim()");
+
+ for (dp = domains; dp != NULL; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_drain != NULL)
+ (*pr->pr_drain)();
+}
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
new file mode 100644
index 0000000..c84d4b2
--- /dev/null
+++ b/sys/kern/kern_mib.c
@@ -0,0 +1,542 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_posix.h"
+#include "opt_config.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/sbuf.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/jail.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+
+SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0,
+ "Sysctl internal magic");
+SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW|CTLFLAG_CAPRD, 0,
+ "High kernel, proc, limits &c");
+SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0,
+ "Virtual memory");
+SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0,
+ "File system");
+SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0,
+ "Network, (see socket.h)");
+SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0,
+ "Debugging");
+SYSCTL_NODE(_debug, OID_AUTO, sizeof, CTLFLAG_RW, 0,
+ "Sizeof various things");
+SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0,
+ "hardware");
+SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
+ "machine dependent");
+SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0,
+ "user-level");
+SYSCTL_NODE(, CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0,
+ "p1003_1b, (see p1003_1b.h)");
+
+SYSCTL_NODE(, OID_AUTO, compat, CTLFLAG_RW, 0,
+ "Compatibility code");
+SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW, 0,
+ "Security");
+#ifdef REGRESSION
+SYSCTL_NODE(, OID_AUTO, regression, CTLFLAG_RW, 0,
+ "Regression test MIB");
+#endif
+
+SYSCTL_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD|CTLFLAG_MPSAFE,
+ kern_ident, 0, "Kernel identifier");
+
+SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD|CTLFLAG_MPSAFE|
+ CTLFLAG_CAPRD, osrelease, 0, "Operating system release");
+
+SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD|CTLFLAG_CAPRD,
+ 0, BSD, "Operating system revision");
+
+SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD|CTLFLAG_MPSAFE,
+ version, 0, "Kernel version");
+
+SYSCTL_STRING(_kern, OID_AUTO, compiler_version, CTLFLAG_RD|CTLFLAG_MPSAFE,
+ compiler_version, 0, "Version of compiler used to compile kernel");
+
+SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD|CTLFLAG_MPSAFE|
+ CTLFLAG_CAPRD, ostype, 0, "Operating system type");
+
+/*
+ * NOTICE: The *userland* release date is available in
+ * /usr/include/osreldate.h
+ */
+SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD|CTLFLAG_CAPRD,
+ &osreldate, 0, "Kernel release date");
+
+SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN,
+ &maxproc, 0, "Maximum number of processes");
+
+SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
+ &maxprocperuid, 0, "Maximum processes allowed per userid");
+
+SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN,
+ &maxusers, 0, "Hint for kernel tuning");
+
+SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD|CTLFLAG_CAPRD,
+ 0, ARG_MAX, "Maximum bytes of argument to execve(2)");
+
+SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD|CTLFLAG_CAPRD,
+ 0, _POSIX_VERSION, "Version of POSIX attempting to comply to");
+
+SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
+ &ngroups_max, 0,
+ "Maximum number of supplemental groups a user can belong to");
+
+SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD|CTLFLAG_CAPRD,
+ 0, 1, "Whether job control is available");
+
+#ifdef _POSIX_SAVED_IDS
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
+ 0, 1, "Whether saved set-group/user ID is available");
+#else
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
+ 0, 0, "Whether saved set-group/user ID is available");
+#endif
+
+char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */
+
+SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW,
+ kernelname, sizeof kernelname, "Name of kernel file booted");
+
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD|CTLFLAG_CAPRD,
+ &mp_ncpus, 0, "Number of active CPUs");
+
+SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD|CTLFLAG_CAPRD,
+ 0, BYTE_ORDER, "System byte order");
+
+SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD|CTLFLAG_CAPRD,
+ 0, PAGE_SIZE, "System memory page size");
+
+static int
+sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
+{
+ char buf[256];
+ size_t len;
+
+ len = req->oldlen;
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+ arc4rand(buf, len, 0);
+ return (SYSCTL_OUT(req, buf, len));
+}
+
+SYSCTL_PROC(_kern, KERN_ARND, arandom,
+ CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0,
+ sysctl_kern_arnd, "", "arc4rand");
+
+static int
+sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
+{
+ u_long val;
+
+ val = ctob(physmem);
+ return (sysctl_handle_long(oidp, &val, 0, req));
+}
+
+SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG | CTLFLAG_RD,
+ 0, 0, sysctl_hw_physmem, "LU", "");
+
+static int
+sysctl_hw_realmem(SYSCTL_HANDLER_ARGS)
+{
+ u_long val;
+ val = ctob(realmem);
+ return (sysctl_handle_long(oidp, &val, 0, req));
+}
+SYSCTL_PROC(_hw, HW_REALMEM, realmem, CTLTYPE_ULONG | CTLFLAG_RD,
+ 0, 0, sysctl_hw_realmem, "LU", "");
+static int
+sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
+{
+ u_long val;
+
+ val = ctob(physmem - cnt.v_wire_count);
+ return (sysctl_handle_long(oidp, &val, 0, req));
+}
+
+SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG | CTLFLAG_RD,
+ 0, 0, sysctl_hw_usermem, "LU", "");
+
+SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, "");
+
+u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE };
+
+static int
+sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+#ifdef SCTL_MASK32
+ int i;
+ uint32_t pagesizes32[MAXPAGESIZES];
+
+ if (req->flags & SCTL_MASK32) {
+ /*
+ * Recreate the "pagesizes" array with 32-bit elements. Truncate
+ * any page size greater than UINT32_MAX to zero.
+ */
+ for (i = 0; i < MAXPAGESIZES; i++)
+ pagesizes32[i] = (uint32_t)pagesizes[i];
+
+ error = SYSCTL_OUT(req, pagesizes32, sizeof(pagesizes32));
+ } else
+#endif
+ error = SYSCTL_OUT(req, pagesizes, sizeof(pagesizes));
+ return (error);
+}
+SYSCTL_PROC(_hw, OID_AUTO, pagesizes, CTLTYPE_ULONG | CTLFLAG_RD,
+ NULL, 0, sysctl_hw_pagesizes, "LU", "Supported page sizes");
+
+#ifdef SCTL_MASK32
+int adaptive_machine_arch = 1;
+SYSCTL_INT(_debug, OID_AUTO, adaptive_machine_arch, CTLFLAG_RW,
+ &adaptive_machine_arch, 1,
+ "Adapt reported machine architecture to the ABI of the binary");
+#endif
+
+static int
+sysctl_hw_machine_arch(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ static const char machine_arch[] = MACHINE_ARCH;
+#ifdef SCTL_MASK32
+ static const char machine_arch32[] = MACHINE_ARCH32;
+
+ if ((req->flags & SCTL_MASK32) != 0 && adaptive_machine_arch)
+ error = SYSCTL_OUT(req, machine_arch32, sizeof(machine_arch32));
+ else
+#endif
+ error = SYSCTL_OUT(req, machine_arch, sizeof(machine_arch));
+ return (error);
+
+}
+SYSCTL_PROC(_hw, HW_MACHINE_ARCH, machine_arch, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, sysctl_hw_machine_arch, "A", "System architecture");
+
+static int
+sysctl_hostname(SYSCTL_HANDLER_ARGS)
+{
+ struct prison *pr, *cpr;
+ size_t pr_offset;
+ char tmpname[MAXHOSTNAMELEN];
+ int descend, error, len;
+
+ /*
+ * This function can set: hostname domainname hostuuid.
+ * Keep that in mind when comments say "hostname".
+ */
+ pr_offset = (size_t)arg1;
+ len = arg2;
+ KASSERT(len <= sizeof(tmpname),
+ ("length %d too long for %s", len, __func__));
+
+ pr = req->td->td_ucred->cr_prison;
+ if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
+ return (EPERM);
+ /*
+ * Make a local copy of hostname to get/set so we don't have to hold
+ * the jail mutex during the sysctl copyin/copyout activities.
+ */
+ mtx_lock(&pr->pr_mtx);
+ bcopy((char *)pr + pr_offset, tmpname, len);
+ mtx_unlock(&pr->pr_mtx);
+
+ error = sysctl_handle_string(oidp, tmpname, len, req);
+
+ if (req->newptr != NULL && error == 0) {
+ /*
+ * Copy the locally set hostname to all jails that share
+ * this host info.
+ */
+ sx_slock(&allprison_lock);
+ while (!(pr->pr_flags & PR_HOST))
+ pr = pr->pr_parent;
+ mtx_lock(&pr->pr_mtx);
+ bcopy(tmpname, (char *)pr + pr_offset, len);
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
+ if (cpr->pr_flags & PR_HOST)
+ descend = 0;
+ else
+ bcopy(tmpname, (char *)cpr + pr_offset, len);
+ mtx_unlock(&pr->pr_mtx);
+ sx_sunlock(&allprison_lock);
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
+ CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+ (void *)(offsetof(struct prison, pr_hostname)), MAXHOSTNAMELEN,
+ sysctl_hostname, "A", "Hostname");
+SYSCTL_PROC(_kern, KERN_NISDOMAINNAME, domainname,
+ CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+ (void *)(offsetof(struct prison, pr_domainname)), MAXHOSTNAMELEN,
+ sysctl_hostname, "A", "Name of the current YP/NIS domain");
+SYSCTL_PROC(_kern, KERN_HOSTUUID, hostuuid,
+ CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+ (void *)(offsetof(struct prison, pr_hostuuid)), HOSTUUIDLEN,
+ sysctl_hostname, "A", "Host UUID");
+
+static int regression_securelevel_nonmonotonic = 0;
+
+#ifdef REGRESSION
+SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
+ &regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
+#endif
+
+static int
+sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
+{
+ struct prison *pr, *cpr;
+ int descend, error, level;
+
+ pr = req->td->td_ucred->cr_prison;
+
+ /*
+ * Reading the securelevel is easy, since the current jail's level
+ * is known to be at least as secure as any higher levels. Perform
+ * a lockless read since the securelevel is an integer.
+ */
+ level = pr->pr_securelevel;
+ error = sysctl_handle_int(oidp, &level, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ /* Permit update only if the new securelevel exceeds the old. */
+ sx_slock(&allprison_lock);
+ mtx_lock(&pr->pr_mtx);
+ if (!regression_securelevel_nonmonotonic &&
+ level < pr->pr_securelevel) {
+ mtx_unlock(&pr->pr_mtx);
+ sx_sunlock(&allprison_lock);
+ return (EPERM);
+ }
+ pr->pr_securelevel = level;
+ /*
+ * Set all child jails to be at least this level, but do not lower
+ * them (even if regression_securelevel_nonmonotonic).
+ */
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) {
+ if (cpr->pr_securelevel < level)
+ cpr->pr_securelevel = level;
+ }
+ mtx_unlock(&pr->pr_mtx);
+ sx_sunlock(&allprison_lock);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
+ CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
+ "I", "Current secure level");
+
+#ifdef INCLUDE_CONFIG_FILE
+/* Actual kernel configuration options. */
+extern char kernconfstring[];
+
+SYSCTL_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD, kernconfstring, 0,
+ "Kernel configuration file");
+#endif
+
+static int
+sysctl_hostid(SYSCTL_HANDLER_ARGS)
+{
+ struct prison *pr, *cpr;
+ u_long tmpid;
+ int descend, error;
+
+ /*
+ * Like sysctl_hostname, except it operates on a u_long
+ * instead of a string, and is used only for hostid.
+ */
+ pr = req->td->td_ucred->cr_prison;
+ if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr)
+ return (EPERM);
+ tmpid = pr->pr_hostid;
+ error = sysctl_handle_long(oidp, &tmpid, 0, req);
+
+ if (req->newptr != NULL && error == 0) {
+ sx_slock(&allprison_lock);
+ while (!(pr->pr_flags & PR_HOST))
+ pr = pr->pr_parent;
+ mtx_lock(&pr->pr_mtx);
+ pr->pr_hostid = tmpid;
+ FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
+ if (cpr->pr_flags & PR_HOST)
+ descend = 0;
+ else
+ cpr->pr_hostid = tmpid;
+ mtx_unlock(&pr->pr_mtx);
+ sx_sunlock(&allprison_lock);
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_HOSTID, hostid,
+ CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
+ NULL, 0, sysctl_hostid, "LU", "Host ID");
+
+SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD, 0, "Kernel Features");
+
+#ifdef COMPAT_FREEBSD4
+FEATURE(compat_freebsd4, "Compatible with FreeBSD 4");
+#endif
+
+#ifdef COMPAT_FREEBSD5
+FEATURE(compat_freebsd5, "Compatible with FreeBSD 5");
+#endif
+
+#ifdef COMPAT_FREEBSD6
+FEATURE(compat_freebsd6, "Compatible with FreeBSD 6");
+#endif
+
+#ifdef COMPAT_FREEBSD7
+FEATURE(compat_freebsd7, "Compatible with FreeBSD 7");
+#endif
+
+/*
+ * This is really cheating. These actually live in the libc, something
+ * which I'm not quite sure is a good idea anyway, but in order for
+ * getnext and friends to actually work, we define dummies here.
+ *
+ * XXXRW: These probably should be CTLFLAG_CAPRD.
+ */
+SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
+ "", 0, "PATH that finds all the standard utilities");
+SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
+ 0, 0, "Max ibase/obase values in bc(1)");
+SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
+ 0, 0, "Max array size in bc(1)");
+SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
+ 0, 0, "Max scale value in bc(1)");
+SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
+ 0, 0, "Max string length in bc(1)");
+SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
+ 0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
+SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
+ 0, 0, "Max length (bytes) of a text-processing utility's input line");
+SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
+ 0, 0, "Maximum number of repeats of a regexp permitted");
+SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
+ 0, 0,
+ "The version of POSIX 1003.2 with which the system attempts to comply");
+SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
+ 0, 0, "Whether C development supports the C bindings option");
+SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
+ 0, 0, "Whether system supports the C development utilities option");
+SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
+ 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
+ 0, 0, "Whether system supports FORTRAN development utilities");
+SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
+ 0, 0, "Whether system supports FORTRAN runtime utilities");
+SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
+ 0, 0, "Whether system supports creation of locales");
+SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
+ 0, 0, "Whether system supports software development utilities");
+SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
+ 0, 0, "Whether system supports the user portability utilities");
+SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
+ 0, 0, "Min Maximum number of streams a process may have open at one time");
+SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
+ 0, 0, "Min Maximum number of types supported for timezone names");
+
+#include <sys/vnode.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
+ 0, sizeof(struct vnode), "sizeof(struct vnode)");
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
+ 0, sizeof(struct proc), "sizeof(struct proc)");
+
+static int
+sysctl_kern_pid_max(SYSCTL_HANDLER_ARGS)
+{
+ int error, pm;
+
+ pm = pid_max;
+ error = sysctl_handle_int(oidp, &pm, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ sx_xlock(&proctree_lock);
+ sx_xlock(&allproc_lock);
+
+ /*
+ * Only permit the values less then PID_MAX.
+ * As a safety measure, do not allow to limit the pid_max too much.
+ */
+ if (pm < 300 || pm > PID_MAX)
+ error = EINVAL;
+ else
+ pid_max = pm;
+ sx_xunlock(&allproc_lock);
+ sx_xunlock(&proctree_lock);
+ return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_TUN |
+ CTLFLAG_MPSAFE, 0, 0, sysctl_kern_pid_max, "I",
+ "Maximum allowed pid");
+
+#include <sys/bio.h>
+#include <sys/buf.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
+ 0, sizeof(struct bio), "sizeof(struct bio)");
+SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
+ 0, sizeof(struct buf), "sizeof(struct buf)");
+
+#include <sys/user.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
+ 0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
+
+/* XXX compatibility, remove for 6.0 */
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
+ &__elfN(fallback_brand), sizeof(__elfN(fallback_brand)),
+ "compatibility for kern.fallback_elf_brand");
diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c
new file mode 100644
index 0000000..b769320
--- /dev/null
+++ b/sys/kern/kern_module.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright (c) 1997 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_compat.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/malloc.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/reboot.h>
+#include <sys/sx.h>
+#include <sys/module.h>
+#include <sys/linker.h>
+
+static MALLOC_DEFINE(M_MODULE, "module", "module data structures");
+
+struct module {
+ TAILQ_ENTRY(module) link; /* chain together all modules */
+ TAILQ_ENTRY(module) flink; /* all modules in a file */
+ struct linker_file *file; /* file which contains this module */
+ int refs; /* reference count */
+ int id; /* unique id number */
+ char *name; /* module name */
+ modeventhand_t handler; /* event handler */
+ void *arg; /* argument for handler */
+ modspecific_t data; /* module specific data */
+};
+
+#define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg)
+
+static TAILQ_HEAD(modulelist, module) modules;
+struct sx modules_sx;
+static int nextid = 1;
+static void module_shutdown(void *, int);
+
+static int
+modevent_nop(module_t mod, int what, void *arg)
+{
+
+ switch(what) {
+ case MOD_LOAD:
+ return (0);
+ case MOD_UNLOAD:
+ return (EBUSY);
+ default:
+ return (EOPNOTSUPP);
+ }
+}
+
+static void
+module_init(void *arg)
+{
+
+ sx_init(&modules_sx, "module subsystem sx lock");
+ TAILQ_INIT(&modules);
+ EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL,
+ SHUTDOWN_PRI_DEFAULT);
+}
+
+SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0);
+
+static void
+module_shutdown(void *arg1, int arg2)
+{
+ module_t mod;
+
+ if (arg2 & RB_NOSYNC)
+ return;
+ mtx_lock(&Giant);
+ MOD_SLOCK;
+ TAILQ_FOREACH_REVERSE(mod, &modules, modulelist, link)
+ MOD_EVENT(mod, MOD_SHUTDOWN);
+ MOD_SUNLOCK;
+ mtx_unlock(&Giant);
+}
+
+void
+module_register_init(const void *arg)
+{
+ const moduledata_t *data = (const moduledata_t *)arg;
+ int error;
+ module_t mod;
+
+ mtx_lock(&Giant);
+ MOD_SLOCK;
+ mod = module_lookupbyname(data->name);
+ if (mod == NULL)
+ panic("module_register_init: module named %s not found\n",
+ data->name);
+ MOD_SUNLOCK;
+ error = MOD_EVENT(mod, MOD_LOAD);
+ if (error) {
+ MOD_EVENT(mod, MOD_UNLOAD);
+ MOD_XLOCK;
+ module_release(mod);
+ MOD_XUNLOCK;
+ printf("module_register_init: MOD_LOAD (%s, %p, %p) error"
+ " %d\n", data->name, (void *)data->evhand, data->priv,
+ error);
+ } else {
+ MOD_XLOCK;
+ if (mod->file) {
+ /*
+ * Once a module is successfully loaded, move
+ * it to the head of the module list for this
+ * linker file. This resorts the list so that
+ * when the kernel linker iterates over the
+ * modules to unload them, it will unload them
+ * in the reverse order they were loaded.
+ */
+ TAILQ_REMOVE(&mod->file->modules, mod, flink);
+ TAILQ_INSERT_HEAD(&mod->file->modules, mod, flink);
+ }
+ MOD_XUNLOCK;
+ }
+ mtx_unlock(&Giant);
+}
+
+int
+module_register(const moduledata_t *data, linker_file_t container)
+{
+ size_t namelen;
+ module_t newmod;
+
+ MOD_XLOCK;
+ newmod = module_lookupbyname(data->name);
+ if (newmod != NULL) {
+ MOD_XUNLOCK;
+ printf("module_register: module %s already exists!\n",
+ data->name);
+ return (EEXIST);
+ }
+ namelen = strlen(data->name) + 1;
+ newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
+ if (newmod == NULL) {
+ MOD_XUNLOCK;
+ return (ENOMEM);
+ }
+ newmod->refs = 1;
+ newmod->id = nextid++;
+ newmod->name = (char *)(newmod + 1);
+ strcpy(newmod->name, data->name);
+ newmod->handler = data->evhand ? data->evhand : modevent_nop;
+ newmod->arg = data->priv;
+ bzero(&newmod->data, sizeof(newmod->data));
+ TAILQ_INSERT_TAIL(&modules, newmod, link);
+
+ if (container)
+ TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
+ newmod->file = container;
+ MOD_XUNLOCK;
+ return (0);
+}
+
+void
+module_reference(module_t mod)
+{
+
+ MOD_XLOCK_ASSERT;
+
+ MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
+ mod->refs++;
+}
+
+void
+module_release(module_t mod)
+{
+
+ MOD_XLOCK_ASSERT;
+
+ if (mod->refs <= 0)
+ panic("module_release: bad reference count");
+
+ MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));
+
+ mod->refs--;
+ if (mod->refs == 0) {
+ TAILQ_REMOVE(&modules, mod, link);
+ if (mod->file)
+ TAILQ_REMOVE(&mod->file->modules, mod, flink);
+ free(mod, M_MODULE);
+ }
+}
+
+module_t
+module_lookupbyname(const char *name)
+{
+ module_t mod;
+ int err;
+
+ MOD_LOCK_ASSERT;
+
+ TAILQ_FOREACH(mod, &modules, link) {
+ err = strcmp(mod->name, name);
+ if (err == 0)
+ return (mod);
+ }
+ return (NULL);
+}
+
+module_t
+module_lookupbyid(int modid)
+{
+ module_t mod;
+
+ MOD_LOCK_ASSERT;
+
+ TAILQ_FOREACH(mod, &modules, link)
+ if (mod->id == modid)
+ return(mod);
+ return (NULL);
+}
+
+int
+module_quiesce(module_t mod)
+{
+ int error;
+
+ mtx_lock(&Giant);
+ error = MOD_EVENT(mod, MOD_QUIESCE);
+ mtx_unlock(&Giant);
+ if (error == EOPNOTSUPP || error == EINVAL)
+ error = 0;
+ return (error);
+}
+
+int
+module_unload(module_t mod)
+{
+ int error;
+
+ mtx_lock(&Giant);
+ error = MOD_EVENT(mod, MOD_UNLOAD);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+int
+module_getid(module_t mod)
+{
+
+ MOD_LOCK_ASSERT;
+ return (mod->id);
+}
+
+module_t
+module_getfnext(module_t mod)
+{
+
+ MOD_LOCK_ASSERT;
+ return (TAILQ_NEXT(mod, flink));
+}
+
+const char *
+module_getname(module_t mod)
+{
+
+ MOD_LOCK_ASSERT;
+ return (mod->name);
+}
+
+void
+module_setspecific(module_t mod, modspecific_t *datap)
+{
+
+ MOD_XLOCK_ASSERT;
+ mod->data = *datap;
+}
+
+linker_file_t
+module_file(module_t mod)
+{
+
+ return (mod->file);
+}
+
+/*
+ * Syscalls.
+ */
+int
+sys_modnext(struct thread *td, struct modnext_args *uap)
+{
+ module_t mod;
+ int error = 0;
+
+ td->td_retval[0] = -1;
+
+ MOD_SLOCK;
+ if (uap->modid == 0) {
+ mod = TAILQ_FIRST(&modules);
+ if (mod)
+ td->td_retval[0] = mod->id;
+ else
+ error = ENOENT;
+ goto done2;
+ }
+ mod = module_lookupbyid(uap->modid);
+ if (mod == NULL) {
+ error = ENOENT;
+ goto done2;
+ }
+ if (TAILQ_NEXT(mod, link))
+ td->td_retval[0] = TAILQ_NEXT(mod, link)->id;
+ else
+ td->td_retval[0] = 0;
+done2:
+ MOD_SUNLOCK;
+ return (error);
+}
+
+int
+sys_modfnext(struct thread *td, struct modfnext_args *uap)
+{
+ module_t mod;
+ int error;
+
+ td->td_retval[0] = -1;
+
+ MOD_SLOCK;
+ mod = module_lookupbyid(uap->modid);
+ if (mod == NULL) {
+ error = ENOENT;
+ } else {
+ error = 0;
+ if (TAILQ_NEXT(mod, flink))
+ td->td_retval[0] = TAILQ_NEXT(mod, flink)->id;
+ else
+ td->td_retval[0] = 0;
+ }
+ MOD_SUNLOCK;
+ return (error);
+}
+
+struct module_stat_v1 {
+ int version; /* set to sizeof(struct module_stat) */
+ char name[MAXMODNAME];
+ int refs;
+ int id;
+};
+
+int
+sys_modstat(struct thread *td, struct modstat_args *uap)
+{
+ module_t mod;
+ modspecific_t data;
+ int error = 0;
+ int id, namelen, refs, version;
+ struct module_stat *stat;
+ char *name;
+
+ MOD_SLOCK;
+ mod = module_lookupbyid(uap->modid);
+ if (mod == NULL) {
+ MOD_SUNLOCK;
+ return (ENOENT);
+ }
+ id = mod->id;
+ refs = mod->refs;
+ name = mod->name;
+ data = mod->data;
+ MOD_SUNLOCK;
+ stat = uap->stat;
+
+ /*
+ * Check the version of the user's structure.
+ */
+ if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
+ return (error);
+ if (version != sizeof(struct module_stat_v1)
+ && version != sizeof(struct module_stat))
+ return (EINVAL);
+ namelen = strlen(mod->name) + 1;
+ if (namelen > MAXMODNAME)
+ namelen = MAXMODNAME;
+ if ((error = copyout(name, &stat->name[0], namelen)) != 0)
+ return (error);
+
+ if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0)
+ return (error);
+ if ((error = copyout(&id, &stat->id, sizeof(int))) != 0)
+ return (error);
+
+ /*
+ * >v1 stat includes module data.
+ */
+ if (version == sizeof(struct module_stat))
+ if ((error = copyout(&data, &stat->data,
+ sizeof(data))) != 0)
+ return (error);
+ td->td_retval[0] = 0;
+ return (error);
+}
+
+int
+sys_modfind(struct thread *td, struct modfind_args *uap)
+{
+ int error = 0;
+ char name[MAXMODNAME];
+ module_t mod;
+
+ if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0)
+ return (error);
+
+ MOD_SLOCK;
+ mod = module_lookupbyname(name);
+ if (mod == NULL)
+ error = ENOENT;
+ else
+ td->td_retval[0] = module_getid(mod);
+ MOD_SUNLOCK;
+ return (error);
+}
+
+MODULE_VERSION(kernel, __FreeBSD_version);
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+
+typedef union modspecific32 {
+ int intval;
+ uint32_t uintval;
+ int longval;
+ uint32_t ulongval;
+} modspecific32_t;
+
+struct module_stat32 {
+ int version;
+ char name[MAXMODNAME];
+ int refs;
+ int id;
+ modspecific32_t data;
+};
+
+int
+freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap)
+{
+ module_t mod;
+ modspecific32_t data32;
+ int error = 0;
+ int id, namelen, refs, version;
+ struct module_stat32 *stat32;
+ char *name;
+
+ MOD_SLOCK;
+ mod = module_lookupbyid(uap->modid);
+ if (mod == NULL) {
+ MOD_SUNLOCK;
+ return (ENOENT);
+ }
+
+ id = mod->id;
+ refs = mod->refs;
+ name = mod->name;
+ CP(mod->data, data32, intval);
+ CP(mod->data, data32, uintval);
+ CP(mod->data, data32, longval);
+ CP(mod->data, data32, ulongval);
+ MOD_SUNLOCK;
+ stat32 = uap->stat;
+
+ if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0)
+ return (error);
+ if (version != sizeof(struct module_stat_v1)
+ && version != sizeof(struct module_stat32))
+ return (EINVAL);
+ namelen = strlen(mod->name) + 1;
+ if (namelen > MAXMODNAME)
+ namelen = MAXMODNAME;
+ if ((error = copyout(name, &stat32->name[0], namelen)) != 0)
+ return (error);
+
+ if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0)
+ return (error);
+ if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0)
+ return (error);
+
+ /*
+ * >v1 stat includes module data.
+ */
+ if (version == sizeof(struct module_stat32))
+ if ((error = copyout(&data32, &stat32->data,
+ sizeof(data32))) != 0)
+ return (error);
+ td->td_retval[0] = 0;
+ return (error);
+}
+#endif
diff --git a/sys/kern/kern_mtxpool.c b/sys/kern/kern_mtxpool.c
new file mode 100644
index 0000000..23b41bb
--- /dev/null
+++ b/sys/kern/kern_mtxpool.c
@@ -0,0 +1,218 @@
+/*-
+ * Copyright (c) 2001 Matthew Dillon. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* Mutex pool routines. These routines are designed to be used as short
+ * term leaf mutexes (e.g. the last mutex you might acquire other then
+ * calling msleep()). They operate using a shared pool. A mutex is chosen
+ * from the pool based on the supplied pointer (which may or may not be
+ * valid).
+ *
+ * Advantages:
+ * - no structural overhead. Mutexes can be associated with structures
+ * without adding bloat to the structures.
+ * - mutexes can be obtained for invalid pointers, useful when uses
+ * mutexes to interlock destructor ops.
+ * - no initialization/destructor overhead.
+ * - can be used with msleep.
+ *
+ * Disadvantages:
+ * - should generally only be used as leaf mutexes.
+ * - pool/pool dependancy ordering cannot be depended on.
+ * - possible L1 cache mastersip contention between cpus.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/proc.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+
+static MALLOC_DEFINE(M_MTXPOOL, "mtx_pool", "mutex pool");
+
+/* Pool sizes must be a power of two */
+#ifndef MTX_POOL_LOCKBUILDER_SIZE
+#define MTX_POOL_LOCKBUILDER_SIZE 128
+#endif
+#ifndef MTX_POOL_SLEEP_SIZE
+#define MTX_POOL_SLEEP_SIZE 128
+#endif
+
+struct mtxpool_header {
+ int mtxpool_size;
+ int mtxpool_mask;
+ int mtxpool_shift;
+ int mtxpool_next;
+};
+
+struct mtx_pool {
+ struct mtxpool_header mtx_pool_header;
+ struct mtx mtx_pool_ary[1];
+};
+
+static struct mtx_pool_lockbuilder {
+ struct mtxpool_header mtx_pool_header;
+ struct mtx mtx_pool_ary[MTX_POOL_LOCKBUILDER_SIZE];
+} lockbuilder_pool;
+
+#define mtx_pool_size mtx_pool_header.mtxpool_size
+#define mtx_pool_mask mtx_pool_header.mtxpool_mask
+#define mtx_pool_shift mtx_pool_header.mtxpool_shift
+#define mtx_pool_next mtx_pool_header.mtxpool_next
+
+struct mtx_pool *mtxpool_sleep;
+struct mtx_pool *mtxpool_lockbuilder;
+
+#if UINTPTR_MAX == UINT64_MAX /* 64 bits */
+# define POINTER_BITS 64
+# define HASH_MULTIPLIER 11400714819323198485u /* (2^64)*(sqrt(5)-1)/2 */
+#else /* assume 32 bits */
+# define POINTER_BITS 32
+# define HASH_MULTIPLIER 2654435769u /* (2^32)*(sqrt(5)-1)/2 */
+#endif
+
+/*
+ * Return the (shared) pool mutex associated with the specified address.
+ * The returned mutex is a leaf level mutex, meaning that if you obtain it
+ * you cannot obtain any other mutexes until you release it. You can
+ * legally msleep() on the mutex.
+ */
+struct mtx *
+mtx_pool_find(struct mtx_pool *pool, void *ptr)
+{
+ int p;
+
+ KASSERT(pool != NULL, ("_mtx_pool_find(): null pool"));
+ /*
+ * Fibonacci hash, see Knuth's
+ * _Art of Computer Programming, Volume 3 / Sorting and Searching_
+ */
+ p = ((HASH_MULTIPLIER * (uintptr_t)ptr) >> pool->mtx_pool_shift) &
+ pool->mtx_pool_mask;
+ return (&pool->mtx_pool_ary[p]);
+}
+
+static void
+mtx_pool_initialize(struct mtx_pool *pool, const char *mtx_name, int pool_size,
+ int opts)
+{
+ int i, maskbits;
+
+ pool->mtx_pool_size = pool_size;
+ pool->mtx_pool_mask = pool_size - 1;
+ for (i = 1, maskbits = 0; (i & pool_size) == 0; i = i << 1)
+ maskbits++;
+ pool->mtx_pool_shift = POINTER_BITS - maskbits;
+ pool->mtx_pool_next = 0;
+ for (i = 0; i < pool_size; ++i)
+ mtx_init(&pool->mtx_pool_ary[i], mtx_name, NULL, opts);
+}
+
+struct mtx_pool *
+mtx_pool_create(const char *mtx_name, int pool_size, int opts)
+{
+ struct mtx_pool *pool;
+
+ if (pool_size <= 0 || !powerof2(pool_size)) {
+ printf("WARNING: %s pool size is not a power of 2.\n",
+ mtx_name);
+ pool_size = 128;
+ }
+ pool = malloc(sizeof (struct mtx_pool) +
+ ((pool_size - 1) * sizeof (struct mtx)),
+ M_MTXPOOL, M_WAITOK | M_ZERO);
+ mtx_pool_initialize(pool, mtx_name, pool_size, opts);
+ return pool;
+}
+
+void
+mtx_pool_destroy(struct mtx_pool **poolp)
+{
+ int i;
+ struct mtx_pool *pool = *poolp;
+
+ for (i = pool->mtx_pool_size - 1; i >= 0; --i)
+ mtx_destroy(&pool->mtx_pool_ary[i]);
+ free(pool, M_MTXPOOL);
+ *poolp = NULL;
+}
+
+static void
+mtx_pool_setup_static(void *dummy __unused)
+{
+ mtx_pool_initialize((struct mtx_pool *)&lockbuilder_pool,
+ "lockbuilder mtxpool", MTX_POOL_LOCKBUILDER_SIZE,
+ MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
+ mtxpool_lockbuilder = (struct mtx_pool *)&lockbuilder_pool;
+}
+
+static void
+mtx_pool_setup_dynamic(void *dummy __unused)
+{
+ mtxpool_sleep = mtx_pool_create("sleep mtxpool",
+ MTX_POOL_SLEEP_SIZE, MTX_DEF);
+}
+
+/*
+ * Obtain a (shared) mutex from the pool. The returned mutex is a leaf
+ * level mutex, meaning that if you obtain it you cannot obtain any other
+ * mutexes until you release it. You can legally msleep() on the mutex.
+ */
+struct mtx *
+mtx_pool_alloc(struct mtx_pool *pool)
+{
+ int i;
+
+ KASSERT(pool != NULL, ("mtx_pool_alloc(): null pool"));
+ /*
+ * mtx_pool_next is unprotected against multiple accesses,
+ * but simultaneous access by two CPUs should not be very
+ * harmful.
+ */
+ i = pool->mtx_pool_next;
+ pool->mtx_pool_next = (i + 1) & pool->mtx_pool_mask;
+ return (&pool->mtx_pool_ary[i]);
+}
+
+/*
+ * The lockbuilder pool must be initialized early because the lockmgr
+ * and sx locks depend on it. The sx locks are used in the kernel
+ * memory allocator. The lockmgr subsystem is initialized by
+ * SYSINIT(..., SI_SUB_LOCKMGR, ...).
+ *
+ * We can't call malloc() to dynamically allocate the sleep pool
+ * until after kmeminit() has been called, which is done by
+ * SYSINIT(..., SI_SUB_KMEM, ...).
+ */
+SYSINIT(mtxpooli1, SI_SUB_MTX_POOL_STATIC, SI_ORDER_FIRST,
+ mtx_pool_setup_static, NULL);
+SYSINIT(mtxpooli2, SI_SUB_MTX_POOL_DYNAMIC, SI_ORDER_FIRST,
+ mtx_pool_setup_dynamic, NULL);
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
new file mode 100644
index 0000000..cd1ed7d
--- /dev/null
+++ b/sys/kern/kern_mutex.c
@@ -0,0 +1,1009 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ */
+
+/*
+ * Machine independent bits of mutex implementation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_adaptive_mutexes.h"
+#include "opt_ddb.h"
+#include "opt_global.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/turnstile.h>
+#include <sys/vmmeter.h>
+#include <sys/lock_profile.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+#include <machine/cpu.h>
+
+#include <ddb/ddb.h>
+
+#include <fs/devfs/devfs_int.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
+#define ADAPTIVE_MUTEXES
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DEFINE( , , lock, failed);
+#endif
+
+/*
+ * Return the mutex address when the lock cookie address is provided.
+ * This functionality assumes that struct mtx* have a member named mtx_lock.
+ */
+#define mtxlock2mtx(c) (__containerof(c, struct mtx, mtx_lock))
+
+/*
+ * Internal utility macros.
+ */
+#define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED)
+
+#define mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED)
+
+#define mtx_owner(m) ((struct thread *)((m)->mtx_lock & ~MTX_FLAGMASK))
+
+static void assert_mtx(const struct lock_object *lock, int what);
+#ifdef DDB
+static void db_show_mtx(const struct lock_object *lock);
+#endif
+static void lock_mtx(struct lock_object *lock, int how);
+static void lock_spin(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int owner_mtx(const struct lock_object *lock,
+ struct thread **owner);
+#endif
+static int unlock_mtx(struct lock_object *lock);
+static int unlock_spin(struct lock_object *lock);
+
+/*
+ * Lock classes for sleep and spin mutexes.
+ */
+struct lock_class lock_class_mtx_sleep = {
+ .lc_name = "sleep mutex",
+ .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
+ .lc_assert = assert_mtx,
+#ifdef DDB
+ .lc_ddb_show = db_show_mtx,
+#endif
+ .lc_lock = lock_mtx,
+ .lc_unlock = unlock_mtx,
+#ifdef KDTRACE_HOOKS
+ .lc_owner = owner_mtx,
+#endif
+};
+struct lock_class lock_class_mtx_spin = {
+ .lc_name = "spin mutex",
+ .lc_flags = LC_SPINLOCK | LC_RECURSABLE,
+ .lc_assert = assert_mtx,
+#ifdef DDB
+ .lc_ddb_show = db_show_mtx,
+#endif
+ .lc_lock = lock_spin,
+ .lc_unlock = unlock_spin,
+#ifdef KDTRACE_HOOKS
+ .lc_owner = owner_mtx,
+#endif
+};
+
+/*
+ * System-wide mutexes
+ */
+struct mtx blocked_lock;
+struct mtx Giant;
+
+void
+assert_mtx(const struct lock_object *lock, int what)
+{
+
+ mtx_assert((const struct mtx *)lock, what);
+}
+
+void
+lock_mtx(struct lock_object *lock, int how)
+{
+
+ mtx_lock((struct mtx *)lock);
+}
+
+void
+lock_spin(struct lock_object *lock, int how)
+{
+
+ panic("spin locks can only use msleep_spin");
+}
+
+int
+unlock_mtx(struct lock_object *lock)
+{
+ struct mtx *m;
+
+ m = (struct mtx *)lock;
+ mtx_assert(m, MA_OWNED | MA_NOTRECURSED);
+ mtx_unlock(m);
+ return (0);
+}
+
+int
+unlock_spin(struct lock_object *lock)
+{
+
+ panic("spin locks can only use msleep_spin");
+}
+
+#ifdef KDTRACE_HOOKS
+int
+owner_mtx(const struct lock_object *lock, struct thread **owner)
+{
+ const struct mtx *m = (const struct mtx *)lock;
+
+ *owner = mtx_owner(m);
+ return (mtx_unowned(m) == 0);
+}
+#endif
+
+/*
+ * Function versions of the inlined __mtx_* macros. These are used by
+ * modules and can also be called from assembly language if needed.
+ */
+void
+__mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
+{
+ struct mtx *m;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ m = mtxlock2mtx(c);
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d",
+ curthread, m->lock_object.lo_name, file, line));
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_lock() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+ ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
+ file, line));
+ WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) |
+ LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
+
+ __mtx_lock(m, curthread, opts, file, line);
+ LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
+ line);
+ WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE,
+ file, line);
+ curthread->td_locks++;
+}
+
+void
+__mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
+{
+ struct mtx *m;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ m = mtxlock2mtx(c);
+
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_unlock() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+ ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
+ file, line));
+ WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
+ line);
+ mtx_assert(m, MA_OWNED);
+
+ if (m->mtx_recurse == 0)
+ LOCKSTAT_PROFILE_RELEASE_LOCK(LS_MTX_UNLOCK_RELEASE, m);
+ __mtx_unlock(m, curthread, opts, file, line);
+ curthread->td_locks--;
+}
+
+void
+__mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
+ int line)
+{
+ struct mtx *m;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ m = mtxlock2mtx(c);
+
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
+ ("mtx_lock_spin() of sleep mutex %s @ %s:%d",
+ m->lock_object.lo_name, file, line));
+ if (mtx_owned(m))
+ KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
+ (opts & MTX_RECURSE) != 0,
+ ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n",
+ m->lock_object.lo_name, file, line));
+ opts &= ~MTX_RECURSE;
+ WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
+ file, line, NULL);
+ __mtx_lock_spin(m, curthread, opts, file, line);
+ LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
+ line);
+ WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+void
+__mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
+ int line)
+{
+ struct mtx *m;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ m = mtxlock2mtx(c);
+
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
+ ("mtx_unlock_spin() of sleep mutex %s @ %s:%d",
+ m->lock_object.lo_name, file, line));
+ WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
+ line);
+ mtx_assert(m, MA_OWNED);
+
+ __mtx_unlock_spin(m);
+}
+
+/*
+ * The important part of mtx_trylock{,_flags}()
+ * Tries to acquire lock `m.' If this function is called on a mutex that
+ * is already owned, it will recursively acquire the lock.
+ */
+int
+_mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line)
+{
+ struct mtx *m;
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+ int rval;
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+ m = mtxlock2mtx(c);
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d",
+ curthread, m->lock_object.lo_name, file, line));
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("mtx_trylock() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+ ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
+ file, line));
+
+ if (mtx_owned(m) && ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
+ (opts & MTX_RECURSE) != 0)) {
+ m->mtx_recurse++;
+ atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
+ rval = 1;
+ } else
+ rval = _mtx_obtain_lock(m, (uintptr_t)curthread);
+ opts &= ~MTX_RECURSE;
+
+ LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line);
+ if (rval) {
+ WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ curthread->td_locks++;
+ if (m->mtx_recurse == 0)
+ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE,
+ m, contested, waittime, file, line);
+
+ }
+
+ return (rval);
+}
+
+/*
+ * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
+ *
+ * We call this if the lock is either contested (i.e. we need to go to
+ * sleep waiting for it), or if we need to recurse on it.
+ */
+void
+__mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
+ const char *file, int line)
+{
+ struct mtx *m;
+ struct turnstile *ts;
+ uintptr_t v;
+#ifdef ADAPTIVE_MUTEXES
+ volatile struct thread *owner;
+#endif
+#ifdef KTR
+ int cont_logged = 0;
+#endif
+#ifdef LOCK_PROFILING
+ int contested = 0;
+ uint64_t waittime = 0;
+#endif
+#ifdef KDTRACE_HOOKS
+ uint64_t spin_cnt = 0;
+ uint64_t sleep_cnt = 0;
+ int64_t sleep_time = 0;
+#endif
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ m = mtxlock2mtx(c);
+
+ if (mtx_owned(m)) {
+ KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
+ (opts & MTX_RECURSE) != 0,
+ ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n",
+ m->lock_object.lo_name, file, line));
+ opts &= ~MTX_RECURSE;
+ m->mtx_recurse++;
+ atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
+ return;
+ }
+ opts &= ~MTX_RECURSE;
+
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&m->lock_object,
+ &contested, &waittime);
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
+ CTR4(KTR_LOCK,
+ "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
+ m->lock_object.lo_name, (void *)m->mtx_lock, file, line);
+
+ while (!_mtx_obtain_lock(m, tid)) {
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+#ifdef ADAPTIVE_MUTEXES
+ /*
+ * If the owner is running on another CPU, spin until the
+ * owner stops running or the state of the lock changes.
+ */
+ v = m->mtx_lock;
+ if (v != MTX_UNOWNED) {
+ owner = (struct thread *)(v & ~MTX_FLAGMASK);
+ if (TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&m->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, m, owner);
+ while (mtx_owner(m) == owner &&
+ TD_IS_RUNNING(owner)) {
+ cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ }
+ continue;
+ }
+ }
+#endif
+
+ ts = turnstile_trywait(&m->lock_object);
+ v = m->mtx_lock;
+
+ /*
+ * Check if the lock has been released while spinning for
+ * the turnstile chain lock.
+ */
+ if (v == MTX_UNOWNED) {
+ turnstile_cancel(ts);
+ continue;
+ }
+
+#ifdef ADAPTIVE_MUTEXES
+ /*
+ * The current lock owner might have started executing
+ * on another CPU (or the lock could have changed
+ * owners) while we were waiting on the turnstile
+ * chain lock. If so, drop the turnstile lock and try
+ * again.
+ */
+ owner = (struct thread *)(v & ~MTX_FLAGMASK);
+ if (TD_IS_RUNNING(owner)) {
+ turnstile_cancel(ts);
+ continue;
+ }
+#endif
+
+ /*
+ * If the mutex isn't already contested and a failure occurs
+ * setting the contested bit, the mutex was either released
+ * or the state of the MTX_RECURSED bit changed.
+ */
+ if ((v & MTX_CONTESTED) == 0 &&
+ !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
+ turnstile_cancel(ts);
+ continue;
+ }
+
+ /*
+ * We definitely must sleep for this lock.
+ */
+ mtx_assert(m, MA_NOTOWNED);
+
+#ifdef KTR
+ if (!cont_logged) {
+ CTR6(KTR_CONTENTION,
+ "contention: %p at %s:%d wants %s, taken by %s:%d",
+ (void *)tid, file, line, m->lock_object.lo_name,
+ WITNESS_FILE(&m->lock_object),
+ WITNESS_LINE(&m->lock_object));
+ cont_logged = 1;
+ }
+#endif
+
+ /*
+ * Block on the turnstile.
+ */
+#ifdef KDTRACE_HOOKS
+ sleep_time -= lockstat_nsecs();
+#endif
+ turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE);
+#ifdef KDTRACE_HOOKS
+ sleep_time += lockstat_nsecs();
+ sleep_cnt++;
+#endif
+ }
+#ifdef KTR
+ if (cont_logged) {
+ CTR4(KTR_CONTENTION,
+ "contention end: %s acquired by %p at %s:%d",
+ m->lock_object.lo_name, (void *)tid, file, line);
+ }
+#endif
+ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE, m, contested,
+ waittime, file, line);
+#ifdef KDTRACE_HOOKS
+ if (sleep_time)
+ LOCKSTAT_RECORD1(LS_MTX_LOCK_BLOCK, m, sleep_time);
+
+ /*
+ * Only record the loops spinning and not sleeping.
+ */
+ if (spin_cnt > sleep_cnt)
+ LOCKSTAT_RECORD1(LS_MTX_LOCK_SPIN, m, (spin_cnt - sleep_cnt));
+#endif
+}
+
+static void
+_mtx_lock_spin_failed(struct mtx *m)
+{
+ struct thread *td;
+
+ td = mtx_owner(m);
+
+ /* If the mutex is unlocked, try again. */
+ if (td == NULL)
+ return;
+
+ printf( "spin lock %p (%s) held by %p (tid %d) too long\n",
+ m, m->lock_object.lo_name, td, td->td_tid);
+#ifdef WITNESS
+ witness_display_spinlock(&m->lock_object, td, printf);
+#endif
+ panic("spin lock held too long");
+}
+
+#ifdef SMP
+/*
+ * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock.
+ *
+ * This is only called if we need to actually spin for the lock. Recursion
+ * is handled inline.
+ */
+void
+_mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts,
+ const char *file, int line)
+{
+ struct mtx *m;
+ int i = 0;
+#ifdef LOCK_PROFILING
+ int contested = 0;
+ uint64_t waittime = 0;
+#endif
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ m = mtxlock2mtx(c);
+
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
+
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
+ while (!_mtx_obtain_lock(m, tid)) {
+
+ /* Give interrupts a chance while we spin. */
+ spinlock_exit();
+ while (m->mtx_lock != MTX_UNOWNED) {
+ if (i++ < 10000000) {
+ cpu_spinwait();
+ continue;
+ }
+ if (i < 60000000 || kdb_active || panicstr != NULL)
+ DELAY(1);
+ else
+ _mtx_lock_spin_failed(m);
+ cpu_spinwait();
+ }
+ spinlock_enter();
+ }
+
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
+ CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
+
+ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE, m,
+ contested, waittime, (file), (line));
+ LOCKSTAT_RECORD1(LS_MTX_SPIN_LOCK_SPIN, m, i);
+}
+#endif /* SMP */
+
+void
+thread_lock_flags_(struct thread *td, int opts, const char *file, int line)
+{
+ struct mtx *m;
+ uintptr_t tid;
+ int i;
+#ifdef LOCK_PROFILING
+ int contested = 0;
+ uint64_t waittime = 0;
+#endif
+#ifdef KDTRACE_HOOKS
+ uint64_t spin_cnt = 0;
+#endif
+
+ i = 0;
+ tid = (uintptr_t)curthread;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ for (;;) {
+retry:
+ spinlock_enter();
+ m = td->td_lock;
+ KASSERT(m->mtx_lock != MTX_DESTROYED,
+ ("thread_lock() of destroyed mutex @ %s:%d", file, line));
+ KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
+ ("thread_lock() of sleep mutex %s @ %s:%d",
+ m->lock_object.lo_name, file, line));
+ if (mtx_owned(m))
+ KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0,
+ ("thread_lock: recursed on non-recursive mutex %s @ %s:%d\n",
+ m->lock_object.lo_name, file, line));
+ WITNESS_CHECKORDER(&m->lock_object,
+ opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
+ while (!_mtx_obtain_lock(m, tid)) {
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ if (m->mtx_lock == tid) {
+ m->mtx_recurse++;
+ break;
+ }
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&m->lock_object,
+ &contested, &waittime);
+ /* Give interrupts a chance while we spin. */
+ spinlock_exit();
+ while (m->mtx_lock != MTX_UNOWNED) {
+ if (i++ < 10000000)
+ cpu_spinwait();
+ else if (i < 60000000 ||
+ kdb_active || panicstr != NULL)
+ DELAY(1);
+ else
+ _mtx_lock_spin_failed(m);
+ cpu_spinwait();
+ if (m != td->td_lock)
+ goto retry;
+ }
+ spinlock_enter();
+ }
+ if (m == td->td_lock)
+ break;
+ __mtx_unlock_spin(m); /* does spinlock_exit() */
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ }
+ if (m->mtx_recurse == 0)
+ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE,
+ m, contested, waittime, (file), (line));
+ LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
+ line);
+ WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+ LOCKSTAT_RECORD1(LS_THREAD_LOCK_SPIN, m, spin_cnt);
+}
+
+struct mtx *
+thread_lock_block(struct thread *td)
+{
+ struct mtx *lock;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ lock = td->td_lock;
+ td->td_lock = &blocked_lock;
+ mtx_unlock_spin(lock);
+
+ return (lock);
+}
+
+void
+thread_lock_unblock(struct thread *td, struct mtx *new)
+{
+ mtx_assert(new, MA_OWNED);
+ MPASS(td->td_lock == &blocked_lock);
+ atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new);
+}
+
+void
+thread_lock_set(struct thread *td, struct mtx *new)
+{
+ struct mtx *lock;
+
+ mtx_assert(new, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ lock = td->td_lock;
+ td->td_lock = new;
+ mtx_unlock_spin(lock);
+}
+
+/*
+ * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
+ *
+ * We are only called here if the lock is recursed or contested (i.e. we
+ * need to wake up a blocked thread).
+ */
+void
+__mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line)
+{
+ struct mtx *m;
+ struct turnstile *ts;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ m = mtxlock2mtx(c);
+
+ if (mtx_recursed(m)) {
+ if (--(m->mtx_recurse) == 0)
+ atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
+ return;
+ }
+
+ /*
+ * We have to lock the chain before the turnstile so this turnstile
+ * can be removed from the hash list if it is empty.
+ */
+ turnstile_chain_lock(&m->lock_object);
+ ts = turnstile_lookup(&m->lock_object);
+ if (LOCK_LOG_TEST(&m->lock_object, opts))
+ CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
+ MPASS(ts != NULL);
+ turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
+ _mtx_release_lock_quick(m);
+
+ /*
+ * This turnstile is now no longer associated with the mutex. We can
+ * unlock the chain lock so a new turnstile may take it's place.
+ */
+ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+ turnstile_chain_unlock(&m->lock_object);
+}
+
+/*
+ * All the unlocking of MTX_SPIN locks is done inline.
+ * See the __mtx_unlock_spin() macro for the details.
+ */
+
+/*
+ * The backing function for the INVARIANTS-enabled mtx_assert()
+ */
+#ifdef INVARIANT_SUPPORT
+void
+__mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line)
+{
+ const struct mtx *m;
+
+ if (panicstr != NULL || dumping)
+ return;
+
+ m = mtxlock2mtx(c);
+
+ switch (what) {
+ case MA_OWNED:
+ case MA_OWNED | MA_RECURSED:
+ case MA_OWNED | MA_NOTRECURSED:
+ if (!mtx_owned(m))
+ panic("mutex %s not owned at %s:%d",
+ m->lock_object.lo_name, file, line);
+ if (mtx_recursed(m)) {
+ if ((what & MA_NOTRECURSED) != 0)
+ panic("mutex %s recursed at %s:%d",
+ m->lock_object.lo_name, file, line);
+ } else if ((what & MA_RECURSED) != 0) {
+ panic("mutex %s unrecursed at %s:%d",
+ m->lock_object.lo_name, file, line);
+ }
+ break;
+ case MA_NOTOWNED:
+ if (mtx_owned(m))
+ panic("mutex %s owned at %s:%d",
+ m->lock_object.lo_name, file, line);
+ break;
+ default:
+ panic("unknown mtx_assert at %s:%d", file, line);
+ }
+}
+#endif
+
+/*
+ * The MUTEX_DEBUG-enabled mtx_validate()
+ *
+ * Most of these checks have been moved off into the LO_INITIALIZED flag
+ * maintained by the witness code.
+ */
+#ifdef MUTEX_DEBUG
+
+void mtx_validate(struct mtx *);
+
+void
+mtx_validate(struct mtx *m)
+{
+
+/*
+ * XXX: When kernacc() does not require Giant we can reenable this check
+ */
+#ifdef notyet
+ /*
+ * Can't call kernacc() from early init386(), especially when
+ * initializing Giant mutex, because some stuff in kernacc()
+ * requires Giant itself.
+ */
+ if (!cold)
+ if (!kernacc((caddr_t)m, sizeof(m),
+ VM_PROT_READ | VM_PROT_WRITE))
+ panic("Can't read and write to mutex %p", m);
+#endif
+}
+#endif
+
+/*
+ * General init routine used by the MTX_SYSINIT() macro.
+ */
+void
+mtx_sysinit(void *arg)
+{
+ struct mtx_args *margs = arg;
+
+ mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL,
+ margs->ma_opts);
+}
+
+/*
+ * Mutex initialization routine; initialize lock `m' of type contained in
+ * `opts' with options contained in `opts' and name `name.' The optional
+ * lock type `type' is used as a general lock category name for use with
+ * witness.
+ */
+void
+_mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts)
+{
+ struct mtx *m;
+ struct lock_class *class;
+ int flags;
+
+ m = mtxlock2mtx(c);
+
+ MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
+ MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE)) == 0);
+ ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock,
+ ("%s: mtx_lock not aligned for %s: %p", __func__, name,
+ &m->mtx_lock));
+
+#ifdef MUTEX_DEBUG
+ /* Diagnostic and error correction */
+ mtx_validate(m);
+#endif
+
+ /* Determine lock class and lock flags. */
+ if (opts & MTX_SPIN)
+ class = &lock_class_mtx_spin;
+ else
+ class = &lock_class_mtx_sleep;
+ flags = 0;
+ if (opts & MTX_QUIET)
+ flags |= LO_QUIET;
+ if (opts & MTX_RECURSE)
+ flags |= LO_RECURSABLE;
+ if ((opts & MTX_NOWITNESS) == 0)
+ flags |= LO_WITNESS;
+ if (opts & MTX_DUPOK)
+ flags |= LO_DUPOK;
+ if (opts & MTX_NOPROFILE)
+ flags |= LO_NOPROFILE;
+
+ /* Initialize mutex. */
+ lock_init(&m->lock_object, class, name, type, flags);
+
+ m->mtx_lock = MTX_UNOWNED;
+ m->mtx_recurse = 0;
+}
+
+/*
+ * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be
+ * passed in as a flag here because if the corresponding mtx_init() was
+ * called with MTX_QUIET set, then it will already be set in the mutex's
+ * flags.
+ */
+void
+_mtx_destroy(volatile uintptr_t *c)
+{
+ struct mtx *m;
+
+ m = mtxlock2mtx(c);
+
+ if (!mtx_owned(m))
+ MPASS(mtx_unowned(m));
+ else {
+ MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
+
+ /* Perform the non-mtx related part of mtx_unlock_spin(). */
+ if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin)
+ spinlock_exit();
+ else
+ curthread->td_locks--;
+
+ lock_profile_release_lock(&m->lock_object);
+ /* Tell witness this isn't locked to make it happy. */
+ WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__,
+ __LINE__);
+ }
+
+ m->mtx_lock = MTX_DESTROYED;
+ lock_destroy(&m->lock_object);
+}
+
+/*
+ * Intialize the mutex code and system mutexes. This is called from the MD
+ * startup code prior to mi_startup(). The per-CPU data space needs to be
+ * setup before this is called.
+ */
+void
+mutex_init(void)
+{
+
+ /* Setup turnstiles so that sleep mutexes work. */
+ init_turnstiles();
+
+ /*
+ * Initialize mutexes.
+ */
+ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
+ mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
+ blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */
+ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+ mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
+ mtx_init(&devmtx, "cdev", NULL, MTX_DEF);
+ mtx_lock(&Giant);
+}
+
+#ifdef DDB
+void
+db_show_mtx(const struct lock_object *lock)
+{
+ struct thread *td;
+ const struct mtx *m;
+
+ m = (const struct mtx *)lock;
+
+ db_printf(" flags: {");
+ if (LOCK_CLASS(lock) == &lock_class_mtx_spin)
+ db_printf("SPIN");
+ else
+ db_printf("DEF");
+ if (m->lock_object.lo_flags & LO_RECURSABLE)
+ db_printf(", RECURSE");
+ if (m->lock_object.lo_flags & LO_DUPOK)
+ db_printf(", DUPOK");
+ db_printf("}\n");
+ db_printf(" state: {");
+ if (mtx_unowned(m))
+ db_printf("UNOWNED");
+ else if (mtx_destroyed(m))
+ db_printf("DESTROYED");
+ else {
+ db_printf("OWNED");
+ if (m->mtx_lock & MTX_CONTESTED)
+ db_printf(", CONTESTED");
+ if (m->mtx_lock & MTX_RECURSED)
+ db_printf(", RECURSED");
+ }
+ db_printf("}\n");
+ if (!mtx_unowned(m) && !mtx_destroyed(m)) {
+ td = mtx_owner(m);
+ db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td,
+ td->td_tid, td->td_proc->p_pid, td->td_name);
+ if (mtx_recursed(m))
+ db_printf(" recursed: %d\n", m->mtx_recurse);
+ }
+}
+#endif
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
new file mode 100644
index 0000000..7c95575
--- /dev/null
+++ b/sys/kern/kern_ntptime.c
@@ -0,0 +1,1055 @@
+/*-
+ ***********************************************************************
+ * *
+ * Copyright (c) David L. Mills 1993-2001 *
+ * *
+ * Permission to use, copy, modify, and distribute this software and *
+ * its documentation for any purpose and without fee is hereby *
+ * granted, provided that the above copyright notice appears in all *
+ * copies and that both the copyright notice and this permission *
+ * notice appear in supporting documentation, and that the name *
+ * University of Delaware not be used in advertising or publicity *
+ * pertaining to distribution of the software without specific, *
+ * written prior permission. The University of Delaware makes no *
+ * representations about the suitability this software for any *
+ * purpose. It is provided "as is" without express or implied *
+ * warranty. *
+ * *
+ **********************************************************************/
+
+/*
+ * Adapted from the original sources for FreeBSD and timecounters by:
+ * Poul-Henning Kamp <phk@FreeBSD.org>.
+ *
+ * The 32bit version of the "LP" macros seems a bit past its "sell by"
+ * date so I have retained only the 64bit version and included it directly
+ * in this file.
+ *
+ * Only minor changes done to interface with the timecounters over in
+ * sys/kern/kern_clock.c. Some of the comments below may be (even more)
+ * confusing and/or plain wrong in that context.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ntp.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <sys/timetc.h>
+#include <sys/timepps.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+
+#ifdef PPS_SYNC
+FEATURE(pps_sync, "Support usage of external PPS signal by kernel PLL");
+#endif
+
+/*
+ * Single-precision macros for 64-bit machines
+ */
+typedef int64_t l_fp;
+#define L_ADD(v, u) ((v) += (u))
+#define L_SUB(v, u) ((v) -= (u))
+#define L_ADDHI(v, a) ((v) += (int64_t)(a) << 32)
+#define L_NEG(v) ((v) = -(v))
+#define L_RSHIFT(v, n) \
+ do { \
+ if ((v) < 0) \
+ (v) = -(-(v) >> (n)); \
+ else \
+ (v) = (v) >> (n); \
+ } while (0)
+#define L_MPY(v, a) ((v) *= (a))
+#define L_CLR(v) ((v) = 0)
+#define L_ISNEG(v) ((v) < 0)
+#define L_LINT(v, a) ((v) = (int64_t)(a) << 32)
+#define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
+
+/*
+ * Generic NTP kernel interface
+ *
+ * These routines constitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by other routines in this module to adjust the
+ * phase and frequency of the clock discipline loop which controls the
+ * system clock.
+ *
+ * When the kernel time is reckoned directly in nanoseconds (NTP_NANO
+ * defined), the time at each tick interrupt is derived directly from
+ * the kernel time variable. When the kernel time is reckoned in
+ * microseconds, (NTP_NANO undefined), the time is derived from the
+ * kernel time variable together with a variable representing the
+ * leftover nanoseconds at the last tick interrupt. In either case, the
+ * current nanosecond time is reckoned from these values plus an
+ * interpolated value derived by the clock routines in another
+ * architecture-specific module. The interpolation can use either a
+ * dedicated counter or a processor cycle counter (PCC) implemented in
+ * some architectures.
+ *
+ * Note that all routines must run at priority splclock or higher.
+ */
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The nanosecond clock discipline uses two variable types, time
+ * variables and frequency variables. Both types are represented as 64-
+ * bit fixed-point quantities with the decimal point between two 32-bit
+ * halves. On a 32-bit machine, each half is represented as a single
+ * word and mathematical operations are done using multiple-precision
+ * arithmetic. On a 64-bit machine, ordinary computer arithmetic is
+ * used.
+ *
+ * A time variable is a signed 64-bit fixed-point number in ns and
+ * fraction. It represents the remaining time offset to be amortized
+ * over succeeding tick interrupts. The maximum time offset is about
+ * 0.5 s and the resolution is about 2.3e-10 ns.
+ *
+ * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |s s s| ns |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | fraction |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * A frequency variable is a signed 64-bit fixed-point number in ns/s
+ * and fraction. It represents the ns and fraction to be added to the
+ * kernel time variable at each second. The maximum frequency offset is
+ * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
+ *
+ * 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |s s s s s s s s s s s s s| ns/s |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | fraction |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock.
+ */
+#define SHIFT_PLL 4 /* PLL loop gain (shift) */
+#define SHIFT_FLL 2 /* FLL loop gain (shift) */
+
+static int time_state = TIME_OK; /* clock state */
+int time_status = STA_UNSYNC; /* clock status bits */
+static long time_tai; /* TAI offset (s) */
+static long time_monitor; /* last time offset scaled (ns) */
+static long time_constant; /* poll interval (shift) (s) */
+static long time_precision = 1; /* clock precision (ns) */
+static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
+long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
+static long time_reftime; /* time at last adjustment (s) */
+static l_fp time_offset; /* time offset (ns) */
+static l_fp time_freq; /* frequency offset (ns/s) */
+static l_fp time_adj; /* tick adjust (ns/s) */
+
+static int64_t time_adjtime; /* correction from adjtime(2) (usec) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available and connected via a modem control lead. They establish
+ * the engineering parameters of the clock discipline loop when
+ * controlled by the PPS signal.
+ */
+#define PPS_FAVG 2 /* min freq avg interval (s) (shift) */
+#define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */
+#define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */
+#define PPS_PAVG 4 /* phase avg interval (s) (shift) */
+#define PPS_VALID 120 /* PPS signal watchdog max (s) */
+#define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */
+#define PPS_POPCORN 2 /* popcorn spike threshold (shift) */
+
+static struct timespec pps_tf[3]; /* phase median filter */
+static l_fp pps_freq; /* scaled frequency offset (ns/s) */
+static long pps_fcount; /* frequency accumulator */
+static long pps_jitter; /* nominal jitter (ns) */
+static long pps_stabil; /* nominal stability (scaled ns/s) */
+static long pps_lastsec; /* time at last calibration (s) */
+static int pps_valid; /* signal watchdog counter */
+static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */
+static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */
+static int pps_intcnt; /* wander counter */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt; /* calibration intervals */
+static long pps_jitcnt; /* jitter limit exceeded */
+static long pps_stbcnt; /* stability limit exceeded */
+static long pps_errcnt; /* calibration errors */
+#endif /* PPS_SYNC */
+/*
+ * End of phase/frequency-lock loop (PLL/FLL) definitions
+ */
+
+static void ntp_init(void);
+static void hardupdate(long offset);
+static void ntp_gettime1(struct ntptimeval *ntvp);
+static int ntp_is_time_error(void);
+
+static int
+ntp_is_time_error(void)
+{
+ /*
+ * Status word error decode. If any of these conditions occur,
+ * an error is returned, instead of the status word. Most
+ * applications will care only about the fact the system clock
+ * may not be trusted, not about the details.
+ *
+ * Hardware or software error
+ */
+ if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
+
+ /*
+ * PPS signal lost when either time or frequency synchronization
+ * requested
+ */
+ (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL)) ||
+
+ /*
+ * PPS jitter exceeded when time synchronization requested
+ */
+ (time_status & STA_PPSTIME &&
+ time_status & STA_PPSJITTER) ||
+
+ /*
+ * PPS wander exceeded or calibration error when frequency
+ * synchronization requested
+ */
+ (time_status & STA_PPSFREQ &&
+ time_status & (STA_PPSWANDER | STA_PPSERROR)))
+ return (1);
+
+ return (0);
+}
+
+static void
+ntp_gettime1(struct ntptimeval *ntvp)
+{
+ struct timespec atv; /* nanosecond time */
+
+ GIANT_REQUIRED;
+
+ nanotime(&atv);
+ ntvp->time.tv_sec = atv.tv_sec;
+ ntvp->time.tv_nsec = atv.tv_nsec;
+ ntvp->maxerror = time_maxerror;
+ ntvp->esterror = time_esterror;
+ ntvp->tai = time_tai;
+ ntvp->time_state = time_state;
+
+ if (ntp_is_time_error())
+ ntvp->time_state = TIME_ERROR;
+}
+
+/*
+ * ntp_gettime() - NTP user application interface
+ *
+ * See the timex.h header file for synopsis and API description. Note that
+ * the TAI offset is returned in the ntvtimeval.tai structure member.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_gettime_args {
+ struct ntptimeval *ntvp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ntp_gettime(struct thread *td, struct ntp_gettime_args *uap)
+{
+ struct ntptimeval ntv;
+
+ mtx_lock(&Giant);
+ ntp_gettime1(&ntv);
+ mtx_unlock(&Giant);
+
+ td->td_retval[0] = ntv.time_state;
+ return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
+}
+
+static int
+ntp_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct ntptimeval ntv; /* temporary structure */
+
+ ntp_gettime1(&ntv);
+
+ return (sysctl_handle_opaque(oidp, &ntv, sizeof(ntv), req));
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, "");
+SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
+
+#ifdef PPS_SYNC
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW,
+ &pps_shiftmax, 0, "Max interval duration (sec) (shift)");
+SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW,
+ &pps_shift, 0, "Interval duration (sec) (shift)");
+SYSCTL_LONG(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD,
+ &time_monitor, 0, "Last time offset scaled (ns)");
+
+SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD,
+ &pps_freq, sizeof(pps_freq), "I", "Scaled frequency offset (ns/sec)");
+SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD,
+ &time_freq, sizeof(time_freq), "I", "Frequency offset (ns/sec)");
+#endif
+
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ *
+ * See the timex.h header file for synopsis and API description. Note that
+ * the timex.constant structure member has a dual purpose to set the time
+ * constant and to set the TAI offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_adjtime_args {
+ struct timex *tp;
+};
+#endif
+
+int
+sys_ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
+{
+ struct timex ntv; /* temporary structure */
+ long freq; /* frequency ns/s) */
+ int modes; /* mode bits from structure */
+ int s; /* caller priority */
+ int error;
+
+ error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
+ if (error)
+ return(error);
+
+ /*
+ * Update selected clock variables - only the superuser can
+ * change anything. Note that there is no error checking here on
+ * the assumption the superuser should know what it is doing.
+ * Note that either the time constant or TAI offset are loaded
+ * from the ntv.constant member, depending on the mode bits. If
+ * the STA_PLL bit in the status word is cleared, the state and
+ * status words are reset to the initial values at boot.
+ */
+ mtx_lock(&Giant);
+ modes = ntv.modes;
+ if (modes)
+ error = priv_check(td, PRIV_NTP_ADJTIME);
+ if (error)
+ goto done2;
+ s = splclock();
+ if (modes & MOD_MAXERROR)
+ time_maxerror = ntv.maxerror;
+ if (modes & MOD_ESTERROR)
+ time_esterror = ntv.esterror;
+ if (modes & MOD_STATUS) {
+ if (time_status & STA_PLL && !(ntv.status & STA_PLL)) {
+ time_state = TIME_OK;
+ time_status = STA_UNSYNC;
+#ifdef PPS_SYNC
+ pps_shift = PPS_FAVG;
+#endif /* PPS_SYNC */
+ }
+ time_status &= STA_RONLY;
+ time_status |= ntv.status & ~STA_RONLY;
+ }
+ if (modes & MOD_TIMECONST) {
+ if (ntv.constant < 0)
+ time_constant = 0;
+ else if (ntv.constant > MAXTC)
+ time_constant = MAXTC;
+ else
+ time_constant = ntv.constant;
+ }
+ if (modes & MOD_TAI) {
+ if (ntv.constant > 0) /* XXX zero & negative numbers ? */
+ time_tai = ntv.constant;
+ }
+#ifdef PPS_SYNC
+ if (modes & MOD_PPSMAX) {
+ if (ntv.shift < PPS_FAVG)
+ pps_shiftmax = PPS_FAVG;
+ else if (ntv.shift > PPS_FAVGMAX)
+ pps_shiftmax = PPS_FAVGMAX;
+ else
+ pps_shiftmax = ntv.shift;
+ }
+#endif /* PPS_SYNC */
+ if (modes & MOD_NANO)
+ time_status |= STA_NANO;
+ if (modes & MOD_MICRO)
+ time_status &= ~STA_NANO;
+ if (modes & MOD_CLKB)
+ time_status |= STA_CLK;
+ if (modes & MOD_CLKA)
+ time_status &= ~STA_CLK;
+ if (modes & MOD_FREQUENCY) {
+ freq = (ntv.freq * 1000LL) >> 16;
+ if (freq > MAXFREQ)
+ L_LINT(time_freq, MAXFREQ);
+ else if (freq < -MAXFREQ)
+ L_LINT(time_freq, -MAXFREQ);
+ else {
+ /*
+ * ntv.freq is [PPM * 2^16] = [us/s * 2^16]
+ * time_freq is [ns/s * 2^32]
+ */
+ time_freq = ntv.freq * 1000LL * 65536LL;
+ }
+#ifdef PPS_SYNC
+ pps_freq = time_freq;
+#endif /* PPS_SYNC */
+ }
+ if (modes & MOD_OFFSET) {
+ if (time_status & STA_NANO)
+ hardupdate(ntv.offset);
+ else
+ hardupdate(ntv.offset * 1000);
+ }
+
+ /*
+ * Retrieve all clock variables. Note that the TAI offset is
+ * returned only by ntp_gettime();
+ */
+ if (time_status & STA_NANO)
+ ntv.offset = L_GINT(time_offset);
+ else
+ ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
+ ntv.freq = L_GINT((time_freq / 1000LL) << 16);
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+ ntv.status = time_status;
+ ntv.constant = time_constant;
+ if (time_status & STA_NANO)
+ ntv.precision = time_precision;
+ else
+ ntv.precision = time_precision / 1000;
+ ntv.tolerance = MAXFREQ * SCALE_PPM;
+#ifdef PPS_SYNC
+ ntv.shift = pps_shift;
+ ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
+ if (time_status & STA_NANO)
+ ntv.jitter = pps_jitter;
+ else
+ ntv.jitter = pps_jitter / 1000;
+ ntv.stabil = pps_stabil;
+ ntv.calcnt = pps_calcnt;
+ ntv.errcnt = pps_errcnt;
+ ntv.jitcnt = pps_jitcnt;
+ ntv.stbcnt = pps_stbcnt;
+#endif /* PPS_SYNC */
+ splx(s);
+
+ error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
+ if (error)
+ goto done2;
+
+ if (ntp_is_time_error())
+ td->td_retval[0] = TIME_ERROR;
+ else
+ td->td_retval[0] = time_state;
+
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * second_overflow() - called after ntp_tick_adjust()
+ *
+ * This routine is ordinarily called immediately following the above
+ * routine ntp_tick_adjust(). While these two routines are normally
+ * combined, they are separated here only for the purposes of
+ * simulation.
+ */
+void
+ntp_update_second(int64_t *adjustment, time_t *newsec)
+{
+ int tickrate;
+ l_fp ftemp; /* 32/64-bit temporary */
+
+ /*
+ * On rollover of the second both the nanosecond and microsecond
+ * clocks are updated and the state machine cranked as
+ * necessary. The phase adjustment to be used for the next
+ * second is calculated and the maximum error is increased by
+ * the tolerance.
+ */
+ time_maxerror += MAXFREQ / 1000;
+
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second. The nano_time() routine or
+ * external clock driver will insure that reported time
+ * is always monotonic.
+ */
+ switch (time_state) {
+
+ /*
+ * No warning.
+ */
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+
+ /*
+ * Insert second 23:59:60 following second
+ * 23:59:59.
+ */
+ case TIME_INS:
+ if (!(time_status & STA_INS))
+ time_state = TIME_OK;
+ else if ((*newsec) % 86400 == 0) {
+ (*newsec)--;
+ time_state = TIME_OOP;
+ time_tai++;
+ }
+ break;
+
+ /*
+ * Delete second 23:59:59.
+ */
+ case TIME_DEL:
+ if (!(time_status & STA_DEL))
+ time_state = TIME_OK;
+ else if (((*newsec) + 1) % 86400 == 0) {
+ (*newsec)++;
+ time_tai--;
+ time_state = TIME_WAIT;
+ }
+ break;
+
+ /*
+ * Insert second in progress.
+ */
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ /*
+ * Wait for status bits to clear.
+ */
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+
+ /*
+ * Compute the total time adjustment for the next second
+ * in ns. The offset is reduced by a factor depending on
+ * whether the PPS signal is operating. Note that the
+ * value is in effect scaled by the clock frequency,
+ * since the adjustment is added at each tick interrupt.
+ */
+ ftemp = time_offset;
+#ifdef PPS_SYNC
+ /* XXX even if PPS signal dies we should finish adjustment ? */
+ if (time_status & STA_PPSTIME && time_status &
+ STA_PPSSIGNAL)
+ L_RSHIFT(ftemp, pps_shift);
+ else
+ L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
+#else
+ L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
+#endif /* PPS_SYNC */
+ time_adj = ftemp;
+ L_SUB(time_offset, ftemp);
+ L_ADD(time_adj, time_freq);
+
+ /*
+ * Apply any correction from adjtime(2). If more than one second
+ * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
+ * until the last second is slewed the final < 500 usecs.
+ */
+ if (time_adjtime != 0) {
+ if (time_adjtime > 1000000)
+ tickrate = 5000;
+ else if (time_adjtime < -1000000)
+ tickrate = -5000;
+ else if (time_adjtime > 500)
+ tickrate = 500;
+ else if (time_adjtime < -500)
+ tickrate = -500;
+ else
+ tickrate = time_adjtime;
+ time_adjtime -= tickrate;
+ L_LINT(ftemp, tickrate * 1000);
+ L_ADD(time_adj, ftemp);
+ }
+ *adjustment = time_adj;
+
+#ifdef PPS_SYNC
+ if (pps_valid > 0)
+ pps_valid--;
+ else
+ time_status &= ~STA_PPSSIGNAL;
+#endif /* PPS_SYNC */
+}
+
+/*
+ * ntp_init() - initialize variables and structures
+ *
+ * This routine must be called after the kernel variables hz and tick
+ * are set or changed and before the next tick interrupt. In this
+ * particular implementation, these values are assumed set elsewhere in
+ * the kernel. The design allows the clock frequency and tick interval
+ * to be changed while the system is running. So, this routine should
+ * probably be integrated with the code that does that.
+ */
+static void
+ntp_init()
+{
+
+ /*
+ * The following variables are initialized only at startup. Only
+ * those structures not cleared by the compiler need to be
+ * initialized, and these only in the simulator. In the actual
+ * kernel, any nonzero values here will quickly evaporate.
+ */
+ L_CLR(time_offset);
+ L_CLR(time_freq);
+#ifdef PPS_SYNC
+ pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
+ pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
+ pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
+ pps_fcount = 0;
+ L_CLR(pps_freq);
+#endif /* PPS_SYNC */
+}
+
+SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL);
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillators and nominal update
+ * intervals less than 256 s, operation should be in phase-lock mode,
+ * where the loop is disciplined to phase. For update intervals greater
+ * than 1024 s, operation should be in frequency-lock mode, where the
+ * loop is disciplined to frequency. Between 256 s and 1024 s, the mode
+ * is selected by the STA_MODE status bit.
+ */
+static void
+hardupdate(offset)
+ long offset; /* clock offset (ns) */
+{
+ long mtemp;
+ l_fp ftemp;
+
+ /*
+ * Select how the phase is to be controlled and from which
+ * source. If the PPS signal is present and enabled to
+ * discipline the time, the PPS offset is used; otherwise, the
+ * argument offset is used.
+ */
+ if (!(time_status & STA_PLL))
+ return;
+ if (!(time_status & STA_PPSTIME && time_status &
+ STA_PPSSIGNAL)) {
+ if (offset > MAXPHASE)
+ time_monitor = MAXPHASE;
+ else if (offset < -MAXPHASE)
+ time_monitor = -MAXPHASE;
+ else
+ time_monitor = offset;
+ L_LINT(time_offset, time_monitor);
+ }
+
+ /*
+ * Select how the frequency is to be controlled and in which
+ * mode (PLL or FLL). If the PPS signal is present and enabled
+ * to discipline the frequency, the PPS frequency is used;
+ * otherwise, the argument offset is used to compute it.
+ */
+ if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
+ time_reftime = time_second;
+ return;
+ }
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = time_second;
+ mtemp = time_second - time_reftime;
+ L_LINT(ftemp, time_monitor);
+ L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
+ L_MPY(ftemp, mtemp);
+ L_ADD(time_freq, ftemp);
+ time_status &= ~STA_MODE;
+ if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
+ MAXSEC)) {
+ L_LINT(ftemp, (time_monitor << 4) / mtemp);
+ L_RSHIFT(ftemp, SHIFT_FLL + 4);
+ L_ADD(time_freq, ftemp);
+ time_status |= STA_MODE;
+ }
+ time_reftime = time_second;
+ if (L_GINT(time_freq) > MAXFREQ)
+ L_LINT(time_freq, MAXFREQ);
+ else if (L_GINT(time_freq) < -MAXFREQ)
+ L_LINT(time_freq, -MAXFREQ);
+}
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. There are two independent
+ * first-order feedback loops, one for the phase, the other for the
+ * frequency. The phase loop measures and grooms the PPS phase offset
+ * and leaves it in a handy spot for the seconds overflow routine. The
+ * frequency loop averages successive PPS phase differences and
+ * calculates the PPS frequency offset, which is also processed by the
+ * seconds overflow routine. The code requires the caller to capture the
+ * time and architecture-dependent hardware counter values in
+ * nanoseconds at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for the actual time and frequency variables, which
+ * are determined by this routine and updated atomically.
+ */
+void
+hardpps(tsp, nsec)
+ struct timespec *tsp; /* time at PPS */
+ long nsec; /* hardware counter at PPS */
+{
+ long u_sec, u_nsec, v_nsec; /* temps */
+ l_fp ftemp;
+
+ /*
+ * The signal is first processed by a range gate and frequency
+ * discriminator. The range gate rejects noise spikes outside
+ * the range +-500 us. The frequency discriminator rejects input
+ * signals with apparent frequency outside the range 1 +-500
+ * PPM. If two hits occur in the same second, we ignore the
+ * later hit; if not and a hit occurs outside the range gate,
+ * keep the later hit for later comparison, but do not process
+ * it.
+ */
+ time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
+ time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
+ pps_valid = PPS_VALID;
+ u_sec = tsp->tv_sec;
+ u_nsec = tsp->tv_nsec;
+ if (u_nsec >= (NANOSECOND >> 1)) {
+ u_nsec -= NANOSECOND;
+ u_sec++;
+ }
+ v_nsec = u_nsec - pps_tf[0].tv_nsec;
+ if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
+ MAXFREQ)
+ return;
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0].tv_sec = u_sec;
+ pps_tf[0].tv_nsec = u_nsec;
+
+ /*
+ * Compute the difference between the current and previous
+ * counter values. If the difference exceeds 0.5 s, assume it
+ * has wrapped around, so correct 1.0 s. If the result exceeds
+ * the tick interval, the sample point has crossed a tick
+ * boundary during the last second, so correct the tick. Very
+ * intricate.
+ */
+ u_nsec = nsec;
+ if (u_nsec > (NANOSECOND >> 1))
+ u_nsec -= NANOSECOND;
+ else if (u_nsec < -(NANOSECOND >> 1))
+ u_nsec += NANOSECOND;
+ pps_fcount += u_nsec;
+ if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
+ return;
+ time_status &= ~STA_PPSJITTER;
+
+ /*
+ * A three-stage median filter is used to help denoise the PPS
+ * time. The median sample becomes the time offset estimate; the
+ * difference between the other two samples becomes the time
+ * dispersion (jitter) estimate.
+ */
+ if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
+ if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
+ v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */
+ u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
+ } else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
+ v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */
+ u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
+ } else {
+ v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */
+ u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
+ }
+ } else {
+ if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
+ v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */
+ u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
+ } else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
+ v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */
+ u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
+ } else {
+ v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */
+ u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
+ }
+ }
+
+ /*
+ * Nominal jitter is due to PPS signal noise and interrupt
+ * latency. If it exceeds the popcorn threshold, the sample is
+ * discarded. otherwise, if so enabled, the time offset is
+ * updated. We can tolerate a modest loss of data here without
+ * much degrading time accuracy.
+ *
+ * The measurements being checked here were made with the system
+ * timecounter, so the popcorn threshold is not allowed to fall below
+ * the number of nanoseconds in two ticks of the timecounter. For a
+ * timecounter running faster than 1 GHz the lower bound is 2ns, just
+ * to avoid a nonsensical threshold of zero.
+ */
+ if (u_nsec > lmax(pps_jitter << PPS_POPCORN,
+ 2 * (NANOSECOND / (long)qmin(NANOSECOND, tc_getfrequency())))) {
+ time_status |= STA_PPSJITTER;
+ pps_jitcnt++;
+ } else if (time_status & STA_PPSTIME) {
+ time_monitor = -v_nsec;
+ L_LINT(time_offset, time_monitor);
+ }
+ pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
+ u_sec = pps_tf[0].tv_sec - pps_lastsec;
+ if (u_sec < (1 << pps_shift))
+ return;
+
+ /*
+ * At the end of the calibration interval the difference between
+ * the first and last counter values becomes the scaled
+ * frequency. It will later be divided by the length of the
+ * interval to determine the frequency update. If the frequency
+ * exceeds a sanity threshold, or if the actual calibration
+ * interval is not equal to the expected length, the data are
+ * discarded. We can tolerate a modest loss of data here without
+ * much degrading frequency accuracy.
+ */
+ pps_calcnt++;
+ v_nsec = -pps_fcount;
+ pps_lastsec = pps_tf[0].tv_sec;
+ pps_fcount = 0;
+ u_nsec = MAXFREQ << pps_shift;
+ if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
+ pps_shift)) {
+ time_status |= STA_PPSERROR;
+ pps_errcnt++;
+ return;
+ }
+
+ /*
+ * Here the raw frequency offset and wander (stability) is
+ * calculated. If the wander is less than the wander threshold
+ * for four consecutive averaging intervals, the interval is
+ * doubled; if it is greater than the threshold for four
+ * consecutive intervals, the interval is halved. The scaled
+ * frequency offset is converted to frequency offset. The
+ * stability metric is calculated as the average of recent
+ * frequency changes, but is used only for performance
+ * monitoring.
+ */
+ L_LINT(ftemp, v_nsec);
+ L_RSHIFT(ftemp, pps_shift);
+ L_SUB(ftemp, pps_freq);
+ u_nsec = L_GINT(ftemp);
+ if (u_nsec > PPS_MAXWANDER) {
+ L_LINT(ftemp, PPS_MAXWANDER);
+ pps_intcnt--;
+ time_status |= STA_PPSWANDER;
+ pps_stbcnt++;
+ } else if (u_nsec < -PPS_MAXWANDER) {
+ L_LINT(ftemp, -PPS_MAXWANDER);
+ pps_intcnt--;
+ time_status |= STA_PPSWANDER;
+ pps_stbcnt++;
+ } else {
+ pps_intcnt++;
+ }
+ if (pps_intcnt >= 4) {
+ pps_intcnt = 4;
+ if (pps_shift < pps_shiftmax) {
+ pps_shift++;
+ pps_intcnt = 0;
+ }
+ } else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
+ pps_intcnt = -4;
+ if (pps_shift > PPS_FAVG) {
+ pps_shift--;
+ pps_intcnt = 0;
+ }
+ }
+ if (u_nsec < 0)
+ u_nsec = -u_nsec;
+ pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
+
+ /*
+ * The PPS frequency is recalculated and clamped to the maximum
+ * MAXFREQ. If enabled, the system clock frequency is updated as
+ * well.
+ */
+ L_ADD(pps_freq, ftemp);
+ u_nsec = L_GINT(pps_freq);
+ if (u_nsec > MAXFREQ)
+ L_LINT(pps_freq, MAXFREQ);
+ else if (u_nsec < -MAXFREQ)
+ L_LINT(pps_freq, -MAXFREQ);
+ if (time_status & STA_PPSFREQ)
+ time_freq = pps_freq;
+}
+#endif /* PPS_SYNC */
+
+#ifndef _SYS_SYSPROTO_H_
+struct adjtime_args {
+ struct timeval *delta;
+ struct timeval *olddelta;
+};
+#endif
+/* ARGSUSED */
+int
+sys_adjtime(struct thread *td, struct adjtime_args *uap)
+{
+ struct timeval delta, olddelta, *deltap;
+ int error;
+
+ if (uap->delta) {
+ error = copyin(uap->delta, &delta, sizeof(delta));
+ if (error)
+ return (error);
+ deltap = &delta;
+ } else
+ deltap = NULL;
+ error = kern_adjtime(td, deltap, &olddelta);
+ if (uap->olddelta && error == 0)
+ error = copyout(&olddelta, uap->olddelta, sizeof(olddelta));
+ return (error);
+}
+
+int
+kern_adjtime(struct thread *td, struct timeval *delta, struct timeval *olddelta)
+{
+ struct timeval atv;
+ int error;
+
+ mtx_lock(&Giant);
+ if (olddelta) {
+ atv.tv_sec = time_adjtime / 1000000;
+ atv.tv_usec = time_adjtime % 1000000;
+ if (atv.tv_usec < 0) {
+ atv.tv_usec += 1000000;
+ atv.tv_sec--;
+ }
+ *olddelta = atv;
+ }
+ if (delta) {
+ if ((error = priv_check(td, PRIV_ADJTIME))) {
+ mtx_unlock(&Giant);
+ return (error);
+ }
+ time_adjtime = (int64_t)delta->tv_sec * 1000000 +
+ delta->tv_usec;
+ }
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+static struct callout resettodr_callout;
+static int resettodr_period = 1800;
+
+static void
+periodic_resettodr(void *arg __unused)
+{
+
+ if (!ntp_is_time_error()) {
+ mtx_lock(&Giant);
+ resettodr();
+ mtx_unlock(&Giant);
+ }
+ if (resettodr_period > 0)
+ callout_schedule(&resettodr_callout, resettodr_period * hz);
+}
+
+static void
+shutdown_resettodr(void *arg __unused, int howto __unused)
+{
+
+ callout_drain(&resettodr_callout);
+ if (resettodr_period > 0 && !ntp_is_time_error()) {
+ mtx_lock(&Giant);
+ resettodr();
+ mtx_unlock(&Giant);
+ }
+}
+
+static int
+sysctl_resettodr_period(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+ if (error || !req->newptr)
+ return (error);
+ if (resettodr_period == 0)
+ callout_stop(&resettodr_callout);
+ else
+ callout_reset(&resettodr_callout, resettodr_period * hz,
+ periodic_resettodr, NULL);
+ return (0);
+}
+
+SYSCTL_PROC(_machdep, OID_AUTO, rtc_save_period, CTLTYPE_INT|CTLFLAG_RW,
+ &resettodr_period, 1800, sysctl_resettodr_period, "I",
+ "Save system time to RTC with this period (in seconds)");
+TUNABLE_INT("machdep.rtc_save_period", &resettodr_period);
+
+static void
+start_periodic_resettodr(void *arg __unused)
+{
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_resettodr, NULL,
+ SHUTDOWN_PRI_FIRST);
+ callout_init(&resettodr_callout, 1);
+ if (resettodr_period == 0)
+ return;
+ callout_reset(&resettodr_callout, resettodr_period * hz,
+ periodic_resettodr, NULL);
+}
+
+SYSINIT(periodic_resettodr, SI_SUB_LAST, SI_ORDER_MIDDLE,
+ start_periodic_resettodr, NULL);
diff --git a/sys/kern/kern_osd.c b/sys/kern/kern_osd.c
new file mode 100644
index 0000000..184c4f0
--- /dev/null
+++ b/sys/kern/kern_osd.c
@@ -0,0 +1,403 @@
+/*-
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/jail.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rmlock.h>
+#include <sys/sx.h>
+#include <sys/queue.h>
+#include <sys/proc.h>
+#include <sys/osd.h>
+
+/* OSD (Object Specific Data) */
+
+static MALLOC_DEFINE(M_OSD, "osd", "Object Specific Data");
+
+static int osd_debug = 0;
+TUNABLE_INT("debug.osd", &osd_debug);
+SYSCTL_INT(_debug, OID_AUTO, osd, CTLFLAG_RW, &osd_debug, 0, "OSD debug level");
+
+#define OSD_DEBUG(...) do { \
+ if (osd_debug) { \
+ printf("OSD (%s:%u): ", __func__, __LINE__); \
+ printf(__VA_ARGS__); \
+ printf("\n"); \
+ } \
+} while (0)
+
+static void do_osd_del(u_int type, struct osd *osd, u_int slot,
+ int list_locked);
+
+/*
+ * Lists of objects with OSD.
+ *
+ * Lock key:
+ * (m) osd_module_lock
+ * (o) osd_object_lock
+ * (l) osd_list_lock
+ */
+static LIST_HEAD(, osd) osd_list[OSD_LAST + 1]; /* (m) */
+static osd_method_t *osd_methods[OSD_LAST + 1]; /* (m) */
+static u_int osd_nslots[OSD_LAST + 1]; /* (m) */
+static osd_destructor_t *osd_destructors[OSD_LAST + 1]; /* (o) */
+static const u_int osd_nmethods[OSD_LAST + 1] = {
+ [OSD_JAIL] = PR_MAXMETHOD,
+};
+
+static struct sx osd_module_lock[OSD_LAST + 1];
+static struct rmlock osd_object_lock[OSD_LAST + 1];
+static struct mtx osd_list_lock[OSD_LAST + 1];
+
+static void
+osd_default_destructor(void *value __unused)
+{
+ /* Do nothing. */
+}
+
+int
+osd_register(u_int type, osd_destructor_t destructor, osd_method_t *methods)
+{
+ void *newptr;
+ u_int i, m;
+
+ KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+
+ /*
+ * If no destructor is given, use default one. We need to use some
+ * destructor, because NULL destructor means unused slot.
+ */
+ if (destructor == NULL)
+ destructor = osd_default_destructor;
+
+ sx_xlock(&osd_module_lock[type]);
+ /*
+ * First, we try to find unused slot.
+ */
+ for (i = 0; i < osd_nslots[type]; i++) {
+ if (osd_destructors[type][i] == NULL) {
+ OSD_DEBUG("Unused slot found (type=%u, slot=%u).",
+ type, i);
+ break;
+ }
+ }
+ /*
+ * If no unused slot was found, allocate one.
+ */
+ if (i == osd_nslots[type]) {
+ osd_nslots[type]++;
+ if (osd_nmethods[type] != 0)
+ osd_methods[type] = realloc(osd_methods[type],
+ sizeof(osd_method_t) * osd_nslots[type] *
+ osd_nmethods[type], M_OSD, M_WAITOK);
+ newptr = malloc(sizeof(osd_destructor_t) * osd_nslots[type],
+ M_OSD, M_WAITOK);
+ rm_wlock(&osd_object_lock[type]);
+ bcopy(osd_destructors[type], newptr,
+ sizeof(osd_destructor_t) * i);
+ free(osd_destructors[type], M_OSD);
+ osd_destructors[type] = newptr;
+ rm_wunlock(&osd_object_lock[type]);
+ OSD_DEBUG("New slot allocated (type=%u, slot=%u).",
+ type, i + 1);
+ }
+
+ osd_destructors[type][i] = destructor;
+ if (osd_nmethods[type] != 0) {
+ for (m = 0; m < osd_nmethods[type]; m++)
+ osd_methods[type][i * osd_nmethods[type] + m] =
+ methods != NULL ? methods[m] : NULL;
+ }
+ sx_xunlock(&osd_module_lock[type]);
+ return (i + 1);
+}
+
+void
+osd_deregister(u_int type, u_int slot)
+{
+ struct osd *osd, *tosd;
+
+ KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+ KASSERT(slot > 0, ("Invalid slot."));
+ KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+
+ sx_xlock(&osd_module_lock[type]);
+ rm_wlock(&osd_object_lock[type]);
+ /*
+ * Free all OSD for the given slot.
+ */
+ mtx_lock(&osd_list_lock[type]);
+ LIST_FOREACH_SAFE(osd, &osd_list[type], osd_next, tosd)
+ do_osd_del(type, osd, slot, 1);
+ mtx_unlock(&osd_list_lock[type]);
+ /*
+ * Set destructor to NULL to free the slot.
+ */
+ osd_destructors[type][slot - 1] = NULL;
+ if (slot == osd_nslots[type]) {
+ osd_nslots[type]--;
+ osd_destructors[type] = realloc(osd_destructors[type],
+ sizeof(osd_destructor_t) * osd_nslots[type], M_OSD,
+ M_NOWAIT | M_ZERO);
+ if (osd_nmethods[type] != 0)
+ osd_methods[type] = realloc(osd_methods[type],
+ sizeof(osd_method_t) * osd_nslots[type] *
+ osd_nmethods[type], M_OSD, M_NOWAIT | M_ZERO);
+ /*
+ * We always reallocate to smaller size, so we assume it will
+ * always succeed.
+ */
+ KASSERT(osd_destructors[type] != NULL &&
+ (osd_nmethods[type] == 0 || osd_methods[type] != NULL),
+ ("realloc() failed"));
+ OSD_DEBUG("Deregistration of the last slot (type=%u, slot=%u).",
+ type, slot);
+ } else {
+ OSD_DEBUG("Slot deregistration (type=%u, slot=%u).",
+ type, slot);
+ }
+ rm_wunlock(&osd_object_lock[type]);
+ sx_xunlock(&osd_module_lock[type]);
+}
+
+int
+osd_set(u_int type, struct osd *osd, u_int slot, void *value)
+{
+ struct rm_priotracker tracker;
+
+ KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+ KASSERT(slot > 0, ("Invalid slot."));
+ KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+
+ rm_rlock(&osd_object_lock[type], &tracker);
+ if (slot > osd->osd_nslots) {
+ if (value == NULL) {
+ OSD_DEBUG(
+ "Not allocating null slot (type=%u, slot=%u).",
+ type, slot);
+ rm_runlock(&osd_object_lock[type], &tracker);
+ return (0);
+ } else if (osd->osd_nslots == 0) {
+ /*
+ * First OSD for this object, so we need to allocate
+ * space and put it onto the list.
+ */
+ osd->osd_slots = malloc(sizeof(void *) * slot, M_OSD,
+ M_NOWAIT | M_ZERO);
+ if (osd->osd_slots == NULL) {
+ rm_runlock(&osd_object_lock[type], &tracker);
+ return (ENOMEM);
+ }
+ osd->osd_nslots = slot;
+ mtx_lock(&osd_list_lock[type]);
+ LIST_INSERT_HEAD(&osd_list[type], osd, osd_next);
+ mtx_unlock(&osd_list_lock[type]);
+ OSD_DEBUG("Setting first slot (type=%u).", type);
+ } else {
+ void *newptr;
+
+ /*
+ * Too few slots allocated here, needs to extend
+ * the array.
+ */
+ newptr = realloc(osd->osd_slots, sizeof(void *) * slot,
+ M_OSD, M_NOWAIT | M_ZERO);
+ if (newptr == NULL) {
+ rm_runlock(&osd_object_lock[type], &tracker);
+ return (ENOMEM);
+ }
+ osd->osd_slots = newptr;
+ osd->osd_nslots = slot;
+ OSD_DEBUG("Growing slots array (type=%u).", type);
+ }
+ }
+ OSD_DEBUG("Setting slot value (type=%u, slot=%u, value=%p).", type,
+ slot, value);
+ osd->osd_slots[slot - 1] = value;
+ rm_runlock(&osd_object_lock[type], &tracker);
+ return (0);
+}
+
+void *
+osd_get(u_int type, struct osd *osd, u_int slot)
+{
+ struct rm_priotracker tracker;
+ void *value;
+
+ KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+ KASSERT(slot > 0, ("Invalid slot."));
+ KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+
+ rm_rlock(&osd_object_lock[type], &tracker);
+ if (slot > osd->osd_nslots) {
+ value = NULL;
+ OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot);
+ } else {
+ value = osd->osd_slots[slot - 1];
+ OSD_DEBUG("Returning slot value (type=%u, slot=%u, value=%p).",
+ type, slot, value);
+ }
+ rm_runlock(&osd_object_lock[type], &tracker);
+ return (value);
+}
+
+void
+osd_del(u_int type, struct osd *osd, u_int slot)
+{
+ struct rm_priotracker tracker;
+
+ rm_rlock(&osd_object_lock[type], &tracker);
+ do_osd_del(type, osd, slot, 0);
+ rm_runlock(&osd_object_lock[type], &tracker);
+}
+
+static void
+do_osd_del(u_int type, struct osd *osd, u_int slot, int list_locked)
+{
+ int i;
+
+ KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+ KASSERT(slot > 0, ("Invalid slot."));
+ KASSERT(osd_destructors[type][slot - 1] != NULL, ("Unused slot."));
+
+ OSD_DEBUG("Deleting slot (type=%u, slot=%u).", type, slot);
+
+ if (slot > osd->osd_nslots) {
+ OSD_DEBUG("Slot doesn't exist (type=%u, slot=%u).", type, slot);
+ return;
+ }
+ if (osd->osd_slots[slot - 1] != NULL) {
+ osd_destructors[type][slot - 1](osd->osd_slots[slot - 1]);
+ osd->osd_slots[slot - 1] = NULL;
+ }
+ for (i = osd->osd_nslots - 1; i >= 0; i--) {
+ if (osd->osd_slots[i] != NULL) {
+ OSD_DEBUG("Slot still has a value (type=%u, slot=%u).",
+ type, i + 1);
+ break;
+ }
+ }
+ if (i == -1) {
+ /* No values left for this object. */
+ OSD_DEBUG("No more slots left (type=%u).", type);
+ if (!list_locked)
+ mtx_lock(&osd_list_lock[type]);
+ LIST_REMOVE(osd, osd_next);
+ if (!list_locked)
+ mtx_unlock(&osd_list_lock[type]);
+ free(osd->osd_slots, M_OSD);
+ osd->osd_slots = NULL;
+ osd->osd_nslots = 0;
+ } else if (slot == osd->osd_nslots) {
+ /* This was the last slot. */
+ osd->osd_slots = realloc(osd->osd_slots,
+ sizeof(void *) * (i + 1), M_OSD, M_NOWAIT | M_ZERO);
+ /*
+ * We always reallocate to smaller size, so we assume it will
+ * always succeed.
+ */
+ KASSERT(osd->osd_slots != NULL, ("realloc() failed"));
+ osd->osd_nslots = i + 1;
+ OSD_DEBUG("Reducing slots array to %u (type=%u).",
+ osd->osd_nslots, type);
+ }
+}
+
+int
+osd_call(u_int type, u_int method, void *obj, void *data)
+{
+ osd_method_t methodfun;
+ int error, i;
+
+ KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+ KASSERT(method < osd_nmethods[type], ("Invalid method."));
+
+ /*
+ * Call this method for every slot that defines it, stopping if an
+ * error is encountered.
+ */
+ error = 0;
+ sx_slock(&osd_module_lock[type]);
+ for (i = 0; i < osd_nslots[type]; i++) {
+ methodfun =
+ osd_methods[type][i * osd_nmethods[type] + method];
+ if (methodfun != NULL && (error = methodfun(obj, data)) != 0)
+ break;
+ }
+ sx_sunlock(&osd_module_lock[type]);
+ return (error);
+}
+
+void
+osd_exit(u_int type, struct osd *osd)
+{
+ struct rm_priotracker tracker;
+ u_int i;
+
+ KASSERT(type >= OSD_FIRST && type <= OSD_LAST, ("Invalid type."));
+
+ if (osd->osd_nslots == 0) {
+ KASSERT(osd->osd_slots == NULL, ("Non-null osd_slots."));
+ /* No OSD attached, just leave. */
+ return;
+ }
+
+ rm_rlock(&osd_object_lock[type], &tracker);
+ for (i = 1; i <= osd->osd_nslots; i++) {
+ if (osd_destructors[type][i - 1] != NULL)
+ do_osd_del(type, osd, i, 0);
+ else
+ OSD_DEBUG("Unused slot (type=%u, slot=%u).", type, i);
+ }
+ rm_runlock(&osd_object_lock[type], &tracker);
+ OSD_DEBUG("Object exit (type=%u).", type);
+}
+
+static void
+osd_init(void *arg __unused)
+{
+ u_int i;
+
+ for (i = OSD_FIRST; i <= OSD_LAST; i++) {
+ osd_nslots[i] = 0;
+ LIST_INIT(&osd_list[i]);
+ sx_init(&osd_module_lock[i], "osd_module");
+ rm_init(&osd_object_lock[i], "osd_object");
+ mtx_init(&osd_list_lock[i], "osd_list", NULL, MTX_DEF);
+ osd_destructors[i] = NULL;
+ osd_methods[i] = NULL;
+ }
+}
+SYSINIT(osd, SI_SUB_LOCK, SI_ORDER_ANY, osd_init, NULL);
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
new file mode 100644
index 0000000..b37b9f3
--- /dev/null
+++ b/sys/kern/kern_physio.c
@@ -0,0 +1,170 @@
+/*-
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ * are met.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+int
+physio(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct buf *bp;
+ struct cdevsw *csw;
+ caddr_t sa;
+ u_int iolen;
+ int error, i, mapped;
+
+ /* Keep the process UPAGES from being swapped. XXX: why ? */
+ PHOLD(curproc);
+
+ bp = getpbuf(NULL);
+ sa = bp->b_data;
+ error = 0;
+
+ /* XXX: sanity check */
+ if(dev->si_iosize_max < PAGE_SIZE) {
+ printf("WARNING: %s si_iosize_max=%d, using DFLTPHYS.\n",
+ devtoname(dev), dev->si_iosize_max);
+ dev->si_iosize_max = DFLTPHYS;
+ }
+
+ /*
+ * If the driver does not want I/O to be split, that means that we
+ * need to reject any requests that will not fit into one buffer.
+ */
+ if (dev->si_flags & SI_NOSPLIT &&
+ (uio->uio_resid > dev->si_iosize_max || uio->uio_resid > MAXPHYS ||
+ uio->uio_iovcnt > 1)) {
+ /*
+ * Tell the user why his I/O was rejected.
+ */
+ if (uio->uio_resid > dev->si_iosize_max)
+ uprintf("%s: request size=%zd > si_iosize_max=%d; "
+ "cannot split request\n", devtoname(dev),
+ uio->uio_resid, dev->si_iosize_max);
+ if (uio->uio_resid > MAXPHYS)
+ uprintf("%s: request size=%zd > MAXPHYS=%d; "
+ "cannot split request\n", devtoname(dev),
+ uio->uio_resid, MAXPHYS);
+ if (uio->uio_iovcnt > 1)
+ uprintf("%s: request vectors=%d > 1; "
+ "cannot split request\n", devtoname(dev),
+ uio->uio_iovcnt);
+
+ error = EFBIG;
+ goto doerror;
+ }
+
+ for (i = 0; i < uio->uio_iovcnt; i++) {
+ while (uio->uio_iov[i].iov_len) {
+ bp->b_flags = 0;
+ if (uio->uio_rw == UIO_READ) {
+ bp->b_iocmd = BIO_READ;
+ curthread->td_ru.ru_inblock++;
+ } else {
+ bp->b_iocmd = BIO_WRITE;
+ curthread->td_ru.ru_oublock++;
+ }
+ bp->b_iodone = bdone;
+ bp->b_data = uio->uio_iov[i].iov_base;
+ bp->b_bcount = uio->uio_iov[i].iov_len;
+ bp->b_offset = uio->uio_offset;
+ bp->b_iooffset = uio->uio_offset;
+ bp->b_saveaddr = sa;
+
+ /* Don't exceed drivers iosize limit */
+ if (bp->b_bcount > dev->si_iosize_max)
+ bp->b_bcount = dev->si_iosize_max;
+
+ /*
+ * Make sure the pbuf can map the request
+ * XXX: The pbuf has kvasize = MAXPHYS so a request
+ * XXX: larger than MAXPHYS - PAGE_SIZE must be
+ * XXX: page aligned or it will be fragmented.
+ */
+ iolen = ((vm_offset_t) bp->b_data) & PAGE_MASK;
+ if ((bp->b_bcount + iolen) > bp->b_kvasize) {
+ /*
+ * This device does not want I/O to be split.
+ */
+ if (dev->si_flags & SI_NOSPLIT) {
+ uprintf("%s: request ptr %p is not "
+ "on a page boundary; cannot split "
+ "request\n", devtoname(dev),
+ bp->b_data);
+ error = EFBIG;
+ goto doerror;
+ }
+ bp->b_bcount = bp->b_kvasize;
+ if (iolen != 0)
+ bp->b_bcount -= PAGE_SIZE;
+ }
+ bp->b_bufsize = bp->b_bcount;
+
+ bp->b_blkno = btodb(bp->b_offset);
+
+ csw = dev->si_devsw;
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ if (dev->si_flags & SI_UNMAPPED)
+ mapped = 0;
+ else
+ mapped = 1;
+ if (vmapbuf(bp, mapped) < 0) {
+ error = EFAULT;
+ goto doerror;
+ }
+ }
+
+ dev_strategy_csw(dev, csw, bp);
+ if (uio->uio_rw == UIO_READ)
+ bwait(bp, PRIBIO, "physrd");
+ else
+ bwait(bp, PRIBIO, "physwr");
+
+ if (uio->uio_segflg == UIO_USERSPACE)
+ vunmapbuf(bp);
+ iolen = bp->b_bcount - bp->b_resid;
+ if (iolen == 0 && !(bp->b_ioflags & BIO_ERROR))
+ goto doerror; /* EOF */
+ uio->uio_iov[i].iov_len -= iolen;
+ uio->uio_iov[i].iov_base =
+ (char *)uio->uio_iov[i].iov_base + iolen;
+ uio->uio_resid -= iolen;
+ uio->uio_offset += iolen;
+ if( bp->b_ioflags & BIO_ERROR) {
+ error = bp->b_error;
+ goto doerror;
+ }
+ }
+ }
+doerror:
+ relpbuf(bp, NULL);
+ PRELE(curproc);
+ return (error);
+}
diff --git a/sys/kern/kern_pmc.c b/sys/kern/kern_pmc.c
new file mode 100644
index 0000000..2b50be0
--- /dev/null
+++ b/sys/kern/kern_pmc.c
@@ -0,0 +1,345 @@
+/*-
+ * Copyright (c) 2003-2008 Joseph Koshy
+ * Copyright (c) 2007 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by A. Joseph Koshy under
+ * sponsorship from the FreeBSD Foundation and Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/types.h>
+#include <sys/ctype.h>
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pmc.h>
+#include <sys/pmckern.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#ifdef HWPMC_HOOKS
+FEATURE(hwpmc_hooks, "Kernel support for HW PMC");
+#define PMC_KERNEL_VERSION PMC_VERSION
+#else
+#define PMC_KERNEL_VERSION 0
+#endif
+
+MALLOC_DECLARE(M_PMCHOOKS);
+MALLOC_DEFINE(M_PMCHOOKS, "pmchooks", "Memory space for PMC hooks");
+
+const int pmc_kernel_version = PMC_KERNEL_VERSION;
+
+/* Hook variable. */
+int (*pmc_hook)(struct thread *td, int function, void *arg) = NULL;
+
+/* Interrupt handler */
+int (*pmc_intr)(int cpu, struct trapframe *tf) = NULL;
+
+/* Bitmask of CPUs requiring servicing at hardclock time */
+volatile cpuset_t pmc_cpumask;
+
+/*
+ * A global count of SS mode PMCs. When non-zero, this means that
+ * we have processes that are sampling the system as a whole.
+ */
+volatile int pmc_ss_count;
+
+/*
+ * Since PMC(4) may not be loaded in the current kernel, the
+ * convention followed is that a non-NULL value of 'pmc_hook' implies
+ * the presence of this kernel module.
+ *
+ * This requires us to protect 'pmc_hook' with a
+ * shared (sx) lock -- thus making the process of calling into PMC(4)
+ * somewhat more expensive than a simple 'if' check and indirect call.
+ */
+struct sx pmc_sx;
+
+/*
+ * PMC Soft per cpu trapframe.
+ */
+struct trapframe pmc_tf[MAXCPU];
+
+/*
+ * PMC Soft use a global table to store registered events.
+ */
+
+SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters");
+
+static int pmc_softevents = 16;
+TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "softevents", &pmc_softevents);
+SYSCTL_INT(_kern_hwpmc, OID_AUTO, softevents, CTLFLAG_TUN|CTLFLAG_RD,
+ &pmc_softevents, 0, "maximum number of soft events");
+
+struct mtx pmc_softs_mtx;
+int pmc_softs_count;
+struct pmc_soft **pmc_softs;
+
+MTX_SYSINIT(pmc_soft_mtx, &pmc_softs_mtx, "pmc-softs", MTX_SPIN);
+
+static void
+pmc_init_sx(void)
+{
+ sx_init_flags(&pmc_sx, "pmc-sx", SX_NOWITNESS);
+}
+
+SYSINIT(pmcsx, SI_SUB_LOCK, SI_ORDER_MIDDLE, pmc_init_sx, NULL);
+
+/*
+ * Helper functions.
+ */
+
+/*
+ * A note on the CPU numbering scheme used by the hwpmc(4) driver.
+ *
+ * CPUs are denoted using numbers in the range 0..[pmc_cpu_max()-1].
+ * CPUs could be numbered "sparsely" in this range; the predicate
+ * `pmc_cpu_is_present()' is used to test whether a given CPU is
+ * physically present.
+ *
+ * Further, a CPU that is physically present may be administratively
+ * disabled or otherwise unavailable for use by hwpmc(4). The
+ * `pmc_cpu_is_active()' predicate tests for CPU usability. An
+ * "active" CPU participates in thread scheduling and can field
+ * interrupts raised by PMC hardware.
+ *
+ * On systems with hyperthreaded CPUs, multiple logical CPUs may share
+ * PMC hardware resources. For such processors one logical CPU is
+ * denoted as the primary owner of the in-CPU PMC resources. The
+ * pmc_cpu_is_primary() predicate is used to distinguish this primary
+ * CPU from the others.
+ */
+
+int
+pmc_cpu_is_active(int cpu)
+{
+#ifdef SMP
+ return (pmc_cpu_is_present(cpu) &&
+ !CPU_ISSET(cpu, &hlt_cpus_mask));
+#else
+ return (1);
+#endif
+}
+
+/* Deprecated. */
+int
+pmc_cpu_is_disabled(int cpu)
+{
+ return (!pmc_cpu_is_active(cpu));
+}
+
+int
+pmc_cpu_is_present(int cpu)
+{
+#ifdef SMP
+ return (!CPU_ABSENT(cpu));
+#else
+ return (1);
+#endif
+}
+
+int
+pmc_cpu_is_primary(int cpu)
+{
+#ifdef SMP
+ return (!CPU_ISSET(cpu, &logical_cpus_mask));
+#else
+ return (1);
+#endif
+}
+
+
+/*
+ * Return the maximum CPU number supported by the system. The return
+ * value is used for scaling internal data structures and for runtime
+ * checks.
+ */
+unsigned int
+pmc_cpu_max(void)
+{
+#ifdef SMP
+ return (mp_maxid+1);
+#else
+ return (1);
+#endif
+}
+
+#ifdef INVARIANTS
+
+/*
+ * Return the count of CPUs in the `active' state in the system.
+ */
+int
+pmc_cpu_max_active(void)
+{
+#ifdef SMP
+ /*
+ * When support for CPU hot-plugging is added to the kernel,
+ * this function would change to return the current number
+ * of "active" CPUs.
+ */
+ return (mp_ncpus);
+#else
+ return (1);
+#endif
+}
+
+#endif
+
+/*
+ * Cleanup event name:
+ * - remove duplicate '_'
+ * - all uppercase
+ */
+static void
+pmc_soft_namecleanup(char *name)
+{
+ char *p, *q;
+
+ p = q = name;
+
+ for ( ; *p == '_' ; p++)
+ ;
+ for ( ; *p ; p++) {
+ if (*p == '_' && (*(p + 1) == '_' || *(p + 1) == '\0'))
+ continue;
+ else
+ *q++ = toupper(*p);
+ }
+ *q = '\0';
+}
+
+void
+pmc_soft_ev_register(struct pmc_soft *ps)
+{
+ static int warned = 0;
+ int n;
+
+ ps->ps_running = 0;
+ ps->ps_ev.pm_ev_code = 0; /* invalid */
+ pmc_soft_namecleanup(ps->ps_ev.pm_ev_name);
+
+ mtx_lock_spin(&pmc_softs_mtx);
+
+ if (pmc_softs_count >= pmc_softevents) {
+ /*
+ * XXX Reusing events can enter a race condition where
+ * new allocated event will be used as an old one.
+ */
+ for (n = 0; n < pmc_softevents; n++)
+ if (pmc_softs[n] == NULL)
+ break;
+ if (n == pmc_softevents) {
+ mtx_unlock_spin(&pmc_softs_mtx);
+ if (!warned) {
+ printf("hwpmc: too many soft events, "
+ "increase kern.hwpmc.softevents tunable\n");
+ warned = 1;
+ }
+ return;
+ }
+
+ ps->ps_ev.pm_ev_code = PMC_EV_SOFT_FIRST + n;
+ pmc_softs[n] = ps;
+ } else {
+ ps->ps_ev.pm_ev_code = PMC_EV_SOFT_FIRST + pmc_softs_count;
+ pmc_softs[pmc_softs_count++] = ps;
+ }
+
+ mtx_unlock_spin(&pmc_softs_mtx);
+}
+
+void
+pmc_soft_ev_deregister(struct pmc_soft *ps)
+{
+
+ KASSERT(ps != NULL, ("pmc_soft_deregister: called with NULL"));
+
+ mtx_lock_spin(&pmc_softs_mtx);
+
+ if (ps->ps_ev.pm_ev_code != 0 &&
+ (ps->ps_ev.pm_ev_code - PMC_EV_SOFT_FIRST) < pmc_softevents) {
+ KASSERT(ps->ps_ev.pm_ev_code >= PMC_EV_SOFT_FIRST &&
+ ps->ps_ev.pm_ev_code <= PMC_EV_SOFT_LAST,
+ ("pmc_soft_deregister: invalid event value"));
+ pmc_softs[ps->ps_ev.pm_ev_code - PMC_EV_SOFT_FIRST] = NULL;
+ }
+
+ mtx_unlock_spin(&pmc_softs_mtx);
+}
+
+struct pmc_soft *
+pmc_soft_ev_acquire(enum pmc_event ev)
+{
+ struct pmc_soft *ps;
+
+ if (ev == 0 || (ev - PMC_EV_SOFT_FIRST) >= pmc_softevents)
+ return NULL;
+
+ KASSERT(ev >= PMC_EV_SOFT_FIRST &&
+ ev <= PMC_EV_SOFT_LAST,
+ ("event out of range"));
+
+ mtx_lock_spin(&pmc_softs_mtx);
+
+ ps = pmc_softs[ev - PMC_EV_SOFT_FIRST];
+ if (ps == NULL)
+ mtx_unlock_spin(&pmc_softs_mtx);
+
+ return ps;
+}
+
+void
+pmc_soft_ev_release(struct pmc_soft *ps)
+{
+
+ mtx_unlock_spin(&pmc_softs_mtx);
+}
+
+/*
+ * Initialise hwpmc.
+ */
+static void
+init_hwpmc(void *dummy __unused)
+{
+ if (pmc_softevents <= 0 ||
+ pmc_softevents > PMC_EV_DYN_COUNT) {
+ (void) printf("hwpmc: tunable \"softevents\"=%d out of "
+ "range.\n", pmc_softevents);
+ pmc_softevents = PMC_EV_DYN_COUNT;
+ }
+ pmc_softs = malloc(pmc_softevents * sizeof(struct pmc_soft *), M_PMCHOOKS, M_NOWAIT|M_ZERO);
+ KASSERT(pmc_softs != NULL, ("cannot allocate soft events table"));
+}
+
+SYSINIT(hwpmc, SI_SUB_KDTRACE, SI_ORDER_FIRST, init_hwpmc, NULL);
+
diff --git a/sys/kern/kern_poll.c b/sys/kern/kern_poll.c
new file mode 100644
index 0000000..349f338
--- /dev/null
+++ b/sys/kern/kern_poll.c
@@ -0,0 +1,567 @@
+/*-
+ * Copyright (c) 2001-2002 Luigi Rizzo
+ *
+ * Supported by: the Xorp Project (www.xorp.org)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_device_polling.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/proc.h>
+#include <sys/eventhandler.h>
+#include <sys/resourcevar.h>
+#include <sys/socket.h> /* needed by net/if.h */
+#include <sys/sockio.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+
+#include <net/if.h> /* for IFF_* flags */
+#include <net/netisr.h> /* for NETISR_POLL */
+#include <net/vnet.h>
+
+void hardclock_device_poll(void); /* hook from hardclock */
+
+static struct mtx poll_mtx;
+
+/*
+ * Polling support for [network] device drivers.
+ *
+ * Drivers which support this feature can register with the
+ * polling code.
+ *
+ * If registration is successful, the driver must disable interrupts,
+ * and further I/O is performed through the handler, which is invoked
+ * (at least once per clock tick) with 3 arguments: the "arg" passed at
+ * register time (a struct ifnet pointer), a command, and a "count" limit.
+ *
+ * The command can be one of the following:
+ * POLL_ONLY: quick move of "count" packets from input/output queues.
+ * POLL_AND_CHECK_STATUS: as above, plus check status registers or do
+ * other more expensive operations. This command is issued periodically
+ * but less frequently than POLL_ONLY.
+ *
+ * The count limit specifies how much work the handler can do during the
+ * call -- typically this is the number of packets to be received, or
+ * transmitted, etc. (drivers are free to interpret this number, as long
+ * as the max time spent in the function grows roughly linearly with the
+ * count).
+ *
+ * Polling is enabled and disabled via setting IFCAP_POLLING flag on
+ * the interface. The driver ioctl handler should register interface
+ * with polling and disable interrupts, if registration was successful.
+ *
+ * A second variable controls the sharing of CPU between polling/kernel
+ * network processing, and other activities (typically userlevel tasks):
+ * kern.polling.user_frac (between 0 and 100, default 50) sets the share
+ * of CPU allocated to user tasks. CPU is allocated proportionally to the
+ * shares, by dynamically adjusting the "count" (poll_burst).
+ *
+ * Other parameters can should be left to their default values.
+ * The following constraints hold
+ *
+ * 1 <= poll_each_burst <= poll_burst <= poll_burst_max
+ * MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
+ */
+
+#define MIN_POLL_BURST_MAX 10
+#define MAX_POLL_BURST_MAX 20000
+
+static uint32_t poll_burst = 5;
+static uint32_t poll_burst_max = 150; /* good for 100Mbit net and HZ=1000 */
+static uint32_t poll_each_burst = 5;
+
+static SYSCTL_NODE(_kern, OID_AUTO, polling, CTLFLAG_RW, 0,
+ "Device polling parameters");
+
+SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RD,
+ &poll_burst, 0, "Current polling burst size");
+
+static int netisr_poll_scheduled;
+static int netisr_pollmore_scheduled;
+static int poll_shutting_down;
+
+static int poll_burst_max_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ uint32_t val = poll_burst_max;
+ int error;
+
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr )
+ return (error);
+ if (val < MIN_POLL_BURST_MAX || val > MAX_POLL_BURST_MAX)
+ return (EINVAL);
+
+ mtx_lock(&poll_mtx);
+ poll_burst_max = val;
+ if (poll_burst > poll_burst_max)
+ poll_burst = poll_burst_max;
+ if (poll_each_burst > poll_burst_max)
+ poll_each_burst = MIN_POLL_BURST_MAX;
+ mtx_unlock(&poll_mtx);
+
+ return (0);
+}
+SYSCTL_PROC(_kern_polling, OID_AUTO, burst_max, CTLTYPE_UINT | CTLFLAG_RW,
+ 0, sizeof(uint32_t), poll_burst_max_sysctl, "I", "Max Polling burst size");
+
+static int poll_each_burst_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ uint32_t val = poll_each_burst;
+ int error;
+
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr )
+ return (error);
+ if (val < 1)
+ return (EINVAL);
+
+ mtx_lock(&poll_mtx);
+ if (val > poll_burst_max) {
+ mtx_unlock(&poll_mtx);
+ return (EINVAL);
+ }
+ poll_each_burst = val;
+ mtx_unlock(&poll_mtx);
+
+ return (0);
+}
+SYSCTL_PROC(_kern_polling, OID_AUTO, each_burst, CTLTYPE_UINT | CTLFLAG_RW,
+ 0, sizeof(uint32_t), poll_each_burst_sysctl, "I",
+ "Max size of each burst");
+
+static uint32_t poll_in_idle_loop=0; /* do we poll in idle loop ? */
+SYSCTL_UINT(_kern_polling, OID_AUTO, idle_poll, CTLFLAG_RW,
+ &poll_in_idle_loop, 0, "Enable device polling in idle loop");
+
+static uint32_t user_frac = 50;
+static int user_frac_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ uint32_t val = user_frac;
+ int error;
+
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr )
+ return (error);
+ if (val > 99)
+ return (EINVAL);
+
+ mtx_lock(&poll_mtx);
+ user_frac = val;
+ mtx_unlock(&poll_mtx);
+
+ return (0);
+}
+SYSCTL_PROC(_kern_polling, OID_AUTO, user_frac, CTLTYPE_UINT | CTLFLAG_RW,
+ 0, sizeof(uint32_t), user_frac_sysctl, "I",
+ "Desired user fraction of cpu time");
+
+static uint32_t reg_frac_count = 0;
+static uint32_t reg_frac = 20 ;
+static int reg_frac_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ uint32_t val = reg_frac;
+ int error;
+
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr )
+ return (error);
+ if (val < 1 || val > hz)
+ return (EINVAL);
+
+ mtx_lock(&poll_mtx);
+ reg_frac = val;
+ if (reg_frac_count >= reg_frac)
+ reg_frac_count = 0;
+ mtx_unlock(&poll_mtx);
+
+ return (0);
+}
+SYSCTL_PROC(_kern_polling, OID_AUTO, reg_frac, CTLTYPE_UINT | CTLFLAG_RW,
+ 0, sizeof(uint32_t), reg_frac_sysctl, "I",
+ "Every this many cycles check registers");
+
+static uint32_t short_ticks;
+SYSCTL_UINT(_kern_polling, OID_AUTO, short_ticks, CTLFLAG_RD,
+ &short_ticks, 0, "Hardclock ticks shorter than they should be");
+
+static uint32_t lost_polls;
+SYSCTL_UINT(_kern_polling, OID_AUTO, lost_polls, CTLFLAG_RD,
+ &lost_polls, 0, "How many times we would have lost a poll tick");
+
+static uint32_t pending_polls;
+SYSCTL_UINT(_kern_polling, OID_AUTO, pending_polls, CTLFLAG_RD,
+ &pending_polls, 0, "Do we need to poll again");
+
+static int residual_burst = 0;
+SYSCTL_INT(_kern_polling, OID_AUTO, residual_burst, CTLFLAG_RD,
+ &residual_burst, 0, "# of residual cycles in burst");
+
+static uint32_t poll_handlers; /* next free entry in pr[]. */
+SYSCTL_UINT(_kern_polling, OID_AUTO, handlers, CTLFLAG_RD,
+ &poll_handlers, 0, "Number of registered poll handlers");
+
+static uint32_t phase;
+SYSCTL_UINT(_kern_polling, OID_AUTO, phase, CTLFLAG_RD,
+ &phase, 0, "Polling phase");
+
+static uint32_t suspect;
+SYSCTL_UINT(_kern_polling, OID_AUTO, suspect, CTLFLAG_RD,
+ &suspect, 0, "suspect event");
+
+static uint32_t stalled;
+SYSCTL_UINT(_kern_polling, OID_AUTO, stalled, CTLFLAG_RD,
+ &stalled, 0, "potential stalls");
+
+static uint32_t idlepoll_sleeping; /* idlepoll is sleeping */
+SYSCTL_UINT(_kern_polling, OID_AUTO, idlepoll_sleeping, CTLFLAG_RD,
+ &idlepoll_sleeping, 0, "idlepoll is sleeping");
+
+
+#define POLL_LIST_LEN 128
+struct pollrec {
+ poll_handler_t *handler;
+ struct ifnet *ifp;
+};
+
+static struct pollrec pr[POLL_LIST_LEN];
+
+static void
+poll_shutdown(void *arg, int howto)
+{
+
+ poll_shutting_down = 1;
+}
+
+static void
+init_device_poll(void)
+{
+
+ mtx_init(&poll_mtx, "polling", NULL, MTX_DEF);
+ EVENTHANDLER_REGISTER(shutdown_post_sync, poll_shutdown, NULL,
+ SHUTDOWN_PRI_LAST);
+}
+SYSINIT(device_poll, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, init_device_poll, NULL);
+
+
+/*
+ * Hook from hardclock. Tries to schedule a netisr, but keeps track
+ * of lost ticks due to the previous handler taking too long.
+ * Normally, this should not happen, because polling handler should
+ * run for a short time. However, in some cases (e.g. when there are
+ * changes in link status etc.) the drivers take a very long time
+ * (even in the order of milliseconds) to reset and reconfigure the
+ * device, causing apparent lost polls.
+ *
+ * The first part of the code is just for debugging purposes, and tries
+ * to count how often hardclock ticks are shorter than they should,
+ * meaning either stray interrupts or delayed events.
+ */
+void
+hardclock_device_poll(void)
+{
+ static struct timeval prev_t, t;
+ int delta;
+
+ if (poll_handlers == 0 || poll_shutting_down)
+ return;
+
+ microuptime(&t);
+ delta = (t.tv_usec - prev_t.tv_usec) +
+ (t.tv_sec - prev_t.tv_sec)*1000000;
+ if (delta * hz < 500000)
+ short_ticks++;
+ else
+ prev_t = t;
+
+ if (pending_polls > 100) {
+ /*
+ * Too much, assume it has stalled (not always true
+ * see comment above).
+ */
+ stalled++;
+ pending_polls = 0;
+ phase = 0;
+ }
+
+ if (phase <= 2) {
+ if (phase != 0)
+ suspect++;
+ phase = 1;
+ netisr_poll_scheduled = 1;
+ netisr_pollmore_scheduled = 1;
+ netisr_sched_poll();
+ phase = 2;
+ }
+ if (pending_polls++ > 0)
+ lost_polls++;
+}
+
+/*
+ * ether_poll is called from the idle loop.
+ */
+static void
+ether_poll(int count)
+{
+ int i;
+
+ mtx_lock(&poll_mtx);
+
+ if (count > poll_each_burst)
+ count = poll_each_burst;
+
+ for (i = 0 ; i < poll_handlers ; i++)
+ pr[i].handler(pr[i].ifp, POLL_ONLY, count);
+
+ mtx_unlock(&poll_mtx);
+}
+
+/*
+ * netisr_pollmore is called after other netisr's, possibly scheduling
+ * another NETISR_POLL call, or adapting the burst size for the next cycle.
+ *
+ * It is very bad to fetch large bursts of packets from a single card at once,
+ * because the burst could take a long time to be completely processed, or
+ * could saturate the intermediate queue (ipintrq or similar) leading to
+ * losses or unfairness. To reduce the problem, and also to account better for
+ * time spent in network-related processing, we split the burst in smaller
+ * chunks of fixed size, giving control to the other netisr's between chunks.
+ * This helps in improving the fairness, reducing livelock (because we
+ * emulate more closely the "process to completion" that we have with
+ * fastforwarding) and accounting for the work performed in low level
+ * handling and forwarding.
+ */
+
+static struct timeval poll_start_t;
+
+void
+netisr_pollmore()
+{
+ struct timeval t;
+ int kern_load;
+
+ mtx_lock(&poll_mtx);
+ if (!netisr_pollmore_scheduled) {
+ mtx_unlock(&poll_mtx);
+ return;
+ }
+ netisr_pollmore_scheduled = 0;
+ phase = 5;
+ if (residual_burst > 0) {
+ netisr_poll_scheduled = 1;
+ netisr_pollmore_scheduled = 1;
+ netisr_sched_poll();
+ mtx_unlock(&poll_mtx);
+ /* will run immediately on return, followed by netisrs */
+ return;
+ }
+ /* here we can account time spent in netisr's in this tick */
+ microuptime(&t);
+ kern_load = (t.tv_usec - poll_start_t.tv_usec) +
+ (t.tv_sec - poll_start_t.tv_sec)*1000000; /* us */
+ kern_load = (kern_load * hz) / 10000; /* 0..100 */
+ if (kern_load > (100 - user_frac)) { /* try decrease ticks */
+ if (poll_burst > 1)
+ poll_burst--;
+ } else {
+ if (poll_burst < poll_burst_max)
+ poll_burst++;
+ }
+
+ pending_polls--;
+ if (pending_polls == 0) /* we are done */
+ phase = 0;
+ else {
+ /*
+ * Last cycle was long and caused us to miss one or more
+ * hardclock ticks. Restart processing again, but slightly
+ * reduce the burst size to prevent that this happens again.
+ */
+ poll_burst -= (poll_burst / 8);
+ if (poll_burst < 1)
+ poll_burst = 1;
+ netisr_poll_scheduled = 1;
+ netisr_pollmore_scheduled = 1;
+ netisr_sched_poll();
+ phase = 6;
+ }
+ mtx_unlock(&poll_mtx);
+}
+
+/*
+ * netisr_poll is typically scheduled once per tick.
+ */
+void
+netisr_poll(void)
+{
+ int i, cycles;
+ enum poll_cmd arg = POLL_ONLY;
+
+ mtx_lock(&poll_mtx);
+ if (!netisr_poll_scheduled) {
+ mtx_unlock(&poll_mtx);
+ return;
+ }
+ netisr_poll_scheduled = 0;
+ phase = 3;
+ if (residual_burst == 0) { /* first call in this tick */
+ microuptime(&poll_start_t);
+ if (++reg_frac_count == reg_frac) {
+ arg = POLL_AND_CHECK_STATUS;
+ reg_frac_count = 0;
+ }
+
+ residual_burst = poll_burst;
+ }
+ cycles = (residual_burst < poll_each_burst) ?
+ residual_burst : poll_each_burst;
+ residual_burst -= cycles;
+
+ for (i = 0 ; i < poll_handlers ; i++)
+ pr[i].handler(pr[i].ifp, arg, cycles);
+
+ phase = 4;
+ mtx_unlock(&poll_mtx);
+}
+
+/*
+ * Try to register routine for polling. Returns 0 if successful
+ * (and polling should be enabled), error code otherwise.
+ * A device is not supposed to register itself multiple times.
+ *
+ * This is called from within the *_ioctl() functions.
+ */
+int
+ether_poll_register(poll_handler_t *h, struct ifnet *ifp)
+{
+ int i;
+
+ KASSERT(h != NULL, ("%s: handler is NULL", __func__));
+ KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
+
+ mtx_lock(&poll_mtx);
+ if (poll_handlers >= POLL_LIST_LEN) {
+ /*
+ * List full, cannot register more entries.
+ * This should never happen; if it does, it is probably a
+ * broken driver trying to register multiple times. Checking
+ * this at runtime is expensive, and won't solve the problem
+ * anyways, so just report a few times and then give up.
+ */
+ static int verbose = 10 ;
+ if (verbose >0) {
+ log(LOG_ERR, "poll handlers list full, "
+ "maybe a broken driver ?\n");
+ verbose--;
+ }
+ mtx_unlock(&poll_mtx);
+ return (ENOMEM); /* no polling for you */
+ }
+
+ for (i = 0 ; i < poll_handlers ; i++)
+ if (pr[i].ifp == ifp && pr[i].handler != NULL) {
+ mtx_unlock(&poll_mtx);
+ log(LOG_DEBUG, "ether_poll_register: %s: handler"
+ " already registered\n", ifp->if_xname);
+ return (EEXIST);
+ }
+
+ pr[poll_handlers].handler = h;
+ pr[poll_handlers].ifp = ifp;
+ poll_handlers++;
+ mtx_unlock(&poll_mtx);
+ if (idlepoll_sleeping)
+ wakeup(&idlepoll_sleeping);
+ return (0);
+}
+
+/*
+ * Remove interface from the polling list. Called from *_ioctl(), too.
+ */
+int
+ether_poll_deregister(struct ifnet *ifp)
+{
+ int i;
+
+ KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
+
+ mtx_lock(&poll_mtx);
+
+ for (i = 0 ; i < poll_handlers ; i++)
+ if (pr[i].ifp == ifp) /* found it */
+ break;
+ if (i == poll_handlers) {
+ log(LOG_DEBUG, "ether_poll_deregister: %s: not found!\n",
+ ifp->if_xname);
+ mtx_unlock(&poll_mtx);
+ return (ENOENT);
+ }
+ poll_handlers--;
+ if (i < poll_handlers) { /* Last entry replaces this one. */
+ pr[i].handler = pr[poll_handlers].handler;
+ pr[i].ifp = pr[poll_handlers].ifp;
+ }
+ mtx_unlock(&poll_mtx);
+ return (0);
+}
+
+static void
+poll_idle(void)
+{
+ struct thread *td = curthread;
+ struct rtprio rtp;
+
+ rtp.prio = RTP_PRIO_MAX; /* lowest priority */
+ rtp.type = RTP_PRIO_IDLE;
+ PROC_SLOCK(td->td_proc);
+ rtp_to_pri(&rtp, td);
+ PROC_SUNLOCK(td->td_proc);
+
+ for (;;) {
+ if (poll_in_idle_loop && poll_handlers > 0) {
+ idlepoll_sleeping = 0;
+ ether_poll(poll_each_burst);
+ thread_lock(td);
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(td);
+ } else {
+ idlepoll_sleeping = 1;
+ tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3);
+ }
+ }
+}
+
+static struct proc *idlepoll;
+static struct kproc_desc idlepoll_kp = {
+ "idlepoll",
+ poll_idle,
+ &idlepoll
+};
+SYSINIT(idlepoll, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, kproc_start,
+ &idlepoll_kp);
diff --git a/sys/kern/kern_priv.c b/sys/kern/kern_priv.c
new file mode 100644
index 0000000..4d266ab
--- /dev/null
+++ b/sys/kern/kern_priv.c
@@ -0,0 +1,185 @@
+/*-
+ * Copyright (c) 2006 nCircle Network Security, Inc.
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert N. M. Watson for the TrustedBSD
+ * Project under contract to nCircle Network Security, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY,
+ * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_kdtrace.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * `suser_enabled' (which can be set by the security.bsd.suser_enabled
+ * sysctl) determines whether the system 'super-user' policy is in effect. If
+ * it is nonzero, an effective uid of 0 connotes special privilege,
+ * overriding many mandatory and discretionary protections. If it is zero,
+ * uid 0 is offered no special privilege in the kernel security policy.
+ * Setting it to zero may seriously impact the functionality of many existing
+ * userland programs, and should not be done without careful consideration of
+ * the consequences.
+ */
+static int suser_enabled = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
+ &suser_enabled, 0, "processes with uid 0 have privilege");
+TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
+
+static int unprivileged_mlock = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_mlock, CTLFLAG_RW|CTLFLAG_TUN,
+ &unprivileged_mlock, 0, "Allow non-root users to call mlock(2)");
+TUNABLE_INT("security.bsd.unprivileged_mlock", &unprivileged_mlock);
+
+SDT_PROVIDER_DEFINE(priv);
+SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv_ok, priv-ok, "int");
+SDT_PROBE_DEFINE1(priv, kernel, priv_check, priv_err, priv-err, "int");
+
+/*
+ * Check a credential for privilege. Lots of good reasons to deny privilege;
+ * only a few to grant it.
+ */
+int
+priv_check_cred(struct ucred *cred, int priv, int flags)
+{
+ int error;
+
+ KASSERT(PRIV_VALID(priv), ("priv_check_cred: invalid privilege %d",
+ priv));
+
+ /*
+ * We first evaluate policies that may deny the granting of
+ * privilege unilaterally.
+ */
+#ifdef MAC
+ error = mac_priv_check(cred, priv);
+ if (error)
+ goto out;
+#endif
+
+ /*
+ * Jail policy will restrict certain privileges that may otherwise be
+ * be granted.
+ */
+ error = prison_priv_check(cred, priv);
+ if (error)
+ goto out;
+
+ if (unprivileged_mlock) {
+ /*
+ * Allow unprivileged users to call mlock(2)/munlock(2) and
+ * mlockall(2)/munlockall(2).
+ */
+ switch (priv) {
+ case PRIV_VM_MLOCK:
+ case PRIV_VM_MUNLOCK:
+ error = 0;
+ goto out;
+ }
+ }
+
+ /*
+ * Having determined if privilege is restricted by various policies,
+ * now determine if privilege is granted. At this point, any policy
+ * may grant privilege. For now, we allow short-circuit boolean
+ * evaluation, so may not call all policies. Perhaps we should.
+ *
+ * Superuser policy grants privilege based on the effective (or in
+ * the case of specific privileges, real) uid being 0. We allow the
+ * superuser policy to be globally disabled, although this is
+ * currenty of limited utility.
+ */
+ if (suser_enabled) {
+ switch (priv) {
+ case PRIV_MAXFILES:
+ case PRIV_MAXPROC:
+ case PRIV_PROC_LIMIT:
+ if (cred->cr_ruid == 0) {
+ error = 0;
+ goto out;
+ }
+ break;
+ default:
+ if (cred->cr_uid == 0) {
+ error = 0;
+ goto out;
+ }
+ break;
+ }
+ }
+
+ /*
+ * Writes to kernel/physical memory are a typical root-only operation,
+ * but non-root users are expected to be able to read it (provided they
+ * have permission to access /dev/[k]mem).
+ */
+ if (priv == PRIV_KMEM_READ) {
+ error = 0;
+ goto out;
+ }
+
+ /*
+ * Now check with MAC, if enabled, to see if a policy module grants
+ * privilege.
+ */
+#ifdef MAC
+ if (mac_priv_grant(cred, priv) == 0) {
+ error = 0;
+ goto out;
+ }
+#endif
+
+ /*
+ * The default is deny, so if no policies have granted it, reject
+ * with a privilege error here.
+ */
+ error = EPERM;
+out:
+ if (error)
+ SDT_PROBE1(priv, kernel, priv_check, priv_err, priv);
+ else
+ SDT_PROBE1(priv, kernel, priv_check, priv_ok, priv);
+ return (error);
+}
+
+int
+priv_check(struct thread *td, int priv)
+{
+
+ KASSERT(td == curthread, ("priv_check: td != curthread"));
+
+ return (priv_check_cred(td->td_ucred, priv, 0));
+}
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
new file mode 100644
index 0000000..3fa7a7f
--- /dev/null
+++ b/sys/kern/kern_proc.c
@@ -0,0 +1,2740 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_kstack_pages.h"
+#include "opt_stack.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/elf.h>
+#include <sys/exec.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/ptrace.h>
+#include <sys/refcount.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sysent.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/stack.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/filedesc.h>
+#include <sys/tty.h>
+#include <sys/signalvar.h>
+#include <sys/sdt.h>
+#include <sys/sx.h>
+#include <sys/user.h>
+#include <sys/jail.h>
+#include <sys/vnode.h>
+#include <sys/eventhandler.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_util.h>
+#endif
+
+SDT_PROVIDER_DEFINE(proc);
+SDT_PROBE_DEFINE4(proc, kernel, ctor, entry, entry, "struct proc *", "int",
+ "void *", "int");
+SDT_PROBE_DEFINE4(proc, kernel, ctor, return, return, "struct proc *", "int",
+ "void *", "int");
+SDT_PROBE_DEFINE4(proc, kernel, dtor, entry, entry, "struct proc *", "int",
+ "void *", "struct thread *");
+SDT_PROBE_DEFINE3(proc, kernel, dtor, return, return, "struct proc *", "int",
+ "void *");
+SDT_PROBE_DEFINE3(proc, kernel, init, entry, entry, "struct proc *", "int",
+ "int");
+SDT_PROBE_DEFINE3(proc, kernel, init, return, return, "struct proc *", "int",
+ "int");
+
+MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
+MALLOC_DEFINE(M_SESSION, "session", "session header");
+static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
+MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
+
+static void doenterpgrp(struct proc *, struct pgrp *);
+static void orphanpg(struct pgrp *pg);
+static void fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp);
+static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
+static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp,
+ int preferthread);
+static void pgadjustjobc(struct pgrp *pgrp, int entering);
+static void pgdelete(struct pgrp *);
+static int proc_ctor(void *mem, int size, void *arg, int flags);
+static void proc_dtor(void *mem, int size, void *arg);
+static int proc_init(void *mem, int size, int flags);
+static void proc_fini(void *mem, int size);
+static void pargs_free(struct pargs *pa);
+static struct proc *zpfind_locked(pid_t pid);
+
+/*
+ * Other process lists
+ */
+struct pidhashhead *pidhashtbl;
+u_long pidhash;
+struct pgrphashhead *pgrphashtbl;
+u_long pgrphash;
+struct proclist allproc;
+struct proclist zombproc;
+struct sx allproc_lock;
+struct sx proctree_lock;
+struct mtx ppeers_lock;
+uma_zone_t proc_zone;
+
+int kstack_pages = KSTACK_PAGES;
+SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0,
+ "Kernel stack size in pages");
+
+CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
+#ifdef COMPAT_FREEBSD32
+CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE);
+#endif
+
+/*
+ * Initialize global process hashing structures.
+ */
+void
+procinit()
+{
+
+ sx_init(&allproc_lock, "allproc");
+ sx_init(&proctree_lock, "proctree");
+ mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
+ LIST_INIT(&allproc);
+ LIST_INIT(&zombproc);
+ pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
+ pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
+ proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
+ proc_ctor, proc_dtor, proc_init, proc_fini,
+ UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uihashinit();
+}
+
+/*
+ * Prepare a proc for use.
+ */
+static int
+proc_ctor(void *mem, int size, void *arg, int flags)
+{
+ struct proc *p;
+
+ p = (struct proc *)mem;
+ SDT_PROBE(proc, kernel, ctor , entry, p, size, arg, flags, 0);
+ EVENTHANDLER_INVOKE(process_ctor, p);
+ SDT_PROBE(proc, kernel, ctor , return, p, size, arg, flags, 0);
+ return (0);
+}
+
+/*
+ * Reclaim a proc after use.
+ */
+static void
+proc_dtor(void *mem, int size, void *arg)
+{
+ struct proc *p;
+ struct thread *td;
+
+ /* INVARIANTS checks go here */
+ p = (struct proc *)mem;
+ td = FIRST_THREAD_IN_PROC(p);
+ SDT_PROBE(proc, kernel, dtor, entry, p, size, arg, td, 0);
+ if (td != NULL) {
+#ifdef INVARIANTS
+ KASSERT((p->p_numthreads == 1),
+ ("bad number of threads in exiting process"));
+ KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
+#endif
+ /* Free all OSD associated to this thread. */
+ osd_thread_exit(td);
+ }
+ EVENTHANDLER_INVOKE(process_dtor, p);
+ if (p->p_ksi != NULL)
+ KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
+ SDT_PROBE(proc, kernel, dtor, return, p, size, arg, 0, 0);
+}
+
+/*
+ * Initialize type-stable parts of a proc (when newly created).
+ */
+static int
+proc_init(void *mem, int size, int flags)
+{
+ struct proc *p;
+
+ p = (struct proc *)mem;
+ SDT_PROBE(proc, kernel, init, entry, p, size, flags, 0, 0);
+ p->p_sched = (struct p_sched *)&p[1];
+ bzero(&p->p_mtx, sizeof(struct mtx));
+ mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+ mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
+ cv_init(&p->p_pwait, "ppwait");
+ cv_init(&p->p_dbgwait, "dbgwait");
+ TAILQ_INIT(&p->p_threads); /* all threads in proc */
+ EVENTHANDLER_INVOKE(process_init, p);
+ p->p_stats = pstats_alloc();
+ SDT_PROBE(proc, kernel, init, return, p, size, flags, 0, 0);
+ return (0);
+}
+
+/*
+ * UMA should ensure that this function is never called.
+ * Freeing a proc structure would violate type stability.
+ */
+static void
+proc_fini(void *mem, int size)
+{
+#ifdef notnow
+ struct proc *p;
+
+ p = (struct proc *)mem;
+ EVENTHANDLER_INVOKE(process_fini, p);
+ pstats_free(p->p_stats);
+ thread_free(FIRST_THREAD_IN_PROC(p));
+ mtx_destroy(&p->p_mtx);
+ if (p->p_ksi != NULL)
+ ksiginfo_free(p->p_ksi);
+#else
+ panic("proc reclaimed");
+#endif
+}
+
+/*
+ * Is p an inferior of the current process?
+ */
+int
+inferior(p)
+ register struct proc *p;
+{
+
+ sx_assert(&proctree_lock, SX_LOCKED);
+ for (; p != curproc; p = p->p_pptr)
+ if (p->p_pid == 0)
+ return (0);
+ return (1);
+}
+
+struct proc *
+pfind_locked(pid_t pid)
+{
+ struct proc *p;
+
+ sx_assert(&allproc_lock, SX_LOCKED);
+ LIST_FOREACH(p, PIDHASH(pid), p_hash) {
+ if (p->p_pid == pid) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ p = NULL;
+ }
+ break;
+ }
+ }
+ return (p);
+}
+
+/*
+ * Locate a process by number; return only "live" processes -- i.e., neither
+ * zombies nor newly born but incompletely initialized processes. By not
+ * returning processes in the PRS_NEW state, we allow callers to avoid
+ * testing for that condition to avoid dereferencing p_ucred, et al.
+ */
+struct proc *
+pfind(pid_t pid)
+{
+ struct proc *p;
+
+ sx_slock(&allproc_lock);
+ p = pfind_locked(pid);
+ sx_sunlock(&allproc_lock);
+ return (p);
+}
+
+static struct proc *
+pfind_tid_locked(pid_t tid)
+{
+ struct proc *p;
+ struct thread *td;
+
+ sx_assert(&allproc_lock, SX_LOCKED);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_tid == tid)
+ goto found;
+ }
+ PROC_UNLOCK(p);
+ }
+found:
+ return (p);
+}
+
+/*
+ * Locate a process group by number.
+ * The caller must hold proctree_lock.
+ */
+struct pgrp *
+pgfind(pgid)
+ register pid_t pgid;
+{
+ register struct pgrp *pgrp;
+
+ sx_assert(&proctree_lock, SX_LOCKED);
+
+ LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
+ if (pgrp->pg_id == pgid) {
+ PGRP_LOCK(pgrp);
+ return (pgrp);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Locate process and do additional manipulations, depending on flags.
+ */
+int
+pget(pid_t pid, int flags, struct proc **pp)
+{
+ struct proc *p;
+ int error;
+
+ sx_slock(&allproc_lock);
+ if (pid <= PID_MAX) {
+ p = pfind_locked(pid);
+ if (p == NULL && (flags & PGET_NOTWEXIT) == 0)
+ p = zpfind_locked(pid);
+ } else if ((flags & PGET_NOTID) == 0) {
+ p = pfind_tid_locked(pid);
+ } else {
+ p = NULL;
+ }
+ sx_sunlock(&allproc_lock);
+ if (p == NULL)
+ return (ESRCH);
+ if ((flags & PGET_CANSEE) != 0) {
+ error = p_cansee(curthread, p);
+ if (error != 0)
+ goto errout;
+ }
+ if ((flags & PGET_CANDEBUG) != 0) {
+ error = p_candebug(curthread, p);
+ if (error != 0)
+ goto errout;
+ }
+ if ((flags & PGET_ISCURRENT) != 0 && curproc != p) {
+ error = EPERM;
+ goto errout;
+ }
+ if ((flags & PGET_NOTWEXIT) != 0 && (p->p_flag & P_WEXIT) != 0) {
+ error = ESRCH;
+ goto errout;
+ }
+ if ((flags & PGET_NOTINEXEC) != 0 && (p->p_flag & P_INEXEC) != 0) {
+ /*
+ * XXXRW: Not clear ESRCH is the right error during proc
+ * execve().
+ */
+ error = ESRCH;
+ goto errout;
+ }
+ if ((flags & PGET_HOLD) != 0) {
+ _PHOLD(p);
+ PROC_UNLOCK(p);
+ }
+ *pp = p;
+ return (0);
+errout:
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+/*
+ * Create a new process group.
+ * pgid must be equal to the pid of p.
+ * Begin a new session if required.
+ */
+int
+enterpgrp(p, pgid, pgrp, sess)
+ register struct proc *p;
+ pid_t pgid;
+ struct pgrp *pgrp;
+ struct session *sess;
+{
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+
+ KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
+ KASSERT(p->p_pid == pgid,
+ ("enterpgrp: new pgrp and pid != pgid"));
+ KASSERT(pgfind(pgid) == NULL,
+ ("enterpgrp: pgrp with pgid exists"));
+ KASSERT(!SESS_LEADER(p),
+ ("enterpgrp: session leader attempted setpgrp"));
+
+ mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
+
+ if (sess != NULL) {
+ /*
+ * new session
+ */
+ mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
+ PROC_LOCK(p);
+ p->p_flag &= ~P_CONTROLT;
+ PROC_UNLOCK(p);
+ PGRP_LOCK(pgrp);
+ sess->s_leader = p;
+ sess->s_sid = p->p_pid;
+ refcount_init(&sess->s_count, 1);
+ sess->s_ttyvp = NULL;
+ sess->s_ttydp = NULL;
+ sess->s_ttyp = NULL;
+ bcopy(p->p_session->s_login, sess->s_login,
+ sizeof(sess->s_login));
+ pgrp->pg_session = sess;
+ KASSERT(p == curproc,
+ ("enterpgrp: mksession and p != curproc"));
+ } else {
+ pgrp->pg_session = p->p_session;
+ sess_hold(pgrp->pg_session);
+ PGRP_LOCK(pgrp);
+ }
+ pgrp->pg_id = pgid;
+ LIST_INIT(&pgrp->pg_members);
+
+ /*
+ * As we have an exclusive lock of proctree_lock,
+ * this should not deadlock.
+ */
+ LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
+ pgrp->pg_jobc = 0;
+ SLIST_INIT(&pgrp->pg_sigiolst);
+ PGRP_UNLOCK(pgrp);
+
+ doenterpgrp(p, pgrp);
+
+ return (0);
+}
+
+/*
+ * Move p to an existing process group
+ */
+int
+enterthispgrp(p, pgrp)
+ register struct proc *p;
+ struct pgrp *pgrp;
+{
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
+ SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
+ KASSERT(pgrp->pg_session == p->p_session,
+ ("%s: pgrp's session %p, p->p_session %p.\n",
+ __func__,
+ pgrp->pg_session,
+ p->p_session));
+ KASSERT(pgrp != p->p_pgrp,
+ ("%s: p belongs to pgrp.", __func__));
+
+ doenterpgrp(p, pgrp);
+
+ return (0);
+}
+
+/*
+ * Move p to a process group
+ */
+static void
+doenterpgrp(p, pgrp)
+ struct proc *p;
+ struct pgrp *pgrp;
+{
+ struct pgrp *savepgrp;
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
+ SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
+
+ savepgrp = p->p_pgrp;
+
+ /*
+ * Adjust eligibility of affected pgrps to participate in job control.
+ * Increment eligibility counts before decrementing, otherwise we
+ * could reach 0 spuriously during the first call.
+ */
+ fixjobc(p, pgrp, 1);
+ fixjobc(p, p->p_pgrp, 0);
+
+ PGRP_LOCK(pgrp);
+ PGRP_LOCK(savepgrp);
+ PROC_LOCK(p);
+ LIST_REMOVE(p, p_pglist);
+ p->p_pgrp = pgrp;
+ PROC_UNLOCK(p);
+ LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
+ PGRP_UNLOCK(savepgrp);
+ PGRP_UNLOCK(pgrp);
+ if (LIST_EMPTY(&savepgrp->pg_members))
+ pgdelete(savepgrp);
+}
+
+/*
+ * remove process from process group
+ */
+int
+leavepgrp(p)
+ register struct proc *p;
+{
+ struct pgrp *savepgrp;
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ savepgrp = p->p_pgrp;
+ PGRP_LOCK(savepgrp);
+ PROC_LOCK(p);
+ LIST_REMOVE(p, p_pglist);
+ p->p_pgrp = NULL;
+ PROC_UNLOCK(p);
+ PGRP_UNLOCK(savepgrp);
+ if (LIST_EMPTY(&savepgrp->pg_members))
+ pgdelete(savepgrp);
+ return (0);
+}
+
+/*
+ * delete a process group
+ */
+static void
+pgdelete(pgrp)
+ register struct pgrp *pgrp;
+{
+ struct session *savesess;
+ struct tty *tp;
+
+ sx_assert(&proctree_lock, SX_XLOCKED);
+ PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+ SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
+
+ /*
+ * Reset any sigio structures pointing to us as a result of
+ * F_SETOWN with our pgid.
+ */
+ funsetownlst(&pgrp->pg_sigiolst);
+
+ PGRP_LOCK(pgrp);
+ tp = pgrp->pg_session->s_ttyp;
+ LIST_REMOVE(pgrp, pg_hash);
+ savesess = pgrp->pg_session;
+ PGRP_UNLOCK(pgrp);
+
+ /* Remove the reference to the pgrp before deallocating it. */
+ if (tp != NULL) {
+ tty_lock(tp);
+ tty_rel_pgrp(tp, pgrp);
+ }
+
+ mtx_destroy(&pgrp->pg_mtx);
+ free(pgrp, M_PGRP);
+ sess_release(savesess);
+}
+
+static void
+pgadjustjobc(pgrp, entering)
+ struct pgrp *pgrp;
+ int entering;
+{
+
+ PGRP_LOCK(pgrp);
+ if (entering)
+ pgrp->pg_jobc++;
+ else {
+ --pgrp->pg_jobc;
+ if (pgrp->pg_jobc == 0)
+ orphanpg(pgrp);
+ }
+ PGRP_UNLOCK(pgrp);
+}
+
+/*
+ * Adjust pgrp jobc counters when specified process changes process group.
+ * We count the number of processes in each process group that "qualify"
+ * the group for terminal job control (those with a parent in a different
+ * process group of the same session). If that count reaches zero, the
+ * process group becomes orphaned. Check both the specified process'
+ * process group and that of its children.
+ * entering == 0 => p is leaving specified group.
+ * entering == 1 => p is entering specified group.
+ */
+void
+fixjobc(p, pgrp, entering)
+ register struct proc *p;
+ register struct pgrp *pgrp;
+ int entering;
+{
+ register struct pgrp *hispgrp;
+ register struct session *mysession;
+
+ sx_assert(&proctree_lock, SX_LOCKED);
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+ PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
+ SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
+
+ /*
+ * Check p's parent to see whether p qualifies its own process
+ * group; if so, adjust count for p's process group.
+ */
+ mysession = pgrp->pg_session;
+ if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
+ hispgrp->pg_session == mysession)
+ pgadjustjobc(pgrp, entering);
+
+ /*
+ * Check this process' children to see whether they qualify
+ * their process groups; if so, adjust counts for children's
+ * process groups.
+ */
+ LIST_FOREACH(p, &p->p_children, p_sibling) {
+ hispgrp = p->p_pgrp;
+ if (hispgrp == pgrp ||
+ hispgrp->pg_session != mysession)
+ continue;
+ PROC_LOCK(p);
+ if (p->p_state == PRS_ZOMBIE) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ PROC_UNLOCK(p);
+ pgadjustjobc(hispgrp, entering);
+ }
+}
+
+/*
+ * A process group has become orphaned;
+ * if there are any stopped processes in the group,
+ * hang-up all process in that group.
+ */
+static void
+orphanpg(pg)
+ struct pgrp *pg;
+{
+ register struct proc *p;
+
+ PGRP_LOCK_ASSERT(pg, MA_OWNED);
+
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (P_SHOULDSTOP(p)) {
+ PROC_UNLOCK(p);
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ kern_psignal(p, SIGHUP);
+ kern_psignal(p, SIGCONT);
+ PROC_UNLOCK(p);
+ }
+ return;
+ }
+ PROC_UNLOCK(p);
+ }
+}
+
+void
+sess_hold(struct session *s)
+{
+
+ refcount_acquire(&s->s_count);
+}
+
+void
+sess_release(struct session *s)
+{
+
+ if (refcount_release(&s->s_count)) {
+ if (s->s_ttyp != NULL) {
+ tty_lock(s->s_ttyp);
+ tty_rel_sess(s->s_ttyp, s);
+ }
+ mtx_destroy(&s->s_mtx);
+ free(s, M_SESSION);
+ }
+}
+
+#ifdef DDB
+
+DB_SHOW_COMMAND(pgrpdump, pgrpdump)
+{
+ register struct pgrp *pgrp;
+ register struct proc *p;
+ register int i;
+
+ for (i = 0; i <= pgrphash; i++) {
+ if (!LIST_EMPTY(&pgrphashtbl[i])) {
+ printf("\tindx %d\n", i);
+ LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
+ printf(
+ "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
+ (void *)pgrp, (long)pgrp->pg_id,
+ (void *)pgrp->pg_session,
+ pgrp->pg_session->s_count,
+ (void *)LIST_FIRST(&pgrp->pg_members));
+ LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+ printf("\t\tpid %ld addr %p pgrp %p\n",
+ (long)p->p_pid, (void *)p,
+ (void *)p->p_pgrp);
+ }
+ }
+ }
+ }
+}
+#endif /* DDB */
+
+/*
+ * Calculate the kinfo_proc members which contain process-wide
+ * informations.
+ * Must be called with the target process locked.
+ */
+static void
+fill_kinfo_aggregate(struct proc *p, struct kinfo_proc *kp)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ kp->ki_estcpu = 0;
+ kp->ki_pctcpu = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ kp->ki_pctcpu += sched_pctcpu(td);
+ kp->ki_estcpu += td->td_estcpu;
+ thread_unlock(td);
+ }
+}
+
+/*
+ * Clear kinfo_proc and fill in any information that is common
+ * to all threads in the process.
+ * Must be called with the target process locked.
+ */
+static void
+fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
+{
+ struct thread *td0;
+ struct tty *tp;
+ struct session *sp;
+ struct ucred *cred;
+ struct sigacts *ps;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ bzero(kp, sizeof(*kp));
+
+ kp->ki_structsize = sizeof(*kp);
+ kp->ki_paddr = p;
+ kp->ki_addr =/* p->p_addr; */0; /* XXX */
+ kp->ki_args = p->p_args;
+ kp->ki_textvp = p->p_textvp;
+#ifdef KTRACE
+ kp->ki_tracep = p->p_tracevp;
+ kp->ki_traceflag = p->p_traceflag;
+#endif
+ kp->ki_fd = p->p_fd;
+ kp->ki_vmspace = p->p_vmspace;
+ kp->ki_flag = p->p_flag;
+ cred = p->p_ucred;
+ if (cred) {
+ kp->ki_uid = cred->cr_uid;
+ kp->ki_ruid = cred->cr_ruid;
+ kp->ki_svuid = cred->cr_svuid;
+ kp->ki_cr_flags = 0;
+ if (cred->cr_flags & CRED_FLAG_CAPMODE)
+ kp->ki_cr_flags |= KI_CRF_CAPABILITY_MODE;
+ /* XXX bde doesn't like KI_NGROUPS */
+ if (cred->cr_ngroups > KI_NGROUPS) {
+ kp->ki_ngroups = KI_NGROUPS;
+ kp->ki_cr_flags |= KI_CRF_GRP_OVERFLOW;
+ } else
+ kp->ki_ngroups = cred->cr_ngroups;
+ bcopy(cred->cr_groups, kp->ki_groups,
+ kp->ki_ngroups * sizeof(gid_t));
+ kp->ki_rgid = cred->cr_rgid;
+ kp->ki_svgid = cred->cr_svgid;
+ /* If jailed(cred), emulate the old P_JAILED flag. */
+ if (jailed(cred)) {
+ kp->ki_flag |= P_JAILED;
+ /* If inside the jail, use 0 as a jail ID. */
+ if (cred->cr_prison != curthread->td_ucred->cr_prison)
+ kp->ki_jid = cred->cr_prison->pr_id;
+ }
+ strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name,
+ sizeof(kp->ki_loginclass));
+ }
+ ps = p->p_sigacts;
+ if (ps) {
+ mtx_lock(&ps->ps_mtx);
+ kp->ki_sigignore = ps->ps_sigignore;
+ kp->ki_sigcatch = ps->ps_sigcatch;
+ mtx_unlock(&ps->ps_mtx);
+ }
+ if (p->p_state != PRS_NEW &&
+ p->p_state != PRS_ZOMBIE &&
+ p->p_vmspace != NULL) {
+ struct vmspace *vm = p->p_vmspace;
+
+ kp->ki_size = vm->vm_map.size;
+ kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
+ FOREACH_THREAD_IN_PROC(p, td0) {
+ if (!TD_IS_SWAPPED(td0))
+ kp->ki_rssize += td0->td_kstack_pages;
+ }
+ kp->ki_swrss = vm->vm_swrss;
+ kp->ki_tsize = vm->vm_tsize;
+ kp->ki_dsize = vm->vm_dsize;
+ kp->ki_ssize = vm->vm_ssize;
+ } else if (p->p_state == PRS_ZOMBIE)
+ kp->ki_stat = SZOMB;
+ if (kp->ki_flag & P_INMEM)
+ kp->ki_sflag = PS_INMEM;
+ else
+ kp->ki_sflag = 0;
+ /* Calculate legacy swtime as seconds since 'swtick'. */
+ kp->ki_swtime = (ticks - p->p_swtick) / hz;
+ kp->ki_pid = p->p_pid;
+ kp->ki_nice = p->p_nice;
+ kp->ki_fibnum = p->p_fibnum;
+ kp->ki_start = p->p_stats->p_start;
+ timevaladd(&kp->ki_start, &boottime);
+ PROC_SLOCK(p);
+ rufetch(p, &kp->ki_rusage);
+ kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
+ calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
+ PROC_SUNLOCK(p);
+ calccru(p, &kp->ki_childutime, &kp->ki_childstime);
+ /* Some callers want child times in a single value. */
+ kp->ki_childtime = kp->ki_childstime;
+ timevaladd(&kp->ki_childtime, &kp->ki_childutime);
+
+ FOREACH_THREAD_IN_PROC(p, td0)
+ kp->ki_cow += td0->td_cow;
+
+ tp = NULL;
+ if (p->p_pgrp) {
+ kp->ki_pgid = p->p_pgrp->pg_id;
+ kp->ki_jobc = p->p_pgrp->pg_jobc;
+ sp = p->p_pgrp->pg_session;
+
+ if (sp != NULL) {
+ kp->ki_sid = sp->s_sid;
+ SESS_LOCK(sp);
+ strlcpy(kp->ki_login, sp->s_login,
+ sizeof(kp->ki_login));
+ if (sp->s_ttyvp)
+ kp->ki_kiflag |= KI_CTTY;
+ if (SESS_LEADER(p))
+ kp->ki_kiflag |= KI_SLEADER;
+ /* XXX proctree_lock */
+ tp = sp->s_ttyp;
+ SESS_UNLOCK(sp);
+ }
+ }
+ if ((p->p_flag & P_CONTROLT) && tp != NULL) {
+ kp->ki_tdev = tty_udev(tp);
+ kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+ if (tp->t_session)
+ kp->ki_tsid = tp->t_session->s_sid;
+ } else
+ kp->ki_tdev = NODEV;
+ if (p->p_comm[0] != '\0')
+ strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
+ if (p->p_sysent && p->p_sysent->sv_name != NULL &&
+ p->p_sysent->sv_name[0] != '\0')
+ strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
+ kp->ki_siglist = p->p_siglist;
+ kp->ki_xstat = p->p_xstat;
+ kp->ki_acflag = p->p_acflag;
+ kp->ki_lock = p->p_lock;
+ if (p->p_pptr)
+ kp->ki_ppid = p->p_pptr->p_pid;
+}
+
+/*
+ * Fill in information that is thread specific. Must be called with
+ * target process locked. If 'preferthread' is set, overwrite certain
+ * process-related fields that are maintained for both threads and
+ * processes.
+ */
+static void
+fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp, int preferthread)
+{
+ struct proc *p;
+
+ p = td->td_proc;
+ kp->ki_tdaddr = td;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if (preferthread)
+ PROC_SLOCK(p);
+ thread_lock(td);
+ if (td->td_wmesg != NULL)
+ strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
+ else
+ bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
+ strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname));
+ if (TD_ON_LOCK(td)) {
+ kp->ki_kiflag |= KI_LOCKBLOCK;
+ strlcpy(kp->ki_lockname, td->td_lockname,
+ sizeof(kp->ki_lockname));
+ } else {
+ kp->ki_kiflag &= ~KI_LOCKBLOCK;
+ bzero(kp->ki_lockname, sizeof(kp->ki_lockname));
+ }
+
+ if (p->p_state == PRS_NORMAL) { /* approximate. */
+ if (TD_ON_RUNQ(td) ||
+ TD_CAN_RUN(td) ||
+ TD_IS_RUNNING(td)) {
+ kp->ki_stat = SRUN;
+ } else if (P_SHOULDSTOP(p)) {
+ kp->ki_stat = SSTOP;
+ } else if (TD_IS_SLEEPING(td)) {
+ kp->ki_stat = SSLEEP;
+ } else if (TD_ON_LOCK(td)) {
+ kp->ki_stat = SLOCK;
+ } else {
+ kp->ki_stat = SWAIT;
+ }
+ } else if (p->p_state == PRS_ZOMBIE) {
+ kp->ki_stat = SZOMB;
+ } else {
+ kp->ki_stat = SIDL;
+ }
+
+ /* Things in the thread */
+ kp->ki_wchan = td->td_wchan;
+ kp->ki_pri.pri_level = td->td_priority;
+ kp->ki_pri.pri_native = td->td_base_pri;
+ kp->ki_lastcpu = td->td_lastcpu;
+ kp->ki_oncpu = td->td_oncpu;
+ kp->ki_tdflags = td->td_flags;
+ kp->ki_tid = td->td_tid;
+ kp->ki_numthreads = p->p_numthreads;
+ kp->ki_pcb = td->td_pcb;
+ kp->ki_kstack = (void *)td->td_kstack;
+ kp->ki_slptime = (ticks - td->td_slptick) / hz;
+ kp->ki_pri.pri_class = td->td_pri_class;
+ kp->ki_pri.pri_user = td->td_user_pri;
+
+ if (preferthread) {
+ rufetchtd(td, &kp->ki_rusage);
+ kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime);
+ kp->ki_pctcpu = sched_pctcpu(td);
+ kp->ki_estcpu = td->td_estcpu;
+ kp->ki_cow = td->td_cow;
+ }
+
+ /* We can't get this anymore but ps etc never used it anyway. */
+ kp->ki_rqindex = 0;
+
+ if (preferthread)
+ kp->ki_siglist = td->td_siglist;
+ kp->ki_sigmask = td->td_sigmask;
+ thread_unlock(td);
+ if (preferthread)
+ PROC_SUNLOCK(p);
+}
+
+/*
+ * Fill in a kinfo_proc structure for the specified process.
+ * Must be called with the target process locked.
+ */
+void
+fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
+{
+
+ MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
+
+ fill_kinfo_proc_only(p, kp);
+ fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0);
+ fill_kinfo_aggregate(p, kp);
+}
+
+struct pstats *
+pstats_alloc(void)
+{
+
+ return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK));
+}
+
+/*
+ * Copy parts of p_stats; zero the rest of p_stats (statistics).
+ */
+void
+pstats_fork(struct pstats *src, struct pstats *dst)
+{
+
+ bzero(&dst->pstat_startzero,
+ __rangeof(struct pstats, pstat_startzero, pstat_endzero));
+ bcopy(&src->pstat_startcopy, &dst->pstat_startcopy,
+ __rangeof(struct pstats, pstat_startcopy, pstat_endcopy));
+}
+
+void
+pstats_free(struct pstats *ps)
+{
+
+ free(ps, M_SUBPROC);
+}
+
+static struct proc *
+zpfind_locked(pid_t pid)
+{
+ struct proc *p;
+
+ sx_assert(&allproc_lock, SX_LOCKED);
+ LIST_FOREACH(p, &zombproc, p_list) {
+ if (p->p_pid == pid) {
+ PROC_LOCK(p);
+ break;
+ }
+ }
+ return (p);
+}
+
+/*
+ * Locate a zombie process by number
+ */
+struct proc *
+zpfind(pid_t pid)
+{
+ struct proc *p;
+
+ sx_slock(&allproc_lock);
+ p = zpfind_locked(pid);
+ sx_sunlock(&allproc_lock);
+ return (p);
+}
+
+#ifdef COMPAT_FREEBSD32
+
+/*
+ * This function is typically used to copy out the kernel address, so
+ * it can be replaced by assignment of zero.
+ */
+static inline uint32_t
+ptr32_trim(void *ptr)
+{
+ uintptr_t uptr;
+
+ uptr = (uintptr_t)ptr;
+ return ((uptr > UINT_MAX) ? 0 : uptr);
+}
+
+#define PTRTRIM_CP(src,dst,fld) \
+ do { (dst).fld = ptr32_trim((src).fld); } while (0)
+
+static void
+freebsd32_kinfo_proc_out(const struct kinfo_proc *ki, struct kinfo_proc32 *ki32)
+{
+ int i;
+
+ bzero(ki32, sizeof(struct kinfo_proc32));
+ ki32->ki_structsize = sizeof(struct kinfo_proc32);
+ CP(*ki, *ki32, ki_layout);
+ PTRTRIM_CP(*ki, *ki32, ki_args);
+ PTRTRIM_CP(*ki, *ki32, ki_paddr);
+ PTRTRIM_CP(*ki, *ki32, ki_addr);
+ PTRTRIM_CP(*ki, *ki32, ki_tracep);
+ PTRTRIM_CP(*ki, *ki32, ki_textvp);
+ PTRTRIM_CP(*ki, *ki32, ki_fd);
+ PTRTRIM_CP(*ki, *ki32, ki_vmspace);
+ PTRTRIM_CP(*ki, *ki32, ki_wchan);
+ CP(*ki, *ki32, ki_pid);
+ CP(*ki, *ki32, ki_ppid);
+ CP(*ki, *ki32, ki_pgid);
+ CP(*ki, *ki32, ki_tpgid);
+ CP(*ki, *ki32, ki_sid);
+ CP(*ki, *ki32, ki_tsid);
+ CP(*ki, *ki32, ki_jobc);
+ CP(*ki, *ki32, ki_tdev);
+ CP(*ki, *ki32, ki_siglist);
+ CP(*ki, *ki32, ki_sigmask);
+ CP(*ki, *ki32, ki_sigignore);
+ CP(*ki, *ki32, ki_sigcatch);
+ CP(*ki, *ki32, ki_uid);
+ CP(*ki, *ki32, ki_ruid);
+ CP(*ki, *ki32, ki_svuid);
+ CP(*ki, *ki32, ki_rgid);
+ CP(*ki, *ki32, ki_svgid);
+ CP(*ki, *ki32, ki_ngroups);
+ for (i = 0; i < KI_NGROUPS; i++)
+ CP(*ki, *ki32, ki_groups[i]);
+ CP(*ki, *ki32, ki_size);
+ CP(*ki, *ki32, ki_rssize);
+ CP(*ki, *ki32, ki_swrss);
+ CP(*ki, *ki32, ki_tsize);
+ CP(*ki, *ki32, ki_dsize);
+ CP(*ki, *ki32, ki_ssize);
+ CP(*ki, *ki32, ki_xstat);
+ CP(*ki, *ki32, ki_acflag);
+ CP(*ki, *ki32, ki_pctcpu);
+ CP(*ki, *ki32, ki_estcpu);
+ CP(*ki, *ki32, ki_slptime);
+ CP(*ki, *ki32, ki_swtime);
+ CP(*ki, *ki32, ki_cow);
+ CP(*ki, *ki32, ki_runtime);
+ TV_CP(*ki, *ki32, ki_start);
+ TV_CP(*ki, *ki32, ki_childtime);
+ CP(*ki, *ki32, ki_flag);
+ CP(*ki, *ki32, ki_kiflag);
+ CP(*ki, *ki32, ki_traceflag);
+ CP(*ki, *ki32, ki_stat);
+ CP(*ki, *ki32, ki_nice);
+ CP(*ki, *ki32, ki_lock);
+ CP(*ki, *ki32, ki_rqindex);
+ CP(*ki, *ki32, ki_oncpu);
+ CP(*ki, *ki32, ki_lastcpu);
+ bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1);
+ bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1);
+ bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1);
+ bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1);
+ bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1);
+ bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1);
+ bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1);
+ CP(*ki, *ki32, ki_fibnum);
+ CP(*ki, *ki32, ki_cr_flags);
+ CP(*ki, *ki32, ki_jid);
+ CP(*ki, *ki32, ki_numthreads);
+ CP(*ki, *ki32, ki_tid);
+ CP(*ki, *ki32, ki_pri);
+ freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage);
+ freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch);
+ PTRTRIM_CP(*ki, *ki32, ki_pcb);
+ PTRTRIM_CP(*ki, *ki32, ki_kstack);
+ PTRTRIM_CP(*ki, *ki32, ki_udata);
+ CP(*ki, *ki32, ki_sflag);
+ CP(*ki, *ki32, ki_tdflags);
+}
+#endif
+
+int
+kern_proc_out(struct proc *p, struct sbuf *sb, int flags)
+{
+ struct thread *td;
+ struct kinfo_proc ki;
+#ifdef COMPAT_FREEBSD32
+ struct kinfo_proc32 ki32;
+#endif
+ int error;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ MPASS(FIRST_THREAD_IN_PROC(p) != NULL);
+
+ error = 0;
+ fill_kinfo_proc(p, &ki);
+ if ((flags & KERN_PROC_NOTHREADS) != 0) {
+#ifdef COMPAT_FREEBSD32
+ if ((flags & KERN_PROC_MASK32) != 0) {
+ freebsd32_kinfo_proc_out(&ki, &ki32);
+ error = sbuf_bcat(sb, &ki32, sizeof(ki32));
+ } else
+#endif
+ error = sbuf_bcat(sb, &ki, sizeof(ki));
+ } else {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ fill_kinfo_thread(td, &ki, 1);
+#ifdef COMPAT_FREEBSD32
+ if ((flags & KERN_PROC_MASK32) != 0) {
+ freebsd32_kinfo_proc_out(&ki, &ki32);
+ error = sbuf_bcat(sb, &ki32, sizeof(ki32));
+ } else
+#endif
+ error = sbuf_bcat(sb, &ki, sizeof(ki));
+ if (error)
+ break;
+ }
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+static int
+sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags,
+ int doingzomb)
+{
+ struct sbuf sb;
+ struct kinfo_proc ki;
+ struct proc *np;
+ int error, error2;
+ pid_t pid;
+
+ pid = p->p_pid;
+ sbuf_new_for_sysctl(&sb, (char *)&ki, sizeof(ki), req);
+ error = kern_proc_out(p, &sb, flags);
+ error2 = sbuf_finish(&sb);
+ sbuf_delete(&sb);
+ if (error != 0)
+ return (error);
+ else if (error2 != 0)
+ return (error2);
+ if (doingzomb)
+ np = zpfind(pid);
+ else {
+ if (pid == 0)
+ return (0);
+ np = pfind(pid);
+ }
+ if (np == NULL)
+ return (ESRCH);
+ if (np != p) {
+ PROC_UNLOCK(np);
+ return (ESRCH);
+ }
+ PROC_UNLOCK(np);
+ return (0);
+}
+
+static int
+sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ int flags, doingzomb, oid_number;
+ int error = 0;
+
+ oid_number = oidp->oid_number;
+ if (oid_number != KERN_PROC_ALL &&
+ (oid_number & KERN_PROC_INC_THREAD) == 0)
+ flags = KERN_PROC_NOTHREADS;
+ else {
+ flags = 0;
+ oid_number &= ~KERN_PROC_INC_THREAD;
+ }
+#ifdef COMPAT_FREEBSD32
+ if (req->flags & SCTL_MASK32)
+ flags |= KERN_PROC_MASK32;
+#endif
+ if (oid_number == KERN_PROC_PID) {
+ if (namelen != 1)
+ return (EINVAL);
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error)
+ return (error);
+ error = pget((pid_t)name[0], PGET_CANSEE, &p);
+ if (error != 0)
+ return (error);
+ error = sysctl_out_proc(p, req, flags, 0);
+ return (error);
+ }
+
+ switch (oid_number) {
+ case KERN_PROC_ALL:
+ if (namelen != 0)
+ return (EINVAL);
+ break;
+ case KERN_PROC_PROC:
+ if (namelen != 0 && namelen != 1)
+ return (EINVAL);
+ break;
+ default:
+ if (namelen != 1)
+ return (EINVAL);
+ break;
+ }
+
+ if (!req->oldptr) {
+ /* overestimate by 5 procs */
+ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
+ if (error)
+ return (error);
+ }
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sx_slock(&allproc_lock);
+ for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+ if (!doingzomb)
+ p = LIST_FIRST(&allproc);
+ else
+ p = LIST_FIRST(&zombproc);
+ for (; p != 0; p = LIST_NEXT(p, p_list)) {
+ /*
+ * Skip embryonic processes.
+ */
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ KASSERT(p->p_ucred != NULL,
+ ("process credential is NULL for non-NEW proc"));
+ /*
+ * Show a user only appropriate processes.
+ */
+ if (p_cansee(curthread, p)) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /*
+ * TODO - make more efficient (see notes below).
+ * do by session.
+ */
+ switch (oid_number) {
+
+ case KERN_PROC_GID:
+ if (p->p_ucred->cr_gid != (gid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+
+ case KERN_PROC_PGRP:
+ /* could do this by traversing pgrp */
+ if (p->p_pgrp == NULL ||
+ p->p_pgrp->pg_id != (pid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+
+ case KERN_PROC_RGID:
+ if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+
+ case KERN_PROC_SESSION:
+ if (p->p_session == NULL ||
+ p->p_session->s_sid != (pid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+
+ case KERN_PROC_TTY:
+ if ((p->p_flag & P_CONTROLT) == 0 ||
+ p->p_session == NULL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ /* XXX proctree_lock */
+ SESS_LOCK(p->p_session);
+ if (p->p_session->s_ttyp == NULL ||
+ tty_udev(p->p_session->s_ttyp) !=
+ (dev_t)name[0]) {
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ continue;
+ }
+ SESS_UNLOCK(p->p_session);
+ break;
+
+ case KERN_PROC_UID:
+ if (p->p_ucred->cr_uid != (uid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+
+ case KERN_PROC_RUID:
+ if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ break;
+
+ case KERN_PROC_PROC:
+ break;
+
+ default:
+ break;
+
+ }
+
+ error = sysctl_out_proc(p, req, flags, doingzomb);
+ if (error) {
+ sx_sunlock(&allproc_lock);
+ return (error);
+ }
+ }
+ }
+ sx_sunlock(&allproc_lock);
+ return (0);
+}
+
+struct pargs *
+pargs_alloc(int len)
+{
+ struct pargs *pa;
+
+ pa = malloc(sizeof(struct pargs) + len, M_PARGS,
+ M_WAITOK);
+ refcount_init(&pa->ar_ref, 1);
+ pa->ar_length = len;
+ return (pa);
+}
+
+static void
+pargs_free(struct pargs *pa)
+{
+
+ free(pa, M_PARGS);
+}
+
+void
+pargs_hold(struct pargs *pa)
+{
+
+ if (pa == NULL)
+ return;
+ refcount_acquire(&pa->ar_ref);
+}
+
+void
+pargs_drop(struct pargs *pa)
+{
+
+ if (pa == NULL)
+ return;
+ if (refcount_release(&pa->ar_ref))
+ pargs_free(pa);
+}
+
+static int
+proc_read_mem(struct thread *td, struct proc *p, vm_offset_t offset, void* buf,
+ size_t len)
+{
+ struct iovec iov;
+ struct uio uio;
+
+ iov.iov_base = (caddr_t)buf;
+ iov.iov_len = len;
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = offset;
+ uio.uio_resid = (ssize_t)len;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_rw = UIO_READ;
+ uio.uio_td = td;
+
+ return (proc_rwmem(p, &uio));
+}
+
+static int
+proc_read_string(struct thread *td, struct proc *p, const char *sptr, char *buf,
+ size_t len)
+{
+ size_t i;
+ int error;
+
+ error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, len);
+ /*
+ * Reading the chunk may validly return EFAULT if the string is shorter
+ * than the chunk and is aligned at the end of the page, assuming the
+ * next page is not mapped. So if EFAULT is returned do a fallback to
+ * one byte read loop.
+ */
+ if (error == EFAULT) {
+ for (i = 0; i < len; i++, buf++, sptr++) {
+ error = proc_read_mem(td, p, (vm_offset_t)sptr, buf, 1);
+ if (error != 0)
+ return (error);
+ if (*buf == '\0')
+ break;
+ }
+ error = 0;
+ }
+ return (error);
+}
+
+#define PROC_AUXV_MAX 256 /* Safety limit on auxv size. */
+
+enum proc_vector_type {
+ PROC_ARG,
+ PROC_ENV,
+ PROC_AUX,
+};
+
+#ifdef COMPAT_FREEBSD32
+static int
+get_proc_vector32(struct thread *td, struct proc *p, char ***proc_vectorp,
+ size_t *vsizep, enum proc_vector_type type)
+{
+ struct freebsd32_ps_strings pss;
+ Elf32_Auxinfo aux;
+ vm_offset_t vptr, ptr;
+ uint32_t *proc_vector32;
+ char **proc_vector;
+ size_t vsize, size;
+ int i, error;
+
+ error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings),
+ &pss, sizeof(pss));
+ if (error != 0)
+ return (error);
+ switch (type) {
+ case PROC_ARG:
+ vptr = (vm_offset_t)PTRIN(pss.ps_argvstr);
+ vsize = pss.ps_nargvstr;
+ if (vsize > ARG_MAX)
+ return (ENOEXEC);
+ size = vsize * sizeof(int32_t);
+ break;
+ case PROC_ENV:
+ vptr = (vm_offset_t)PTRIN(pss.ps_envstr);
+ vsize = pss.ps_nenvstr;
+ if (vsize > ARG_MAX)
+ return (ENOEXEC);
+ size = vsize * sizeof(int32_t);
+ break;
+ case PROC_AUX:
+ vptr = (vm_offset_t)PTRIN(pss.ps_envstr) +
+ (pss.ps_nenvstr + 1) * sizeof(int32_t);
+ if (vptr % 4 != 0)
+ return (ENOEXEC);
+ for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) {
+ error = proc_read_mem(td, p, ptr, &aux, sizeof(aux));
+ if (error != 0)
+ return (error);
+ if (aux.a_type == AT_NULL)
+ break;
+ ptr += sizeof(aux);
+ }
+ if (aux.a_type != AT_NULL)
+ return (ENOEXEC);
+ vsize = i + 1;
+ size = vsize * sizeof(aux);
+ break;
+ default:
+ KASSERT(0, ("Wrong proc vector type: %d", type));
+ return (EINVAL);
+ }
+ proc_vector32 = malloc(size, M_TEMP, M_WAITOK);
+ error = proc_read_mem(td, p, vptr, proc_vector32, size);
+ if (error != 0)
+ goto done;
+ if (type == PROC_AUX) {
+ *proc_vectorp = (char **)proc_vector32;
+ *vsizep = vsize;
+ return (0);
+ }
+ proc_vector = malloc(vsize * sizeof(char *), M_TEMP, M_WAITOK);
+ for (i = 0; i < (int)vsize; i++)
+ proc_vector[i] = PTRIN(proc_vector32[i]);
+ *proc_vectorp = proc_vector;
+ *vsizep = vsize;
+done:
+ free(proc_vector32, M_TEMP);
+ return (error);
+}
+#endif
+
+static int
+get_proc_vector(struct thread *td, struct proc *p, char ***proc_vectorp,
+ size_t *vsizep, enum proc_vector_type type)
+{
+ struct ps_strings pss;
+ Elf_Auxinfo aux;
+ vm_offset_t vptr, ptr;
+ char **proc_vector;
+ size_t vsize, size;
+ int error, i;
+
+#ifdef COMPAT_FREEBSD32
+ if (SV_PROC_FLAG(p, SV_ILP32) != 0)
+ return (get_proc_vector32(td, p, proc_vectorp, vsizep, type));
+#endif
+ error = proc_read_mem(td, p, (vm_offset_t)(p->p_sysent->sv_psstrings),
+ &pss, sizeof(pss));
+ if (error != 0)
+ return (error);
+ switch (type) {
+ case PROC_ARG:
+ vptr = (vm_offset_t)pss.ps_argvstr;
+ vsize = pss.ps_nargvstr;
+ if (vsize > ARG_MAX)
+ return (ENOEXEC);
+ size = vsize * sizeof(char *);
+ break;
+ case PROC_ENV:
+ vptr = (vm_offset_t)pss.ps_envstr;
+ vsize = pss.ps_nenvstr;
+ if (vsize > ARG_MAX)
+ return (ENOEXEC);
+ size = vsize * sizeof(char *);
+ break;
+ case PROC_AUX:
+ /*
+ * The aux array is just above env array on the stack. Check
+ * that the address is naturally aligned.
+ */
+ vptr = (vm_offset_t)pss.ps_envstr + (pss.ps_nenvstr + 1)
+ * sizeof(char *);
+#if __ELF_WORD_SIZE == 64
+ if (vptr % sizeof(uint64_t) != 0)
+#else
+ if (vptr % sizeof(uint32_t) != 0)
+#endif
+ return (ENOEXEC);
+ /*
+ * We count the array size reading the aux vectors from the
+ * stack until AT_NULL vector is returned. So (to keep the code
+ * simple) we read the process stack twice: the first time here
+ * to find the size and the second time when copying the vectors
+ * to the allocated proc_vector.
+ */
+ for (ptr = vptr, i = 0; i < PROC_AUXV_MAX; i++) {
+ error = proc_read_mem(td, p, ptr, &aux, sizeof(aux));
+ if (error != 0)
+ return (error);
+ if (aux.a_type == AT_NULL)
+ break;
+ ptr += sizeof(aux);
+ }
+ /*
+ * If the PROC_AUXV_MAX entries are iterated over, and we have
+ * not reached AT_NULL, it is most likely we are reading wrong
+ * data: either the process doesn't have auxv array or data has
+ * been modified. Return the error in this case.
+ */
+ if (aux.a_type != AT_NULL)
+ return (ENOEXEC);
+ vsize = i + 1;
+ size = vsize * sizeof(aux);
+ break;
+ default:
+ KASSERT(0, ("Wrong proc vector type: %d", type));
+ return (EINVAL); /* In case we are built without INVARIANTS. */
+ }
+ proc_vector = malloc(size, M_TEMP, M_WAITOK);
+ if (proc_vector == NULL)
+ return (ENOMEM);
+ error = proc_read_mem(td, p, vptr, proc_vector, size);
+ if (error != 0) {
+ free(proc_vector, M_TEMP);
+ return (error);
+ }
+ *proc_vectorp = proc_vector;
+ *vsizep = vsize;
+
+ return (0);
+}
+
+#define GET_PS_STRINGS_CHUNK_SZ 256 /* Chunk size (bytes) for ps_strings operations. */
+
+static int
+get_ps_strings(struct thread *td, struct proc *p, struct sbuf *sb,
+ enum proc_vector_type type)
+{
+ size_t done, len, nchr, vsize;
+ int error, i;
+ char **proc_vector, *sptr;
+ char pss_string[GET_PS_STRINGS_CHUNK_SZ];
+
+ PROC_ASSERT_HELD(p);
+
+ /*
+ * We are not going to read more than 2 * (PATH_MAX + ARG_MAX) bytes.
+ */
+ nchr = 2 * (PATH_MAX + ARG_MAX);
+
+ error = get_proc_vector(td, p, &proc_vector, &vsize, type);
+ if (error != 0)
+ return (error);
+ for (done = 0, i = 0; i < (int)vsize && done < nchr; i++) {
+ /*
+ * The program may have scribbled into its argv array, e.g. to
+ * remove some arguments. If that has happened, break out
+ * before trying to read from NULL.
+ */
+ if (proc_vector[i] == NULL)
+ break;
+ for (sptr = proc_vector[i]; ; sptr += GET_PS_STRINGS_CHUNK_SZ) {
+ error = proc_read_string(td, p, sptr, pss_string,
+ sizeof(pss_string));
+ if (error != 0)
+ goto done;
+ len = strnlen(pss_string, GET_PS_STRINGS_CHUNK_SZ);
+ if (done + len >= nchr)
+ len = nchr - done - 1;
+ sbuf_bcat(sb, pss_string, len);
+ if (len != GET_PS_STRINGS_CHUNK_SZ)
+ break;
+ done += GET_PS_STRINGS_CHUNK_SZ;
+ }
+ sbuf_bcat(sb, "", 1);
+ done += len + 1;
+ }
+done:
+ free(proc_vector, M_TEMP);
+ return (error);
+}
+
+int
+proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb)
+{
+
+ return (get_ps_strings(curthread, p, sb, PROC_ARG));
+}
+
+int
+proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb)
+{
+
+ return (get_ps_strings(curthread, p, sb, PROC_ENV));
+}
+
+int
+proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb)
+{
+ size_t vsize, size;
+ char **auxv;
+ int error;
+
+ error = get_proc_vector(td, p, &auxv, &vsize, PROC_AUX);
+ if (error == 0) {
+#ifdef COMPAT_FREEBSD32
+ if (SV_PROC_FLAG(p, SV_ILP32) != 0)
+ size = vsize * sizeof(Elf32_Auxinfo);
+ else
+#endif
+ size = vsize * sizeof(Elf_Auxinfo);
+ error = sbuf_bcat(sb, auxv, size);
+ free(auxv, M_TEMP);
+ }
+ return (error);
+}
+
+/*
+ * This sysctl allows a process to retrieve the argument list or process
+ * title for another process without groping around in the address space
+ * of the other process. It also allow a process to set its own "process
+ * title to a string of its own choice.
+ */
+static int
+sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ struct pargs *newpa, *pa;
+ struct proc *p;
+ struct sbuf sb;
+ int flags, error = 0, error2;
+
+ if (namelen != 1)
+ return (EINVAL);
+
+ flags = PGET_CANSEE;
+ if (req->newptr != NULL)
+ flags |= PGET_ISCURRENT;
+ error = pget((pid_t)name[0], flags, &p);
+ if (error)
+ return (error);
+
+ pa = p->p_args;
+ if (pa != NULL) {
+ pargs_hold(pa);
+ PROC_UNLOCK(p);
+ error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
+ pargs_drop(pa);
+ } else if ((p->p_flag & (P_WEXIT | P_SYSTEM)) == 0) {
+ _PHOLD(p);
+ PROC_UNLOCK(p);
+ sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
+ error = proc_getargv(curthread, p, &sb);
+ error2 = sbuf_finish(&sb);
+ PRELE(p);
+ sbuf_delete(&sb);
+ if (error == 0 && error2 != 0)
+ error = error2;
+ } else {
+ PROC_UNLOCK(p);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
+ return (ENOMEM);
+ newpa = pargs_alloc(req->newlen);
+ error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
+ if (error != 0) {
+ pargs_free(newpa);
+ return (error);
+ }
+ PROC_LOCK(p);
+ pa = p->p_args;
+ p->p_args = newpa;
+ PROC_UNLOCK(p);
+ pargs_drop(pa);
+ return (0);
+}
+
+/*
+ * This sysctl allows a process to retrieve environment of another process.
+ */
+static int
+sysctl_kern_proc_env(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ struct sbuf sb;
+ int error, error2;
+
+ if (namelen != 1)
+ return (EINVAL);
+
+ error = pget((pid_t)name[0], PGET_WANTREAD, &p);
+ if (error != 0)
+ return (error);
+ if ((p->p_flag & P_SYSTEM) != 0) {
+ PRELE(p);
+ return (0);
+ }
+
+ sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
+ error = proc_getenvv(curthread, p, &sb);
+ error2 = sbuf_finish(&sb);
+ PRELE(p);
+ sbuf_delete(&sb);
+ return (error != 0 ? error : error2);
+}
+
+/*
+ * This sysctl allows a process to retrieve ELF auxiliary vector of
+ * another process.
+ */
+static int
+sysctl_kern_proc_auxv(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ struct sbuf sb;
+ int error, error2;
+
+ if (namelen != 1)
+ return (EINVAL);
+
+ error = pget((pid_t)name[0], PGET_WANTREAD, &p);
+ if (error != 0)
+ return (error);
+ if ((p->p_flag & P_SYSTEM) != 0) {
+ PRELE(p);
+ return (0);
+ }
+ sbuf_new_for_sysctl(&sb, NULL, GET_PS_STRINGS_CHUNK_SZ, req);
+ error = proc_getauxv(curthread, p, &sb);
+ error2 = sbuf_finish(&sb);
+ PRELE(p);
+ sbuf_delete(&sb);
+ return (error != 0 ? error : error2);
+}
+
+/*
+ * This sysctl allows a process to retrieve the path of the executable for
+ * itself or another process.
+ */
+static int
+sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS)
+{
+ pid_t *pidp = (pid_t *)arg1;
+ unsigned int arglen = arg2;
+ struct proc *p;
+ struct vnode *vp;
+ char *retbuf, *freebuf;
+ int error;
+
+ if (arglen != 1)
+ return (EINVAL);
+ if (*pidp == -1) { /* -1 means this process */
+ p = req->td->td_proc;
+ } else {
+ error = pget(*pidp, PGET_CANSEE, &p);
+ if (error != 0)
+ return (error);
+ }
+
+ vp = p->p_textvp;
+ if (vp == NULL) {
+ if (*pidp != -1)
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ vref(vp);
+ if (*pidp != -1)
+ PROC_UNLOCK(p);
+ error = vn_fullpath(req->td, vp, &retbuf, &freebuf);
+ vrele(vp);
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1);
+ free(freebuf, M_TEMP);
+ return (error);
+}
+
+static int
+sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
+{
+ struct proc *p;
+ char *sv_name;
+ int *name;
+ int namelen;
+ int error;
+
+ namelen = arg2;
+ if (namelen != 1)
+ return (EINVAL);
+
+ name = (int *)arg1;
+ error = pget((pid_t)name[0], PGET_CANSEE, &p);
+ if (error != 0)
+ return (error);
+ sv_name = p->p_sysent->sv_name;
+ PROC_UNLOCK(p);
+ return (sysctl_handle_string(oidp, sv_name, 0, req));
+}
+
+#ifdef KINFO_OVMENTRY_SIZE
+CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE);
+#endif
+
+#ifdef COMPAT_FREEBSD7
+static int
+sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS)
+{
+ vm_map_entry_t entry, tmp_entry;
+ unsigned int last_timestamp;
+ char *fullpath, *freepath;
+ struct kinfo_ovmentry *kve;
+ struct vattr va;
+ struct ucred *cred;
+ int error, *name;
+ struct vnode *vp;
+ struct proc *p;
+ vm_map_t map;
+ struct vmspace *vm;
+
+ name = (int *)arg1;
+ error = pget((pid_t)name[0], PGET_WANTREAD, &p);
+ if (error != 0)
+ return (error);
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL) {
+ PRELE(p);
+ return (ESRCH);
+ }
+ kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
+
+ map = &vm->vm_map;
+ vm_map_lock_read(map);
+ for (entry = map->header.next; entry != &map->header;
+ entry = entry->next) {
+ vm_object_t obj, tobj, lobj;
+ vm_offset_t addr;
+
+ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
+ continue;
+
+ bzero(kve, sizeof(*kve));
+ kve->kve_structsize = sizeof(*kve);
+
+ kve->kve_private_resident = 0;
+ obj = entry->object.vm_object;
+ if (obj != NULL) {
+ VM_OBJECT_RLOCK(obj);
+ if (obj->shadow_count == 1)
+ kve->kve_private_resident =
+ obj->resident_page_count;
+ }
+ kve->kve_resident = 0;
+ addr = entry->start;
+ while (addr < entry->end) {
+ if (pmap_extract(map->pmap, addr))
+ kve->kve_resident++;
+ addr += PAGE_SIZE;
+ }
+
+ for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
+ if (tobj != obj)
+ VM_OBJECT_RLOCK(tobj);
+ if (lobj != obj)
+ VM_OBJECT_RUNLOCK(lobj);
+ lobj = tobj;
+ }
+
+ kve->kve_start = (void*)entry->start;
+ kve->kve_end = (void*)entry->end;
+ kve->kve_offset = (off_t)entry->offset;
+
+ if (entry->protection & VM_PROT_READ)
+ kve->kve_protection |= KVME_PROT_READ;
+ if (entry->protection & VM_PROT_WRITE)
+ kve->kve_protection |= KVME_PROT_WRITE;
+ if (entry->protection & VM_PROT_EXECUTE)
+ kve->kve_protection |= KVME_PROT_EXEC;
+
+ if (entry->eflags & MAP_ENTRY_COW)
+ kve->kve_flags |= KVME_FLAG_COW;
+ if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
+ kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
+ if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
+ kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
+
+ last_timestamp = map->timestamp;
+ vm_map_unlock_read(map);
+
+ kve->kve_fileid = 0;
+ kve->kve_fsid = 0;
+ freepath = NULL;
+ fullpath = "";
+ if (lobj) {
+ vp = NULL;
+ switch (lobj->type) {
+ case OBJT_DEFAULT:
+ kve->kve_type = KVME_TYPE_DEFAULT;
+ break;
+ case OBJT_VNODE:
+ kve->kve_type = KVME_TYPE_VNODE;
+ vp = lobj->handle;
+ vref(vp);
+ break;
+ case OBJT_SWAP:
+ kve->kve_type = KVME_TYPE_SWAP;
+ break;
+ case OBJT_DEVICE:
+ kve->kve_type = KVME_TYPE_DEVICE;
+ break;
+ case OBJT_PHYS:
+ kve->kve_type = KVME_TYPE_PHYS;
+ break;
+ case OBJT_DEAD:
+ kve->kve_type = KVME_TYPE_DEAD;
+ break;
+ case OBJT_SG:
+ kve->kve_type = KVME_TYPE_SG;
+ break;
+ default:
+ kve->kve_type = KVME_TYPE_UNKNOWN;
+ break;
+ }
+ if (lobj != obj)
+ VM_OBJECT_RUNLOCK(lobj);
+
+ kve->kve_ref_count = obj->ref_count;
+ kve->kve_shadow_count = obj->shadow_count;
+ VM_OBJECT_RUNLOCK(obj);
+ if (vp != NULL) {
+ vn_fullpath(curthread, vp, &fullpath,
+ &freepath);
+ cred = curthread->td_ucred;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ if (VOP_GETATTR(vp, &va, cred) == 0) {
+ kve->kve_fileid = va.va_fileid;
+ kve->kve_fsid = va.va_fsid;
+ }
+ vput(vp);
+ }
+ } else {
+ kve->kve_type = KVME_TYPE_NONE;
+ kve->kve_ref_count = 0;
+ kve->kve_shadow_count = 0;
+ }
+
+ strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
+ if (freepath != NULL)
+ free(freepath, M_TEMP);
+
+ error = SYSCTL_OUT(req, kve, sizeof(*kve));
+ vm_map_lock_read(map);
+ if (error)
+ break;
+ if (last_timestamp != map->timestamp) {
+ vm_map_lookup_entry(map, addr - 1, &tmp_entry);
+ entry = tmp_entry;
+ }
+ }
+ vm_map_unlock_read(map);
+ vmspace_free(vm);
+ PRELE(p);
+ free(kve, M_TEMP);
+ return (error);
+}
+#endif /* COMPAT_FREEBSD7 */
+
+#ifdef KINFO_VMENTRY_SIZE
+CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
+#endif
+
+/*
+ * Must be called with the process locked and will return unlocked.
+ */
+int
+kern_proc_vmmap_out(struct proc *p, struct sbuf *sb)
+{
+ vm_map_entry_t entry, tmp_entry;
+ unsigned int last_timestamp;
+ char *fullpath, *freepath;
+ struct kinfo_vmentry *kve;
+ struct vattr va;
+ struct ucred *cred;
+ int error;
+ struct vnode *vp;
+ struct vmspace *vm;
+ vm_map_t map;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ _PHOLD(p);
+ PROC_UNLOCK(p);
+ vm = vmspace_acquire_ref(p);
+ if (vm == NULL) {
+ PRELE(p);
+ return (ESRCH);
+ }
+ kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);
+
+ error = 0;
+ map = &vm->vm_map;
+ vm_map_lock_read(map);
+ for (entry = map->header.next; entry != &map->header;
+ entry = entry->next) {
+ vm_object_t obj, tobj, lobj;
+ vm_offset_t addr;
+ vm_paddr_t locked_pa;
+ int mincoreinfo;
+
+ if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
+ continue;
+
+ bzero(kve, sizeof(*kve));
+
+ kve->kve_private_resident = 0;
+ obj = entry->object.vm_object;
+ if (obj != NULL) {
+ VM_OBJECT_RLOCK(obj);
+ if (obj->shadow_count == 1)
+ kve->kve_private_resident =
+ obj->resident_page_count;
+ }
+ kve->kve_resident = 0;
+ addr = entry->start;
+ while (addr < entry->end) {
+ locked_pa = 0;
+ mincoreinfo = pmap_mincore(map->pmap, addr, &locked_pa);
+ if (locked_pa != 0)
+ vm_page_unlock(PHYS_TO_VM_PAGE(locked_pa));
+ if (mincoreinfo & MINCORE_INCORE)
+ kve->kve_resident++;
+ if (mincoreinfo & MINCORE_SUPER)
+ kve->kve_flags |= KVME_FLAG_SUPER;
+ addr += PAGE_SIZE;
+ }
+
+ for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
+ if (tobj != obj)
+ VM_OBJECT_RLOCK(tobj);
+ if (lobj != obj)
+ VM_OBJECT_RUNLOCK(lobj);
+ lobj = tobj;
+ }
+
+ kve->kve_start = entry->start;
+ kve->kve_end = entry->end;
+ kve->kve_offset = entry->offset;
+
+ if (entry->protection & VM_PROT_READ)
+ kve->kve_protection |= KVME_PROT_READ;
+ if (entry->protection & VM_PROT_WRITE)
+ kve->kve_protection |= KVME_PROT_WRITE;
+ if (entry->protection & VM_PROT_EXECUTE)
+ kve->kve_protection |= KVME_PROT_EXEC;
+
+ if (entry->eflags & MAP_ENTRY_COW)
+ kve->kve_flags |= KVME_FLAG_COW;
+ if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
+ kve->kve_flags |= KVME_FLAG_NEEDS_COPY;
+ if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
+ kve->kve_flags |= KVME_FLAG_NOCOREDUMP;
+ if (entry->eflags & MAP_ENTRY_GROWS_UP)
+ kve->kve_flags |= KVME_FLAG_GROWS_UP;
+ if (entry->eflags & MAP_ENTRY_GROWS_DOWN)
+ kve->kve_flags |= KVME_FLAG_GROWS_DOWN;
+
+ last_timestamp = map->timestamp;
+ vm_map_unlock_read(map);
+
+ freepath = NULL;
+ fullpath = "";
+ if (lobj) {
+ vp = NULL;
+ switch (lobj->type) {
+ case OBJT_DEFAULT:
+ kve->kve_type = KVME_TYPE_DEFAULT;
+ break;
+ case OBJT_VNODE:
+ kve->kve_type = KVME_TYPE_VNODE;
+ vp = lobj->handle;
+ vref(vp);
+ break;
+ case OBJT_SWAP:
+ kve->kve_type = KVME_TYPE_SWAP;
+ break;
+ case OBJT_DEVICE:
+ kve->kve_type = KVME_TYPE_DEVICE;
+ break;
+ case OBJT_PHYS:
+ kve->kve_type = KVME_TYPE_PHYS;
+ break;
+ case OBJT_DEAD:
+ kve->kve_type = KVME_TYPE_DEAD;
+ break;
+ case OBJT_SG:
+ kve->kve_type = KVME_TYPE_SG;
+ break;
+ default:
+ kve->kve_type = KVME_TYPE_UNKNOWN;
+ break;
+ }
+ if (lobj != obj)
+ VM_OBJECT_RUNLOCK(lobj);
+
+ kve->kve_ref_count = obj->ref_count;
+ kve->kve_shadow_count = obj->shadow_count;
+ VM_OBJECT_RUNLOCK(obj);
+ if (vp != NULL) {
+ vn_fullpath(curthread, vp, &fullpath,
+ &freepath);
+ kve->kve_vn_type = vntype_to_kinfo(vp->v_type);
+ cred = curthread->td_ucred;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ if (VOP_GETATTR(vp, &va, cred) == 0) {
+ kve->kve_vn_fileid = va.va_fileid;
+ kve->kve_vn_fsid = va.va_fsid;
+ kve->kve_vn_mode =
+ MAKEIMODE(va.va_type, va.va_mode);
+ kve->kve_vn_size = va.va_size;
+ kve->kve_vn_rdev = va.va_rdev;
+ kve->kve_status = KF_ATTR_VALID;
+ }
+ vput(vp);
+ }
+ } else {
+ kve->kve_type = KVME_TYPE_NONE;
+ kve->kve_ref_count = 0;
+ kve->kve_shadow_count = 0;
+ }
+
+ strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
+ if (freepath != NULL)
+ free(freepath, M_TEMP);
+
+ /* Pack record size down */
+ kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) +
+ strlen(kve->kve_path) + 1;
+ kve->kve_structsize = roundup(kve->kve_structsize,
+ sizeof(uint64_t));
+ error = sbuf_bcat(sb, kve, kve->kve_structsize);
+ vm_map_lock_read(map);
+ if (error)
+ break;
+ if (last_timestamp != map->timestamp) {
+ vm_map_lookup_entry(map, addr - 1, &tmp_entry);
+ entry = tmp_entry;
+ }
+ }
+ vm_map_unlock_read(map);
+ vmspace_free(vm);
+ PRELE(p);
+ free(kve, M_TEMP);
+ return (error);
+}
+
+static int
+sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS)
+{
+ struct proc *p;
+ struct sbuf sb;
+ int error, error2, *name;
+
+ name = (int *)arg1;
+ sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_vmentry), req);
+ error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+ if (error != 0) {
+ sbuf_delete(&sb);
+ return (error);
+ }
+ error = kern_proc_vmmap_out(p, &sb);
+ error2 = sbuf_finish(&sb);
+ sbuf_delete(&sb);
+ return (error != 0 ? error : error2);
+}
+
+#if defined(STACK) || defined(DDB)
+static int
+sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS)
+{
+ struct kinfo_kstack *kkstp;
+ int error, i, *name, numthreads;
+ lwpid_t *lwpidarray;
+ struct thread *td;
+ struct stack *st;
+ struct sbuf sb;
+ struct proc *p;
+
+ name = (int *)arg1;
+ error = pget((pid_t)name[0], PGET_NOTINEXEC | PGET_WANTREAD, &p);
+ if (error != 0)
+ return (error);
+
+ kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK);
+ st = stack_create();
+
+ lwpidarray = NULL;
+ numthreads = 0;
+ PROC_LOCK(p);
+repeat:
+ if (numthreads < p->p_numthreads) {
+ if (lwpidarray != NULL) {
+ free(lwpidarray, M_TEMP);
+ lwpidarray = NULL;
+ }
+ numthreads = p->p_numthreads;
+ PROC_UNLOCK(p);
+ lwpidarray = malloc(sizeof(*lwpidarray) * numthreads, M_TEMP,
+ M_WAITOK | M_ZERO);
+ PROC_LOCK(p);
+ goto repeat;
+ }
+ i = 0;
+
+ /*
+ * XXXRW: During the below loop, execve(2) and countless other sorts
+ * of changes could have taken place. Should we check to see if the
+ * vmspace has been replaced, or the like, in order to prevent
+ * giving a snapshot that spans, say, execve(2), with some threads
+ * before and some after? Among other things, the credentials could
+ * have changed, in which case the right to extract debug info might
+ * no longer be assured.
+ */
+ FOREACH_THREAD_IN_PROC(p, td) {
+ KASSERT(i < numthreads,
+ ("sysctl_kern_proc_kstack: numthreads"));
+ lwpidarray[i] = td->td_tid;
+ i++;
+ }
+ numthreads = i;
+ for (i = 0; i < numthreads; i++) {
+ td = thread_find(p, lwpidarray[i]);
+ if (td == NULL) {
+ continue;
+ }
+ bzero(kkstp, sizeof(*kkstp));
+ (void)sbuf_new(&sb, kkstp->kkst_trace,
+ sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN);
+ thread_lock(td);
+ kkstp->kkst_tid = td->td_tid;
+ if (TD_IS_SWAPPED(td))
+ kkstp->kkst_state = KKST_STATE_SWAPPED;
+ else if (TD_IS_RUNNING(td))
+ kkstp->kkst_state = KKST_STATE_RUNNING;
+ else {
+ kkstp->kkst_state = KKST_STATE_STACKOK;
+ stack_save_td(st, td);
+ }
+ thread_unlock(td);
+ PROC_UNLOCK(p);
+ stack_sbuf_print(&sb, st);
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+ error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp));
+ PROC_LOCK(p);
+ if (error)
+ break;
+ }
+ _PRELE(p);
+ PROC_UNLOCK(p);
+ if (lwpidarray != NULL)
+ free(lwpidarray, M_TEMP);
+ stack_destroy(st);
+ free(kkstp, M_TEMP);
+ return (error);
+}
+#endif
+
+/*
+ * This sysctl allows a process to retrieve the full list of groups from
+ * itself or another process.
+ */
+static int
+sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS)
+{
+ pid_t *pidp = (pid_t *)arg1;
+ unsigned int arglen = arg2;
+ struct proc *p;
+ struct ucred *cred;
+ int error;
+
+ if (arglen != 1)
+ return (EINVAL);
+ if (*pidp == -1) { /* -1 means this process */
+ p = req->td->td_proc;
+ } else {
+ error = pget(*pidp, PGET_CANSEE, &p);
+ if (error != 0)
+ return (error);
+ }
+
+ cred = crhold(p->p_ucred);
+ if (*pidp != -1)
+ PROC_UNLOCK(p);
+
+ error = SYSCTL_OUT(req, cred->cr_groups,
+ cred->cr_ngroups * sizeof(gid_t));
+ crfree(cred);
+ return (error);
+}
+
+/*
+ * This sysctl allows a process to retrieve or/and set the resource limit for
+ * another process.
+ */
+static int
+sysctl_kern_proc_rlimit(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ struct rlimit rlim;
+ struct proc *p;
+ u_int which;
+ int flags, error;
+
+ if (namelen != 2)
+ return (EINVAL);
+
+ which = (u_int)name[1];
+ if (which >= RLIM_NLIMITS)
+ return (EINVAL);
+
+ if (req->newptr != NULL && req->newlen != sizeof(rlim))
+ return (EINVAL);
+
+ flags = PGET_HOLD | PGET_NOTWEXIT;
+ if (req->newptr != NULL)
+ flags |= PGET_CANDEBUG;
+ else
+ flags |= PGET_CANSEE;
+ error = pget((pid_t)name[0], flags, &p);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Retrieve limit.
+ */
+ if (req->oldptr != NULL) {
+ PROC_LOCK(p);
+ lim_rlimit(p, which, &rlim);
+ PROC_UNLOCK(p);
+ }
+ error = SYSCTL_OUT(req, &rlim, sizeof(rlim));
+ if (error != 0)
+ goto errout;
+
+ /*
+ * Set limit.
+ */
+ if (req->newptr != NULL) {
+ error = SYSCTL_IN(req, &rlim, sizeof(rlim));
+ if (error == 0)
+ error = kern_proc_setrlimit(curthread, p, which, &rlim);
+ }
+
+errout:
+ PRELE(p);
+ return (error);
+}
+
+/*
+ * This sysctl allows a process to retrieve ps_strings structure location of
+ * another process.
+ */
+static int
+sysctl_kern_proc_ps_strings(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ vm_offset_t ps_strings;
+ int error;
+#ifdef COMPAT_FREEBSD32
+ uint32_t ps_strings32;
+#endif
+
+ if (namelen != 1)
+ return (EINVAL);
+
+ error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+ if (error != 0)
+ return (error);
+#ifdef COMPAT_FREEBSD32
+ if ((req->flags & SCTL_MASK32) != 0) {
+ /*
+ * We return 0 if the 32 bit emulation request is for a 64 bit
+ * process.
+ */
+ ps_strings32 = SV_PROC_FLAG(p, SV_ILP32) != 0 ?
+ PTROUT(p->p_sysent->sv_psstrings) : 0;
+ PROC_UNLOCK(p);
+ error = SYSCTL_OUT(req, &ps_strings32, sizeof(ps_strings32));
+ return (error);
+ }
+#endif
+ ps_strings = p->p_sysent->sv_psstrings;
+ PROC_UNLOCK(p);
+ error = SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings));
+ return (error);
+}
+
+/*
+ * This sysctl allows a process to retrieve umask of another process.
+ */
+static int
+sysctl_kern_proc_umask(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ int error;
+ u_short fd_cmask;
+
+ if (namelen != 1)
+ return (EINVAL);
+
+ error = pget((pid_t)name[0], PGET_WANTREAD, &p);
+ if (error != 0)
+ return (error);
+
+ FILEDESC_SLOCK(p->p_fd);
+ fd_cmask = p->p_fd->fd_cmask;
+ FILEDESC_SUNLOCK(p->p_fd);
+ PRELE(p);
+ error = SYSCTL_OUT(req, &fd_cmask, sizeof(fd_cmask));
+ return (error);
+}
+
+/*
+ * This sysctl allows a process to set and retrieve binary osreldate of
+ * another process.
+ */
+static int
+sysctl_kern_proc_osrel(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ int flags, error, osrel;
+
+ if (namelen != 1)
+ return (EINVAL);
+
+ if (req->newptr != NULL && req->newlen != sizeof(osrel))
+ return (EINVAL);
+
+ flags = PGET_HOLD | PGET_NOTWEXIT;
+ if (req->newptr != NULL)
+ flags |= PGET_CANDEBUG;
+ else
+ flags |= PGET_CANSEE;
+ error = pget((pid_t)name[0], flags, &p);
+ if (error != 0)
+ return (error);
+
+ error = SYSCTL_OUT(req, &p->p_osrel, sizeof(p->p_osrel));
+ if (error != 0)
+ goto errout;
+
+ if (req->newptr != NULL) {
+ error = SYSCTL_IN(req, &osrel, sizeof(osrel));
+ if (error != 0)
+ goto errout;
+ if (osrel < 0) {
+ error = EINVAL;
+ goto errout;
+ }
+ p->p_osrel = osrel;
+ }
+errout:
+ PRELE(p);
+ return (error);
+}
+
+SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table");
+
+SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT|
+ CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc",
+ "Return entire process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc, "Return process table, no threads");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args,
+ CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE,
+ sysctl_kern_proc_args, "Process argument list");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_ENV, env, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ sysctl_kern_proc_env, "Process environment");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_AUXV, auxv, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_auxv, "Process ELF auxiliary vector");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name,
+ "Process syscall vector name (ABI type)");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD),
+ sid_td, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");
+
+static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td,
+ CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_kern_proc,
+ "Return process table, no threads");
+
+#ifdef COMPAT_FREEBSD7
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries");
+#endif
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries");
+
+#if defined(STACK) || defined(DDB)
+static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks");
+#endif
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_RLIMIT, rlimit, CTLFLAG_RW |
+ CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_rlimit,
+ "Process resource limits");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_PS_STRINGS, ps_strings, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_ps_strings,
+ "Process ps_strings location");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_UMASK, umask, CTLFLAG_RD |
+ CTLFLAG_MPSAFE, sysctl_kern_proc_umask, "Process umask");
+
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OSREL, osrel, CTLFLAG_RW |
+ CTLFLAG_ANYBODY | CTLFLAG_MPSAFE, sysctl_kern_proc_osrel,
+ "Process binary osreldate");
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
new file mode 100644
index 0000000..f99e053
--- /dev/null
+++ b/sys/kern/kern_prot.c
@@ -0,0 +1,2222 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
+ * The Regents of the University of California.
+ * (c) UNIX System Laboratories, Inc.
+ * Copyright (c) 2000-2001 Robert N. M. Watson.
+ * All rights reserved.
+ *
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94
+ */
+
+/*
+ * System calls related to processes and protection
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/acct.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/refcount.h>
+#include <sys/sx.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+#include <sys/pioctl.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+
+#ifdef REGRESSION
+FEATURE(regression,
+ "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)");
+#endif
+
+#if defined(INET) || defined(INET6)
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#endif
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+static MALLOC_DEFINE(M_CRED, "cred", "credentials");
+
+SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy");
+
+static void crextend(struct ucred *cr, int n);
+static void crsetgroups_locked(struct ucred *cr, int ngrp,
+ gid_t *groups);
+
+#ifndef _SYS_SYSPROTO_H_
+struct getpid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getpid(struct thread *td, struct getpid_args *uap)
+{
+ struct proc *p = td->td_proc;
+
+ td->td_retval[0] = p->p_pid;
+#if defined(COMPAT_43)
+ PROC_LOCK(p);
+ td->td_retval[1] = p->p_pptr->p_pid;
+ PROC_UNLOCK(p);
+#endif
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getppid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getppid(struct thread *td, struct getppid_args *uap)
+{
+ struct proc *p = td->td_proc;
+
+ PROC_LOCK(p);
+ td->td_retval[0] = p->p_pptr->p_pid;
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+/*
+ * Get process group ID; note that POSIX getpgrp takes no parameter.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgrp_args {
+ int dummy;
+};
+#endif
+int
+sys_getpgrp(struct thread *td, struct getpgrp_args *uap)
+{
+ struct proc *p = td->td_proc;
+
+ PROC_LOCK(p);
+ td->td_retval[0] = p->p_pgrp->pg_id;
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+/* Get an arbitary pid's process group id */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgid_args {
+ pid_t pid;
+};
+#endif
+int
+sys_getpgid(struct thread *td, struct getpgid_args *uap)
+{
+ struct proc *p;
+ int error;
+
+ if (uap->pid == 0) {
+ p = td->td_proc;
+ PROC_LOCK(p);
+ } else {
+ p = pfind(uap->pid);
+ if (p == NULL)
+ return (ESRCH);
+ error = p_cansee(td, p);
+ if (error) {
+ PROC_UNLOCK(p);
+ return (error);
+ }
+ }
+ td->td_retval[0] = p->p_pgrp->pg_id;
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+/*
+ * Get an arbitary pid's session id.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getsid_args {
+ pid_t pid;
+};
+#endif
+int
+sys_getsid(struct thread *td, struct getsid_args *uap)
+{
+ struct proc *p;
+ int error;
+
+ if (uap->pid == 0) {
+ p = td->td_proc;
+ PROC_LOCK(p);
+ } else {
+ p = pfind(uap->pid);
+ if (p == NULL)
+ return (ESRCH);
+ error = p_cansee(td, p);
+ if (error) {
+ PROC_UNLOCK(p);
+ return (error);
+ }
+ }
+ td->td_retval[0] = p->p_session->s_sid;
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getuid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getuid(struct thread *td, struct getuid_args *uap)
+{
+
+ td->td_retval[0] = td->td_ucred->cr_ruid;
+#if defined(COMPAT_43)
+ td->td_retval[1] = td->td_ucred->cr_uid;
+#endif
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct geteuid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_geteuid(struct thread *td, struct geteuid_args *uap)
+{
+
+ td->td_retval[0] = td->td_ucred->cr_uid;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getgid(struct thread *td, struct getgid_args *uap)
+{
+
+ td->td_retval[0] = td->td_ucred->cr_rgid;
+#if defined(COMPAT_43)
+ td->td_retval[1] = td->td_ucred->cr_groups[0];
+#endif
+ return (0);
+}
+
+/*
+ * Get effective group ID. The "egid" is groups[0], and could be obtained
+ * via getgroups. This syscall exists because it is somewhat painful to do
+ * correctly in a library function.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getegid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getegid(struct thread *td, struct getegid_args *uap)
+{
+
+ td->td_retval[0] = td->td_ucred->cr_groups[0];
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getgroups_args {
+ u_int gidsetsize;
+ gid_t *gidset;
+};
+#endif
+int
+sys_getgroups(struct thread *td, register struct getgroups_args *uap)
+{
+ gid_t *groups;
+ u_int ngrp;
+ int error;
+
+ if (uap->gidsetsize < td->td_ucred->cr_ngroups) {
+ if (uap->gidsetsize == 0)
+ ngrp = 0;
+ else
+ return (EINVAL);
+ } else
+ ngrp = td->td_ucred->cr_ngroups;
+ groups = malloc(ngrp * sizeof(*groups), M_TEMP, M_WAITOK);
+ error = kern_getgroups(td, &ngrp, groups);
+ if (error)
+ goto out;
+ if (uap->gidsetsize > 0)
+ error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t));
+ if (error == 0)
+ td->td_retval[0] = ngrp;
+out:
+ free(groups, M_TEMP);
+ return (error);
+}
+
+int
+kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups)
+{
+ struct ucred *cred;
+
+ cred = td->td_ucred;
+ if (*ngrp == 0) {
+ *ngrp = cred->cr_ngroups;
+ return (0);
+ }
+ if (*ngrp < cred->cr_ngroups)
+ return (EINVAL);
+ *ngrp = cred->cr_ngroups;
+ bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t));
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setsid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setsid(register struct thread *td, struct setsid_args *uap)
+{
+ struct pgrp *pgrp;
+ int error;
+ struct proc *p = td->td_proc;
+ struct pgrp *newpgrp;
+ struct session *newsess;
+
+ error = 0;
+ pgrp = NULL;
+
+ newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
+ newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
+
+ sx_xlock(&proctree_lock);
+
+ if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
+ if (pgrp != NULL)
+ PGRP_UNLOCK(pgrp);
+ error = EPERM;
+ } else {
+ (void)enterpgrp(p, p->p_pid, newpgrp, newsess);
+ td->td_retval[0] = p->p_pid;
+ newpgrp = NULL;
+ newsess = NULL;
+ }
+
+ sx_xunlock(&proctree_lock);
+
+ if (newpgrp != NULL)
+ free(newpgrp, M_PGRP);
+ if (newsess != NULL)
+ free(newsess, M_SESSION);
+
+ return (error);
+}
+
+/*
+ * set process group (setpgid/old setpgrp)
+ *
+ * caller does setpgid(targpid, targpgid)
+ *
+ * pid must be caller or child of caller (ESRCH)
+ * if a child
+ * pid must be in same session (EPERM)
+ * pid can't have done an exec (EACCES)
+ * if pgid != pid
+ * there must exist some pid in same session having pgid (EPERM)
+ * pid must not be session leader (EPERM)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setpgid_args {
+ int pid; /* target process id */
+ int pgid; /* target pgrp id */
+};
+#endif
+/* ARGSUSED */
+int
+sys_setpgid(struct thread *td, register struct setpgid_args *uap)
+{
+ struct proc *curp = td->td_proc;
+ register struct proc *targp; /* target process */
+ register struct pgrp *pgrp; /* target pgrp */
+ int error;
+ struct pgrp *newpgrp;
+
+ if (uap->pgid < 0)
+ return (EINVAL);
+
+ error = 0;
+
+ newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK | M_ZERO);
+
+ sx_xlock(&proctree_lock);
+ if (uap->pid != 0 && uap->pid != curp->p_pid) {
+ if ((targp = pfind(uap->pid)) == NULL) {
+ error = ESRCH;
+ goto done;
+ }
+ if (!inferior(targp)) {
+ PROC_UNLOCK(targp);
+ error = ESRCH;
+ goto done;
+ }
+ if ((error = p_cansee(td, targp))) {
+ PROC_UNLOCK(targp);
+ goto done;
+ }
+ if (targp->p_pgrp == NULL ||
+ targp->p_session != curp->p_session) {
+ PROC_UNLOCK(targp);
+ error = EPERM;
+ goto done;
+ }
+ if (targp->p_flag & P_EXEC) {
+ PROC_UNLOCK(targp);
+ error = EACCES;
+ goto done;
+ }
+ PROC_UNLOCK(targp);
+ } else
+ targp = curp;
+ if (SESS_LEADER(targp)) {
+ error = EPERM;
+ goto done;
+ }
+ if (uap->pgid == 0)
+ uap->pgid = targp->p_pid;
+ if ((pgrp = pgfind(uap->pgid)) == NULL) {
+ if (uap->pgid == targp->p_pid) {
+ error = enterpgrp(targp, uap->pgid, newpgrp,
+ NULL);
+ if (error == 0)
+ newpgrp = NULL;
+ } else
+ error = EPERM;
+ } else {
+ if (pgrp == targp->p_pgrp) {
+ PGRP_UNLOCK(pgrp);
+ goto done;
+ }
+ if (pgrp->pg_id != targp->p_pid &&
+ pgrp->pg_session != curp->p_session) {
+ PGRP_UNLOCK(pgrp);
+ error = EPERM;
+ goto done;
+ }
+ PGRP_UNLOCK(pgrp);
+ error = enterthispgrp(targp, pgrp);
+ }
+done:
+ sx_xunlock(&proctree_lock);
+ KASSERT((error == 0) || (newpgrp != NULL),
+ ("setpgid failed and newpgrp is NULL"));
+ if (newpgrp != NULL)
+ free(newpgrp, M_PGRP);
+ return (error);
+}
+
+/*
+ * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
+ * compatible. It says that setting the uid/gid to euid/egid is a special
+ * case of "appropriate privilege". Once the rules are expanded out, this
+ * basically means that setuid(nnn) sets all three id's, in all permitted
+ * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid())
+ * does not set the saved id - this is dangerous for traditional BSD
+ * programs. For this reason, we *really* do not want to set
+ * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
+ */
+#define POSIX_APPENDIX_B_4_2_2
+
+#ifndef _SYS_SYSPROTO_H_
+struct setuid_args {
+ uid_t uid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setuid(struct thread *td, struct setuid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ uid_t uid;
+ struct uidinfo *uip;
+ int error;
+
+ uid = uap->uid;
+ AUDIT_ARG_UID(uid);
+ newcred = crget();
+ uip = uifind(uid);
+ PROC_LOCK(p);
+ /*
+ * Copy credentials so other references do not see our changes.
+ */
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_setuid(oldcred, uid);
+ if (error)
+ goto fail;
+#endif
+
+ /*
+ * See if we have "permission" by POSIX 1003.1 rules.
+ *
+ * Note that setuid(geteuid()) is a special case of
+ * "appropriate privileges" in appendix B.4.2.2. We need
+ * to use this clause to be compatible with traditional BSD
+ * semantics. Basically, it means that "setuid(xx)" sets all
+ * three id's (assuming you have privs).
+ *
+ * Notes on the logic. We do things in three steps.
+ * 1: We determine if the euid is going to change, and do EPERM
+ * right away. We unconditionally change the euid later if this
+ * test is satisfied, simplifying that part of the logic.
+ * 2: We determine if the real and/or saved uids are going to
+ * change. Determined by compile options.
+ * 3: Change euid last. (after tests in #2 for "appropriate privs")
+ */
+ if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */
+#ifdef _POSIX_SAVED_IDS
+ uid != oldcred->cr_svuid && /* allow setuid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
+ uid != oldcred->cr_uid && /* allow setuid(geteuid()) */
+#endif
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETUID, 0)) != 0)
+ goto fail;
+
+#ifdef _POSIX_SAVED_IDS
+ /*
+ * Do we have "appropriate privileges" (are we root or uid == euid)
+ * If so, we are changing the real uid and/or saved uid.
+ */
+ if (
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */
+ uid == oldcred->cr_uid ||
+#endif
+ /* We are using privs. */
+ priv_check_cred(oldcred, PRIV_CRED_SETUID, 0) == 0)
+#endif
+ {
+ /*
+ * Set the real uid and transfer proc count to new user.
+ */
+ if (uid != oldcred->cr_ruid) {
+ change_ruid(newcred, uip);
+ setsugid(p);
+ }
+ /*
+ * Set saved uid
+ *
+ * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
+ * the security of seteuid() depends on it. B.4.2.2 says it
+ * is important that we should do this.
+ */
+ if (uid != oldcred->cr_svuid) {
+ change_svuid(newcred, uid);
+ setsugid(p);
+ }
+ }
+
+ /*
+ * In all permitted cases, we are changing the euid.
+ */
+ if (uid != oldcred->cr_uid) {
+ change_euid(newcred, uip);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+#ifdef RACCT
+ racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+ uifree(uip);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ uifree(uip);
+ crfree(newcred);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct seteuid_args {
+ uid_t euid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_seteuid(struct thread *td, struct seteuid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ uid_t euid;
+ struct uidinfo *euip;
+ int error;
+
+ euid = uap->euid;
+ AUDIT_ARG_EUID(euid);
+ newcred = crget();
+ euip = uifind(euid);
+ PROC_LOCK(p);
+ /*
+ * Copy credentials so other references do not see our changes.
+ */
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_seteuid(oldcred, euid);
+ if (error)
+ goto fail;
+#endif
+
+ if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */
+ euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, 0)) != 0)
+ goto fail;
+
+ /*
+ * Everything's okay, do it.
+ */
+ if (oldcred->cr_uid != euid) {
+ change_euid(newcred, euip);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ uifree(euip);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ uifree(euip);
+ crfree(newcred);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgid_args {
+ gid_t gid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setgid(struct thread *td, struct setgid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ gid_t gid;
+ int error;
+
+ gid = uap->gid;
+ AUDIT_ARG_GID(gid);
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_setgid(oldcred, gid);
+ if (error)
+ goto fail;
+#endif
+
+ /*
+ * See if we have "permission" by POSIX 1003.1 rules.
+ *
+ * Note that setgid(getegid()) is a special case of
+ * "appropriate privileges" in appendix B.4.2.2. We need
+ * to use this clause to be compatible with traditional BSD
+ * semantics. Basically, it means that "setgid(xx)" sets all
+ * three id's (assuming you have privs).
+ *
+ * For notes on the logic here, see setuid() above.
+ */
+ if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */
+#ifdef _POSIX_SAVED_IDS
+ gid != oldcred->cr_svgid && /* allow setgid(saved gid) */
+#endif
+#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
+ gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
+#endif
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETGID, 0)) != 0)
+ goto fail;
+
+#ifdef _POSIX_SAVED_IDS
+ /*
+ * Do we have "appropriate privileges" (are we root or gid == egid)
+ * If so, we are changing the real uid and saved gid.
+ */
+ if (
+#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */
+ gid == oldcred->cr_groups[0] ||
+#endif
+ /* We are using privs. */
+ priv_check_cred(oldcred, PRIV_CRED_SETGID, 0) == 0)
+#endif
+ {
+ /*
+ * Set real gid
+ */
+ if (oldcred->cr_rgid != gid) {
+ change_rgid(newcred, gid);
+ setsugid(p);
+ }
+ /*
+ * Set saved gid
+ *
+ * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
+ * the security of setegid() depends on it. B.4.2.2 says it
+ * is important that we should do this.
+ */
+ if (oldcred->cr_svgid != gid) {
+ change_svgid(newcred, gid);
+ setsugid(p);
+ }
+ }
+ /*
+ * In all cases permitted cases, we are changing the egid.
+ * Copy credentials so other references do not see our changes.
+ */
+ if (oldcred->cr_groups[0] != gid) {
+ change_egid(newcred, gid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setegid_args {
+ gid_t egid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setegid(struct thread *td, struct setegid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ gid_t egid;
+ int error;
+
+ egid = uap->egid;
+ AUDIT_ARG_EGID(egid);
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_setegid(oldcred, egid);
+ if (error)
+ goto fail;
+#endif
+
+ if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */
+ egid != oldcred->cr_svgid && /* allow setegid(saved gid) */
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, 0)) != 0)
+ goto fail;
+
+ if (oldcred->cr_groups[0] != egid) {
+ change_egid(newcred, egid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setgroups_args {
+ u_int gidsetsize;
+ gid_t *gidset;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setgroups(struct thread *td, struct setgroups_args *uap)
+{
+ gid_t *groups = NULL;
+ int error;
+
+ if (uap->gidsetsize > ngroups_max + 1)
+ return (EINVAL);
+ groups = malloc(uap->gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK);
+ error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t));
+ if (error)
+ goto out;
+ error = kern_setgroups(td, uap->gidsetsize, groups);
+out:
+ free(groups, M_TEMP);
+ return (error);
+}
+
+int
+kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ int error;
+
+ if (ngrp > ngroups_max + 1)
+ return (EINVAL);
+ AUDIT_ARG_GROUPSET(groups, ngrp);
+ newcred = crget();
+ crextend(newcred, ngrp);
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_setgroups(oldcred, ngrp, groups);
+ if (error)
+ goto fail;
+#endif
+
+ error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0);
+ if (error)
+ goto fail;
+
+ if (ngrp < 1) {
+ /*
+ * setgroups(0, NULL) is a legitimate way of clearing the
+ * groups vector on non-BSD systems (which generally do not
+ * have the egid in the groups[0]). We risk security holes
+ * when running non-BSD software if we do not do the same.
+ */
+ newcred->cr_ngroups = 1;
+ } else {
+ crsetgroups_locked(newcred, ngrp, groups);
+ }
+ setsugid(p);
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setreuid_args {
+ uid_t ruid;
+ uid_t euid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setreuid(register struct thread *td, struct setreuid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ uid_t euid, ruid;
+ struct uidinfo *euip, *ruip;
+ int error;
+
+ euid = uap->euid;
+ ruid = uap->ruid;
+ AUDIT_ARG_EUID(euid);
+ AUDIT_ARG_RUID(ruid);
+ newcred = crget();
+ euip = uifind(euid);
+ ruip = uifind(ruid);
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_setreuid(oldcred, ruid, euid);
+ if (error)
+ goto fail;
+#endif
+
+ if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
+ ruid != oldcred->cr_svuid) ||
+ (euid != (uid_t)-1 && euid != oldcred->cr_uid &&
+ euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, 0)) != 0)
+ goto fail;
+
+ if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
+ change_euid(newcred, euip);
+ setsugid(p);
+ }
+ if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
+ change_ruid(newcred, ruip);
+ setsugid(p);
+ }
+ if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) &&
+ newcred->cr_svuid != newcred->cr_uid) {
+ change_svuid(newcred, newcred->cr_uid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+#ifdef RACCT
+ racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+ uifree(ruip);
+ uifree(euip);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ uifree(ruip);
+ uifree(euip);
+ crfree(newcred);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setregid_args {
+ gid_t rgid;
+ gid_t egid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setregid(register struct thread *td, struct setregid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ gid_t egid, rgid;
+ int error;
+
+ egid = uap->egid;
+ rgid = uap->rgid;
+ AUDIT_ARG_EGID(egid);
+ AUDIT_ARG_RGID(rgid);
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_setregid(oldcred, rgid, egid);
+ if (error)
+ goto fail;
+#endif
+
+ if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
+ rgid != oldcred->cr_svgid) ||
+ (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
+ egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, 0)) != 0)
+ goto fail;
+
+ if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+ change_egid(newcred, egid);
+ setsugid(p);
+ }
+ if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
+ change_rgid(newcred, rgid);
+ setsugid(p);
+ }
+ if ((rgid != (gid_t)-1 || newcred->cr_groups[0] != newcred->cr_rgid) &&
+ newcred->cr_svgid != newcred->cr_groups[0]) {
+ change_svgid(newcred, newcred->cr_groups[0]);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ return (error);
+}
+
+/*
+ * setresuid(ruid, euid, suid) is like setreuid except control over the saved
+ * uid is explicit.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setresuid_args {
+ uid_t ruid;
+ uid_t euid;
+ uid_t suid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setresuid(register struct thread *td, struct setresuid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ uid_t euid, ruid, suid;
+ struct uidinfo *euip, *ruip;
+ int error;
+
+ euid = uap->euid;
+ ruid = uap->ruid;
+ suid = uap->suid;
+ AUDIT_ARG_EUID(euid);
+ AUDIT_ARG_RUID(ruid);
+ AUDIT_ARG_SUID(suid);
+ newcred = crget();
+ euip = uifind(euid);
+ ruip = uifind(ruid);
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_setresuid(oldcred, ruid, euid, suid);
+ if (error)
+ goto fail;
+#endif
+
+ if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
+ ruid != oldcred->cr_svuid &&
+ ruid != oldcred->cr_uid) ||
+ (euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
+ euid != oldcred->cr_svuid &&
+ euid != oldcred->cr_uid) ||
+ (suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
+ suid != oldcred->cr_svuid &&
+ suid != oldcred->cr_uid)) &&
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, 0)) != 0)
+ goto fail;
+
+ if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
+ change_euid(newcred, euip);
+ setsugid(p);
+ }
+ if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
+ change_ruid(newcred, ruip);
+ setsugid(p);
+ }
+ if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
+ change_svuid(newcred, suid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+#ifdef RACCT
+ racct_proc_ucred_changed(p, oldcred, newcred);
+#endif
+ uifree(ruip);
+ uifree(euip);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ uifree(ruip);
+ uifree(euip);
+ crfree(newcred);
+ return (error);
+
+}
+
+/*
+ * setresgid(rgid, egid, sgid) is like setregid except control over the saved
+ * gid is explicit.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setresgid_args {
+ gid_t rgid;
+ gid_t egid;
+ gid_t sgid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setresgid(register struct thread *td, struct setresgid_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *newcred, *oldcred;
+ gid_t egid, rgid, sgid;
+ int error;
+
+ egid = uap->egid;
+ rgid = uap->rgid;
+ sgid = uap->sgid;
+ AUDIT_ARG_EGID(egid);
+ AUDIT_ARG_RGID(rgid);
+ AUDIT_ARG_SGID(sgid);
+ newcred = crget();
+ PROC_LOCK(p);
+ oldcred = crcopysafe(p, newcred);
+
+#ifdef MAC
+ error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid);
+ if (error)
+ goto fail;
+#endif
+
+ if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
+ rgid != oldcred->cr_svgid &&
+ rgid != oldcred->cr_groups[0]) ||
+ (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
+ egid != oldcred->cr_svgid &&
+ egid != oldcred->cr_groups[0]) ||
+ (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
+ sgid != oldcred->cr_svgid &&
+ sgid != oldcred->cr_groups[0])) &&
+ (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, 0)) != 0)
+ goto fail;
+
+ if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
+ change_egid(newcred, egid);
+ setsugid(p);
+ }
+ if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
+ change_rgid(newcred, rgid);
+ setsugid(p);
+ }
+ if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
+ change_svgid(newcred, sgid);
+ setsugid(p);
+ }
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ return (0);
+
+fail:
+ PROC_UNLOCK(p);
+ crfree(newcred);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getresuid_args {
+ uid_t *ruid;
+ uid_t *euid;
+ uid_t *suid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getresuid(register struct thread *td, struct getresuid_args *uap)
+{
+ struct ucred *cred;
+ int error1 = 0, error2 = 0, error3 = 0;
+
+ cred = td->td_ucred;
+ if (uap->ruid)
+ error1 = copyout(&cred->cr_ruid,
+ uap->ruid, sizeof(cred->cr_ruid));
+ if (uap->euid)
+ error2 = copyout(&cred->cr_uid,
+ uap->euid, sizeof(cred->cr_uid));
+ if (uap->suid)
+ error3 = copyout(&cred->cr_svuid,
+ uap->suid, sizeof(cred->cr_svuid));
+ return (error1 ? error1 : error2 ? error2 : error3);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getresgid_args {
+ gid_t *rgid;
+ gid_t *egid;
+ gid_t *sgid;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getresgid(register struct thread *td, struct getresgid_args *uap)
+{
+ struct ucred *cred;
+ int error1 = 0, error2 = 0, error3 = 0;
+
+ cred = td->td_ucred;
+ if (uap->rgid)
+ error1 = copyout(&cred->cr_rgid,
+ uap->rgid, sizeof(cred->cr_rgid));
+ if (uap->egid)
+ error2 = copyout(&cred->cr_groups[0],
+ uap->egid, sizeof(cred->cr_groups[0]));
+ if (uap->sgid)
+ error3 = copyout(&cred->cr_svgid,
+ uap->sgid, sizeof(cred->cr_svgid));
+ return (error1 ? error1 : error2 ? error2 : error3);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct issetugid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_issetugid(register struct thread *td, struct issetugid_args *uap)
+{
+ struct proc *p = td->td_proc;
+
+ /*
+ * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
+ * we use P_SUGID because we consider changing the owners as
+ * "tainting" as well.
+ * This is significant for procs that start as root and "become"
+ * a user without an exec - programs cannot know *everything*
+ * that libc *might* have put in their data segment.
+ */
+ PROC_LOCK(p);
+ td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+int
+sys___setugid(struct thread *td, struct __setugid_args *uap)
+{
+#ifdef REGRESSION
+ struct proc *p;
+
+ p = td->td_proc;
+ switch (uap->flag) {
+ case 0:
+ PROC_LOCK(p);
+ p->p_flag &= ~P_SUGID;
+ PROC_UNLOCK(p);
+ return (0);
+ case 1:
+ PROC_LOCK(p);
+ p->p_flag |= P_SUGID;
+ PROC_UNLOCK(p);
+ return (0);
+ default:
+ return (EINVAL);
+ }
+#else /* !REGRESSION */
+
+ return (ENOSYS);
+#endif /* REGRESSION */
+}
+
+/*
+ * Check if gid is a member of the group set.
+ */
+int
+groupmember(gid_t gid, struct ucred *cred)
+{
+ int l;
+ int h;
+ int m;
+
+ if (cred->cr_groups[0] == gid)
+ return(1);
+
+ /*
+ * If gid was not our primary group, perform a binary search
+ * of the supplemental groups. This is possible because we
+ * sort the groups in crsetgroups().
+ */
+ l = 1;
+ h = cred->cr_ngroups;
+ while (l < h) {
+ m = l + ((h - l) / 2);
+ if (cred->cr_groups[m] < gid)
+ l = m + 1;
+ else
+ h = m;
+ }
+ if ((l < cred->cr_ngroups) && (cred->cr_groups[l] == gid))
+ return (1);
+
+ return (0);
+}
+
+/*
+ * Test the active securelevel against a given level. securelevel_gt()
+ * implements (securelevel > level). securelevel_ge() implements
+ * (securelevel >= level). Note that the logic is inverted -- these
+ * functions return EPERM on "success" and 0 on "failure".
+ *
+ * Due to care taken when setting the securelevel, we know that no jail will
+ * be less secure that its parent (or the physical system), so it is sufficient
+ * to test the current jail only.
+ *
+ * XXXRW: Possibly since this has to do with privilege, it should move to
+ * kern_priv.c.
+ */
+int
+securelevel_gt(struct ucred *cr, int level)
+{
+
+ return (cr->cr_prison->pr_securelevel > level ? EPERM : 0);
+}
+
+int
+securelevel_ge(struct ucred *cr, int level)
+{
+
+ return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0);
+}
+
+/*
+ * 'see_other_uids' determines whether or not visibility of processes
+ * and sockets with credentials holding different real uids is possible
+ * using a variety of system MIBs.
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int see_other_uids = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
+ &see_other_uids, 0,
+ "Unprivileged processes may see subjects/objects with different real uid");
+
+/*-
+ * Determine if u1 "can see" the subject specified by u2, according to the
+ * 'see_other_uids' policy.
+ * Returns: 0 for permitted, ESRCH otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ * u1 may equal u2, in which case only one reference is required
+ */
+static int
+cr_seeotheruids(struct ucred *u1, struct ucred *u2)
+{
+
+ if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
+ if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, 0) != 0)
+ return (ESRCH);
+ }
+ return (0);
+}
+
+/*
+ * 'see_other_gids' determines whether or not visibility of processes
+ * and sockets with credentials holding different real gids is possible
+ * using a variety of system MIBs.
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int see_other_gids = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW,
+ &see_other_gids, 0,
+ "Unprivileged processes may see subjects/objects with different real gid");
+
+/*
+ * Determine if u1 can "see" the subject specified by u2, according to the
+ * 'see_other_gids' policy.
+ * Returns: 0 for permitted, ESRCH otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ * u1 may equal u2, in which case only one reference is required
+ */
+static int
+cr_seeothergids(struct ucred *u1, struct ucred *u2)
+{
+ int i, match;
+
+ if (!see_other_gids) {
+ match = 0;
+ for (i = 0; i < u1->cr_ngroups; i++) {
+ if (groupmember(u1->cr_groups[i], u2))
+ match = 1;
+ if (match)
+ break;
+ }
+ if (!match) {
+ if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, 0) != 0)
+ return (ESRCH);
+ }
+ }
+ return (0);
+}
+
+/*-
+ * Determine if u1 "can see" the subject specified by u2.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: none
+ * References: *u1 and *u2 must not change during the call
+ * u1 may equal u2, in which case only one reference is required
+ */
+int
+cr_cansee(struct ucred *u1, struct ucred *u2)
+{
+ int error;
+
+ if ((error = prison_check(u1, u2)))
+ return (error);
+#ifdef MAC
+ if ((error = mac_cred_check_visible(u1, u2)))
+ return (error);
+#endif
+ if ((error = cr_seeotheruids(u1, u2)))
+ return (error);
+ if ((error = cr_seeothergids(u1, u2)))
+ return (error);
+ return (0);
+}
+
+/*-
+ * Determine if td "can see" the subject specified by p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect p->p_ucred must be held. td really
+ * should be curthread.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansee(struct thread *td, struct proc *p)
+{
+
+ /* Wrap cr_cansee() for all functionality. */
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ return (cr_cansee(td->td_ucred, p->p_ucred));
+}
+
+/*
+ * 'conservative_signals' prevents the delivery of a broad class of
+ * signals by unprivileged processes to processes that have changed their
+ * credentials since the last invocation of execve(). This can prevent
+ * the leakage of cached information or retained privileges as a result
+ * of a common class of signal-related vulnerabilities. However, this
+ * may interfere with some applications that expect to be able to
+ * deliver these signals to peer processes after having given up
+ * privilege.
+ */
+static int conservative_signals = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW,
+ &conservative_signals, 0, "Unprivileged processes prevented from "
+ "sending certain signals to processes whose credentials have changed");
+/*-
+ * Determine whether cred may deliver the specified signal to proc.
+ * Returns: 0 for permitted, an errno value otherwise.
+ * Locks: A lock must be held for proc.
+ * References: cred and proc must be valid for the lifetime of the call.
+ */
+int
+cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
+{
+ int error;
+
+ PROC_LOCK_ASSERT(proc, MA_OWNED);
+ /*
+ * Jail semantics limit the scope of signalling to proc in the
+ * same jail as cred, if cred is in jail.
+ */
+ error = prison_check(cred, proc->p_ucred);
+ if (error)
+ return (error);
+#ifdef MAC
+ if ((error = mac_proc_check_signal(cred, proc, signum)))
+ return (error);
+#endif
+ if ((error = cr_seeotheruids(cred, proc->p_ucred)))
+ return (error);
+ if ((error = cr_seeothergids(cred, proc->p_ucred)))
+ return (error);
+
+ /*
+ * UNIX signal semantics depend on the status of the P_SUGID
+ * bit on the target process. If the bit is set, then additional
+ * restrictions are placed on the set of available signals.
+ */
+ if (conservative_signals && (proc->p_flag & P_SUGID)) {
+ switch (signum) {
+ case 0:
+ case SIGKILL:
+ case SIGINT:
+ case SIGTERM:
+ case SIGALRM:
+ case SIGSTOP:
+ case SIGTTIN:
+ case SIGTTOU:
+ case SIGTSTP:
+ case SIGHUP:
+ case SIGUSR1:
+ case SIGUSR2:
+ /*
+ * Generally, permit job and terminal control
+ * signals.
+ */
+ break;
+ default:
+ /* Not permitted without privilege. */
+ error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, 0);
+ if (error)
+ return (error);
+ }
+ }
+
+ /*
+ * Generally, the target credential's ruid or svuid must match the
+ * subject credential's ruid or euid.
+ */
+ if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
+ cred->cr_ruid != proc->p_ucred->cr_svuid &&
+ cred->cr_uid != proc->p_ucred->cr_ruid &&
+ cred->cr_uid != proc->p_ucred->cr_svuid) {
+ error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, 0);
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+
+/*-
+ * Determine whether td may deliver the specified signal to p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ * must be held. td must be curthread, and a lock must be
+ * held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansignal(struct thread *td, struct proc *p, int signum)
+{
+
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (td->td_proc == p)
+ return (0);
+
+ /*
+ * UNIX signalling semantics require that processes in the same
+ * session always be able to deliver SIGCONT to one another,
+ * overriding the remaining protections.
+ */
+ /* XXX: This will require an additional lock of some sort. */
+ if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
+ return (0);
+ /*
+ * Some compat layers use SIGTHR and higher signals for
+ * communication between different kernel threads of the same
+ * process, so that they expect that it's always possible to
+ * deliver them, even for suid applications where cr_cansignal() can
+ * deny such ability for security consideration. It should be
+ * pretty safe to do since the only way to create two processes
+ * with the same p_leader is via rfork(2).
+ */
+ if (td->td_proc->p_leader != NULL && signum >= SIGTHR &&
+ signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader)
+ return (0);
+
+ return (cr_cansignal(td->td_ucred, p, signum));
+}
+
+/*-
+ * Determine whether td may reschedule p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ * must be held. td must be curthread, and a lock must
+ * be held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_cansched(struct thread *td, struct proc *p)
+{
+ int error;
+
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (td->td_proc == p)
+ return (0);
+ if ((error = prison_check(td->td_ucred, p->p_ucred)))
+ return (error);
+#ifdef MAC
+ if ((error = mac_proc_check_sched(td->td_ucred, p)))
+ return (error);
+#endif
+ if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+ return (error);
+ if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
+ return (error);
+ if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
+ td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
+ error = priv_check(td, PRIV_SCHED_DIFFCRED);
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * The 'unprivileged_proc_debug' flag may be used to disable a variety of
+ * unprivileged inter-process debugging services, including some procfs
+ * functionality, ptrace(), and ktrace(). In the past, inter-process
+ * debugging has been involved in a variety of security problems, and sites
+ * not requiring the service might choose to disable it when hardening
+ * systems.
+ *
+ * XXX: Should modifying and reading this variable require locking?
+ * XXX: data declarations should be together near the beginning of the file.
+ */
+static int unprivileged_proc_debug = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW,
+ &unprivileged_proc_debug, 0,
+ "Unprivileged processes may use process debugging facilities");
+
+/*-
+ * Determine whether td may debug p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ * must be held. td must be curthread, and a lock must
+ * be held for p.
+ * References: td and p must be valid for the lifetime of the call
+ */
+int
+p_candebug(struct thread *td, struct proc *p)
+{
+ int credentialchanged, error, grpsubset, i, uidsubset;
+
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (!unprivileged_proc_debug) {
+ error = priv_check(td, PRIV_DEBUG_UNPRIV);
+ if (error)
+ return (error);
+ }
+ if (td->td_proc == p)
+ return (0);
+ if ((error = prison_check(td->td_ucred, p->p_ucred)))
+ return (error);
+#ifdef MAC
+ if ((error = mac_proc_check_debug(td->td_ucred, p)))
+ return (error);
+#endif
+ if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+ return (error);
+ if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
+ return (error);
+
+ /*
+ * Is p's group set a subset of td's effective group set? This
+ * includes p's egid, group access list, rgid, and svgid.
+ */
+ grpsubset = 1;
+ for (i = 0; i < p->p_ucred->cr_ngroups; i++) {
+ if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) {
+ grpsubset = 0;
+ break;
+ }
+ }
+ grpsubset = grpsubset &&
+ groupmember(p->p_ucred->cr_rgid, td->td_ucred) &&
+ groupmember(p->p_ucred->cr_svgid, td->td_ucred);
+
+ /*
+ * Are the uids present in p's credential equal to td's
+ * effective uid? This includes p's euid, svuid, and ruid.
+ */
+ uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid &&
+ td->td_ucred->cr_uid == p->p_ucred->cr_svuid &&
+ td->td_ucred->cr_uid == p->p_ucred->cr_ruid);
+
+ /*
+ * Has the credential of the process changed since the last exec()?
+ */
+ credentialchanged = (p->p_flag & P_SUGID);
+
+ /*
+ * If p's gids aren't a subset, or the uids aren't a subset,
+ * or the credential has changed, require appropriate privilege
+ * for td to debug p.
+ */
+ if (!grpsubset || !uidsubset) {
+ error = priv_check(td, PRIV_DEBUG_DIFFCRED);
+ if (error)
+ return (error);
+ }
+
+ if (credentialchanged) {
+ error = priv_check(td, PRIV_DEBUG_SUGID);
+ if (error)
+ return (error);
+ }
+
+ /* Can't trace init when securelevel > 0. */
+ if (p == initproc) {
+ error = securelevel_gt(td->td_ucred, 0);
+ if (error)
+ return (error);
+ }
+
+ /*
+ * Can't trace a process that's currently exec'ing.
+ *
+ * XXX: Note, this is not a security policy decision, it's a
+ * basic correctness/functionality decision. Therefore, this check
+ * should be moved to the caller's of p_candebug().
+ */
+ if ((p->p_flag & P_INEXEC) != 0)
+ return (EBUSY);
+
+ return (0);
+}
+
+/*-
+ * Determine whether the subject represented by cred can "see" a socket.
+ * Returns: 0 for permitted, ENOENT otherwise.
+ */
+int
+cr_canseesocket(struct ucred *cred, struct socket *so)
+{
+ int error;
+
+ error = prison_check(cred, so->so_cred);
+ if (error)
+ return (ENOENT);
+#ifdef MAC
+ error = mac_socket_check_visible(cred, so);
+ if (error)
+ return (error);
+#endif
+ if (cr_seeotheruids(cred, so->so_cred))
+ return (ENOENT);
+ if (cr_seeothergids(cred, so->so_cred))
+ return (ENOENT);
+
+ return (0);
+}
+
+#if defined(INET) || defined(INET6)
+/*-
+ * Determine whether the subject represented by cred can "see" a socket.
+ * Returns: 0 for permitted, ENOENT otherwise.
+ */
+int
+cr_canseeinpcb(struct ucred *cred, struct inpcb *inp)
+{
+ int error;
+
+ error = prison_check(cred, inp->inp_cred);
+ if (error)
+ return (ENOENT);
+#ifdef MAC
+ INP_LOCK_ASSERT(inp);
+ error = mac_inpcb_check_visible(cred, inp);
+ if (error)
+ return (error);
+#endif
+ if (cr_seeotheruids(cred, inp->inp_cred))
+ return (ENOENT);
+ if (cr_seeothergids(cred, inp->inp_cred))
+ return (ENOENT);
+
+ return (0);
+}
+#endif
+
+/*-
+ * Determine whether td can wait for the exit of p.
+ * Returns: 0 for permitted, an errno value otherwise
+ * Locks: Sufficient locks to protect various components of td and p
+ * must be held. td must be curthread, and a lock must
+ * be held for p.
+ * References: td and p must be valid for the lifetime of the call
+
+ */
+int
+p_canwait(struct thread *td, struct proc *p)
+{
+ int error;
+
+ KASSERT(td == curthread, ("%s: td not curthread", __func__));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if ((error = prison_check(td->td_ucred, p->p_ucred)))
+ return (error);
+#ifdef MAC
+ if ((error = mac_proc_check_wait(td->td_ucred, p)))
+ return (error);
+#endif
+#if 0
+ /* XXXMAC: This could have odd effects on some shells. */
+ if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
+ return (error);
+#endif
+
+ return (0);
+}
+
+/*
+ * Allocate a zeroed cred structure.
+ */
+struct ucred *
+crget(void)
+{
+ register struct ucred *cr;
+
+ cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
+ refcount_init(&cr->cr_ref, 1);
+#ifdef AUDIT
+ audit_cred_init(cr);
+#endif
+#ifdef MAC
+ mac_cred_init(cr);
+#endif
+ crextend(cr, XU_NGROUPS);
+ return (cr);
+}
+
+/*
+ * Claim another reference to a ucred structure.
+ */
+struct ucred *
+crhold(struct ucred *cr)
+{
+
+ refcount_acquire(&cr->cr_ref);
+ return (cr);
+}
+
+/*
+ * Free a cred structure. Throws away space when ref count gets to 0.
+ */
+void
+crfree(struct ucred *cr)
+{
+
+ KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
+ KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
+ if (refcount_release(&cr->cr_ref)) {
+ /*
+ * Some callers of crget(), such as nfs_statfs(),
+ * allocate a temporary credential, but don't
+ * allocate a uidinfo structure.
+ */
+ if (cr->cr_uidinfo != NULL)
+ uifree(cr->cr_uidinfo);
+ if (cr->cr_ruidinfo != NULL)
+ uifree(cr->cr_ruidinfo);
+ /*
+ * Free a prison, if any.
+ */
+ if (cr->cr_prison != NULL)
+ prison_free(cr->cr_prison);
+ if (cr->cr_loginclass != NULL)
+ loginclass_free(cr->cr_loginclass);
+#ifdef AUDIT
+ audit_cred_destroy(cr);
+#endif
+#ifdef MAC
+ mac_cred_destroy(cr);
+#endif
+ free(cr->cr_groups, M_CRED);
+ free(cr, M_CRED);
+ }
+}
+
+/*
+ * Check to see if this ucred is shared.
+ */
+int
+crshared(struct ucred *cr)
+{
+
+ return (cr->cr_ref > 1);
+}
+
+/*
+ * Copy a ucred's contents from a template. Does not block.
+ */
+void
+crcopy(struct ucred *dest, struct ucred *src)
+{
+
+ KASSERT(crshared(dest) == 0, ("crcopy of shared ucred"));
+ bcopy(&src->cr_startcopy, &dest->cr_startcopy,
+ (unsigned)((caddr_t)&src->cr_endcopy -
+ (caddr_t)&src->cr_startcopy));
+ crsetgroups(dest, src->cr_ngroups, src->cr_groups);
+ uihold(dest->cr_uidinfo);
+ uihold(dest->cr_ruidinfo);
+ prison_hold(dest->cr_prison);
+ loginclass_hold(dest->cr_loginclass);
+#ifdef AUDIT
+ audit_cred_copy(src, dest);
+#endif
+#ifdef MAC
+ mac_cred_copy(src, dest);
+#endif
+}
+
+/*
+ * Dup cred struct to a new held one.
+ */
+struct ucred *
+crdup(struct ucred *cr)
+{
+ struct ucred *newcr;
+
+ newcr = crget();
+ crcopy(newcr, cr);
+ return (newcr);
+}
+
+/*
+ * Fill in a struct xucred based on a struct ucred.
+ */
+void
+cru2x(struct ucred *cr, struct xucred *xcr)
+{
+ int ngroups;
+
+ bzero(xcr, sizeof(*xcr));
+ xcr->cr_version = XUCRED_VERSION;
+ xcr->cr_uid = cr->cr_uid;
+
+ ngroups = MIN(cr->cr_ngroups, XU_NGROUPS);
+ xcr->cr_ngroups = ngroups;
+ bcopy(cr->cr_groups, xcr->cr_groups,
+ ngroups * sizeof(*cr->cr_groups));
+}
+
+/*
+ * small routine to swap a thread's current ucred for the correct one taken
+ * from the process.
+ */
+void
+cred_update_thread(struct thread *td)
+{
+ struct proc *p;
+ struct ucred *cred;
+
+ p = td->td_proc;
+ cred = td->td_ucred;
+ PROC_LOCK(p);
+ td->td_ucred = crhold(p->p_ucred);
+ PROC_UNLOCK(p);
+ if (cred != NULL)
+ crfree(cred);
+}
+
+struct ucred *
+crcopysafe(struct proc *p, struct ucred *cr)
+{
+ struct ucred *oldcred;
+ int groups;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ oldcred = p->p_ucred;
+ while (cr->cr_agroups < oldcred->cr_agroups) {
+ groups = oldcred->cr_agroups;
+ PROC_UNLOCK(p);
+ crextend(cr, groups);
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ }
+ crcopy(cr, oldcred);
+
+ return (oldcred);
+}
+
+/*
+ * Extend the passed in credential to hold n items.
+ */
+static void
+crextend(struct ucred *cr, int n)
+{
+ int cnt;
+
+ /* Truncate? */
+ if (n <= cr->cr_agroups)
+ return;
+
+ /*
+ * We extend by 2 each time since we're using a power of two
+ * allocator until we need enough groups to fill a page.
+ * Once we're allocating multiple pages, only allocate as many
+ * as we actually need. The case of processes needing a
+ * non-power of two number of pages seems more likely than
+ * a real world process that adds thousands of groups one at a
+ * time.
+ */
+ if ( n < PAGE_SIZE / sizeof(gid_t) ) {
+ if (cr->cr_agroups == 0)
+ cnt = MINALLOCSIZE / sizeof(gid_t);
+ else
+ cnt = cr->cr_agroups * 2;
+
+ while (cnt < n)
+ cnt *= 2;
+ } else
+ cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t));
+
+ /* Free the old array. */
+ if (cr->cr_groups)
+ free(cr->cr_groups, M_CRED);
+
+ cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK | M_ZERO);
+ cr->cr_agroups = cnt;
+}
+
+/*
+ * Copy groups in to a credential, preserving any necessary invariants.
+ * Currently this includes the sorting of all supplemental gids.
+ * crextend() must have been called before hand to ensure sufficient
+ * space is available.
+ */
+static void
+crsetgroups_locked(struct ucred *cr, int ngrp, gid_t *groups)
+{
+ int i;
+ int j;
+ gid_t g;
+
+ KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small"));
+
+ bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t));
+ cr->cr_ngroups = ngrp;
+
+ /*
+ * Sort all groups except cr_groups[0] to allow groupmember to
+ * perform a binary search.
+ *
+ * XXX: If large numbers of groups become common this should
+ * be replaced with shell sort like linux uses or possibly
+ * heap sort.
+ */
+ for (i = 2; i < ngrp; i++) {
+ g = cr->cr_groups[i];
+ for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--)
+ cr->cr_groups[j + 1] = cr->cr_groups[j];
+ cr->cr_groups[j + 1] = g;
+ }
+}
+
+/*
+ * Copy groups in to a credential after expanding it if required.
+ * Truncate the list to (ngroups_max + 1) if it is too large.
+ */
+void
+crsetgroups(struct ucred *cr, int ngrp, gid_t *groups)
+{
+
+ if (ngrp > ngroups_max + 1)
+ ngrp = ngroups_max + 1;
+
+ crextend(cr, ngrp);
+ crsetgroups_locked(cr, ngrp, groups);
+}
+
+/*
+ * Get login name, if available.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getlogin_args {
+ char *namebuf;
+ u_int namelen;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getlogin(struct thread *td, struct getlogin_args *uap)
+{
+ int error;
+ char login[MAXLOGNAME];
+ struct proc *p = td->td_proc;
+
+ if (uap->namelen > MAXLOGNAME)
+ uap->namelen = MAXLOGNAME;
+ PROC_LOCK(p);
+ SESS_LOCK(p->p_session);
+ bcopy(p->p_session->s_login, login, uap->namelen);
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ if (strlen(login) + 1 > uap->namelen)
+ return (ERANGE);
+ error = copyout(login, uap->namebuf, uap->namelen);
+ return (error);
+}
+
+/*
+ * Set login name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct setlogin_args {
+ char *namebuf;
+};
+#endif
+/* ARGSUSED */
+int
+sys_setlogin(struct thread *td, struct setlogin_args *uap)
+{
+ struct proc *p = td->td_proc;
+ int error;
+ char logintmp[MAXLOGNAME];
+
+ error = priv_check(td, PRIV_PROC_SETLOGIN);
+ if (error)
+ return (error);
+ error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
+ if (error == ENAMETOOLONG)
+ error = EINVAL;
+ else if (!error) {
+ PROC_LOCK(p);
+ SESS_LOCK(p->p_session);
+ (void) memcpy(p->p_session->s_login, logintmp,
+ sizeof(logintmp));
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ }
+ return (error);
+}
+
+void
+setsugid(struct proc *p)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ p->p_flag |= P_SUGID;
+ if (!(p->p_pfsflags & PF_ISUGID))
+ p->p_stops = 0;
+}
+
+/*-
+ * Change a process's effective uid.
+ * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_euid(struct ucred *newcred, struct uidinfo *euip)
+{
+
+ newcred->cr_uid = euip->ui_uid;
+ uihold(euip);
+ uifree(newcred->cr_uidinfo);
+ newcred->cr_uidinfo = euip;
+}
+
+/*-
+ * Change a process's effective gid.
+ * Side effects: newcred->cr_gid will be modified.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_egid(struct ucred *newcred, gid_t egid)
+{
+
+ newcred->cr_groups[0] = egid;
+}
+
+/*-
+ * Change a process's real uid.
+ * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
+ * will be updated, and the old and new cr_ruidinfo proc
+ * counts will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_ruid(struct ucred *newcred, struct uidinfo *ruip)
+{
+
+ (void)chgproccnt(newcred->cr_ruidinfo, -1, 0);
+ newcred->cr_ruid = ruip->ui_uid;
+ uihold(ruip);
+ uifree(newcred->cr_ruidinfo);
+ newcred->cr_ruidinfo = ruip;
+ (void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
+}
+
+/*-
+ * Change a process's real gid.
+ * Side effects: newcred->cr_rgid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_rgid(struct ucred *newcred, gid_t rgid)
+{
+
+ newcred->cr_rgid = rgid;
+}
+
+/*-
+ * Change a process's saved uid.
+ * Side effects: newcred->cr_svuid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_svuid(struct ucred *newcred, uid_t svuid)
+{
+
+ newcred->cr_svuid = svuid;
+}
+
+/*-
+ * Change a process's saved gid.
+ * Side effects: newcred->cr_svgid will be updated.
+ * References: newcred must be an exclusive credential reference for the
+ * duration of the call.
+ */
+void
+change_svgid(struct ucred *newcred, gid_t svgid)
+{
+
+ newcred->cr_svgid = svgid;
+}
diff --git a/sys/kern/kern_racct.c b/sys/kern/kern_racct.c
new file mode 100644
index 0000000..d31c832
--- /dev/null
+++ b/sys/kern/kern_racct.c
@@ -0,0 +1,1291 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Edward Tomasz Napierala under sponsorship
+ * from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/loginclass.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/umtx.h>
+#include <machine/smp.h>
+
+#ifdef RCTL
+#include <sys/rctl.h>
+#endif
+
+#ifdef RACCT
+
+FEATURE(racct, "Resource Accounting");
+
+/*
+ * Do not block processes that have their %cpu usage <= pcpu_threshold.
+ */
+static int pcpu_threshold = 1;
+
+SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW, 0, "Resource Accounting");
+SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
+ 0, "Processes with higher %cpu usage than this value can be throttled.");
+
+/*
+ * How many seconds it takes to use the scheduler %cpu calculations. When a
+ * process starts, we compute its %cpu usage by dividing its runtime by the
+ * process wall clock time. After RACCT_PCPU_SECS pass, we use the value
+ * provided by the scheduler.
+ */
+#define RACCT_PCPU_SECS 3
+
+static struct mtx racct_lock;
+MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
+
+static uma_zone_t racct_zone;
+
+static void racct_sub_racct(struct racct *dest, const struct racct *src);
+static void racct_sub_cred_locked(struct ucred *cred, int resource,
+ uint64_t amount);
+static void racct_add_cred_locked(struct ucred *cred, int resource,
+ uint64_t amount);
+
+SDT_PROVIDER_DEFINE(racct);
+SDT_PROBE_DEFINE3(racct, kernel, rusage, add, add, "struct proc *", "int",
+ "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, add_failure, add-failure,
+ "struct proc *", "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, add_cred, add-cred, "struct ucred *",
+ "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, add_force, add-force, "struct proc *",
+ "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, set, set, "struct proc *", "int",
+ "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, set_failure, set-failure,
+ "struct proc *", "int", "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, sub, sub, "struct proc *", "int",
+ "uint64_t");
+SDT_PROBE_DEFINE3(racct, kernel, rusage, sub_cred, sub-cred, "struct ucred *",
+ "int", "uint64_t");
+SDT_PROBE_DEFINE1(racct, kernel, racct, create, create, "struct racct *");
+SDT_PROBE_DEFINE1(racct, kernel, racct, destroy, destroy, "struct racct *");
+SDT_PROBE_DEFINE2(racct, kernel, racct, join, join, "struct racct *",
+ "struct racct *");
+SDT_PROBE_DEFINE2(racct, kernel, racct, join_failure, join-failure,
+ "struct racct *", "struct racct *");
+SDT_PROBE_DEFINE2(racct, kernel, racct, leave, leave, "struct racct *",
+ "struct racct *");
+
+int racct_types[] = {
+ [RACCT_CPU] =
+ RACCT_IN_MILLIONS,
+ [RACCT_DATA] =
+ RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+ [RACCT_STACK] =
+ RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+ [RACCT_CORE] =
+ RACCT_DENIABLE,
+ [RACCT_RSS] =
+ RACCT_RECLAIMABLE,
+ [RACCT_MEMLOCK] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE,
+ [RACCT_NPROC] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE,
+ [RACCT_NOFILE] =
+ RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+ [RACCT_VMEM] =
+ RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+ [RACCT_NPTS] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+ [RACCT_SWAP] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+ [RACCT_NTHR] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE,
+ [RACCT_MSGQQUEUED] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+ [RACCT_MSGQSIZE] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+ [RACCT_NMSGQ] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+ [RACCT_NSEM] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+ [RACCT_NSEMOP] =
+ RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
+ [RACCT_NSHM] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+ [RACCT_SHMSIZE] =
+ RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
+ [RACCT_WALLCLOCK] =
+ RACCT_IN_MILLIONS,
+ [RACCT_PCTCPU] =
+ RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS };
+
+static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
+
+#ifdef SCHED_4BSD
+/*
+ * Contains intermediate values for %cpu calculations to avoid using floating
+ * point in the kernel.
+ * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
+ * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
+ * zero so the calculations are more straightforward.
+ */
+fixpt_t ccpu_exp[] = {
+ [0] = FSCALE * 1,
+ [1] = FSCALE * 0.95122942450071400909,
+ [2] = FSCALE * 0.90483741803595957316,
+ [3] = FSCALE * 0.86070797642505780722,
+ [4] = FSCALE * 0.81873075307798185866,
+ [5] = FSCALE * 0.77880078307140486824,
+ [6] = FSCALE * 0.74081822068171786606,
+ [7] = FSCALE * 0.70468808971871343435,
+ [8] = FSCALE * 0.67032004603563930074,
+ [9] = FSCALE * 0.63762815162177329314,
+ [10] = FSCALE * 0.60653065971263342360,
+ [11] = FSCALE * 0.57694981038048669531,
+ [12] = FSCALE * 0.54881163609402643262,
+ [13] = FSCALE * 0.52204577676101604789,
+ [14] = FSCALE * 0.49658530379140951470,
+ [15] = FSCALE * 0.47236655274101470713,
+ [16] = FSCALE * 0.44932896411722159143,
+ [17] = FSCALE * 0.42741493194872666992,
+ [18] = FSCALE * 0.40656965974059911188,
+ [19] = FSCALE * 0.38674102345450120691,
+ [20] = FSCALE * 0.36787944117144232159,
+ [21] = FSCALE * 0.34993774911115535467,
+ [22] = FSCALE * 0.33287108369807955328,
+ [23] = FSCALE * 0.31663676937905321821,
+ [24] = FSCALE * 0.30119421191220209664,
+ [25] = FSCALE * 0.28650479686019010032,
+ [26] = FSCALE * 0.27253179303401260312,
+ [27] = FSCALE * 0.25924026064589150757,
+ [28] = FSCALE * 0.24659696394160647693,
+ [29] = FSCALE * 0.23457028809379765313,
+ [30] = FSCALE * 0.22313016014842982893,
+ [31] = FSCALE * 0.21224797382674305771,
+ [32] = FSCALE * 0.20189651799465540848,
+ [33] = FSCALE * 0.19204990862075411423,
+ [34] = FSCALE * 0.18268352405273465022,
+ [35] = FSCALE * 0.17377394345044512668,
+ [36] = FSCALE * 0.16529888822158653829,
+ [37] = FSCALE * 0.15723716631362761621,
+ [38] = FSCALE * 0.14956861922263505264,
+ [39] = FSCALE * 0.14227407158651357185,
+ [40] = FSCALE * 0.13533528323661269189,
+ [41] = FSCALE * 0.12873490358780421886,
+ [42] = FSCALE * 0.12245642825298191021,
+ [43] = FSCALE * 0.11648415777349695786,
+ [44] = FSCALE * 0.11080315836233388333,
+ [45] = FSCALE * 0.10539922456186433678,
+ [46] = FSCALE * 0.10025884372280373372,
+ [47] = FSCALE * 0.09536916221554961888,
+ [48] = FSCALE * 0.09071795328941250337,
+ [49] = FSCALE * 0.08629358649937051097,
+ [50] = FSCALE * 0.08208499862389879516,
+ [51] = FSCALE * 0.07808166600115315231,
+ [52] = FSCALE * 0.07427357821433388042,
+ [53] = FSCALE * 0.07065121306042958674,
+ [54] = FSCALE * 0.06720551273974976512,
+ [55] = FSCALE * 0.06392786120670757270,
+ [56] = FSCALE * 0.06081006262521796499,
+ [57] = FSCALE * 0.05784432087483846296,
+ [58] = FSCALE * 0.05502322005640722902,
+ [59] = FSCALE * 0.05233970594843239308,
+ [60] = FSCALE * 0.04978706836786394297,
+ [61] = FSCALE * 0.04735892439114092119,
+ [62] = FSCALE * 0.04504920239355780606,
+ [63] = FSCALE * 0.04285212686704017991,
+ [64] = FSCALE * 0.04076220397836621516,
+ [65] = FSCALE * 0.03877420783172200988,
+ [66] = FSCALE * 0.03688316740124000544,
+ [67] = FSCALE * 0.03508435410084502588,
+ [68] = FSCALE * 0.03337326996032607948,
+ [69] = FSCALE * 0.03174563637806794323,
+ [70] = FSCALE * 0.03019738342231850073,
+ [71] = FSCALE * 0.02872463965423942912,
+ [72] = FSCALE * 0.02732372244729256080,
+ [73] = FSCALE * 0.02599112877875534358,
+ [74] = FSCALE * 0.02472352647033939120,
+ [75] = FSCALE * 0.02351774585600910823,
+ [76] = FSCALE * 0.02237077185616559577,
+ [77] = FSCALE * 0.02127973643837716938,
+ [78] = FSCALE * 0.02024191144580438847,
+ [79] = FSCALE * 0.01925470177538692429,
+ [80] = FSCALE * 0.01831563888873418029,
+ [81] = FSCALE * 0.01742237463949351138,
+ [82] = FSCALE * 0.01657267540176124754,
+ [83] = FSCALE * 0.01576441648485449082,
+ [84] = FSCALE * 0.01499557682047770621,
+ [85] = FSCALE * 0.01426423390899925527,
+ [86] = FSCALE * 0.01356855901220093175,
+ [87] = FSCALE * 0.01290681258047986886,
+ [88] = FSCALE * 0.01227733990306844117,
+ [89] = FSCALE * 0.01167856697039544521,
+ [90] = FSCALE * 0.01110899653824230649,
+ [91] = FSCALE * 0.01056720438385265337,
+ [92] = FSCALE * 0.01005183574463358164,
+ [93] = FSCALE * 0.00956160193054350793,
+ [94] = FSCALE * 0.00909527710169581709,
+ [95] = FSCALE * 0.00865169520312063417,
+ [96] = FSCALE * 0.00822974704902002884,
+ [97] = FSCALE * 0.00782837754922577143,
+ [98] = FSCALE * 0.00744658307092434051,
+ [99] = FSCALE * 0.00708340892905212004,
+ [100] = FSCALE * 0.00673794699908546709,
+ [101] = FSCALE * 0.00640933344625638184,
+ [102] = FSCALE * 0.00609674656551563610,
+ [103] = FSCALE * 0.00579940472684214321,
+ [104] = FSCALE * 0.00551656442076077241,
+ [105] = FSCALE * 0.00524751839918138427,
+ [106] = FSCALE * 0.00499159390691021621,
+ [107] = FSCALE * 0.00474815099941147558,
+ [108] = FSCALE * 0.00451658094261266798,
+ [109] = FSCALE * 0.00429630469075234057,
+ [110] = FSCALE * 0.00408677143846406699,
+};
+#endif
+
+#define CCPU_EXP_MAX 110
+
+/*
+ * This function is analogical to the getpcpu() function in the ps(1) command.
+ * They should both calculate in the same way so that the racct %cpu
+ * calculations are consistent with the values showed by the ps(1) tool.
+ * The calculations are more complex in the 4BSD scheduler because of the value
+ * of the ccpu variable. In ULE it is defined to be zero which saves us some
+ * work.
+ */
+static uint64_t
+racct_getpcpu(struct proc *p, u_int pcpu)
+{
+ u_int swtime;
+#ifdef SCHED_4BSD
+ fixpt_t pctcpu, pctcpu_next;
+#endif
+#ifdef SMP
+ struct pcpu *pc;
+ int found;
+#endif
+ fixpt_t p_pctcpu;
+ struct thread *td;
+
+ /*
+ * If the process is swapped out, we count its %cpu usage as zero.
+ * This behaviour is consistent with the userland ps(1) tool.
+ */
+ if ((p->p_flag & P_INMEM) == 0)
+ return (0);
+ swtime = (ticks - p->p_swtick) / hz;
+
+ /*
+ * For short-lived processes, the sched_pctcpu() returns small
+ * values even for cpu intensive processes. Therefore we use
+ * our own estimate in this case.
+ */
+ if (swtime < RACCT_PCPU_SECS)
+ return (pcpu);
+
+ p_pctcpu = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td == PCPU_GET(idlethread))
+ continue;
+#ifdef SMP
+ found = 0;
+ STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+ if (td == pc->pc_idlethread) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ continue;
+#endif
+ thread_lock(td);
+#ifdef SCHED_4BSD
+ pctcpu = sched_pctcpu(td);
+ /* Count also the yet unfinished second. */
+ pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
+ pctcpu_next += sched_pctcpu_delta(td);
+ p_pctcpu += max(pctcpu, pctcpu_next);
+#else
+ /*
+ * In ULE the %cpu statistics are updated on every
+ * sched_pctcpu() call. So special calculations to
+ * account for the latest (unfinished) second are
+ * not needed.
+ */
+ p_pctcpu += sched_pctcpu(td);
+#endif
+ thread_unlock(td);
+ }
+
+#ifdef SCHED_4BSD
+ if (swtime <= CCPU_EXP_MAX)
+ return ((100 * (uint64_t)p_pctcpu * 1000000) /
+ (FSCALE - ccpu_exp[swtime]));
+#endif
+
+ return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
+}
+
+static void
+racct_add_racct(struct racct *dest, const struct racct *src)
+{
+ int i;
+
+ mtx_assert(&racct_lock, MA_OWNED);
+
+ /*
+ * Update resource usage in dest.
+ */
+ for (i = 0; i <= RACCT_MAX; i++) {
+ KASSERT(dest->r_resources[i] >= 0,
+ ("%s: resource %d propagation meltdown: dest < 0",
+ __func__, i));
+ KASSERT(src->r_resources[i] >= 0,
+ ("%s: resource %d propagation meltdown: src < 0",
+ __func__, i));
+ dest->r_resources[i] += src->r_resources[i];
+ }
+}
+
+static void
+racct_sub_racct(struct racct *dest, const struct racct *src)
+{
+ int i;
+
+ mtx_assert(&racct_lock, MA_OWNED);
+
+ /*
+ * Update resource usage in dest.
+ */
+ for (i = 0; i <= RACCT_MAX; i++) {
+ if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
+ KASSERT(dest->r_resources[i] >= 0,
+ ("%s: resource %d propagation meltdown: dest < 0",
+ __func__, i));
+ KASSERT(src->r_resources[i] >= 0,
+ ("%s: resource %d propagation meltdown: src < 0",
+ __func__, i));
+ KASSERT(src->r_resources[i] <= dest->r_resources[i],
+ ("%s: resource %d propagation meltdown: src > dest",
+ __func__, i));
+ }
+ if (RACCT_CAN_DROP(i)) {
+ dest->r_resources[i] -= src->r_resources[i];
+ if (dest->r_resources[i] < 0) {
+ KASSERT(RACCT_IS_SLOPPY(i) ||
+ RACCT_IS_DECAYING(i),
+ ("%s: resource %d usage < 0", __func__, i));
+ dest->r_resources[i] = 0;
+ }
+ }
+ }
+}
+
+void
+racct_create(struct racct **racctp)
+{
+
+ SDT_PROBE(racct, kernel, racct, create, racctp, 0, 0, 0, 0);
+
+ KASSERT(*racctp == NULL, ("racct already allocated"));
+
+ *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
+}
+
+static void
+racct_destroy_locked(struct racct **racctp)
+{
+ int i;
+ struct racct *racct;
+
+ SDT_PROBE(racct, kernel, racct, destroy, racctp, 0, 0, 0, 0);
+
+ mtx_assert(&racct_lock, MA_OWNED);
+ KASSERT(racctp != NULL, ("NULL racctp"));
+ KASSERT(*racctp != NULL, ("NULL racct"));
+
+ racct = *racctp;
+
+ for (i = 0; i <= RACCT_MAX; i++) {
+ if (RACCT_IS_SLOPPY(i))
+ continue;
+ if (!RACCT_IS_RECLAIMABLE(i))
+ continue;
+ KASSERT(racct->r_resources[i] == 0,
+ ("destroying non-empty racct: "
+ "%ju allocated for resource %d\n",
+ racct->r_resources[i], i));
+ }
+ uma_zfree(racct_zone, racct);
+ *racctp = NULL;
+}
+
+void
+racct_destroy(struct racct **racct)
+{
+
+ mtx_lock(&racct_lock);
+ racct_destroy_locked(racct);
+ mtx_unlock(&racct_lock);
+}
+
+/*
+ * Increase consumption of 'resource' by 'amount' for 'racct'
+ * and all its parents. Differently from other cases, 'amount' here
+ * may be less than zero.
+ */
+static void
+racct_alloc_resource(struct racct *racct, int resource,
+ uint64_t amount)
+{
+
+ mtx_assert(&racct_lock, MA_OWNED);
+ KASSERT(racct != NULL, ("NULL racct"));
+
+ racct->r_resources[resource] += amount;
+ if (racct->r_resources[resource] < 0) {
+ KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
+ ("%s: resource %d usage < 0", __func__, resource));
+ racct->r_resources[resource] = 0;
+ }
+
+ /*
+ * There are some cases where the racct %cpu resource would grow
+ * beyond 100%.
+ * For example in racct_proc_exit() we add the process %cpu usage
+ * to the ucred racct containers. If too many processes terminated
+ * in a short time span, the ucred %cpu resource could grow too much.
+ * Also, the 4BSD scheduler sometimes returns for a thread more than
+ * 100% cpu usage. So we set a boundary here to 100%.
+ */
+ if ((resource == RACCT_PCTCPU) &&
+ (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000))
+ racct->r_resources[RACCT_PCTCPU] = 100 * 1000000;
+}
+
+static int
+racct_add_locked(struct proc *p, int resource, uint64_t amount)
+{
+#ifdef RCTL
+ int error;
+#endif
+
+ SDT_PROBE(racct, kernel, rusage, add, p, resource, amount, 0, 0);
+
+ /*
+ * We need proc lock to dereference p->p_ucred.
+ */
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+#ifdef RCTL
+ error = rctl_enforce(p, resource, amount);
+ if (error && RACCT_IS_DENIABLE(resource)) {
+ SDT_PROBE(racct, kernel, rusage, add_failure, p, resource,
+ amount, 0, 0);
+ return (error);
+ }
+#endif
+ racct_alloc_resource(p->p_racct, resource, amount);
+ racct_add_cred_locked(p->p_ucred, resource, amount);
+
+ return (0);
+}
+
+/*
+ * Increase allocation of 'resource' by 'amount' for process 'p'.
+ * Return 0 if it's below limits, or errno, if it's not.
+ */
+int
+racct_add(struct proc *p, int resource, uint64_t amount)
+{
+ int error;
+
+ mtx_lock(&racct_lock);
+ error = racct_add_locked(p, resource, amount);
+ mtx_unlock(&racct_lock);
+ return (error);
+}
+
+static void
+racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
+{
+ struct prison *pr;
+
+ SDT_PROBE(racct, kernel, rusage, add_cred, cred, resource, amount,
+ 0, 0);
+
+ racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
+ for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
+ racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
+ amount);
+ racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, amount);
+}
+
+/*
+ * Increase allocation of 'resource' by 'amount' for credential 'cred'.
+ * Doesn't check for limits and never fails.
+ *
+ * XXX: Shouldn't this ever return an error?
+ */
+void
+racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
+{
+
+ mtx_lock(&racct_lock);
+ racct_add_cred_locked(cred, resource, amount);
+ mtx_unlock(&racct_lock);
+}
+
+/*
+ * Increase allocation of 'resource' by 'amount' for process 'p'.
+ * Doesn't check for limits and never fails.
+ */
+void
+racct_add_force(struct proc *p, int resource, uint64_t amount)
+{
+
+ SDT_PROBE(racct, kernel, rusage, add_force, p, resource, amount, 0, 0);
+
+ /*
+ * We need proc lock to dereference p->p_ucred.
+ */
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ mtx_lock(&racct_lock);
+ racct_alloc_resource(p->p_racct, resource, amount);
+ mtx_unlock(&racct_lock);
+ racct_add_cred(p->p_ucred, resource, amount);
+}
+
+static int
+racct_set_locked(struct proc *p, int resource, uint64_t amount)
+{
+ int64_t old_amount, decayed_amount;
+ int64_t diff_proc, diff_cred;
+#ifdef RCTL
+ int error;
+#endif
+
+ SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
+
+ /*
+ * We need proc lock to dereference p->p_ucred.
+ */
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ old_amount = p->p_racct->r_resources[resource];
+ /*
+ * The diffs may be negative.
+ */
+ diff_proc = amount - old_amount;
+ if (RACCT_IS_DECAYING(resource)) {
+ /*
+ * Resources in per-credential racct containers may decay.
+ * If this is the case, we need to calculate the difference
+ * between the new amount and the proportional value of the
+ * old amount that has decayed in the ucred racct containers.
+ */
+ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
+ diff_cred = amount - decayed_amount;
+ } else
+ diff_cred = diff_proc;
+#ifdef notyet
+ KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
+ ("%s: usage of non-droppable resource %d dropping", __func__,
+ resource));
+#endif
+#ifdef RCTL
+ if (diff_proc > 0) {
+ error = rctl_enforce(p, resource, diff_proc);
+ if (error && RACCT_IS_DENIABLE(resource)) {
+ SDT_PROBE(racct, kernel, rusage, set_failure, p,
+ resource, amount, 0, 0);
+ return (error);
+ }
+ }
+#endif
+ racct_alloc_resource(p->p_racct, resource, diff_proc);
+ if (diff_cred > 0)
+ racct_add_cred_locked(p->p_ucred, resource, diff_cred);
+ else if (diff_cred < 0)
+ racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
+
+ return (0);
+}
+
+/*
+ * Set allocation of 'resource' to 'amount' for process 'p'.
+ * Return 0 if it's below limits, or errno, if it's not.
+ *
+ * Note that decreasing the allocation always returns 0,
+ * even if it's above the limit.
+ */
+int
+racct_set(struct proc *p, int resource, uint64_t amount)
+{
+ int error;
+
+ mtx_lock(&racct_lock);
+ error = racct_set_locked(p, resource, amount);
+ mtx_unlock(&racct_lock);
+ return (error);
+}
+
+static void
+racct_set_force_locked(struct proc *p, int resource, uint64_t amount)
+{
+ int64_t old_amount, decayed_amount;
+ int64_t diff_proc, diff_cred;
+
+ SDT_PROBE(racct, kernel, rusage, set, p, resource, amount, 0, 0);
+
+ /*
+ * We need proc lock to dereference p->p_ucred.
+ */
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ old_amount = p->p_racct->r_resources[resource];
+ /*
+ * The diffs may be negative.
+ */
+ diff_proc = amount - old_amount;
+ if (RACCT_IS_DECAYING(resource)) {
+ /*
+ * Resources in per-credential racct containers may decay.
+ * If this is the case, we need to calculate the difference
+ * between the new amount and the proportional value of the
+ * old amount that has decayed in the ucred racct containers.
+ */
+ decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
+ diff_cred = amount - decayed_amount;
+ } else
+ diff_cred = diff_proc;
+
+ racct_alloc_resource(p->p_racct, resource, diff_proc);
+ if (diff_cred > 0)
+ racct_add_cred_locked(p->p_ucred, resource, diff_cred);
+ else if (diff_cred < 0)
+ racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
+}
+
+void
+racct_set_force(struct proc *p, int resource, uint64_t amount)
+{
+ mtx_lock(&racct_lock);
+ racct_set_force_locked(p, resource, amount);
+ mtx_unlock(&racct_lock);
+}
+
+/*
+ * Returns amount of 'resource' the process 'p' can keep allocated.
+ * Allocating more than that would be denied, unless the resource
+ * is marked undeniable. Amount of already allocated resource does
+ * not matter.
+ */
+uint64_t
+racct_get_limit(struct proc *p, int resource)
+{
+
+#ifdef RCTL
+ return (rctl_get_limit(p, resource));
+#else
+ return (UINT64_MAX);
+#endif
+}
+
+/*
+ * Returns amount of 'resource' the process 'p' can keep allocated.
+ * Allocating more than that would be denied, unless the resource
+ * is marked undeniable. Amount of already allocated resource does
+ * matter.
+ */
+uint64_t
+racct_get_available(struct proc *p, int resource)
+{
+
+#ifdef RCTL
+ return (rctl_get_available(p, resource));
+#else
+ return (UINT64_MAX);
+#endif
+}
+
+/*
+ * Returns amount of the %cpu resource that process 'p' can add to its %cpu
+ * utilization. Adding more than that would lead to the process being
+ * throttled.
+ */
+static int64_t
+racct_pcpu_available(struct proc *p)
+{
+
+#ifdef RCTL
+ return (rctl_pcpu_available(p));
+#else
+ return (INT64_MAX);
+#endif
+}
+
+/*
+ * Decrease allocation of 'resource' by 'amount' for process 'p'.
+ */
+void
+racct_sub(struct proc *p, int resource, uint64_t amount)
+{
+
+ SDT_PROBE(racct, kernel, rusage, sub, p, resource, amount, 0, 0);
+
+ /*
+ * We need proc lock to dereference p->p_ucred.
+ */
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ KASSERT(RACCT_CAN_DROP(resource),
+ ("%s: called for non-droppable resource %d", __func__, resource));
+
+ mtx_lock(&racct_lock);
+ KASSERT(amount <= p->p_racct->r_resources[resource],
+ ("%s: freeing %ju of resource %d, which is more "
+ "than allocated %jd for %s (pid %d)", __func__, amount, resource,
+ (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
+
+ racct_alloc_resource(p->p_racct, resource, -amount);
+ racct_sub_cred_locked(p->p_ucred, resource, amount);
+ mtx_unlock(&racct_lock);
+}
+
+static void
+racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
+{
+ struct prison *pr;
+
+ SDT_PROBE(racct, kernel, rusage, sub_cred, cred, resource, amount,
+ 0, 0);
+
+#ifdef notyet
+ KASSERT(RACCT_CAN_DROP(resource),
+ ("%s: called for resource %d which can not drop", __func__,
+ resource));
+#endif
+
+ racct_alloc_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
+ for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
+ racct_alloc_resource(pr->pr_prison_racct->prr_racct, resource,
+ -amount);
+ racct_alloc_resource(cred->cr_loginclass->lc_racct, resource, -amount);
+}
+
+/*
+ * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
+ */
+void
+racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
+{
+
+ mtx_lock(&racct_lock);
+ racct_sub_cred_locked(cred, resource, amount);
+ mtx_unlock(&racct_lock);
+}
+
+/*
+ * Inherit resource usage information from the parent process.
+ */
+int
+racct_proc_fork(struct proc *parent, struct proc *child)
+{
+ int i, error = 0;
+
+ /*
+ * Create racct for the child process.
+ */
+ racct_create(&child->p_racct);
+
+ PROC_LOCK(parent);
+ PROC_LOCK(child);
+ mtx_lock(&racct_lock);
+
+#ifdef RCTL
+ error = rctl_proc_fork(parent, child);
+ if (error != 0)
+ goto out;
+#endif
+
+ /* Init process cpu time. */
+ child->p_prev_runtime = 0;
+ child->p_throttled = 0;
+
+ /*
+ * Inherit resource usage.
+ */
+ for (i = 0; i <= RACCT_MAX; i++) {
+ if (parent->p_racct->r_resources[i] == 0 ||
+ !RACCT_IS_INHERITABLE(i))
+ continue;
+
+ error = racct_set_locked(child, i,
+ parent->p_racct->r_resources[i]);
+ if (error != 0)
+ goto out;
+ }
+
+ error = racct_add_locked(child, RACCT_NPROC, 1);
+ error += racct_add_locked(child, RACCT_NTHR, 1);
+
+out:
+ mtx_unlock(&racct_lock);
+ PROC_UNLOCK(child);
+ PROC_UNLOCK(parent);
+
+ if (error != 0)
+ racct_proc_exit(child);
+
+ return (error);
+}
+
+/*
+ * Called at the end of fork1(), to handle rules that require the process
+ * to be fully initialized.
+ */
+void
+racct_proc_fork_done(struct proc *child)
+{
+
+#ifdef RCTL
+ PROC_LOCK(child);
+ mtx_lock(&racct_lock);
+ rctl_enforce(child, RACCT_NPROC, 0);
+ rctl_enforce(child, RACCT_NTHR, 0);
+ mtx_unlock(&racct_lock);
+ PROC_UNLOCK(child);
+#endif
+}
+
+void
+racct_proc_exit(struct proc *p)
+{
+ int i;
+ uint64_t runtime;
+ struct timeval wallclock;
+ uint64_t pct_estimate, pct;
+
+ PROC_LOCK(p);
+ /*
+ * We don't need to calculate rux, proc_reap() has already done this.
+ */
+ runtime = cputick2usec(p->p_rux.rux_runtime);
+#ifdef notyet
+ KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
+#else
+ if (runtime < p->p_prev_runtime)
+ runtime = p->p_prev_runtime;
+#endif
+ microuptime(&wallclock);
+ timevalsub(&wallclock, &p->p_stats->p_start);
+ if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+ pct_estimate = (1000000 * runtime * 100) /
+ ((uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ } else
+ pct_estimate = 0;
+ pct = racct_getpcpu(p, pct_estimate);
+
+ mtx_lock(&racct_lock);
+ racct_set_locked(p, RACCT_CPU, runtime);
+ racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
+
+ for (i = 0; i <= RACCT_MAX; i++) {
+ if (p->p_racct->r_resources[i] == 0)
+ continue;
+ if (!RACCT_IS_RECLAIMABLE(i))
+ continue;
+ racct_set_locked(p, i, 0);
+ }
+
+ mtx_unlock(&racct_lock);
+ PROC_UNLOCK(p);
+
+#ifdef RCTL
+ rctl_racct_release(p->p_racct);
+#endif
+ racct_destroy(&p->p_racct);
+}
+
+/*
+ * Called after credentials change, to move resource utilisation
+ * between raccts.
+ */
+void
+racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
+ struct ucred *newcred)
+{
+ struct uidinfo *olduip, *newuip;
+ struct loginclass *oldlc, *newlc;
+ struct prison *oldpr, *newpr, *pr;
+
+ PROC_LOCK_ASSERT(p, MA_NOTOWNED);
+
+ newuip = newcred->cr_ruidinfo;
+ olduip = oldcred->cr_ruidinfo;
+ newlc = newcred->cr_loginclass;
+ oldlc = oldcred->cr_loginclass;
+ newpr = newcred->cr_prison;
+ oldpr = oldcred->cr_prison;
+
+ mtx_lock(&racct_lock);
+ if (newuip != olduip) {
+ racct_sub_racct(olduip->ui_racct, p->p_racct);
+ racct_add_racct(newuip->ui_racct, p->p_racct);
+ }
+ if (newlc != oldlc) {
+ racct_sub_racct(oldlc->lc_racct, p->p_racct);
+ racct_add_racct(newlc->lc_racct, p->p_racct);
+ }
+ if (newpr != oldpr) {
+ for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
+ racct_sub_racct(pr->pr_prison_racct->prr_racct,
+ p->p_racct);
+ for (pr = newpr; pr != NULL; pr = pr->pr_parent)
+ racct_add_racct(pr->pr_prison_racct->prr_racct,
+ p->p_racct);
+ }
+ mtx_unlock(&racct_lock);
+
+#ifdef RCTL
+ rctl_proc_ucred_changed(p, newcred);
+#endif
+}
+
+void
+racct_move(struct racct *dest, struct racct *src)
+{
+
+ mtx_lock(&racct_lock);
+
+ racct_add_racct(dest, src);
+ racct_sub_racct(src, src);
+
+ mtx_unlock(&racct_lock);
+}
+
+static void
+racct_proc_throttle(struct proc *p)
+{
+ struct thread *td;
+#ifdef SMP
+ int cpuid;
+#endif
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ /*
+ * Do not block kernel processes. Also do not block processes with
+ * low %cpu utilization to improve interactivity.
+ */
+ if (((p->p_flag & (P_SYSTEM | P_KTHREAD)) != 0) ||
+ (p->p_racct->r_resources[RACCT_PCTCPU] <= pcpu_threshold))
+ return;
+ p->p_throttled = 1;
+
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ switch (td->td_state) {
+ case TDS_RUNQ:
+ /*
+ * If the thread is on the scheduler run-queue, we can
+ * not just remove it from there. So we set the flag
+ * TDF_NEEDRESCHED for the thread, so that once it is
+ * running, it is taken off the cpu as soon as possible.
+ */
+ td->td_flags |= TDF_NEEDRESCHED;
+ break;
+ case TDS_RUNNING:
+ /*
+ * If the thread is running, we request a context
+ * switch for it by setting the TDF_NEEDRESCHED flag.
+ */
+ td->td_flags |= TDF_NEEDRESCHED;
+#ifdef SMP
+ cpuid = td->td_oncpu;
+ if ((cpuid != NOCPU) && (td != curthread))
+ ipi_cpu(cpuid, IPI_AST);
+#endif
+ break;
+ default:
+ break;
+ }
+ thread_unlock(td);
+ }
+}
+
+static void
+racct_proc_wakeup(struct proc *p)
+{
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if (p->p_throttled) {
+ p->p_throttled = 0;
+ wakeup(p->p_racct);
+ }
+}
+
+static void
+racct_decay_resource(struct racct *racct, void * res, void* dummy)
+{
+ int resource;
+ int64_t r_old, r_new;
+
+ resource = *(int *)res;
+ r_old = racct->r_resources[resource];
+
+ /* If there is nothing to decay, just exit. */
+ if (r_old <= 0)
+ return;
+
+ mtx_lock(&racct_lock);
+ r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
+ racct->r_resources[resource] = r_new;
+ mtx_unlock(&racct_lock);
+}
+
+static void
+racct_decay(int resource)
+{
+ ui_racct_foreach(racct_decay_resource, &resource, NULL);
+ loginclass_racct_foreach(racct_decay_resource, &resource, NULL);
+ prison_racct_foreach(racct_decay_resource, &resource, NULL);
+}
+
+static void
+racctd(void)
+{
+ struct thread *td;
+ struct proc *p;
+ struct timeval wallclock;
+ uint64_t runtime;
+ uint64_t pct, pct_estimate;
+
+ for (;;) {
+ racct_decay(RACCT_PCTCPU);
+
+ sx_slock(&allproc_lock);
+
+ LIST_FOREACH(p, &zombproc, p_list) {
+ PROC_LOCK(p);
+ racct_set(p, RACCT_PCTCPU, 0);
+ PROC_UNLOCK(p);
+ }
+
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+
+ microuptime(&wallclock);
+ timevalsub(&wallclock, &p->p_stats->p_start);
+ PROC_SLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td)
+ ruxagg(p, td);
+ runtime = cputick2usec(p->p_rux.rux_runtime);
+ PROC_SUNLOCK(p);
+#ifdef notyet
+ KASSERT(runtime >= p->p_prev_runtime,
+ ("runtime < p_prev_runtime"));
+#else
+ if (runtime < p->p_prev_runtime)
+ runtime = p->p_prev_runtime;
+#endif
+ p->p_prev_runtime = runtime;
+ if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
+ pct_estimate = (1000000 * runtime * 100) /
+ ((uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ } else
+ pct_estimate = 0;
+ pct = racct_getpcpu(p, pct_estimate);
+ mtx_lock(&racct_lock);
+ racct_set_force_locked(p, RACCT_PCTCPU, pct);
+ racct_set_locked(p, RACCT_CPU, runtime);
+ racct_set_locked(p, RACCT_WALLCLOCK,
+ (uint64_t)wallclock.tv_sec * 1000000 +
+ wallclock.tv_usec);
+ mtx_unlock(&racct_lock);
+ PROC_UNLOCK(p);
+ }
+
+ /*
+ * To ensure that processes are throttled in a fair way, we need
+ * to iterate over all processes again and check the limits
+ * for %cpu resource only after ucred racct containers have been
+ * properly filled.
+ */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state != PRS_NORMAL) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+
+ if (racct_pcpu_available(p) <= 0)
+ racct_proc_throttle(p);
+ else if (p->p_throttled)
+ racct_proc_wakeup(p);
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ pause("-", hz);
+ }
+}
+
+static struct kproc_desc racctd_kp = {
+ "racctd",
+ racctd,
+ NULL
+};
+SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, kproc_start, &racctd_kp);
+
+static void
+racct_init(void)
+{
+
+ racct_zone = uma_zcreate("racct", sizeof(struct racct),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ /*
+ * XXX: Move this somewhere.
+ */
+ prison0.pr_prison_racct = prison_racct_find("0");
+}
+SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
+
+#else /* !RACCT */
+
+int
+racct_add(struct proc *p, int resource, uint64_t amount)
+{
+
+ return (0);
+}
+
+void
+racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
+{
+}
+
+void
+racct_add_force(struct proc *p, int resource, uint64_t amount)
+{
+
+ return;
+}
+
+int
+racct_set(struct proc *p, int resource, uint64_t amount)
+{
+
+ return (0);
+}
+
+void
+racct_set_force(struct proc *p, int resource, uint64_t amount)
+{
+}
+
+void
+racct_sub(struct proc *p, int resource, uint64_t amount)
+{
+}
+
+void
+racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
+{
+}
+
+uint64_t
+racct_get_limit(struct proc *p, int resource)
+{
+
+ return (UINT64_MAX);
+}
+
+uint64_t
+racct_get_available(struct proc *p, int resource)
+{
+
+ return (UINT64_MAX);
+}
+
+void
+racct_create(struct racct **racctp)
+{
+}
+
+void
+racct_destroy(struct racct **racctp)
+{
+}
+
+int
+racct_proc_fork(struct proc *parent, struct proc *child)
+{
+
+ return (0);
+}
+
+void
+racct_proc_fork_done(struct proc *child)
+{
+}
+
+void
+racct_proc_exit(struct proc *p)
+{
+}
+
+#endif /* !RACCT */
diff --git a/sys/kern/kern_rangelock.c b/sys/kern/kern_rangelock.c
new file mode 100644
index 0000000..1c0faa3
--- /dev/null
+++ b/sys/kern/kern_rangelock.c
@@ -0,0 +1,248 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+
+#include <vm/uma.h>
+
+struct rl_q_entry {
+ TAILQ_ENTRY(rl_q_entry) rl_q_link;
+ off_t rl_q_start, rl_q_end;
+ int rl_q_flags;
+};
+
+static uma_zone_t rl_entry_zone;
+
+static void
+rangelock_sys_init(void)
+{
+
+ rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
+
+static struct rl_q_entry *
+rlqentry_alloc(void)
+{
+
+ return (uma_zalloc(rl_entry_zone, M_WAITOK));
+}
+
+void
+rlqentry_free(struct rl_q_entry *rleq)
+{
+
+ uma_zfree(rl_entry_zone, rleq);
+}
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+ TAILQ_INIT(&lock->rl_waiters);
+ lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+ KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+/*
+ * Two entries are compatible if their ranges do not overlap, or both
+ * entries are for read.
+ */
+static int
+ranges_overlap(const struct rl_q_entry *e1,
+ const struct rl_q_entry *e2)
+{
+
+ if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
+ return (1);
+ return (0);
+}
+
+/*
+ * Recalculate the lock->rl_currdep after an unlock.
+ */
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+ struct rl_q_entry *entry, *nextentry, *entry1;
+
+ for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) {
+ nextentry = TAILQ_NEXT(entry, rl_q_link);
+ if (entry->rl_q_flags & RL_LOCK_READ) {
+ /* Reads must not overlap with granted writes. */
+ for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
+ !(entry1->rl_q_flags & RL_LOCK_READ);
+ entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
+ if (ranges_overlap(entry, entry1))
+ goto out;
+ }
+ } else {
+ /* Write must not overlap with any granted locks. */
+ for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
+ entry1 != entry;
+ entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
+ if (ranges_overlap(entry, entry1))
+ goto out;
+ }
+
+ /* Move grantable write locks to the front. */
+ TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
+ TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link);
+ }
+
+ /* Grant this lock. */
+ entry->rl_q_flags |= RL_LOCK_GRANTED;
+ wakeup(entry);
+ }
+out:
+ lock->rl_currdep = entry;
+}
+
+static void
+rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
+ struct mtx *ilk)
+{
+
+ MPASS(lock != NULL && entry != NULL && ilk != NULL);
+ mtx_assert(ilk, MA_OWNED);
+ KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
+
+ TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
+ rangelock_calc_block(lock);
+ mtx_unlock(ilk);
+ if (curthread->td_rlqe == NULL)
+ curthread->td_rlqe = entry;
+ else
+ rlqentry_free(entry);
+}
+
+void
+rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
+{
+
+ MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+
+ mtx_lock(ilk);
+ rangelock_unlock_locked(lock, cookie, ilk);
+}
+
+/*
+ * Unlock the sub-range of granted lock.
+ */
+void *
+rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
+ off_t end, struct mtx *ilk)
+{
+ struct rl_q_entry *entry;
+
+ MPASS(lock != NULL && cookie != NULL && ilk != NULL);
+ entry = cookie;
+ KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
+ ("Unlocking non-granted lock"));
+ KASSERT(entry->rl_q_start == start, ("wrong start"));
+ KASSERT(entry->rl_q_end >= end, ("wrong end"));
+
+ mtx_lock(ilk);
+ if (entry->rl_q_end == end) {
+ rangelock_unlock_locked(lock, cookie, ilk);
+ return (NULL);
+ }
+ entry->rl_q_end = end;
+ rangelock_calc_block(lock);
+ mtx_unlock(ilk);
+ return (cookie);
+}
+
+/*
+ * Add the lock request to the queue of the pending requests for
+ * rangelock. Sleep until the request can be granted.
+ */
+static void *
+rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
+ struct mtx *ilk)
+{
+ struct rl_q_entry *entry;
+ struct thread *td;
+
+ MPASS(lock != NULL && ilk != NULL);
+
+ td = curthread;
+ if (td->td_rlqe != NULL) {
+ entry = td->td_rlqe;
+ td->td_rlqe = NULL;
+ } else
+ entry = rlqentry_alloc();
+ MPASS(entry != NULL);
+ entry->rl_q_flags = mode;
+ entry->rl_q_start = start;
+ entry->rl_q_end = end;
+
+ mtx_lock(ilk);
+ /*
+ * XXXKIB TODO. Check that a thread does not try to enqueue a
+ * lock that is incompatible with another request from the same
+ * thread.
+ */
+
+ TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
+ if (lock->rl_currdep == NULL)
+ lock->rl_currdep = entry;
+ rangelock_calc_block(lock);
+ while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+ msleep(entry, ilk, 0, "range", 0);
+ mtx_unlock(ilk);
+ return (entry);
+}
+
+void *
+rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+ return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk));
+}
+
+void *
+rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
+{
+
+ return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk));
+}
diff --git a/sys/kern/kern_rctl.c b/sys/kern/kern_rctl.c
new file mode 100644
index 0000000..934327a
--- /dev/null
+++ b/sys/kern/kern_rctl.c
@@ -0,0 +1,1870 @@
+/*-
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Edward Tomasz Napierala under sponsorship
+ * from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/refcount.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/loginclass.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/rctl.h>
+#include <sys/resourcevar.h>
+#include <sys/sx.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/taskqueue.h>
+#include <sys/tree.h>
+#include <vm/uma.h>
+
+#ifdef RCTL
+#ifndef RACCT
+#error "The RCTL option requires the RACCT option"
+#endif
+
+FEATURE(rctl, "Resource Limits");
+
+#define HRF_DEFAULT 0
+#define HRF_DONT_INHERIT 1
+#define HRF_DONT_ACCUMULATE 2
+
+/* Default buffer size for rctl_get_rules(2). */
+#define RCTL_DEFAULT_BUFSIZE 4096
+#define RCTL_MAX_INBUFLEN 4096
+#define RCTL_LOG_BUFSIZE 128
+
+#define RCTL_PCPU_SHIFT (10 * 1000000)
+
+/*
+ * 'rctl_rule_link' connects a rule with every racct it's related to.
+ * For example, rule 'user:X:openfiles:deny=N/process' is linked
+ * with uidinfo for user X, and to each process of that user.
+ */
+struct rctl_rule_link {
+ LIST_ENTRY(rctl_rule_link) rrl_next;
+ struct rctl_rule *rrl_rule;
+ int rrl_exceeded;
+};
+
+struct dict {
+ const char *d_name;
+ int d_value;
+};
+
+static struct dict subjectnames[] = {
+ { "process", RCTL_SUBJECT_TYPE_PROCESS },
+ { "user", RCTL_SUBJECT_TYPE_USER },
+ { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
+ { "jail", RCTL_SUBJECT_TYPE_JAIL },
+ { NULL, -1 }};
+
+static struct dict resourcenames[] = {
+ { "cputime", RACCT_CPU },
+ { "datasize", RACCT_DATA },
+ { "stacksize", RACCT_STACK },
+ { "coredumpsize", RACCT_CORE },
+ { "memoryuse", RACCT_RSS },
+ { "memorylocked", RACCT_MEMLOCK },
+ { "maxproc", RACCT_NPROC },
+ { "openfiles", RACCT_NOFILE },
+ { "vmemoryuse", RACCT_VMEM },
+ { "pseudoterminals", RACCT_NPTS },
+ { "swapuse", RACCT_SWAP },
+ { "nthr", RACCT_NTHR },
+ { "msgqqueued", RACCT_MSGQQUEUED },
+ { "msgqsize", RACCT_MSGQSIZE },
+ { "nmsgq", RACCT_NMSGQ },
+ { "nsem", RACCT_NSEM },
+ { "nsemop", RACCT_NSEMOP },
+ { "nshm", RACCT_NSHM },
+ { "shmsize", RACCT_SHMSIZE },
+ { "wallclock", RACCT_WALLCLOCK },
+ { "pcpu", RACCT_PCTCPU },
+ { NULL, -1 }};
+
+static struct dict actionnames[] = {
+ { "sighup", RCTL_ACTION_SIGHUP },
+ { "sigint", RCTL_ACTION_SIGINT },
+ { "sigquit", RCTL_ACTION_SIGQUIT },
+ { "sigill", RCTL_ACTION_SIGILL },
+ { "sigtrap", RCTL_ACTION_SIGTRAP },
+ { "sigabrt", RCTL_ACTION_SIGABRT },
+ { "sigemt", RCTL_ACTION_SIGEMT },
+ { "sigfpe", RCTL_ACTION_SIGFPE },
+ { "sigkill", RCTL_ACTION_SIGKILL },
+ { "sigbus", RCTL_ACTION_SIGBUS },
+ { "sigsegv", RCTL_ACTION_SIGSEGV },
+ { "sigsys", RCTL_ACTION_SIGSYS },
+ { "sigpipe", RCTL_ACTION_SIGPIPE },
+ { "sigalrm", RCTL_ACTION_SIGALRM },
+ { "sigterm", RCTL_ACTION_SIGTERM },
+ { "sigurg", RCTL_ACTION_SIGURG },
+ { "sigstop", RCTL_ACTION_SIGSTOP },
+ { "sigtstp", RCTL_ACTION_SIGTSTP },
+ { "sigchld", RCTL_ACTION_SIGCHLD },
+ { "sigttin", RCTL_ACTION_SIGTTIN },
+ { "sigttou", RCTL_ACTION_SIGTTOU },
+ { "sigio", RCTL_ACTION_SIGIO },
+ { "sigxcpu", RCTL_ACTION_SIGXCPU },
+ { "sigxfsz", RCTL_ACTION_SIGXFSZ },
+ { "sigvtalrm", RCTL_ACTION_SIGVTALRM },
+ { "sigprof", RCTL_ACTION_SIGPROF },
+ { "sigwinch", RCTL_ACTION_SIGWINCH },
+ { "siginfo", RCTL_ACTION_SIGINFO },
+ { "sigusr1", RCTL_ACTION_SIGUSR1 },
+ { "sigusr2", RCTL_ACTION_SIGUSR2 },
+ { "sigthr", RCTL_ACTION_SIGTHR },
+ { "deny", RCTL_ACTION_DENY },
+ { "log", RCTL_ACTION_LOG },
+ { "devctl", RCTL_ACTION_DEVCTL },
+ { NULL, -1 }};
+
+static void rctl_init(void);
+SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
+
+static uma_zone_t rctl_rule_link_zone;
+static uma_zone_t rctl_rule_zone;
+static struct rwlock rctl_lock;
+RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
+
+static int rctl_rule_fully_specified(const struct rctl_rule *rule);
+static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
+
+static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
+
+static const char *
+rctl_subject_type_name(int subject)
+{
+ int i;
+
+ for (i = 0; subjectnames[i].d_name != NULL; i++) {
+ if (subjectnames[i].d_value == subject)
+ return (subjectnames[i].d_name);
+ }
+
+ panic("rctl_subject_type_name: unknown subject type %d", subject);
+}
+
+static const char *
+rctl_action_name(int action)
+{
+ int i;
+
+ for (i = 0; actionnames[i].d_name != NULL; i++) {
+ if (actionnames[i].d_value == action)
+ return (actionnames[i].d_name);
+ }
+
+ panic("rctl_action_name: unknown action %d", action);
+}
+
+const char *
+rctl_resource_name(int resource)
+{
+ int i;
+
+ for (i = 0; resourcenames[i].d_name != NULL; i++) {
+ if (resourcenames[i].d_value == resource)
+ return (resourcenames[i].d_name);
+ }
+
+ panic("rctl_resource_name: unknown resource %d", resource);
+}
+
+/*
+ * Return the amount of resource that can be allocated by 'p' before
+ * hitting 'rule'.
+ */
+static int64_t
+rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
+{
+ int resource;
+ int64_t available = INT64_MAX;
+ struct ucred *cred = p->p_ucred;
+
+ rw_assert(&rctl_lock, RA_LOCKED);
+
+ resource = rule->rr_resource;
+ switch (rule->rr_per) {
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ available = rule->rr_amount -
+ p->p_racct->r_resources[resource];
+ break;
+ case RCTL_SUBJECT_TYPE_USER:
+ available = rule->rr_amount -
+ cred->cr_ruidinfo->ui_racct->r_resources[resource];
+ break;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ available = rule->rr_amount -
+ cred->cr_loginclass->lc_racct->r_resources[resource];
+ break;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ available = rule->rr_amount -
+ cred->cr_prison->pr_prison_racct->prr_racct->
+ r_resources[resource];
+ break;
+ default:
+ panic("rctl_compute_available: unknown per %d",
+ rule->rr_per);
+ }
+
+ return (available);
+}
+
+/*
+ * Return non-zero if allocating 'amount' by proc 'p' would exceed
+ * resource limit specified by 'rule'.
+ */
+static int
+rctl_would_exceed(const struct proc *p, const struct rctl_rule *rule,
+ int64_t amount)
+{
+ int64_t available;
+
+ rw_assert(&rctl_lock, RA_LOCKED);
+
+ available = rctl_available_resource(p, rule);
+ if (available >= amount)
+ return (0);
+
+ return (1);
+}
+
+/*
+ * Special version of rctl_available() function for the %cpu resource.
+ * We slightly cheat here and return less than we normally would.
+ */
+int64_t
+rctl_pcpu_available(const struct proc *p) {
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ int64_t available, minavailable, limit;
+
+ minavailable = INT64_MAX;
+ limit = 0;
+
+ rw_rlock(&rctl_lock);
+
+ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+ if (rule->rr_resource != RACCT_PCTCPU)
+ continue;
+ if (rule->rr_action != RCTL_ACTION_DENY)
+ continue;
+ available = rctl_available_resource(p, rule);
+ if (available < minavailable) {
+ minavailable = available;
+ limit = rule->rr_amount;
+ }
+ }
+
+ rw_runlock(&rctl_lock);
+
+ /*
+ * Return slightly less than actual value of the available
+ * %cpu resource. This makes %cpu throttling more agressive
+ * and lets us act sooner than the limits are already exceeded.
+ */
+ if (limit != 0) {
+ if (limit > 2 * RCTL_PCPU_SHIFT)
+ minavailable -= RCTL_PCPU_SHIFT;
+ else
+ minavailable -= (limit / 2);
+ }
+
+ return (minavailable);
+}
+
+/*
+ * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
+ * to what it keeps allocated now. Returns non-zero if the allocation should
+ * be denied, 0 otherwise.
+ */
+int
+rctl_enforce(struct proc *p, int resource, uint64_t amount)
+{
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ struct sbuf sb;
+ int should_deny = 0;
+ char *buf;
+ static int curtime = 0;
+ static struct timeval lasttime;
+
+ rw_rlock(&rctl_lock);
+
+ /*
+ * There may be more than one matching rule; go through all of them.
+ * Denial should be done last, after logging and sending signals.
+ */
+ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+ if (rule->rr_resource != resource)
+ continue;
+ if (!rctl_would_exceed(p, rule, amount)) {
+ link->rrl_exceeded = 0;
+ continue;
+ }
+
+ switch (rule->rr_action) {
+ case RCTL_ACTION_DENY:
+ should_deny = 1;
+ continue;
+ case RCTL_ACTION_LOG:
+ /*
+ * If rrl_exceeded != 0, it means we've already
+ * logged a warning for this process.
+ */
+ if (link->rrl_exceeded != 0)
+ continue;
+
+ /*
+ * If the process state is not fully initialized yet,
+ * we can't access most of the required fields, e.g.
+ * p->p_comm. This happens when called from fork1().
+ * Ignore this rule for now; it will be processed just
+ * after fork, when called from racct_proc_fork_done().
+ */
+ if (p->p_state != PRS_NORMAL)
+ continue;
+
+ if (!ppsratecheck(&lasttime, &curtime, 10))
+ continue;
+
+ buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
+ if (buf == NULL) {
+ printf("rctl_enforce: out of memory\n");
+ continue;
+ }
+ sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
+ rctl_rule_to_sbuf(&sb, rule);
+ sbuf_finish(&sb);
+ printf("rctl: rule \"%s\" matched by pid %d "
+ "(%s), uid %d, jail %s\n", sbuf_data(&sb),
+ p->p_pid, p->p_comm, p->p_ucred->cr_uid,
+ p->p_ucred->cr_prison->pr_prison_racct->prr_name);
+ sbuf_delete(&sb);
+ free(buf, M_RCTL);
+ link->rrl_exceeded = 1;
+ continue;
+ case RCTL_ACTION_DEVCTL:
+ if (link->rrl_exceeded != 0)
+ continue;
+
+ if (p->p_state != PRS_NORMAL)
+ continue;
+
+ buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
+ if (buf == NULL) {
+ printf("rctl_enforce: out of memory\n");
+ continue;
+ }
+ sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
+ sbuf_printf(&sb, "rule=");
+ rctl_rule_to_sbuf(&sb, rule);
+ sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
+ p->p_pid, p->p_ucred->cr_ruid,
+ p->p_ucred->cr_prison->pr_prison_racct->prr_name);
+ sbuf_finish(&sb);
+ devctl_notify_f("RCTL", "rule", "matched",
+ sbuf_data(&sb), M_NOWAIT);
+ sbuf_delete(&sb);
+ free(buf, M_RCTL);
+ link->rrl_exceeded = 1;
+ continue;
+ default:
+ if (link->rrl_exceeded != 0)
+ continue;
+
+ if (p->p_state != PRS_NORMAL)
+ continue;
+
+ KASSERT(rule->rr_action > 0 &&
+ rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
+ ("rctl_enforce: unknown action %d",
+ rule->rr_action));
+
+ /*
+ * We're using the fact that RCTL_ACTION_SIG* values
+ * are equal to their counterparts from sys/signal.h.
+ */
+ kern_psignal(p, rule->rr_action);
+ link->rrl_exceeded = 1;
+ continue;
+ }
+ }
+
+ rw_runlock(&rctl_lock);
+
+ if (should_deny) {
+ /*
+ * Return fake error code; the caller should change it
+ * into one proper for the situation - EFSIZ, ENOMEM etc.
+ */
+ return (EDOOFUS);
+ }
+
+ return (0);
+}
+
+uint64_t
+rctl_get_limit(struct proc *p, int resource)
+{
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ uint64_t amount = UINT64_MAX;
+
+ rw_rlock(&rctl_lock);
+
+ /*
+ * There may be more than one matching rule; go through all of them.
+ * Denial should be done last, after logging and sending signals.
+ */
+ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+ if (rule->rr_resource != resource)
+ continue;
+ if (rule->rr_action != RCTL_ACTION_DENY)
+ continue;
+ if (rule->rr_amount < amount)
+ amount = rule->rr_amount;
+ }
+
+ rw_runlock(&rctl_lock);
+
+ return (amount);
+}
+
+uint64_t
+rctl_get_available(struct proc *p, int resource)
+{
+ struct rctl_rule *rule;
+ struct rctl_rule_link *link;
+ int64_t available, minavailable, allocated;
+
+ minavailable = INT64_MAX;
+
+ rw_rlock(&rctl_lock);
+
+ /*
+ * There may be more than one matching rule; go through all of them.
+ * Denial should be done last, after logging and sending signals.
+ */
+ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+ rule = link->rrl_rule;
+ if (rule->rr_resource != resource)
+ continue;
+ if (rule->rr_action != RCTL_ACTION_DENY)
+ continue;
+ available = rctl_available_resource(p, rule);
+ if (available < minavailable)
+ minavailable = available;
+ }
+
+ rw_runlock(&rctl_lock);
+
+ /*
+ * XXX: Think about this _hard_.
+ */
+ allocated = p->p_racct->r_resources[resource];
+ if (minavailable < INT64_MAX - allocated)
+ minavailable += allocated;
+ if (minavailable < 0)
+ minavailable = 0;
+ return (minavailable);
+}
+
+static int
+rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
+{
+
+ if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
+ if (rule->rr_subject_type != filter->rr_subject_type)
+ return (0);
+
+ switch (filter->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ if (filter->rr_subject.rs_proc != NULL &&
+ rule->rr_subject.rs_proc !=
+ filter->rr_subject.rs_proc)
+ return (0);
+ break;
+ case RCTL_SUBJECT_TYPE_USER:
+ if (filter->rr_subject.rs_uip != NULL &&
+ rule->rr_subject.rs_uip !=
+ filter->rr_subject.rs_uip)
+ return (0);
+ break;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ if (filter->rr_subject.rs_loginclass != NULL &&
+ rule->rr_subject.rs_loginclass !=
+ filter->rr_subject.rs_loginclass)
+ return (0);
+ break;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ if (filter->rr_subject.rs_prison_racct != NULL &&
+ rule->rr_subject.rs_prison_racct !=
+ filter->rr_subject.rs_prison_racct)
+ return (0);
+ break;
+ default:
+ panic("rctl_rule_matches: unknown subject type %d",
+ filter->rr_subject_type);
+ }
+ }
+
+ if (filter->rr_resource != RACCT_UNDEFINED) {
+ if (rule->rr_resource != filter->rr_resource)
+ return (0);
+ }
+
+ if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
+ if (rule->rr_action != filter->rr_action)
+ return (0);
+ }
+
+ if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
+ if (rule->rr_amount != filter->rr_amount)
+ return (0);
+ }
+
+ if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
+ if (rule->rr_per != filter->rr_per)
+ return (0);
+ }
+
+ return (1);
+}
+
+static int
+str2value(const char *str, int *value, struct dict *table)
+{
+ int i;
+
+ if (value == NULL)
+ return (EINVAL);
+
+ for (i = 0; table[i].d_name != NULL; i++) {
+ if (strcasecmp(table[i].d_name, str) == 0) {
+ *value = table[i].d_value;
+ return (0);
+ }
+ }
+
+ return (EINVAL);
+}
+
+static int
+str2id(const char *str, id_t *value)
+{
+ char *end;
+
+ if (str == NULL)
+ return (EINVAL);
+
+ *value = strtoul(str, &end, 10);
+ if ((size_t)(end - str) != strlen(str))
+ return (EINVAL);
+
+ return (0);
+}
+
+static int
+str2int64(const char *str, int64_t *value)
+{
+ char *end;
+
+ if (str == NULL)
+ return (EINVAL);
+
+ *value = strtoul(str, &end, 10);
+ if ((size_t)(end - str) != strlen(str))
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Connect the rule to the racct, increasing refcount for the rule.
+ */
+static void
+rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
+{
+ struct rctl_rule_link *link;
+
+ KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
+
+ rctl_rule_acquire(rule);
+ link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
+ link->rrl_rule = rule;
+ link->rrl_exceeded = 0;
+
+ rw_wlock(&rctl_lock);
+ LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
+ rw_wunlock(&rctl_lock);
+}
+
+static int
+rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
+{
+ struct rctl_rule_link *link;
+
+ KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
+ rw_assert(&rctl_lock, RA_WLOCKED);
+
+ link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
+ if (link == NULL)
+ return (ENOMEM);
+ rctl_rule_acquire(rule);
+ link->rrl_rule = rule;
+ link->rrl_exceeded = 0;
+
+ LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
+ return (0);
+}
+
+/*
+ * Remove limits for a rules matching the filter and release
+ * the refcounts for the rules, possibly freeing them. Returns
+ * the number of limit structures removed.
+ */
+static int
+rctl_racct_remove_rules(struct racct *racct,
+ const struct rctl_rule *filter)
+{
+ int removed = 0;
+ struct rctl_rule_link *link, *linktmp;
+
+ rw_assert(&rctl_lock, RA_WLOCKED);
+
+ LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
+ if (!rctl_rule_matches(link->rrl_rule, filter))
+ continue;
+
+ LIST_REMOVE(link, rrl_next);
+ rctl_rule_release(link->rrl_rule);
+ uma_zfree(rctl_rule_link_zone, link);
+ removed++;
+ }
+ return (removed);
+}
+
+static void
+rctl_rule_acquire_subject(struct rctl_rule *rule)
+{
+
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_UNDEFINED:
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ break;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ if (rule->rr_subject.rs_prison_racct != NULL)
+ prison_racct_hold(rule->rr_subject.rs_prison_racct);
+ break;
+ case RCTL_SUBJECT_TYPE_USER:
+ if (rule->rr_subject.rs_uip != NULL)
+ uihold(rule->rr_subject.rs_uip);
+ break;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ if (rule->rr_subject.rs_loginclass != NULL)
+ loginclass_hold(rule->rr_subject.rs_loginclass);
+ break;
+ default:
+ panic("rctl_rule_acquire_subject: unknown subject type %d",
+ rule->rr_subject_type);
+ }
+}
+
+static void
+rctl_rule_release_subject(struct rctl_rule *rule)
+{
+
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_UNDEFINED:
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ break;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ if (rule->rr_subject.rs_prison_racct != NULL)
+ prison_racct_free(rule->rr_subject.rs_prison_racct);
+ break;
+ case RCTL_SUBJECT_TYPE_USER:
+ if (rule->rr_subject.rs_uip != NULL)
+ uifree(rule->rr_subject.rs_uip);
+ break;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ if (rule->rr_subject.rs_loginclass != NULL)
+ loginclass_free(rule->rr_subject.rs_loginclass);
+ break;
+ default:
+ panic("rctl_rule_release_subject: unknown subject type %d",
+ rule->rr_subject_type);
+ }
+}
+
+struct rctl_rule *
+rctl_rule_alloc(int flags)
+{
+ struct rctl_rule *rule;
+
+ rule = uma_zalloc(rctl_rule_zone, flags);
+ if (rule == NULL)
+ return (NULL);
+ rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
+ rule->rr_subject.rs_proc = NULL;
+ rule->rr_subject.rs_uip = NULL;
+ rule->rr_subject.rs_loginclass = NULL;
+ rule->rr_subject.rs_prison_racct = NULL;
+ rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
+ rule->rr_resource = RACCT_UNDEFINED;
+ rule->rr_action = RCTL_ACTION_UNDEFINED;
+ rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
+ refcount_init(&rule->rr_refcount, 1);
+
+ return (rule);
+}
+
+struct rctl_rule *
+rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
+{
+ struct rctl_rule *copy;
+
+ copy = uma_zalloc(rctl_rule_zone, flags);
+ if (copy == NULL)
+ return (NULL);
+ copy->rr_subject_type = rule->rr_subject_type;
+ copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
+ copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
+ copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
+ copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
+ copy->rr_per = rule->rr_per;
+ copy->rr_resource = rule->rr_resource;
+ copy->rr_action = rule->rr_action;
+ copy->rr_amount = rule->rr_amount;
+ refcount_init(&copy->rr_refcount, 1);
+ rctl_rule_acquire_subject(copy);
+
+ return (copy);
+}
+
+void
+rctl_rule_acquire(struct rctl_rule *rule)
+{
+
+ KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
+
+ refcount_acquire(&rule->rr_refcount);
+}
+
+static void
+rctl_rule_free(void *context, int pending)
+{
+ struct rctl_rule *rule;
+
+ rule = (struct rctl_rule *)context;
+
+ KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
+
+ /*
+ * We don't need locking here; rule is guaranteed to be inaccessible.
+ */
+
+ rctl_rule_release_subject(rule);
+ uma_zfree(rctl_rule_zone, rule);
+}
+
+void
+rctl_rule_release(struct rctl_rule *rule)
+{
+
+ KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
+
+ if (refcount_release(&rule->rr_refcount)) {
+ /*
+ * rctl_rule_release() is often called when iterating
+ * over all the uidinfo structures in the system,
+ * holding uihashtbl_lock. Since rctl_rule_free()
+ * might end up calling uifree(), this would lead
+ * to lock recursion. Use taskqueue to avoid this.
+ */
+ TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
+ taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
+ }
+}
+
+static int
+rctl_rule_fully_specified(const struct rctl_rule *rule)
+{
+
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_UNDEFINED:
+ return (0);
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ if (rule->rr_subject.rs_proc == NULL)
+ return (0);
+ break;
+ case RCTL_SUBJECT_TYPE_USER:
+ if (rule->rr_subject.rs_uip == NULL)
+ return (0);
+ break;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ if (rule->rr_subject.rs_loginclass == NULL)
+ return (0);
+ break;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ if (rule->rr_subject.rs_prison_racct == NULL)
+ return (0);
+ break;
+ default:
+ panic("rctl_rule_fully_specified: unknown subject type %d",
+ rule->rr_subject_type);
+ }
+ if (rule->rr_resource == RACCT_UNDEFINED)
+ return (0);
+ if (rule->rr_action == RCTL_ACTION_UNDEFINED)
+ return (0);
+ if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
+ return (0);
+ if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
+ return (0);
+
+ return (1);
+}
+
+static int
+rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
+{
+ int error = 0;
+ char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
+ *amountstr, *perstr;
+ struct rctl_rule *rule;
+ id_t id;
+
+ rule = rctl_rule_alloc(M_WAITOK);
+
+ subjectstr = strsep(&rulestr, ":");
+ subject_idstr = strsep(&rulestr, ":");
+ resourcestr = strsep(&rulestr, ":");
+ actionstr = strsep(&rulestr, "=/");
+ amountstr = strsep(&rulestr, "/");
+ perstr = rulestr;
+
+ if (subjectstr == NULL || subjectstr[0] == '\0')
+ rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
+ else {
+ error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
+ if (error != 0)
+ goto out;
+ }
+
+ if (subject_idstr == NULL || subject_idstr[0] == '\0') {
+ rule->rr_subject.rs_proc = NULL;
+ rule->rr_subject.rs_uip = NULL;
+ rule->rr_subject.rs_loginclass = NULL;
+ rule->rr_subject.rs_prison_racct = NULL;
+ } else {
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_UNDEFINED:
+ error = EINVAL;
+ goto out;
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ error = str2id(subject_idstr, &id);
+ if (error != 0)
+ goto out;
+ sx_assert(&allproc_lock, SA_LOCKED);
+ rule->rr_subject.rs_proc = pfind(id);
+ if (rule->rr_subject.rs_proc == NULL) {
+ error = ESRCH;
+ goto out;
+ }
+ PROC_UNLOCK(rule->rr_subject.rs_proc);
+ break;
+ case RCTL_SUBJECT_TYPE_USER:
+ error = str2id(subject_idstr, &id);
+ if (error != 0)
+ goto out;
+ rule->rr_subject.rs_uip = uifind(id);
+ break;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ rule->rr_subject.rs_loginclass =
+ loginclass_find(subject_idstr);
+ if (rule->rr_subject.rs_loginclass == NULL) {
+ error = ENAMETOOLONG;
+ goto out;
+ }
+ break;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ rule->rr_subject.rs_prison_racct =
+ prison_racct_find(subject_idstr);
+ if (rule->rr_subject.rs_prison_racct == NULL) {
+ error = ENAMETOOLONG;
+ goto out;
+ }
+ break;
+ default:
+ panic("rctl_string_to_rule: unknown subject type %d",
+ rule->rr_subject_type);
+ }
+ }
+
+ if (resourcestr == NULL || resourcestr[0] == '\0')
+ rule->rr_resource = RACCT_UNDEFINED;
+ else {
+ error = str2value(resourcestr, &rule->rr_resource,
+ resourcenames);
+ if (error != 0)
+ goto out;
+ }
+
+ if (actionstr == NULL || actionstr[0] == '\0')
+ rule->rr_action = RCTL_ACTION_UNDEFINED;
+ else {
+ error = str2value(actionstr, &rule->rr_action, actionnames);
+ if (error != 0)
+ goto out;
+ }
+
+ if (amountstr == NULL || amountstr[0] == '\0')
+ rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
+ else {
+ error = str2int64(amountstr, &rule->rr_amount);
+ if (error != 0)
+ goto out;
+ if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
+ rule->rr_amount *= 1000000;
+ }
+
+ if (perstr == NULL || perstr[0] == '\0')
+ rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
+ else {
+ error = str2value(perstr, &rule->rr_per, subjectnames);
+ if (error != 0)
+ goto out;
+ }
+
+out:
+ if (error == 0)
+ *rulep = rule;
+ else
+ rctl_rule_release(rule);
+
+ return (error);
+}
+
+/*
+ * Link a rule with all the subjects it applies to.
+ */
+int
+rctl_rule_add(struct rctl_rule *rule)
+{
+ struct proc *p;
+ struct ucred *cred;
+ struct uidinfo *uip;
+ struct prison *pr;
+ struct prison_racct *prr;
+ struct loginclass *lc;
+ struct rctl_rule *rule2;
+ int match;
+
+ KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
+
+ /*
+ * Some rules just don't make sense. Note that the one below
+ * cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
+ * for example, is not deniable in the racct sense, but the
+ * limit is enforced in a different way, so "deny" rules for %CPU
+ * do make sense.
+ */
+ if (rule->rr_action == RCTL_ACTION_DENY &&
+ (rule->rr_resource == RACCT_CPU ||
+ rule->rr_resource == RACCT_WALLCLOCK))
+ return (EOPNOTSUPP);
+
+ if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
+ RACCT_IS_SLOPPY(rule->rr_resource))
+ return (EOPNOTSUPP);
+
+ /*
+ * Make sure there are no duplicated rules. Also, for the "deny"
+ * rules, remove ones differing only by "amount".
+ */
+ if (rule->rr_action == RCTL_ACTION_DENY) {
+ rule2 = rctl_rule_duplicate(rule, M_WAITOK);
+ rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
+ rctl_rule_remove(rule2);
+ rctl_rule_release(rule2);
+ } else
+ rctl_rule_remove(rule);
+
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ p = rule->rr_subject.rs_proc;
+ KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
+
+ rctl_racct_add_rule(p->p_racct, rule);
+ /*
+ * In case of per-process rule, we don't have anything more
+ * to do.
+ */
+ return (0);
+
+ case RCTL_SUBJECT_TYPE_USER:
+ uip = rule->rr_subject.rs_uip;
+ KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
+ rctl_racct_add_rule(uip->ui_racct, rule);
+ break;
+
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ lc = rule->rr_subject.rs_loginclass;
+ KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
+ rctl_racct_add_rule(lc->lc_racct, rule);
+ break;
+
+ case RCTL_SUBJECT_TYPE_JAIL:
+ prr = rule->rr_subject.rs_prison_racct;
+ KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
+ rctl_racct_add_rule(prr->prr_racct, rule);
+ break;
+
+ default:
+ panic("rctl_rule_add: unknown subject type %d",
+ rule->rr_subject_type);
+ }
+
+ /*
+ * Now go through all the processes and add the new rule to the ones
+ * it applies to.
+ */
+ sx_assert(&allproc_lock, SA_LOCKED);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ cred = p->p_ucred;
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_USER:
+ if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
+ cred->cr_ruidinfo == rule->rr_subject.rs_uip)
+ break;
+ continue;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
+ break;
+ continue;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ match = 0;
+ for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
+ if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
+ match = 1;
+ break;
+ }
+ }
+ if (match)
+ break;
+ continue;
+ default:
+ panic("rctl_rule_add: unknown subject type %d",
+ rule->rr_subject_type);
+ }
+
+ rctl_racct_add_rule(p->p_racct, rule);
+ }
+
+ return (0);
+}
+
+static void
+rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
+{
+ struct rctl_rule *filter = (struct rctl_rule *)arg2;
+ int found = 0;
+
+ rw_wlock(&rctl_lock);
+ found += rctl_racct_remove_rules(racct, filter);
+ rw_wunlock(&rctl_lock);
+
+ *((int *)arg3) += found;
+}
+
+/*
+ * Remove all rules that match the filter.
+ */
+int
+rctl_rule_remove(struct rctl_rule *filter)
+{
+ int found = 0;
+ struct proc *p;
+
+ if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
+ filter->rr_subject.rs_proc != NULL) {
+ p = filter->rr_subject.rs_proc;
+ rw_wlock(&rctl_lock);
+ found = rctl_racct_remove_rules(p->p_racct, filter);
+ rw_wunlock(&rctl_lock);
+ if (found)
+ return (0);
+ return (ESRCH);
+ }
+
+ loginclass_racct_foreach(rctl_rule_remove_callback, filter,
+ (void *)&found);
+ ui_racct_foreach(rctl_rule_remove_callback, filter,
+ (void *)&found);
+ prison_racct_foreach(rctl_rule_remove_callback, filter,
+ (void *)&found);
+
+ sx_assert(&allproc_lock, SA_LOCKED);
+ rw_wlock(&rctl_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ found += rctl_racct_remove_rules(p->p_racct, filter);
+ }
+ rw_wunlock(&rctl_lock);
+
+ if (found)
+ return (0);
+ return (ESRCH);
+}
+
+/*
+ * Appends a rule to the sbuf.
+ */
+static void
+rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
+{
+ int64_t amount;
+
+ sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
+
+ switch (rule->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ if (rule->rr_subject.rs_proc == NULL)
+ sbuf_printf(sb, ":");
+ else
+ sbuf_printf(sb, "%d:",
+ rule->rr_subject.rs_proc->p_pid);
+ break;
+ case RCTL_SUBJECT_TYPE_USER:
+ if (rule->rr_subject.rs_uip == NULL)
+ sbuf_printf(sb, ":");
+ else
+ sbuf_printf(sb, "%d:",
+ rule->rr_subject.rs_uip->ui_uid);
+ break;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ if (rule->rr_subject.rs_loginclass == NULL)
+ sbuf_printf(sb, ":");
+ else
+ sbuf_printf(sb, "%s:",
+ rule->rr_subject.rs_loginclass->lc_name);
+ break;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ if (rule->rr_subject.rs_prison_racct == NULL)
+ sbuf_printf(sb, ":");
+ else
+ sbuf_printf(sb, "%s:",
+ rule->rr_subject.rs_prison_racct->prr_name);
+ break;
+ default:
+ panic("rctl_rule_to_sbuf: unknown subject type %d",
+ rule->rr_subject_type);
+ }
+
+ amount = rule->rr_amount;
+ if (amount != RCTL_AMOUNT_UNDEFINED &&
+ RACCT_IS_IN_MILLIONS(rule->rr_resource))
+ amount /= 1000000;
+
+ sbuf_printf(sb, "%s:%s=%jd",
+ rctl_resource_name(rule->rr_resource),
+ rctl_action_name(rule->rr_action),
+ amount);
+
+ if (rule->rr_per != rule->rr_subject_type)
+ sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
+}
+
+/*
+ * Routine used by RCTL syscalls to read in input string.
+ */
+static int
+rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
+{
+ int error;
+ char *str;
+
+ if (inbuflen <= 0)
+ return (EINVAL);
+ if (inbuflen > RCTL_MAX_INBUFLEN)
+ return (E2BIG);
+
+ str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
+ error = copyinstr(inbufp, str, inbuflen, NULL);
+ if (error != 0) {
+ free(str, M_RCTL);
+ return (error);
+ }
+
+ *inputstr = str;
+
+ return (0);
+}
+
+/*
+ * Routine used by RCTL syscalls to write out output string.
+ */
+static int
+rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
+{
+ int error;
+
+ if (outputsbuf == NULL)
+ return (0);
+
+ sbuf_finish(outputsbuf);
+ if (outbuflen < sbuf_len(outputsbuf) + 1) {
+ sbuf_delete(outputsbuf);
+ return (ERANGE);
+ }
+ error = copyout(sbuf_data(outputsbuf), outbufp,
+ sbuf_len(outputsbuf) + 1);
+ sbuf_delete(outputsbuf);
+ return (error);
+}
+
+static struct sbuf *
+rctl_racct_to_sbuf(struct racct *racct, int sloppy)
+{
+ int i;
+ int64_t amount;
+ struct sbuf *sb;
+
+ sb = sbuf_new_auto();
+ for (i = 0; i <= RACCT_MAX; i++) {
+ if (sloppy == 0 && RACCT_IS_SLOPPY(i))
+ continue;
+ amount = racct->r_resources[i];
+ if (RACCT_IS_IN_MILLIONS(i))
+ amount /= 1000000;
+ sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
+ }
+ sbuf_setpos(sb, sbuf_len(sb) - 1);
+ return (sb);
+}
+
+int
+sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
+{
+ int error;
+ char *inputstr;
+ struct rctl_rule *filter;
+ struct sbuf *outputsbuf = NULL;
+ struct proc *p;
+ struct uidinfo *uip;
+ struct loginclass *lc;
+ struct prison_racct *prr;
+
+ error = priv_check(td, PRIV_RCTL_GET_RACCT);
+ if (error != 0)
+ return (error);
+
+ error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+ if (error != 0)
+ return (error);
+
+ sx_slock(&allproc_lock);
+ error = rctl_string_to_rule(inputstr, &filter);
+ free(inputstr, M_RCTL);
+ if (error != 0) {
+ sx_sunlock(&allproc_lock);
+ return (error);
+ }
+
+ switch (filter->rr_subject_type) {
+ case RCTL_SUBJECT_TYPE_PROCESS:
+ p = filter->rr_subject.rs_proc;
+ if (p == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
+ break;
+ case RCTL_SUBJECT_TYPE_USER:
+ uip = filter->rr_subject.rs_uip;
+ if (uip == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
+ break;
+ case RCTL_SUBJECT_TYPE_LOGINCLASS:
+ lc = filter->rr_subject.rs_loginclass;
+ if (lc == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
+ break;
+ case RCTL_SUBJECT_TYPE_JAIL:
+ prr = filter->rr_subject.rs_prison_racct;
+ if (prr == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
+ break;
+ default:
+ error = EINVAL;
+ }
+out:
+ rctl_rule_release(filter);
+ sx_sunlock(&allproc_lock);
+ if (error != 0)
+ return (error);
+
+ error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
+
+ return (error);
+}
+
+static void
+rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
+{
+ struct rctl_rule *filter = (struct rctl_rule *)arg2;
+ struct rctl_rule_link *link;
+ struct sbuf *sb = (struct sbuf *)arg3;
+
+ rw_rlock(&rctl_lock);
+ LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
+ if (!rctl_rule_matches(link->rrl_rule, filter))
+ continue;
+ rctl_rule_to_sbuf(sb, link->rrl_rule);
+ sbuf_printf(sb, ",");
+ }
+ rw_runlock(&rctl_lock);
+}
+
+int
+sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
+{
+ int error;
+ size_t bufsize = RCTL_DEFAULT_BUFSIZE;
+ char *inputstr, *buf;
+ struct sbuf *sb;
+ struct rctl_rule *filter;
+ struct rctl_rule_link *link;
+ struct proc *p;
+
+ error = priv_check(td, PRIV_RCTL_GET_RULES);
+ if (error != 0)
+ return (error);
+
+ error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+ if (error != 0)
+ return (error);
+
+ sx_slock(&allproc_lock);
+ error = rctl_string_to_rule(inputstr, &filter);
+ free(inputstr, M_RCTL);
+ if (error != 0) {
+ sx_sunlock(&allproc_lock);
+ return (error);
+ }
+
+again:
+ buf = malloc(bufsize, M_RCTL, M_WAITOK);
+ sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
+ KASSERT(sb != NULL, ("sbuf_new failed"));
+
+ sx_assert(&allproc_lock, SA_LOCKED);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ rw_rlock(&rctl_lock);
+ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+ /*
+ * Non-process rules will be added to the buffer later.
+ * Adding them here would result in duplicated output.
+ */
+ if (link->rrl_rule->rr_subject_type !=
+ RCTL_SUBJECT_TYPE_PROCESS)
+ continue;
+ if (!rctl_rule_matches(link->rrl_rule, filter))
+ continue;
+ rctl_rule_to_sbuf(sb, link->rrl_rule);
+ sbuf_printf(sb, ",");
+ }
+ rw_runlock(&rctl_lock);
+ }
+
+ loginclass_racct_foreach(rctl_get_rules_callback, filter, sb);
+ ui_racct_foreach(rctl_get_rules_callback, filter, sb);
+ prison_racct_foreach(rctl_get_rules_callback, filter, sb);
+ if (sbuf_error(sb) == ENOMEM) {
+ sbuf_delete(sb);
+ free(buf, M_RCTL);
+ bufsize *= 4;
+ goto again;
+ }
+
+ /*
+ * Remove trailing ",".
+ */
+ if (sbuf_len(sb) > 0)
+ sbuf_setpos(sb, sbuf_len(sb) - 1);
+
+ error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
+
+ rctl_rule_release(filter);
+ sx_sunlock(&allproc_lock);
+ free(buf, M_RCTL);
+ return (error);
+}
+
+int
+sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
+{
+ int error;
+ size_t bufsize = RCTL_DEFAULT_BUFSIZE;
+ char *inputstr, *buf;
+ struct sbuf *sb;
+ struct rctl_rule *filter;
+ struct rctl_rule_link *link;
+
+ error = priv_check(td, PRIV_RCTL_GET_LIMITS);
+ if (error != 0)
+ return (error);
+
+ error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+ if (error != 0)
+ return (error);
+
+ sx_slock(&allproc_lock);
+ error = rctl_string_to_rule(inputstr, &filter);
+ free(inputstr, M_RCTL);
+ if (error != 0) {
+ sx_sunlock(&allproc_lock);
+ return (error);
+ }
+
+ if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
+ rctl_rule_release(filter);
+ sx_sunlock(&allproc_lock);
+ return (EINVAL);
+ }
+ if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
+ rctl_rule_release(filter);
+ sx_sunlock(&allproc_lock);
+ return (EOPNOTSUPP);
+ }
+ if (filter->rr_subject.rs_proc == NULL) {
+ rctl_rule_release(filter);
+ sx_sunlock(&allproc_lock);
+ return (EINVAL);
+ }
+
+again:
+ buf = malloc(bufsize, M_RCTL, M_WAITOK);
+ sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
+ KASSERT(sb != NULL, ("sbuf_new failed"));
+
+ rw_rlock(&rctl_lock);
+ LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
+ rrl_next) {
+ rctl_rule_to_sbuf(sb, link->rrl_rule);
+ sbuf_printf(sb, ",");
+ }
+ rw_runlock(&rctl_lock);
+ if (sbuf_error(sb) == ENOMEM) {
+ sbuf_delete(sb);
+ free(buf, M_RCTL);
+ bufsize *= 4;
+ goto again;
+ }
+
+ /*
+ * Remove trailing ",".
+ */
+ if (sbuf_len(sb) > 0)
+ sbuf_setpos(sb, sbuf_len(sb) - 1);
+
+ error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
+ rctl_rule_release(filter);
+ sx_sunlock(&allproc_lock);
+ free(buf, M_RCTL);
+ return (error);
+}
+
+int
+sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
+{
+ int error;
+ struct rctl_rule *rule;
+ char *inputstr;
+
+ error = priv_check(td, PRIV_RCTL_ADD_RULE);
+ if (error != 0)
+ return (error);
+
+ error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+ if (error != 0)
+ return (error);
+
+ sx_slock(&allproc_lock);
+ error = rctl_string_to_rule(inputstr, &rule);
+ free(inputstr, M_RCTL);
+ if (error != 0) {
+ sx_sunlock(&allproc_lock);
+ return (error);
+ }
+ /*
+ * The 'per' part of a rule is optional.
+ */
+ if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
+ rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
+ rule->rr_per = rule->rr_subject_type;
+
+ if (!rctl_rule_fully_specified(rule)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = rctl_rule_add(rule);
+
+out:
+ rctl_rule_release(rule);
+ sx_sunlock(&allproc_lock);
+ return (error);
+}
+
+int
+sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
+{
+ int error;
+ struct rctl_rule *filter;
+ char *inputstr;
+
+ error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
+ if (error != 0)
+ return (error);
+
+ error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
+ if (error != 0)
+ return (error);
+
+ sx_slock(&allproc_lock);
+ error = rctl_string_to_rule(inputstr, &filter);
+ free(inputstr, M_RCTL);
+ if (error != 0) {
+ sx_sunlock(&allproc_lock);
+ return (error);
+ }
+
+ error = rctl_rule_remove(filter);
+ rctl_rule_release(filter);
+ sx_sunlock(&allproc_lock);
+
+ return (error);
+}
+
+/*
+ * Update RCTL rule list after credential change.
+ */
+void
+rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
+{
+ int rulecnt, i;
+ struct rctl_rule_link *link, *newlink;
+ struct uidinfo *newuip;
+ struct loginclass *newlc;
+ struct prison_racct *newprr;
+ LIST_HEAD(, rctl_rule_link) newrules;
+
+ newuip = newcred->cr_ruidinfo;
+ newlc = newcred->cr_loginclass;
+ newprr = newcred->cr_prison->pr_prison_racct;
+
+ LIST_INIT(&newrules);
+
+again:
+ /*
+ * First, count the rules that apply to the process with new
+ * credentials.
+ */
+ rulecnt = 0;
+ rw_rlock(&rctl_lock);
+ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+ if (link->rrl_rule->rr_subject_type ==
+ RCTL_SUBJECT_TYPE_PROCESS)
+ rulecnt++;
+ }
+ LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
+ rulecnt++;
+ LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
+ rulecnt++;
+ LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
+ rulecnt++;
+ rw_runlock(&rctl_lock);
+
+ /*
+ * Create temporary list. We've dropped the rctl_lock in order
+ * to use M_WAITOK.
+ */
+ for (i = 0; i < rulecnt; i++) {
+ newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
+ newlink->rrl_rule = NULL;
+ LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
+ }
+
+ newlink = LIST_FIRST(&newrules);
+
+ /*
+ * Assign rules to the newly allocated list entries.
+ */
+ rw_wlock(&rctl_lock);
+ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
+ if (link->rrl_rule->rr_subject_type ==
+ RCTL_SUBJECT_TYPE_PROCESS) {
+ if (newlink == NULL)
+ goto goaround;
+ rctl_rule_acquire(link->rrl_rule);
+ newlink->rrl_rule = link->rrl_rule;
+ newlink = LIST_NEXT(newlink, rrl_next);
+ rulecnt--;
+ }
+ }
+
+ LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
+ if (newlink == NULL)
+ goto goaround;
+ rctl_rule_acquire(link->rrl_rule);
+ newlink->rrl_rule = link->rrl_rule;
+ newlink = LIST_NEXT(newlink, rrl_next);
+ rulecnt--;
+ }
+
+ LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
+ if (newlink == NULL)
+ goto goaround;
+ rctl_rule_acquire(link->rrl_rule);
+ newlink->rrl_rule = link->rrl_rule;
+ newlink = LIST_NEXT(newlink, rrl_next);
+ rulecnt--;
+ }
+
+ LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
+ if (newlink == NULL)
+ goto goaround;
+ rctl_rule_acquire(link->rrl_rule);
+ newlink->rrl_rule = link->rrl_rule;
+ newlink = LIST_NEXT(newlink, rrl_next);
+ rulecnt--;
+ }
+
+ if (rulecnt == 0) {
+ /*
+ * Free the old rule list.
+ */
+ while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
+ link = LIST_FIRST(&p->p_racct->r_rule_links);
+ LIST_REMOVE(link, rrl_next);
+ rctl_rule_release(link->rrl_rule);
+ uma_zfree(rctl_rule_link_zone, link);
+ }
+
+ /*
+ * Replace lists and we're done.
+ *
+ * XXX: Is there any way to switch list heads instead
+ * of iterating here?
+ */
+ while (!LIST_EMPTY(&newrules)) {
+ newlink = LIST_FIRST(&newrules);
+ LIST_REMOVE(newlink, rrl_next);
+ LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
+ newlink, rrl_next);
+ }
+
+ rw_wunlock(&rctl_lock);
+
+ return;
+ }
+
+goaround:
+ rw_wunlock(&rctl_lock);
+
+ /*
+ * Rule list changed while we were not holding the rctl_lock.
+ * Free the new list and try again.
+ */
+ while (!LIST_EMPTY(&newrules)) {
+ newlink = LIST_FIRST(&newrules);
+ LIST_REMOVE(newlink, rrl_next);
+ if (newlink->rrl_rule != NULL)
+ rctl_rule_release(newlink->rrl_rule);
+ uma_zfree(rctl_rule_link_zone, newlink);
+ }
+
+ goto again;
+}
+
+/*
+ * Assign RCTL rules to the newly created process.
+ */
+int
+rctl_proc_fork(struct proc *parent, struct proc *child)
+{
+ int error;
+ struct rctl_rule_link *link;
+ struct rctl_rule *rule;
+
+ LIST_INIT(&child->p_racct->r_rule_links);
+
+ KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
+
+ rw_wlock(&rctl_lock);
+
+ /*
+ * Go through limits applicable to the parent and assign them
+ * to the child. Rules with 'process' subject have to be duplicated
+ * in order to make their rr_subject point to the new process.
+ */
+ LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
+ if (link->rrl_rule->rr_subject_type ==
+ RCTL_SUBJECT_TYPE_PROCESS) {
+ rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
+ if (rule == NULL)
+ goto fail;
+ KASSERT(rule->rr_subject.rs_proc == parent,
+ ("rule->rr_subject.rs_proc != parent"));
+ rule->rr_subject.rs_proc = child;
+ error = rctl_racct_add_rule_locked(child->p_racct,
+ rule);
+ rctl_rule_release(rule);
+ if (error != 0)
+ goto fail;
+ } else {
+ error = rctl_racct_add_rule_locked(child->p_racct,
+ link->rrl_rule);
+ if (error != 0)
+ goto fail;
+ }
+ }
+
+ rw_wunlock(&rctl_lock);
+ return (0);
+
+fail:
+ while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
+ link = LIST_FIRST(&child->p_racct->r_rule_links);
+ LIST_REMOVE(link, rrl_next);
+ rctl_rule_release(link->rrl_rule);
+ uma_zfree(rctl_rule_link_zone, link);
+ }
+ rw_wunlock(&rctl_lock);
+ return (EAGAIN);
+}
+
+/*
+ * Release rules attached to the racct.
+ */
+void
+rctl_racct_release(struct racct *racct)
+{
+ struct rctl_rule_link *link;
+
+ rw_wlock(&rctl_lock);
+ while (!LIST_EMPTY(&racct->r_rule_links)) {
+ link = LIST_FIRST(&racct->r_rule_links);
+ LIST_REMOVE(link, rrl_next);
+ rctl_rule_release(link->rrl_rule);
+ uma_zfree(rctl_rule_link_zone, link);
+ }
+ rw_wunlock(&rctl_lock);
+}
+
+static void
+rctl_init(void)
+{
+
+ rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
+ sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+}
+
+#else /* !RCTL */
+
+int
+sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+#endif /* !RCTL */
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
new file mode 100644
index 0000000..57ee671
--- /dev/null
+++ b/sys/kern/kern_resource.c
@@ -0,0 +1,1434 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/time.h>
+#include <sys/umtx.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+
+static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
+static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
+#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
+static struct rwlock uihashtbl_lock;
+static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
+static u_long uihash; /* size of hash table - 1 */
+
+static void calcru1(struct proc *p, struct rusage_ext *ruxp,
+ struct timeval *up, struct timeval *sp);
+static int donice(struct thread *td, struct proc *chgp, int n);
+static struct uidinfo *uilookup(uid_t uid);
+static void ruxagg_locked(struct rusage_ext *rux, struct thread *td);
+
+/*
+ * Resource controls and accounting.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getpriority_args {
+ int which;
+ int who;
+};
+#endif
+int
+sys_getpriority(td, uap)
+ struct thread *td;
+ register struct getpriority_args *uap;
+{
+ struct proc *p;
+ struct pgrp *pg;
+ int error, low;
+
+ error = 0;
+ low = PRIO_MAX + 1;
+ switch (uap->which) {
+
+ case PRIO_PROCESS:
+ if (uap->who == 0)
+ low = td->td_proc->p_nice;
+ else {
+ p = pfind(uap->who);
+ if (p == NULL)
+ break;
+ if (p_cansee(td, p) == 0)
+ low = p->p_nice;
+ PROC_UNLOCK(p);
+ }
+ break;
+
+ case PRIO_PGRP:
+ sx_slock(&proctree_lock);
+ if (uap->who == 0) {
+ pg = td->td_proc->p_pgrp;
+ PGRP_LOCK(pg);
+ } else {
+ pg = pgfind(uap->who);
+ if (pg == NULL) {
+ sx_sunlock(&proctree_lock);
+ break;
+ }
+ }
+ sx_sunlock(&proctree_lock);
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL &&
+ p_cansee(td, p) == 0) {
+ if (p->p_nice < low)
+ low = p->p_nice;
+ }
+ PROC_UNLOCK(p);
+ }
+ PGRP_UNLOCK(pg);
+ break;
+
+ case PRIO_USER:
+ if (uap->who == 0)
+ uap->who = td->td_ucred->cr_uid;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL &&
+ p_cansee(td, p) == 0 &&
+ p->p_ucred->cr_uid == uap->who) {
+ if (p->p_nice < low)
+ low = p->p_nice;
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (low == PRIO_MAX + 1 && error == 0)
+ error = ESRCH;
+ td->td_retval[0] = low;
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setpriority_args {
+ int which;
+ int who;
+ int prio;
+};
+#endif
+int
+sys_setpriority(td, uap)
+ struct thread *td;
+ struct setpriority_args *uap;
+{
+ struct proc *curp, *p;
+ struct pgrp *pg;
+ int found = 0, error = 0;
+
+ curp = td->td_proc;
+ switch (uap->which) {
+ case PRIO_PROCESS:
+ if (uap->who == 0) {
+ PROC_LOCK(curp);
+ error = donice(td, curp, uap->prio);
+ PROC_UNLOCK(curp);
+ } else {
+ p = pfind(uap->who);
+ if (p == NULL)
+ break;
+ error = p_cansee(td, p);
+ if (error == 0)
+ error = donice(td, p, uap->prio);
+ PROC_UNLOCK(p);
+ }
+ found++;
+ break;
+
+ case PRIO_PGRP:
+ sx_slock(&proctree_lock);
+ if (uap->who == 0) {
+ pg = curp->p_pgrp;
+ PGRP_LOCK(pg);
+ } else {
+ pg = pgfind(uap->who);
+ if (pg == NULL) {
+ sx_sunlock(&proctree_lock);
+ break;
+ }
+ }
+ sx_sunlock(&proctree_lock);
+ LIST_FOREACH(p, &pg->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL &&
+ p_cansee(td, p) == 0) {
+ error = donice(td, p, uap->prio);
+ found++;
+ }
+ PROC_UNLOCK(p);
+ }
+ PGRP_UNLOCK(pg);
+ break;
+
+ case PRIO_USER:
+ if (uap->who == 0)
+ uap->who = td->td_ucred->cr_uid;
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL &&
+ p->p_ucred->cr_uid == uap->who &&
+ p_cansee(td, p) == 0) {
+ error = donice(td, p, uap->prio);
+ found++;
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (found == 0 && error == 0)
+ error = ESRCH;
+ return (error);
+}
+
+/*
+ * Set "nice" for a (whole) process.
+ */
+static int
+donice(struct thread *td, struct proc *p, int n)
+{
+ int error;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if ((error = p_cansched(td, p)))
+ return (error);
+ if (n > PRIO_MAX)
+ n = PRIO_MAX;
+ if (n < PRIO_MIN)
+ n = PRIO_MIN;
+ if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
+ return (EACCES);
+ sched_nice(p, n);
+ return (0);
+}
+
+static int unprivileged_idprio;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_idprio, CTLFLAG_RW,
+ &unprivileged_idprio, 0, "Allow non-root users to set an idle priority");
+
+/*
+ * Set realtime priority for LWP.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_thread_args {
+ int function;
+ lwpid_t lwpid;
+ struct rtprio *rtp;
+};
+#endif
+int
+sys_rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
+{
+ struct proc *p;
+ struct rtprio rtp;
+ struct thread *td1;
+ int cierror, error;
+
+ /* Perform copyin before acquiring locks if needed. */
+ if (uap->function == RTP_SET)
+ cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+ else
+ cierror = 0;
+
+ if (uap->lwpid == 0 || uap->lwpid == td->td_tid) {
+ p = td->td_proc;
+ td1 = td;
+ PROC_LOCK(p);
+ } else {
+ /* Only look up thread in current process */
+ td1 = tdfind(uap->lwpid, curproc->p_pid);
+ if (td1 == NULL)
+ return (ESRCH);
+ p = td1->td_proc;
+ }
+
+ switch (uap->function) {
+ case RTP_LOOKUP:
+ if ((error = p_cansee(td, p)))
+ break;
+ pri_to_rtp(td1, &rtp);
+ PROC_UNLOCK(p);
+ return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
+ case RTP_SET:
+ if ((error = p_cansched(td, p)) || (error = cierror))
+ break;
+
+ /* Disallow setting rtprio in most cases if not superuser. */
+
+ /*
+ * Realtime priority has to be restricted for reasons which
+ * should be obvious. However, for idleprio processes, there is
+ * a potential for system deadlock if an idleprio process gains
+ * a lock on a resource that other processes need (and the
+ * idleprio process can't run due to a CPU-bound normal
+ * process). Fix me! XXX
+ *
+ * This problem is not only related to idleprio process.
+ * A user level program can obtain a file lock and hold it
+ * indefinitely. Additionally, without idleprio processes it is
+ * still conceivable that a program with low priority will never
+ * get to run. In short, allowing this feature might make it
+ * easier to lock a resource indefinitely, but it is not the
+ * only thing that makes it possible.
+ */
+ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME ||
+ (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE &&
+ unprivileged_idprio == 0)) {
+ error = priv_check(td, PRIV_SCHED_RTPRIO);
+ if (error)
+ break;
+ }
+ error = rtp_to_pri(&rtp, td1);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+/*
+ * Set realtime priority.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_args {
+ int function;
+ pid_t pid;
+ struct rtprio *rtp;
+};
+#endif
+int
+sys_rtprio(td, uap)
+ struct thread *td; /* curthread */
+ register struct rtprio_args *uap;
+{
+ struct proc *p;
+ struct thread *tdp;
+ struct rtprio rtp;
+ int cierror, error;
+
+ /* Perform copyin before acquiring locks if needed. */
+ if (uap->function == RTP_SET)
+ cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+ else
+ cierror = 0;
+
+ if (uap->pid == 0) {
+ p = td->td_proc;
+ PROC_LOCK(p);
+ } else {
+ p = pfind(uap->pid);
+ if (p == NULL)
+ return (ESRCH);
+ }
+
+ switch (uap->function) {
+ case RTP_LOOKUP:
+ if ((error = p_cansee(td, p)))
+ break;
+ /*
+ * Return OUR priority if no pid specified,
+ * or if one is, report the highest priority
+ * in the process. There isn't much more you can do as
+ * there is only room to return a single priority.
+ * Note: specifying our own pid is not the same
+ * as leaving it zero.
+ */
+ if (uap->pid == 0) {
+ pri_to_rtp(td, &rtp);
+ } else {
+ struct rtprio rtp2;
+
+ rtp.type = RTP_PRIO_IDLE;
+ rtp.prio = RTP_PRIO_MAX;
+ FOREACH_THREAD_IN_PROC(p, tdp) {
+ pri_to_rtp(tdp, &rtp2);
+ if (rtp2.type < rtp.type ||
+ (rtp2.type == rtp.type &&
+ rtp2.prio < rtp.prio)) {
+ rtp.type = rtp2.type;
+ rtp.prio = rtp2.prio;
+ }
+ }
+ }
+ PROC_UNLOCK(p);
+ return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
+ case RTP_SET:
+ if ((error = p_cansched(td, p)) || (error = cierror))
+ break;
+
+ /*
+ * Disallow setting rtprio in most cases if not superuser.
+ * See the comment in sys_rtprio_thread about idprio
+ * threads holding a lock.
+ */
+ if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME ||
+ (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE &&
+ !unprivileged_idprio)) {
+ error = priv_check(td, PRIV_SCHED_RTPRIO);
+ if (error)
+ break;
+ }
+
+ /*
+ * If we are setting our own priority, set just our
+ * thread but if we are doing another process,
+ * do all the threads on that process. If we
+ * specify our own pid we do the latter.
+ */
+ if (uap->pid == 0) {
+ error = rtp_to_pri(&rtp, td);
+ } else {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if ((error = rtp_to_pri(&rtp, td)) != 0)
+ break;
+ }
+ }
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+int
+rtp_to_pri(struct rtprio *rtp, struct thread *td)
+{
+ u_char newpri, oldclass, oldpri;
+
+ switch (RTP_PRIO_BASE(rtp->type)) {
+ case RTP_PRIO_REALTIME:
+ if (rtp->prio > RTP_PRIO_MAX)
+ return (EINVAL);
+ newpri = PRI_MIN_REALTIME + rtp->prio;
+ break;
+ case RTP_PRIO_NORMAL:
+ if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE))
+ return (EINVAL);
+ newpri = PRI_MIN_TIMESHARE + rtp->prio;
+ break;
+ case RTP_PRIO_IDLE:
+ if (rtp->prio > RTP_PRIO_MAX)
+ return (EINVAL);
+ newpri = PRI_MIN_IDLE + rtp->prio;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ thread_lock(td);
+ oldclass = td->td_pri_class;
+ sched_class(td, rtp->type); /* XXX fix */
+ oldpri = td->td_user_pri;
+ sched_user_prio(td, newpri);
+ if (td->td_user_pri != oldpri && (oldclass != RTP_PRIO_NORMAL ||
+ td->td_pri_class != RTP_PRIO_NORMAL))
+ sched_prio(td, td->td_user_pri);
+ if (TD_ON_UPILOCK(td) && oldpri != newpri) {
+ critical_enter();
+ thread_unlock(td);
+ umtx_pi_adjust(td, oldpri);
+ critical_exit();
+ } else
+ thread_unlock(td);
+ return (0);
+}
+
+void
+pri_to_rtp(struct thread *td, struct rtprio *rtp)
+{
+
+ thread_lock(td);
+ switch (PRI_BASE(td->td_pri_class)) {
+ case PRI_REALTIME:
+ rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
+ break;
+ case PRI_TIMESHARE:
+ rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
+ break;
+ case PRI_IDLE:
+ rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
+ break;
+ default:
+ break;
+ }
+ rtp->type = td->td_pri_class;
+ thread_unlock(td);
+}
+
+#if defined(COMPAT_43)
+#ifndef _SYS_SYSPROTO_H_
+struct osetrlimit_args {
+ u_int which;
+ struct orlimit *rlp;
+};
+#endif
+int
+osetrlimit(td, uap)
+ struct thread *td;
+ register struct osetrlimit_args *uap;
+{
+ struct orlimit olim;
+ struct rlimit lim;
+ int error;
+
+ if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
+ return (error);
+ lim.rlim_cur = olim.rlim_cur;
+ lim.rlim_max = olim.rlim_max;
+ error = kern_setrlimit(td, uap->which, &lim);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogetrlimit_args {
+ u_int which;
+ struct orlimit *rlp;
+};
+#endif
+int
+ogetrlimit(td, uap)
+ struct thread *td;
+ register struct ogetrlimit_args *uap;
+{
+ struct orlimit olim;
+ struct rlimit rl;
+ struct proc *p;
+ int error;
+
+ if (uap->which >= RLIM_NLIMITS)
+ return (EINVAL);
+ p = td->td_proc;
+ PROC_LOCK(p);
+ lim_rlimit(p, uap->which, &rl);
+ PROC_UNLOCK(p);
+
+ /*
+ * XXX would be more correct to convert only RLIM_INFINITY to the
+ * old RLIM_INFINITY and fail with EOVERFLOW for other larger
+ * values. Most 64->32 and 32->16 conversions, including not
+ * unimportant ones of uids are even more broken than what we
+ * do here (they blindly truncate). We don't do this correctly
+ * here since we have little experience with EOVERFLOW yet.
+ * Elsewhere, getuid() can't fail...
+ */
+ olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
+ olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
+ error = copyout(&olim, uap->rlp, sizeof(olim));
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct __setrlimit_args {
+ u_int which;
+ struct rlimit *rlp;
+};
+#endif
+int
+sys_setrlimit(td, uap)
+ struct thread *td;
+ register struct __setrlimit_args *uap;
+{
+ struct rlimit alim;
+ int error;
+
+ if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
+ return (error);
+ error = kern_setrlimit(td, uap->which, &alim);
+ return (error);
+}
+
+static void
+lim_cb(void *arg)
+{
+ struct rlimit rlim;
+ struct thread *td;
+ struct proc *p;
+
+ p = arg;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ /*
+ * Check if the process exceeds its cpu resource allocation. If
+ * it reaches the max, arrange to kill the process in ast().
+ */
+ if (p->p_cpulimit == RLIM_INFINITY)
+ return;
+ PROC_SLOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ ruxagg(p, td);
+ }
+ PROC_SUNLOCK(p);
+ if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
+ lim_rlimit(p, RLIMIT_CPU, &rlim);
+ if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
+ killproc(p, "exceeded maximum CPU limit");
+ } else {
+ if (p->p_cpulimit < rlim.rlim_max)
+ p->p_cpulimit += 5;
+ kern_psignal(p, SIGXCPU);
+ }
+ }
+ if ((p->p_flag & P_WEXIT) == 0)
+ callout_reset_sbt(&p->p_limco, SBT_1S, 0,
+ lim_cb, p, C_PREL(1));
+}
+
+int
+kern_setrlimit(struct thread *td, u_int which, struct rlimit *limp)
+{
+
+ return (kern_proc_setrlimit(td, td->td_proc, which, limp));
+}
+
+int
+kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which,
+ struct rlimit *limp)
+{
+ struct plimit *newlim, *oldlim;
+ register struct rlimit *alimp;
+ struct rlimit oldssiz;
+ int error;
+
+ if (which >= RLIM_NLIMITS)
+ return (EINVAL);
+
+ /*
+ * Preserve historical bugs by treating negative limits as unsigned.
+ */
+ if (limp->rlim_cur < 0)
+ limp->rlim_cur = RLIM_INFINITY;
+ if (limp->rlim_max < 0)
+ limp->rlim_max = RLIM_INFINITY;
+
+ oldssiz.rlim_cur = 0;
+ newlim = lim_alloc();
+ PROC_LOCK(p);
+ oldlim = p->p_limit;
+ alimp = &oldlim->pl_rlimit[which];
+ if (limp->rlim_cur > alimp->rlim_max ||
+ limp->rlim_max > alimp->rlim_max)
+ if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
+ PROC_UNLOCK(p);
+ lim_free(newlim);
+ return (error);
+ }
+ if (limp->rlim_cur > limp->rlim_max)
+ limp->rlim_cur = limp->rlim_max;
+ lim_copy(newlim, oldlim);
+ alimp = &newlim->pl_rlimit[which];
+
+ switch (which) {
+
+ case RLIMIT_CPU:
+ if (limp->rlim_cur != RLIM_INFINITY &&
+ p->p_cpulimit == RLIM_INFINITY)
+ callout_reset_sbt(&p->p_limco, SBT_1S, 0,
+ lim_cb, p, C_PREL(1));
+ p->p_cpulimit = limp->rlim_cur;
+ break;
+ case RLIMIT_DATA:
+ if (limp->rlim_cur > maxdsiz)
+ limp->rlim_cur = maxdsiz;
+ if (limp->rlim_max > maxdsiz)
+ limp->rlim_max = maxdsiz;
+ break;
+
+ case RLIMIT_STACK:
+ if (limp->rlim_cur > maxssiz)
+ limp->rlim_cur = maxssiz;
+ if (limp->rlim_max > maxssiz)
+ limp->rlim_max = maxssiz;
+ oldssiz = *alimp;
+ if (p->p_sysent->sv_fixlimit != NULL)
+ p->p_sysent->sv_fixlimit(&oldssiz,
+ RLIMIT_STACK);
+ break;
+
+ case RLIMIT_NOFILE:
+ if (limp->rlim_cur > maxfilesperproc)
+ limp->rlim_cur = maxfilesperproc;
+ if (limp->rlim_max > maxfilesperproc)
+ limp->rlim_max = maxfilesperproc;
+ break;
+
+ case RLIMIT_NPROC:
+ if (limp->rlim_cur > maxprocperuid)
+ limp->rlim_cur = maxprocperuid;
+ if (limp->rlim_max > maxprocperuid)
+ limp->rlim_max = maxprocperuid;
+ if (limp->rlim_cur < 1)
+ limp->rlim_cur = 1;
+ if (limp->rlim_max < 1)
+ limp->rlim_max = 1;
+ break;
+ }
+ if (p->p_sysent->sv_fixlimit != NULL)
+ p->p_sysent->sv_fixlimit(limp, which);
+ *alimp = *limp;
+ p->p_limit = newlim;
+ PROC_UNLOCK(p);
+ lim_free(oldlim);
+
+ if (which == RLIMIT_STACK) {
+ /*
+ * Stack is allocated to the max at exec time with only
+ * "rlim_cur" bytes accessible. If stack limit is going
+ * up make more accessible, if going down make inaccessible.
+ */
+ if (limp->rlim_cur != oldssiz.rlim_cur) {
+ vm_offset_t addr;
+ vm_size_t size;
+ vm_prot_t prot;
+
+ if (limp->rlim_cur > oldssiz.rlim_cur) {
+ prot = p->p_sysent->sv_stackprot;
+ size = limp->rlim_cur - oldssiz.rlim_cur;
+ addr = p->p_sysent->sv_usrstack -
+ limp->rlim_cur;
+ } else {
+ prot = VM_PROT_NONE;
+ size = oldssiz.rlim_cur - limp->rlim_cur;
+ addr = p->p_sysent->sv_usrstack -
+ oldssiz.rlim_cur;
+ }
+ addr = trunc_page(addr);
+ size = round_page(size);
+ (void)vm_map_protect(&p->p_vmspace->vm_map,
+ addr, addr + size, prot, FALSE);
+ }
+ }
+
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct __getrlimit_args {
+ u_int which;
+ struct rlimit *rlp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_getrlimit(td, uap)
+ struct thread *td;
+ register struct __getrlimit_args *uap;
+{
+ struct rlimit rlim;
+ struct proc *p;
+ int error;
+
+ if (uap->which >= RLIM_NLIMITS)
+ return (EINVAL);
+ p = td->td_proc;
+ PROC_LOCK(p);
+ lim_rlimit(p, uap->which, &rlim);
+ PROC_UNLOCK(p);
+ error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
+ return (error);
+}
+
+/*
+ * Transform the running time and tick information for children of proc p
+ * into user and system time usage.
+ */
+void
+calccru(p, up, sp)
+ struct proc *p;
+ struct timeval *up;
+ struct timeval *sp;
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ calcru1(p, &p->p_crux, up, sp);
+}
+
+/*
+ * Transform the running time and tick information in proc p into user
+ * and system time usage. If appropriate, include the current time slice
+ * on this CPU.
+ */
+void
+calcru(struct proc *p, struct timeval *up, struct timeval *sp)
+{
+ struct thread *td;
+ uint64_t runtime, u;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ /*
+ * If we are getting stats for the current process, then add in the
+ * stats that this thread has accumulated in its current time slice.
+ * We reset the thread and CPU state as if we had performed a context
+ * switch right here.
+ */
+ td = curthread;
+ if (td->td_proc == p) {
+ u = cpu_ticks();
+ runtime = u - PCPU_GET(switchtime);
+ td->td_runtime += runtime;
+ td->td_incruntime += runtime;
+ PCPU_SET(switchtime, u);
+ }
+ /* Make sure the per-thread stats are current. */
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_incruntime == 0)
+ continue;
+ ruxagg(p, td);
+ }
+ calcru1(p, &p->p_rux, up, sp);
+}
+
+/* Collect resource usage for a single thread. */
+void
+rufetchtd(struct thread *td, struct rusage *ru)
+{
+ struct proc *p;
+ uint64_t runtime, u;
+
+ p = td->td_proc;
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ /*
+ * If we are getting stats for the current thread, then add in the
+ * stats that this thread has accumulated in its current time slice.
+ * We reset the thread and CPU state as if we had performed a context
+ * switch right here.
+ */
+ if (td == curthread) {
+ u = cpu_ticks();
+ runtime = u - PCPU_GET(switchtime);
+ td->td_runtime += runtime;
+ td->td_incruntime += runtime;
+ PCPU_SET(switchtime, u);
+ }
+ ruxagg(p, td);
+ *ru = td->td_ru;
+ calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime);
+}
+
+static void
+calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
+ struct timeval *sp)
+{
+ /* {user, system, interrupt, total} {ticks, usec}: */
+ uint64_t ut, uu, st, su, it, tt, tu;
+
+ ut = ruxp->rux_uticks;
+ st = ruxp->rux_sticks;
+ it = ruxp->rux_iticks;
+ tt = ut + st + it;
+ if (tt == 0) {
+ /* Avoid divide by zero */
+ st = 1;
+ tt = 1;
+ }
+ tu = cputick2usec(ruxp->rux_runtime);
+ if ((int64_t)tu < 0) {
+ /* XXX: this should be an assert /phk */
+ printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
+ (intmax_t)tu, p->p_pid, p->p_comm);
+ tu = ruxp->rux_tu;
+ }
+
+ if (tu >= ruxp->rux_tu) {
+ /*
+ * The normal case, time increased.
+ * Enforce monotonicity of bucketed numbers.
+ */
+ uu = (tu * ut) / tt;
+ if (uu < ruxp->rux_uu)
+ uu = ruxp->rux_uu;
+ su = (tu * st) / tt;
+ if (su < ruxp->rux_su)
+ su = ruxp->rux_su;
+ } else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
+ /*
+ * When we calibrate the cputicker, it is not uncommon to
+ * see the presumably fixed frequency increase slightly over
+ * time as a result of thermal stabilization and NTP
+ * discipline (of the reference clock). We therefore ignore
+ * a bit of backwards slop because we expect to catch up
+ * shortly. We use a 3 microsecond limit to catch low
+ * counts and a 1% limit for high counts.
+ */
+ uu = ruxp->rux_uu;
+ su = ruxp->rux_su;
+ tu = ruxp->rux_tu;
+ } else { /* tu < ruxp->rux_tu */
+ /*
+ * What happened here was likely that a laptop, which ran at
+ * a reduced clock frequency at boot, kicked into high gear.
+ * The wisdom of spamming this message in that case is
+ * dubious, but it might also be indicative of something
+ * serious, so lets keep it and hope laptops can be made
+ * more truthful about their CPU speed via ACPI.
+ */
+ printf("calcru: runtime went backwards from %ju usec "
+ "to %ju usec for pid %d (%s)\n",
+ (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
+ p->p_pid, p->p_comm);
+ uu = (tu * ut) / tt;
+ su = (tu * st) / tt;
+ }
+
+ ruxp->rux_uu = uu;
+ ruxp->rux_su = su;
+ ruxp->rux_tu = tu;
+
+ up->tv_sec = uu / 1000000;
+ up->tv_usec = uu % 1000000;
+ sp->tv_sec = su / 1000000;
+ sp->tv_usec = su % 1000000;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getrusage_args {
+ int who;
+ struct rusage *rusage;
+};
+#endif
+int
+sys_getrusage(td, uap)
+ register struct thread *td;
+ register struct getrusage_args *uap;
+{
+ struct rusage ru;
+ int error;
+
+ error = kern_getrusage(td, uap->who, &ru);
+ if (error == 0)
+ error = copyout(&ru, uap->rusage, sizeof(struct rusage));
+ return (error);
+}
+
+int
+kern_getrusage(struct thread *td, int who, struct rusage *rup)
+{
+ struct proc *p;
+ int error;
+
+ error = 0;
+ p = td->td_proc;
+ PROC_LOCK(p);
+ switch (who) {
+ case RUSAGE_SELF:
+ rufetchcalc(p, rup, &rup->ru_utime,
+ &rup->ru_stime);
+ break;
+
+ case RUSAGE_CHILDREN:
+ *rup = p->p_stats->p_cru;
+ calccru(p, &rup->ru_utime, &rup->ru_stime);
+ break;
+
+ case RUSAGE_THREAD:
+ PROC_SLOCK(p);
+ thread_lock(td);
+ rufetchtd(td, rup);
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
+ break;
+
+ default:
+ error = EINVAL;
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+void
+rucollect(struct rusage *ru, struct rusage *ru2)
+{
+ long *ip, *ip2;
+ int i;
+
+ if (ru->ru_maxrss < ru2->ru_maxrss)
+ ru->ru_maxrss = ru2->ru_maxrss;
+ ip = &ru->ru_first;
+ ip2 = &ru2->ru_first;
+ for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
+ *ip++ += *ip2++;
+}
+
+void
+ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
+ struct rusage_ext *rux2)
+{
+
+ rux->rux_runtime += rux2->rux_runtime;
+ rux->rux_uticks += rux2->rux_uticks;
+ rux->rux_sticks += rux2->rux_sticks;
+ rux->rux_iticks += rux2->rux_iticks;
+ rux->rux_uu += rux2->rux_uu;
+ rux->rux_su += rux2->rux_su;
+ rux->rux_tu += rux2->rux_tu;
+ rucollect(ru, ru2);
+}
+
+/*
+ * Aggregate tick counts into the proc's rusage_ext.
+ */
+static void
+ruxagg_locked(struct rusage_ext *rux, struct thread *td)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
+ rux->rux_runtime += td->td_incruntime;
+ rux->rux_uticks += td->td_uticks;
+ rux->rux_sticks += td->td_sticks;
+ rux->rux_iticks += td->td_iticks;
+}
+
+void
+ruxagg(struct proc *p, struct thread *td)
+{
+
+ thread_lock(td);
+ ruxagg_locked(&p->p_rux, td);
+ ruxagg_locked(&td->td_rux, td);
+ td->td_incruntime = 0;
+ td->td_uticks = 0;
+ td->td_iticks = 0;
+ td->td_sticks = 0;
+ thread_unlock(td);
+}
+
+/*
+ * Update the rusage_ext structure and fetch a valid aggregate rusage
+ * for proc p if storage for one is supplied.
+ */
+void
+rufetch(struct proc *p, struct rusage *ru)
+{
+ struct thread *td;
+
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+ *ru = p->p_ru;
+ if (p->p_numthreads > 0) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ ruxagg(p, td);
+ rucollect(ru, &td->td_ru);
+ }
+ }
+}
+
+/*
+ * Atomically perform a rufetch and a calcru together.
+ * Consumers, can safely assume the calcru is executed only once
+ * rufetch is completed.
+ */
+void
+rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
+ struct timeval *sp)
+{
+
+ PROC_SLOCK(p);
+ rufetch(p, ru);
+ calcru(p, up, sp);
+ PROC_SUNLOCK(p);
+}
+
+/*
+ * Allocate a new resource limits structure and initialize its
+ * reference count and mutex pointer.
+ */
+struct plimit *
+lim_alloc()
+{
+ struct plimit *limp;
+
+ limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
+ refcount_init(&limp->pl_refcnt, 1);
+ return (limp);
+}
+
+struct plimit *
+lim_hold(limp)
+ struct plimit *limp;
+{
+
+ refcount_acquire(&limp->pl_refcnt);
+ return (limp);
+}
+
+void
+lim_fork(struct proc *p1, struct proc *p2)
+{
+
+ PROC_LOCK_ASSERT(p1, MA_OWNED);
+ PROC_LOCK_ASSERT(p2, MA_OWNED);
+
+ p2->p_limit = lim_hold(p1->p_limit);
+ callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
+ if (p1->p_cpulimit != RLIM_INFINITY)
+ callout_reset_sbt(&p2->p_limco, SBT_1S, 0,
+ lim_cb, p2, C_PREL(1));
+}
+
+void
+lim_free(limp)
+ struct plimit *limp;
+{
+
+ KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
+ if (refcount_release(&limp->pl_refcnt))
+ free((void *)limp, M_PLIMIT);
+}
+
+/*
+ * Make a copy of the plimit structure.
+ * We share these structures copy-on-write after fork.
+ */
+void
+lim_copy(dst, src)
+ struct plimit *dst, *src;
+{
+
+ KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
+ bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
+}
+
+/*
+ * Return the hard limit for a particular system resource. The
+ * which parameter specifies the index into the rlimit array.
+ */
+rlim_t
+lim_max(struct proc *p, int which)
+{
+ struct rlimit rl;
+
+ lim_rlimit(p, which, &rl);
+ return (rl.rlim_max);
+}
+
+/*
+ * Return the current (soft) limit for a particular system resource.
+ * The which parameter which specifies the index into the rlimit array
+ */
+rlim_t
+lim_cur(struct proc *p, int which)
+{
+ struct rlimit rl;
+
+ lim_rlimit(p, which, &rl);
+ return (rl.rlim_cur);
+}
+
+/*
+ * Return a copy of the entire rlimit structure for the system limit
+ * specified by 'which' in the rlimit structure pointed to by 'rlp'.
+ */
+void
+lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ KASSERT(which >= 0 && which < RLIM_NLIMITS,
+ ("request for invalid resource limit"));
+ *rlp = p->p_limit->pl_rlimit[which];
+ if (p->p_sysent->sv_fixlimit != NULL)
+ p->p_sysent->sv_fixlimit(rlp, which);
+}
+
+void
+uihashinit()
+{
+
+ uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
+ rw_init(&uihashtbl_lock, "uidinfo hash");
+}
+
+/*
+ * Look up a uidinfo struct for the parameter uid.
+ * uihashtbl_lock must be locked.
+ */
+static struct uidinfo *
+uilookup(uid)
+ uid_t uid;
+{
+ struct uihashhead *uipp;
+ struct uidinfo *uip;
+
+ rw_assert(&uihashtbl_lock, RA_LOCKED);
+ uipp = UIHASH(uid);
+ LIST_FOREACH(uip, uipp, ui_hash)
+ if (uip->ui_uid == uid)
+ break;
+
+ return (uip);
+}
+
+/*
+ * Find or allocate a struct uidinfo for a particular uid.
+ * Increase refcount on uidinfo struct returned.
+ * uifree() should be called on a struct uidinfo when released.
+ */
+struct uidinfo *
+uifind(uid)
+ uid_t uid;
+{
+ struct uidinfo *old_uip, *uip;
+
+ rw_rlock(&uihashtbl_lock);
+ uip = uilookup(uid);
+ if (uip == NULL) {
+ rw_runlock(&uihashtbl_lock);
+ uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
+ racct_create(&uip->ui_racct);
+ rw_wlock(&uihashtbl_lock);
+ /*
+ * There's a chance someone created our uidinfo while we
+ * were in malloc and not holding the lock, so we have to
+ * make sure we don't insert a duplicate uidinfo.
+ */
+ if ((old_uip = uilookup(uid)) != NULL) {
+ /* Someone else beat us to it. */
+ racct_destroy(&uip->ui_racct);
+ free(uip, M_UIDINFO);
+ uip = old_uip;
+ } else {
+ refcount_init(&uip->ui_ref, 0);
+ uip->ui_uid = uid;
+ mtx_init(&uip->ui_vmsize_mtx, "ui_vmsize", NULL,
+ MTX_DEF);
+ LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
+ }
+ }
+ uihold(uip);
+ rw_unlock(&uihashtbl_lock);
+ return (uip);
+}
+
+/*
+ * Place another refcount on a uidinfo struct.
+ */
+void
+uihold(uip)
+ struct uidinfo *uip;
+{
+
+ refcount_acquire(&uip->ui_ref);
+}
+
+/*-
+ * Since uidinfo structs have a long lifetime, we use an
+ * opportunistic refcounting scheme to avoid locking the lookup hash
+ * for each release.
+ *
+ * If the refcount hits 0, we need to free the structure,
+ * which means we need to lock the hash.
+ * Optimal case:
+ * After locking the struct and lowering the refcount, if we find
+ * that we don't need to free, simply unlock and return.
+ * Suboptimal case:
+ * If refcount lowering results in need to free, bump the count
+ * back up, lose the lock and acquire the locks in the proper
+ * order to try again.
+ */
+void
+uifree(uip)
+ struct uidinfo *uip;
+{
+ int old;
+
+ /* Prepare for optimal case. */
+ old = uip->ui_ref;
+ if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1))
+ return;
+
+ /* Prepare for suboptimal case. */
+ rw_wlock(&uihashtbl_lock);
+ if (refcount_release(&uip->ui_ref)) {
+ racct_destroy(&uip->ui_racct);
+ LIST_REMOVE(uip, ui_hash);
+ rw_wunlock(&uihashtbl_lock);
+ if (uip->ui_sbsize != 0)
+ printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
+ uip->ui_uid, uip->ui_sbsize);
+ if (uip->ui_proccnt != 0)
+ printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
+ uip->ui_uid, uip->ui_proccnt);
+ if (uip->ui_vmsize != 0)
+ printf("freeing uidinfo: uid = %d, swapuse = %lld\n",
+ uip->ui_uid, (unsigned long long)uip->ui_vmsize);
+ mtx_destroy(&uip->ui_vmsize_mtx);
+ free(uip, M_UIDINFO);
+ return;
+ }
+ /*
+ * Someone added a reference between atomic_cmpset_int() and
+ * rw_wlock(&uihashtbl_lock).
+ */
+ rw_wunlock(&uihashtbl_lock);
+}
+
+void
+ui_racct_foreach(void (*callback)(struct racct *racct,
+ void *arg2, void *arg3), void *arg2, void *arg3)
+{
+ struct uidinfo *uip;
+ struct uihashhead *uih;
+
+ rw_rlock(&uihashtbl_lock);
+ for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) {
+ LIST_FOREACH(uip, uih, ui_hash) {
+ (callback)(uip->ui_racct, arg2, arg3);
+ }
+ }
+ rw_runlock(&uihashtbl_lock);
+}
+
+/*
+ * Change the count associated with number of processes
+ * a given user is using. When 'max' is 0, don't enforce a limit
+ */
+int
+chgproccnt(uip, diff, max)
+ struct uidinfo *uip;
+ int diff;
+ rlim_t max;
+{
+
+ /* Don't allow them to exceed max, but allow subtraction. */
+ if (diff > 0 && max != 0) {
+ if (atomic_fetchadd_long(&uip->ui_proccnt, (long)diff) + diff > max) {
+ atomic_subtract_long(&uip->ui_proccnt, (long)diff);
+ return (0);
+ }
+ } else {
+ atomic_add_long(&uip->ui_proccnt, (long)diff);
+ if (uip->ui_proccnt < 0)
+ printf("negative proccnt for uid = %d\n", uip->ui_uid);
+ }
+ return (1);
+}
+
+/*
+ * Change the total socket buffer size a user has used.
+ */
+int
+chgsbsize(uip, hiwat, to, max)
+ struct uidinfo *uip;
+ u_int *hiwat;
+ u_int to;
+ rlim_t max;
+{
+ int diff;
+
+ diff = to - *hiwat;
+ if (diff > 0) {
+ if (atomic_fetchadd_long(&uip->ui_sbsize, (long)diff) + diff > max) {
+ atomic_subtract_long(&uip->ui_sbsize, (long)diff);
+ return (0);
+ }
+ } else {
+ atomic_add_long(&uip->ui_sbsize, (long)diff);
+ if (uip->ui_sbsize < 0)
+ printf("negative sbsize for uid = %d\n", uip->ui_uid);
+ }
+ *hiwat = to;
+ return (1);
+}
+
+/*
+ * Change the count associated with number of pseudo-terminals
+ * a given user is using. When 'max' is 0, don't enforce a limit
+ */
+int
+chgptscnt(uip, diff, max)
+ struct uidinfo *uip;
+ int diff;
+ rlim_t max;
+{
+
+ /* Don't allow them to exceed max, but allow subtraction. */
+ if (diff > 0 && max != 0) {
+ if (atomic_fetchadd_long(&uip->ui_ptscnt, (long)diff) + diff > max) {
+ atomic_subtract_long(&uip->ui_ptscnt, (long)diff);
+ return (0);
+ }
+ } else {
+ atomic_add_long(&uip->ui_ptscnt, (long)diff);
+ if (uip->ui_ptscnt < 0)
+ printf("negative ptscnt for uid = %d\n", uip->ui_uid);
+ }
+ return (1);
+}
diff --git a/sys/kern/kern_rmlock.c b/sys/kern/kern_rmlock.c
new file mode 100644
index 0000000..ff397eb
--- /dev/null
+++ b/sys/kern/kern_rmlock.c
@@ -0,0 +1,831 @@
+/*-
+ * Copyright (c) 2007 Stephan Uphoff <ups@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Machine independent bits of reader/writer lock implementation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <sys/kernel.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rmlock.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/turnstile.h>
+#include <sys/lock_profile.h>
+#include <machine/cpu.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * A cookie to mark destroyed rmlocks. This is stored in the head of
+ * rm_activeReaders.
+ */
+#define RM_DESTROYED ((void *)0xdead)
+
+#define rm_destroyed(rm) \
+ (LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED)
+
+#define RMPF_ONQUEUE 1
+#define RMPF_SIGNAL 2
+
+#ifndef INVARIANTS
+#define _rm_assert(c, what, file, line)
+#endif
+
+static void assert_rm(const struct lock_object *lock, int what);
+#ifdef DDB
+static void db_show_rm(const struct lock_object *lock);
+#endif
+static void lock_rm(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int owner_rm(const struct lock_object *lock, struct thread **owner);
+#endif
+static int unlock_rm(struct lock_object *lock);
+
+struct lock_class lock_class_rm = {
+ .lc_name = "rm",
+ .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
+ .lc_assert = assert_rm,
+#ifdef DDB
+ .lc_ddb_show = db_show_rm,
+#endif
+ .lc_lock = lock_rm,
+ .lc_unlock = unlock_rm,
+#ifdef KDTRACE_HOOKS
+ .lc_owner = owner_rm,
+#endif
+};
+
+struct lock_class lock_class_rm_sleepable = {
+ .lc_name = "sleepable rm",
+ .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE,
+ .lc_assert = assert_rm,
+#ifdef DDB
+ .lc_ddb_show = db_show_rm,
+#endif
+ .lc_lock = lock_rm,
+ .lc_unlock = unlock_rm,
+#ifdef KDTRACE_HOOKS
+ .lc_owner = owner_rm,
+#endif
+};
+
+static void
+assert_rm(const struct lock_object *lock, int what)
+{
+
+ rm_assert((const struct rmlock *)lock, what);
+}
+
+/*
+ * These do not support read locks because it would be hard to make
+ * the tracker work correctly with the current lock_class API as you
+ * would need to have the tracker pointer available when calling
+ * rm_rlock() in lock_rm().
+ */
+static void
+lock_rm(struct lock_object *lock, int how)
+{
+ struct rmlock *rm;
+
+ rm = (struct rmlock *)lock;
+ if (how)
+ rm_wlock(rm);
+#ifdef INVARIANTS
+ else
+ panic("lock_rm called in read mode");
+#endif
+}
+
+static int
+unlock_rm(struct lock_object *lock)
+{
+ struct rmlock *rm;
+
+ rm = (struct rmlock *)lock;
+ rm_wunlock(rm);
+ return (1);
+}
+
+#ifdef KDTRACE_HOOKS
+static int
+owner_rm(const struct lock_object *lock, struct thread **owner)
+{
+ const struct rmlock *rm;
+ struct lock_class *lc;
+
+ rm = (const struct rmlock *)lock;
+ lc = LOCK_CLASS(&rm->rm_wlock_object);
+ return (lc->lc_owner(&rm->rm_wlock_object, owner));
+}
+#endif
+
+static struct mtx rm_spinlock;
+
+MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN);
+
+/*
+ * Add or remove tracker from per-cpu list.
+ *
+ * The per-cpu list can be traversed at any time in forward direction from an
+ * interrupt on the *local* cpu.
+ */
+static void inline
+rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker)
+{
+ struct rm_queue *next;
+
+ /* Initialize all tracker pointers */
+ tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue;
+ next = pc->pc_rm_queue.rmq_next;
+ tracker->rmp_cpuQueue.rmq_next = next;
+
+ /* rmq_prev is not used during froward traversal. */
+ next->rmq_prev = &tracker->rmp_cpuQueue;
+
+ /* Update pointer to first element. */
+ pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue;
+}
+
+/*
+ * Return a count of the number of trackers the thread 'td' already
+ * has on this CPU for the lock 'rm'.
+ */
+static int
+rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm,
+ const struct thread *td)
+{
+ struct rm_queue *queue;
+ struct rm_priotracker *tracker;
+ int count;
+
+ count = 0;
+ for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
+ queue = queue->rmq_next) {
+ tracker = (struct rm_priotracker *)queue;
+ if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td))
+ count++;
+ }
+ return (count);
+}
+
+static void inline
+rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker)
+{
+ struct rm_queue *next, *prev;
+
+ next = tracker->rmp_cpuQueue.rmq_next;
+ prev = tracker->rmp_cpuQueue.rmq_prev;
+
+ /* Not used during forward traversal. */
+ next->rmq_prev = prev;
+
+ /* Remove from list. */
+ prev->rmq_next = next;
+}
+
+static void
+rm_cleanIPI(void *arg)
+{
+ struct pcpu *pc;
+ struct rmlock *rm = arg;
+ struct rm_priotracker *tracker;
+ struct rm_queue *queue;
+ pc = pcpu_find(curcpu);
+
+ for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
+ queue = queue->rmq_next) {
+ tracker = (struct rm_priotracker *)queue;
+ if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) {
+ tracker->rmp_flags = RMPF_ONQUEUE;
+ mtx_lock_spin(&rm_spinlock);
+ LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
+ rmp_qentry);
+ mtx_unlock_spin(&rm_spinlock);
+ }
+ }
+}
+
+void
+rm_init_flags(struct rmlock *rm, const char *name, int opts)
+{
+ struct lock_class *lc;
+ int liflags;
+
+ liflags = 0;
+ if (!(opts & RM_NOWITNESS))
+ liflags |= LO_WITNESS;
+ if (opts & RM_RECURSE)
+ liflags |= LO_RECURSABLE;
+ rm->rm_writecpus = all_cpus;
+ LIST_INIT(&rm->rm_activeReaders);
+ if (opts & RM_SLEEPABLE) {
+ liflags |= LO_SLEEPABLE;
+ lc = &lock_class_rm_sleepable;
+ sx_init_flags(&rm->rm_lock_sx, "rmlock_sx", SX_NOWITNESS);
+ } else {
+ lc = &lock_class_rm;
+ mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx", MTX_NOWITNESS);
+ }
+ lock_init(&rm->lock_object, lc, name, NULL, liflags);
+}
+
+void
+rm_init(struct rmlock *rm, const char *name)
+{
+
+ rm_init_flags(rm, name, 0);
+}
+
+void
+rm_destroy(struct rmlock *rm)
+{
+
+ rm_assert(rm, RA_UNLOCKED);
+ LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED;
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+ sx_destroy(&rm->rm_lock_sx);
+ else
+ mtx_destroy(&rm->rm_lock_mtx);
+ lock_destroy(&rm->lock_object);
+}
+
+int
+rm_wowned(const struct rmlock *rm)
+{
+
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+ return (sx_xlocked(&rm->rm_lock_sx));
+ else
+ return (mtx_owned(&rm->rm_lock_mtx));
+}
+
+void
+rm_sysinit(void *arg)
+{
+ struct rm_args *args = arg;
+
+ rm_init(args->ra_rm, args->ra_desc);
+}
+
+void
+rm_sysinit_flags(void *arg)
+{
+ struct rm_args_flags *args = arg;
+
+ rm_init_flags(args->ra_rm, args->ra_desc, args->ra_opts);
+}
+
+static int
+_rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
+{
+ struct pcpu *pc;
+
+ critical_enter();
+ pc = pcpu_find(curcpu);
+
+ /* Check if we just need to do a proper critical_exit. */
+ if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
+ critical_exit();
+ return (1);
+ }
+
+ /* Remove our tracker from the per-cpu list. */
+ rm_tracker_remove(pc, tracker);
+
+ /* Check to see if the IPI granted us the lock after all. */
+ if (tracker->rmp_flags) {
+ /* Just add back tracker - we hold the lock. */
+ rm_tracker_add(pc, tracker);
+ critical_exit();
+ return (1);
+ }
+
+ /*
+ * We allow readers to aquire a lock even if a writer is blocked if
+ * the lock is recursive and the reader already holds the lock.
+ */
+ if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) {
+ /*
+ * Just grant the lock if this thread already has a tracker
+ * for this lock on the per-cpu queue.
+ */
+ if (rm_trackers_present(pc, rm, curthread) != 0) {
+ mtx_lock_spin(&rm_spinlock);
+ LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
+ rmp_qentry);
+ tracker->rmp_flags = RMPF_ONQUEUE;
+ mtx_unlock_spin(&rm_spinlock);
+ rm_tracker_add(pc, tracker);
+ critical_exit();
+ return (1);
+ }
+ }
+
+ sched_unpin();
+ critical_exit();
+
+ if (trylock) {
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
+ if (!sx_try_xlock(&rm->rm_lock_sx))
+ return (0);
+ } else {
+ if (!mtx_trylock(&rm->rm_lock_mtx))
+ return (0);
+ }
+ } else {
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+ sx_xlock(&rm->rm_lock_sx);
+ else
+ mtx_lock(&rm->rm_lock_mtx);
+ }
+
+ critical_enter();
+ pc = pcpu_find(curcpu);
+ CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
+ rm_tracker_add(pc, tracker);
+ sched_pin();
+ critical_exit();
+
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+ sx_xunlock(&rm->rm_lock_sx);
+ else
+ mtx_unlock(&rm->rm_lock_mtx);
+
+ return (1);
+}
+
+int
+_rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
+{
+ struct thread *td = curthread;
+ struct pcpu *pc;
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+ tracker->rmp_flags = 0;
+ tracker->rmp_thread = td;
+ tracker->rmp_rmlock = rm;
+
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+ THREAD_NO_SLEEPING();
+
+ td->td_critnest++; /* critical_enter(); */
+
+ __compiler_membar();
+
+ pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */
+
+ rm_tracker_add(pc, tracker);
+
+ sched_pin();
+
+ __compiler_membar();
+
+ td->td_critnest--;
+
+ /*
+ * Fast path to combine two common conditions into a single
+ * conditional jump.
+ */
+ if (0 == (td->td_owepreempt |
+ CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)))
+ return (1);
+
+ /* We do not have a read token and need to acquire one. */
+ return _rm_rlock_hard(rm, tracker, trylock);
+}
+
+static void
+_rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker)
+{
+
+ if (td->td_owepreempt) {
+ td->td_critnest++;
+ critical_exit();
+ }
+
+ if (!tracker->rmp_flags)
+ return;
+
+ mtx_lock_spin(&rm_spinlock);
+ LIST_REMOVE(tracker, rmp_qentry);
+
+ if (tracker->rmp_flags & RMPF_SIGNAL) {
+ struct rmlock *rm;
+ struct turnstile *ts;
+
+ rm = tracker->rmp_rmlock;
+
+ turnstile_chain_lock(&rm->lock_object);
+ mtx_unlock_spin(&rm_spinlock);
+
+ ts = turnstile_lookup(&rm->lock_object);
+
+ turnstile_signal(ts, TS_EXCLUSIVE_QUEUE);
+ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+ turnstile_chain_unlock(&rm->lock_object);
+ } else
+ mtx_unlock_spin(&rm_spinlock);
+}
+
+void
+_rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker)
+{
+ struct pcpu *pc;
+ struct thread *td = tracker->rmp_thread;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ td->td_critnest++; /* critical_enter(); */
+ pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */
+ rm_tracker_remove(pc, tracker);
+ td->td_critnest--;
+ sched_unpin();
+
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+ THREAD_SLEEPING_OK();
+
+ if (0 == (td->td_owepreempt | tracker->rmp_flags))
+ return;
+
+ _rm_unlock_hard(td, tracker);
+}
+
+void
+_rm_wlock(struct rmlock *rm)
+{
+ struct rm_priotracker *prio;
+ struct turnstile *ts;
+ cpuset_t readcpus;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+ sx_xlock(&rm->rm_lock_sx);
+ else
+ mtx_lock(&rm->rm_lock_mtx);
+
+ if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) {
+ /* Get all read tokens back */
+ readcpus = all_cpus;
+ CPU_NAND(&readcpus, &rm->rm_writecpus);
+ rm->rm_writecpus = all_cpus;
+
+ /*
+ * Assumes rm->rm_writecpus update is visible on other CPUs
+ * before rm_cleanIPI is called.
+ */
+#ifdef SMP
+ smp_rendezvous_cpus(readcpus,
+ smp_no_rendevous_barrier,
+ rm_cleanIPI,
+ smp_no_rendevous_barrier,
+ rm);
+
+#else
+ rm_cleanIPI(rm);
+#endif
+
+ mtx_lock_spin(&rm_spinlock);
+ while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) {
+ ts = turnstile_trywait(&rm->lock_object);
+ prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL;
+ mtx_unlock_spin(&rm_spinlock);
+ turnstile_wait(ts, prio->rmp_thread,
+ TS_EXCLUSIVE_QUEUE);
+ mtx_lock_spin(&rm_spinlock);
+ }
+ mtx_unlock_spin(&rm_spinlock);
+ }
+}
+
+void
+_rm_wunlock(struct rmlock *rm)
+{
+
+ if (rm->lock_object.lo_flags & LO_SLEEPABLE)
+ sx_xunlock(&rm->rm_lock_sx);
+ else
+ mtx_unlock(&rm->rm_lock_mtx);
+}
+
+#ifdef LOCK_DEBUG
+
+void
+_rm_wlock_debug(struct rmlock *rm, const char *file, int line)
+{
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d",
+ curthread, rm->lock_object.lo_name, file, line));
+ KASSERT(!rm_destroyed(rm),
+ ("rm_wlock() of destroyed rmlock @ %s:%d", file, line));
+ _rm_assert(rm, RA_UNLOCKED, file, line);
+
+ WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE,
+ file, line, NULL);
+
+ _rm_wlock(rm);
+
+ LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line);
+
+ WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
+
+ curthread->td_locks++;
+
+}
+
+void
+_rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
+{
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ KASSERT(!rm_destroyed(rm),
+ ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line));
+ _rm_assert(rm, RA_WLOCKED, file, line);
+ WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line);
+ _rm_wunlock(rm);
+ curthread->td_locks--;
+}
+
+int
+_rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
+ int trylock, const char *file, int line)
+{
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+#ifdef INVARIANTS
+ if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) {
+ critical_enter();
+ KASSERT(rm_trackers_present(pcpu_find(curcpu), rm,
+ curthread) == 0,
+ ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n",
+ rm->lock_object.lo_name, file, line));
+ critical_exit();
+ }
+#endif
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d",
+ curthread, rm->lock_object.lo_name, file, line));
+ KASSERT(!rm_destroyed(rm),
+ ("rm_rlock() of destroyed rmlock @ %s:%d", file, line));
+ if (!trylock) {
+ KASSERT(!rm_wowned(rm),
+ ("rm_rlock: wlock already held for %s @ %s:%d",
+ rm->lock_object.lo_name, file, line));
+ WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER, file, line,
+ NULL);
+ }
+
+ if (_rm_rlock(rm, tracker, trylock)) {
+ if (trylock)
+ LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file,
+ line);
+ else
+ LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file,
+ line);
+ WITNESS_LOCK(&rm->lock_object, 0, file, line);
+
+ curthread->td_locks++;
+
+ return (1);
+ } else if (trylock)
+ LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line);
+
+ return (0);
+}
+
+void
+_rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
+ const char *file, int line)
+{
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ KASSERT(!rm_destroyed(rm),
+ ("rm_runlock() of destroyed rmlock @ %s:%d", file, line));
+ _rm_assert(rm, RA_RLOCKED, file, line);
+ WITNESS_UNLOCK(&rm->lock_object, 0, file, line);
+ LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line);
+ _rm_runlock(rm, tracker);
+ curthread->td_locks--;
+}
+
+#else
+
+/*
+ * Just strip out file and line arguments if no lock debugging is enabled in
+ * the kernel - we are called from a kernel module.
+ */
+void
+_rm_wlock_debug(struct rmlock *rm, const char *file, int line)
+{
+
+ _rm_wlock(rm);
+}
+
+void
+_rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
+{
+
+ _rm_wunlock(rm);
+}
+
+int
+_rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
+ int trylock, const char *file, int line)
+{
+
+ return _rm_rlock(rm, tracker, trylock);
+}
+
+void
+_rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
+ const char *file, int line)
+{
+
+ _rm_runlock(rm, tracker);
+}
+
+#endif
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef _rm_assert
+#endif
+
+/*
+ * Note that this does not need to use witness_assert() for read lock
+ * assertions since an exact count of read locks held by this thread
+ * is computable.
+ */
+void
+_rm_assert(const struct rmlock *rm, int what, const char *file, int line)
+{
+ int count;
+
+ if (panicstr != NULL)
+ return;
+ switch (what) {
+ case RA_LOCKED:
+ case RA_LOCKED | RA_RECURSED:
+ case RA_LOCKED | RA_NOTRECURSED:
+ case RA_RLOCKED:
+ case RA_RLOCKED | RA_RECURSED:
+ case RA_RLOCKED | RA_NOTRECURSED:
+ /*
+ * Handle the write-locked case. Unlike other
+ * primitives, writers can never recurse.
+ */
+ if (rm_wowned(rm)) {
+ if (what & RA_RLOCKED)
+ panic("Lock %s exclusively locked @ %s:%d\n",
+ rm->lock_object.lo_name, file, line);
+ if (what & RA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ rm->lock_object.lo_name, file, line);
+ break;
+ }
+
+ critical_enter();
+ count = rm_trackers_present(pcpu_find(curcpu), rm, curthread);
+ critical_exit();
+
+ if (count == 0)
+ panic("Lock %s not %slocked @ %s:%d\n",
+ rm->lock_object.lo_name, (what & RA_RLOCKED) ?
+ "read " : "", file, line);
+ if (count > 1) {
+ if (what & RA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ rm->lock_object.lo_name, file, line);
+ } else if (what & RA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ rm->lock_object.lo_name, file, line);
+ break;
+ case RA_WLOCKED:
+ if (!rm_wowned(rm))
+ panic("Lock %s not exclusively locked @ %s:%d\n",
+ rm->lock_object.lo_name, file, line);
+ break;
+ case RA_UNLOCKED:
+ if (rm_wowned(rm))
+ panic("Lock %s exclusively locked @ %s:%d\n",
+ rm->lock_object.lo_name, file, line);
+
+ critical_enter();
+ count = rm_trackers_present(pcpu_find(curcpu), rm, curthread);
+ critical_exit();
+
+ if (count != 0)
+ panic("Lock %s read locked @ %s:%d\n",
+ rm->lock_object.lo_name, file, line);
+ break;
+ default:
+ panic("Unknown rm lock assertion: %d @ %s:%d", what, file,
+ line);
+ }
+}
+#endif /* INVARIANT_SUPPORT */
+
+#ifdef DDB
+static void
+print_tracker(struct rm_priotracker *tr)
+{
+ struct thread *td;
+
+ td = tr->rmp_thread;
+ db_printf(" thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid,
+ td->td_proc->p_pid, td->td_name);
+ if (tr->rmp_flags & RMPF_ONQUEUE) {
+ db_printf("ONQUEUE");
+ if (tr->rmp_flags & RMPF_SIGNAL)
+ db_printf(",SIGNAL");
+ } else
+ db_printf("0");
+ db_printf("}\n");
+}
+
+static void
+db_show_rm(const struct lock_object *lock)
+{
+ struct rm_priotracker *tr;
+ struct rm_queue *queue;
+ const struct rmlock *rm;
+ struct lock_class *lc;
+ struct pcpu *pc;
+
+ rm = (const struct rmlock *)lock;
+ db_printf(" writecpus: ");
+ ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus));
+ db_printf("\n");
+ db_printf(" per-CPU readers:\n");
+ STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)
+ for (queue = pc->pc_rm_queue.rmq_next;
+ queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
+ tr = (struct rm_priotracker *)queue;
+ if (tr->rmp_rmlock == rm)
+ print_tracker(tr);
+ }
+ db_printf(" active readers:\n");
+ LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry)
+ print_tracker(tr);
+ lc = LOCK_CLASS(&rm->rm_wlock_object);
+ db_printf("Backing write-lock (%s):\n", lc->lc_name);
+ lc->lc_ddb_show(&rm->rm_wlock_object);
+}
+#endif
diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c
new file mode 100644
index 0000000..bd40704
--- /dev/null
+++ b/sys/kern/kern_rwlock.c
@@ -0,0 +1,1232 @@
+/*-
+ * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Machine independent bits of reader/writer lock implementation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_no_adaptive_rwlocks.h"
+
+#include <sys/param.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/turnstile.h>
+
+#include <machine/cpu.h>
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
+#define ADAPTIVE_RWLOCKS
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DECLARE( , , lock, failed);
+#endif
+
+/*
+ * Return the rwlock address when the lock cookie address is provided.
+ * This functionality assumes that struct rwlock* have a member named rw_lock.
+ */
+#define rwlock2rw(c) (__containerof(c, struct rwlock, rw_lock))
+
+#ifdef ADAPTIVE_RWLOCKS
+static int rowner_retries = 10;
+static int rowner_loops = 10000;
+static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
+ "rwlock debugging");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
+SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+static void db_show_rwlock(const struct lock_object *lock);
+#endif
+static void assert_rw(const struct lock_object *lock, int what);
+static void lock_rw(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int owner_rw(const struct lock_object *lock, struct thread **owner);
+#endif
+static int unlock_rw(struct lock_object *lock);
+
+struct lock_class lock_class_rw = {
+ .lc_name = "rw",
+ .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
+ .lc_assert = assert_rw,
+#ifdef DDB
+ .lc_ddb_show = db_show_rwlock,
+#endif
+ .lc_lock = lock_rw,
+ .lc_unlock = unlock_rw,
+#ifdef KDTRACE_HOOKS
+ .lc_owner = owner_rw,
+#endif
+};
+
+/*
+ * Return a pointer to the owning thread if the lock is write-locked or
+ * NULL if the lock is unlocked or read-locked.
+ */
+#define rw_wowner(rw) \
+ ((rw)->rw_lock & RW_LOCK_READ ? NULL : \
+ (struct thread *)RW_OWNER((rw)->rw_lock))
+
+/*
+ * Returns if a write owner is recursed. Write ownership is not assured
+ * here and should be previously checked.
+ */
+#define rw_recursed(rw) ((rw)->rw_recurse != 0)
+
+/*
+ * Return true if curthread helds the lock.
+ */
+#define rw_wlocked(rw) (rw_wowner((rw)) == curthread)
+
+/*
+ * Return a pointer to the owning thread for this lock who should receive
+ * any priority lent by threads that block on this lock. Currently this
+ * is identical to rw_wowner().
+ */
+#define rw_owner(rw) rw_wowner(rw)
+
+#ifndef INVARIANTS
+#define __rw_assert(c, what, file, line)
+#endif
+
+void
+assert_rw(const struct lock_object *lock, int what)
+{
+
+ rw_assert((const struct rwlock *)lock, what);
+}
+
+void
+lock_rw(struct lock_object *lock, int how)
+{
+ struct rwlock *rw;
+
+ rw = (struct rwlock *)lock;
+ if (how)
+ rw_wlock(rw);
+ else
+ rw_rlock(rw);
+}
+
+int
+unlock_rw(struct lock_object *lock)
+{
+ struct rwlock *rw;
+
+ rw = (struct rwlock *)lock;
+ rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
+ if (rw->rw_lock & RW_LOCK_READ) {
+ rw_runlock(rw);
+ return (0);
+ } else {
+ rw_wunlock(rw);
+ return (1);
+ }
+}
+
+#ifdef KDTRACE_HOOKS
+int
+owner_rw(const struct lock_object *lock, struct thread **owner)
+{
+ const struct rwlock *rw = (const struct rwlock *)lock;
+ uintptr_t x = rw->rw_lock;
+
+ *owner = rw_wowner(rw);
+ return ((x & RW_LOCK_READ) != 0 ? (RW_READERS(x) != 0) :
+ (*owner != NULL));
+}
+#endif
+
+void
+_rw_init_flags(volatile uintptr_t *c, const char *name, int opts)
+{
+ struct rwlock *rw;
+ int flags;
+
+ rw = rwlock2rw(c);
+
+ MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
+ RW_RECURSE)) == 0);
+ ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock,
+ ("%s: rw_lock not aligned for %s: %p", __func__, name,
+ &rw->rw_lock));
+
+ flags = LO_UPGRADABLE;
+ if (opts & RW_DUPOK)
+ flags |= LO_DUPOK;
+ if (opts & RW_NOPROFILE)
+ flags |= LO_NOPROFILE;
+ if (!(opts & RW_NOWITNESS))
+ flags |= LO_WITNESS;
+ if (opts & RW_RECURSE)
+ flags |= LO_RECURSABLE;
+ if (opts & RW_QUIET)
+ flags |= LO_QUIET;
+
+ lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
+ rw->rw_lock = RW_UNLOCKED;
+ rw->rw_recurse = 0;
+}
+
+void
+_rw_destroy(volatile uintptr_t *c)
+{
+ struct rwlock *rw;
+
+ rw = rwlock2rw(c);
+
+ KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw));
+ KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw));
+ rw->rw_lock = RW_DESTROYED;
+ lock_destroy(&rw->lock_object);
+}
+
+void
+rw_sysinit(void *arg)
+{
+ struct rw_args *args = arg;
+
+ rw_init((struct rwlock *)args->ra_rw, args->ra_desc);
+}
+
+void
+rw_sysinit_flags(void *arg)
+{
+ struct rw_args_flags *args = arg;
+
+ rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
+ args->ra_flags);
+}
+
+int
+_rw_wowned(const volatile uintptr_t *c)
+{
+
+ return (rw_wowner(rwlock2rw(c)) == curthread);
+}
+
+void
+_rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line)
+{
+ struct rwlock *rw;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ rw = rwlock2rw(c);
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d",
+ curthread, rw->lock_object.lo_name, file, line));
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
+ WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
+ line, NULL);
+ __rw_wlock(rw, curthread, file, line);
+ LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
+ WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
+ curthread->td_locks++;
+}
+
+int
+__rw_try_wlock(volatile uintptr_t *c, const char *file, int line)
+{
+ struct rwlock *rw;
+ int rval;
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+ rw = rwlock2rw(c);
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d",
+ curthread, rw->lock_object.lo_name, file, line));
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line));
+
+ if (rw_wlocked(rw) &&
+ (rw->lock_object.lo_flags & LO_RECURSABLE) != 0) {
+ rw->rw_recurse++;
+ rval = 1;
+ } else
+ rval = atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED,
+ (uintptr_t)curthread);
+
+ LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line);
+ if (rval) {
+ WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ curthread->td_locks++;
+ }
+ return (rval);
+}
+
+void
+_rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line)
+{
+ struct rwlock *rw;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ rw = rwlock2rw(c);
+
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
+ __rw_assert(c, RA_WLOCKED, file, line);
+ WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
+ line);
+ if (!rw_recursed(rw))
+ LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_WUNLOCK_RELEASE, rw);
+ __rw_wunlock(rw, curthread, file, line);
+ curthread->td_locks--;
+}
+/*
+ * Determines whether a new reader can acquire a lock. Succeeds if the
+ * reader already owns a read lock and the lock is locked for read to
+ * prevent deadlock from reader recursion. Also succeeds if the lock
+ * is unlocked and has no writer waiters or spinners. Failing otherwise
+ * prioritizes writers before readers.
+ */
+#define RW_CAN_READ(_rw) \
+ ((curthread->td_rw_rlocks && (_rw) & RW_LOCK_READ) || ((_rw) & \
+ (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) == \
+ RW_LOCK_READ)
+
+void
+__rw_rlock(volatile uintptr_t *c, const char *file, int line)
+{
+ struct rwlock *rw;
+ struct turnstile *ts;
+#ifdef ADAPTIVE_RWLOCKS
+ volatile struct thread *owner;
+ int spintries = 0;
+ int i;
+#endif
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+ uintptr_t v;
+#ifdef KDTRACE_HOOKS
+ uint64_t spin_cnt = 0;
+ uint64_t sleep_cnt = 0;
+ int64_t sleep_time = 0;
+#endif
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ rw = rwlock2rw(c);
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d",
+ curthread, rw->lock_object.lo_name, file, line));
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
+ KASSERT(rw_wowner(rw) != curthread,
+ ("rw_rlock: wlock already held for %s @ %s:%d",
+ rw->lock_object.lo_name, file, line));
+ WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL);
+
+ for (;;) {
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ /*
+ * Handle the easy case. If no other thread has a write
+ * lock, then try to bump up the count of read locks. Note
+ * that we have to preserve the current state of the
+ * RW_LOCK_WRITE_WAITERS flag. If we fail to acquire a
+ * read lock, then rw_lock must have changed, so restart
+ * the loop. Note that this handles the case of a
+ * completely unlocked rwlock since such a lock is encoded
+ * as a read lock with no waiters.
+ */
+ v = rw->rw_lock;
+ if (RW_CAN_READ(v)) {
+ /*
+ * The RW_LOCK_READ_WAITERS flag should only be set
+ * if the lock has been unlocked and write waiters
+ * were present.
+ */
+ if (atomic_cmpset_acq_ptr(&rw->rw_lock, v,
+ v + RW_ONE_READER)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: %p succeed %p -> %p", __func__,
+ rw, (void *)v,
+ (void *)(v + RW_ONE_READER));
+ break;
+ }
+ continue;
+ }
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&rw->lock_object,
+ &contested, &waittime);
+
+#ifdef ADAPTIVE_RWLOCKS
+ /*
+ * If the owner is running on another CPU, spin until
+ * the owner stops running or the state of the lock
+ * changes.
+ */
+ if ((v & RW_LOCK_READ) == 0) {
+ owner = (struct thread *)RW_OWNER(v);
+ if (TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, rw, owner);
+ while ((struct thread*)RW_OWNER(rw->rw_lock) ==
+ owner && TD_IS_RUNNING(owner)) {
+ cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ }
+ continue;
+ }
+ } else if (spintries < rowner_retries) {
+ spintries++;
+ for (i = 0; i < rowner_loops; i++) {
+ v = rw->rw_lock;
+ if ((v & RW_LOCK_READ) == 0 || RW_CAN_READ(v))
+ break;
+ cpu_spinwait();
+ }
+ if (i != rowner_loops)
+ continue;
+ }
+#endif
+
+ /*
+ * Okay, now it's the hard case. Some other thread already
+ * has a write lock or there are write waiters present,
+ * acquire the turnstile lock so we can begin the process
+ * of blocking.
+ */
+ ts = turnstile_trywait(&rw->lock_object);
+
+ /*
+ * The lock might have been released while we spun, so
+ * recheck its state and restart the loop if needed.
+ */
+ v = rw->rw_lock;
+ if (RW_CAN_READ(v)) {
+ turnstile_cancel(ts);
+ continue;
+ }
+
+#ifdef ADAPTIVE_RWLOCKS
+ /*
+ * The current lock owner might have started executing
+ * on another CPU (or the lock could have changed
+ * owners) while we were waiting on the turnstile
+ * chain lock. If so, drop the turnstile lock and try
+ * again.
+ */
+ if ((v & RW_LOCK_READ) == 0) {
+ owner = (struct thread *)RW_OWNER(v);
+ if (TD_IS_RUNNING(owner)) {
+ turnstile_cancel(ts);
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * The lock is held in write mode or it already has waiters.
+ */
+ MPASS(!RW_CAN_READ(v));
+
+ /*
+ * If the RW_LOCK_READ_WAITERS flag is already set, then
+ * we can go ahead and block. If it is not set then try
+ * to set it. If we fail to set it drop the turnstile
+ * lock and restart the loop.
+ */
+ if (!(v & RW_LOCK_READ_WAITERS)) {
+ if (!atomic_cmpset_ptr(&rw->rw_lock, v,
+ v | RW_LOCK_READ_WAITERS)) {
+ turnstile_cancel(ts);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p set read waiters flag",
+ __func__, rw);
+ }
+
+ /*
+ * We were unable to acquire the lock and the read waiters
+ * flag is set, so we must block on the turnstile.
+ */
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
+ rw);
+#ifdef KDTRACE_HOOKS
+ sleep_time -= lockstat_nsecs();
+#endif
+ turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE);
+#ifdef KDTRACE_HOOKS
+ sleep_time += lockstat_nsecs();
+ sleep_cnt++;
+#endif
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
+ __func__, rw);
+ }
+
+ /*
+ * TODO: acquire "owner of record" here. Here be turnstile dragons
+ * however. turnstiles don't like owners changing between calls to
+ * turnstile_wait() currently.
+ */
+ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_RLOCK_ACQUIRE, rw, contested,
+ waittime, file, line);
+ LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
+ WITNESS_LOCK(&rw->lock_object, 0, file, line);
+ curthread->td_locks++;
+ curthread->td_rw_rlocks++;
+#ifdef KDTRACE_HOOKS
+ if (sleep_time)
+ LOCKSTAT_RECORD1(LS_RW_RLOCK_BLOCK, rw, sleep_time);
+
+ /*
+ * Record only the loops spinning and not sleeping.
+ */
+ if (spin_cnt > sleep_cnt)
+ LOCKSTAT_RECORD1(LS_RW_RLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
+#endif
+}
+
+int
+__rw_try_rlock(volatile uintptr_t *c, const char *file, int line)
+{
+ struct rwlock *rw;
+ uintptr_t x;
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+ rw = rwlock2rw(c);
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d",
+ curthread, rw->lock_object.lo_name, file, line));
+
+ for (;;) {
+ x = rw->rw_lock;
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line));
+ if (!(x & RW_LOCK_READ))
+ break;
+ if (atomic_cmpset_acq_ptr(&rw->rw_lock, x, x + RW_ONE_READER)) {
+ LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file,
+ line);
+ WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line);
+ curthread->td_locks++;
+ curthread->td_rw_rlocks++;
+ return (1);
+ }
+ }
+
+ LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line);
+ return (0);
+}
+
+void
+_rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line)
+{
+ struct rwlock *rw;
+ struct turnstile *ts;
+ uintptr_t x, v, queue;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ rw = rwlock2rw(c);
+
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
+ __rw_assert(c, RA_RLOCKED, file, line);
+ WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
+ LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
+
+ /* TODO: drop "owner of record" here. */
+
+ for (;;) {
+ /*
+ * See if there is more than one read lock held. If so,
+ * just drop one and return.
+ */
+ x = rw->rw_lock;
+ if (RW_READERS(x) > 1) {
+ if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
+ x - RW_ONE_READER)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: %p succeeded %p -> %p",
+ __func__, rw, (void *)x,
+ (void *)(x - RW_ONE_READER));
+ break;
+ }
+ continue;
+ }
+ /*
+ * If there aren't any waiters for a write lock, then try
+ * to drop it quickly.
+ */
+ if (!(x & RW_LOCK_WAITERS)) {
+ MPASS((x & ~RW_LOCK_WRITE_SPINNER) ==
+ RW_READERS_LOCK(1));
+ if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
+ RW_UNLOCKED)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p last succeeded",
+ __func__, rw);
+ break;
+ }
+ continue;
+ }
+ /*
+ * Ok, we know we have waiters and we think we are the
+ * last reader, so grab the turnstile lock.
+ */
+ turnstile_chain_lock(&rw->lock_object);
+ v = rw->rw_lock & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
+ MPASS(v & RW_LOCK_WAITERS);
+
+ /*
+ * Try to drop our lock leaving the lock in a unlocked
+ * state.
+ *
+ * If you wanted to do explicit lock handoff you'd have to
+ * do it here. You'd also want to use turnstile_signal()
+ * and you'd have to handle the race where a higher
+ * priority thread blocks on the write lock before the
+ * thread you wakeup actually runs and have the new thread
+ * "steal" the lock. For now it's a lot simpler to just
+ * wakeup all of the waiters.
+ *
+ * As above, if we fail, then another thread might have
+ * acquired a read lock, so drop the turnstile lock and
+ * restart.
+ */
+ x = RW_UNLOCKED;
+ if (v & RW_LOCK_WRITE_WAITERS) {
+ queue = TS_EXCLUSIVE_QUEUE;
+ x |= (v & RW_LOCK_READ_WAITERS);
+ } else
+ queue = TS_SHARED_QUEUE;
+ if (!atomic_cmpset_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v,
+ x)) {
+ turnstile_chain_unlock(&rw->lock_object);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
+ __func__, rw);
+
+ /*
+ * Ok. The lock is released and all that's left is to
+ * wake up the waiters. Note that the lock might not be
+ * free anymore, but in that case the writers will just
+ * block again if they run before the new lock holder(s)
+ * release the lock.
+ */
+ ts = turnstile_lookup(&rw->lock_object);
+ MPASS(ts != NULL);
+ turnstile_broadcast(ts, queue);
+ turnstile_unpend(ts, TS_SHARED_LOCK);
+ turnstile_chain_unlock(&rw->lock_object);
+ break;
+ }
+ LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_RUNLOCK_RELEASE, rw);
+ curthread->td_locks--;
+ curthread->td_rw_rlocks--;
+}
+
+/*
+ * This function is called when we are unable to obtain a write lock on the
+ * first try. This means that at least one other thread holds either a
+ * read or write lock.
+ */
+void
+__rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
+ int line)
+{
+ struct rwlock *rw;
+ struct turnstile *ts;
+#ifdef ADAPTIVE_RWLOCKS
+ volatile struct thread *owner;
+ int spintries = 0;
+ int i;
+#endif
+ uintptr_t v, x;
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+#ifdef KDTRACE_HOOKS
+ uint64_t spin_cnt = 0;
+ uint64_t sleep_cnt = 0;
+ int64_t sleep_time = 0;
+#endif
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ rw = rwlock2rw(c);
+
+ if (rw_wlocked(rw)) {
+ KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE,
+ ("%s: recursing but non-recursive rw %s @ %s:%d\n",
+ __func__, rw->lock_object.lo_name, file, line));
+ rw->rw_recurse++;
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
+ return;
+ }
+
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
+ rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
+
+ while (!_rw_write_lock(rw, tid)) {
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&rw->lock_object,
+ &contested, &waittime);
+#ifdef ADAPTIVE_RWLOCKS
+ /*
+ * If the lock is write locked and the owner is
+ * running on another CPU, spin until the owner stops
+ * running or the state of the lock changes.
+ */
+ v = rw->rw_lock;
+ owner = (struct thread *)RW_OWNER(v);
+ if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
+ __func__, rw, owner);
+ while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
+ TD_IS_RUNNING(owner)) {
+ cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ }
+ continue;
+ }
+ if ((v & RW_LOCK_READ) && RW_READERS(v) &&
+ spintries < rowner_retries) {
+ if (!(v & RW_LOCK_WRITE_SPINNER)) {
+ if (!atomic_cmpset_ptr(&rw->rw_lock, v,
+ v | RW_LOCK_WRITE_SPINNER)) {
+ continue;
+ }
+ }
+ spintries++;
+ for (i = 0; i < rowner_loops; i++) {
+ if ((rw->rw_lock & RW_LOCK_WRITE_SPINNER) == 0)
+ break;
+ cpu_spinwait();
+ }
+#ifdef KDTRACE_HOOKS
+ spin_cnt += rowner_loops - i;
+#endif
+ if (i != rowner_loops)
+ continue;
+ }
+#endif
+ ts = turnstile_trywait(&rw->lock_object);
+ v = rw->rw_lock;
+
+#ifdef ADAPTIVE_RWLOCKS
+ /*
+ * The current lock owner might have started executing
+ * on another CPU (or the lock could have changed
+ * owners) while we were waiting on the turnstile
+ * chain lock. If so, drop the turnstile lock and try
+ * again.
+ */
+ if (!(v & RW_LOCK_READ)) {
+ owner = (struct thread *)RW_OWNER(v);
+ if (TD_IS_RUNNING(owner)) {
+ turnstile_cancel(ts);
+ continue;
+ }
+ }
+#endif
+ /*
+ * Check for the waiters flags about this rwlock.
+ * If the lock was released, without maintain any pending
+ * waiters queue, simply try to acquire it.
+ * If a pending waiters queue is present, claim the lock
+ * ownership and maintain the pending queue.
+ */
+ x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
+ if ((v & ~x) == RW_UNLOCKED) {
+ x &= ~RW_LOCK_WRITE_SPINNER;
+ if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid | x)) {
+ if (x)
+ turnstile_claim(ts);
+ else
+ turnstile_cancel(ts);
+ break;
+ }
+ turnstile_cancel(ts);
+ continue;
+ }
+ /*
+ * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
+ * set it. If we fail to set it, then loop back and try
+ * again.
+ */
+ if (!(v & RW_LOCK_WRITE_WAITERS)) {
+ if (!atomic_cmpset_ptr(&rw->rw_lock, v,
+ v | RW_LOCK_WRITE_WAITERS)) {
+ turnstile_cancel(ts);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p set write waiters flag",
+ __func__, rw);
+ }
+ /*
+ * We were unable to acquire the lock and the write waiters
+ * flag is set, so we must block on the turnstile.
+ */
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
+ rw);
+#ifdef KDTRACE_HOOKS
+ sleep_time -= lockstat_nsecs();
+#endif
+ turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE);
+#ifdef KDTRACE_HOOKS
+ sleep_time += lockstat_nsecs();
+ sleep_cnt++;
+#endif
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
+ __func__, rw);
+#ifdef ADAPTIVE_RWLOCKS
+ spintries = 0;
+#endif
+ }
+ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_WLOCK_ACQUIRE, rw, contested,
+ waittime, file, line);
+#ifdef KDTRACE_HOOKS
+ if (sleep_time)
+ LOCKSTAT_RECORD1(LS_RW_WLOCK_BLOCK, rw, sleep_time);
+
+ /*
+ * Record only the loops spinning and not sleeping.
+ */
+ if (spin_cnt > sleep_cnt)
+ LOCKSTAT_RECORD1(LS_RW_WLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
+#endif
+}
+
+/*
+ * This function is called if the first try at releasing a write lock failed.
+ * This means that one of the 2 waiter bits must be set indicating that at
+ * least one thread is waiting on this lock.
+ */
+void
+__rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
+ int line)
+{
+ struct rwlock *rw;
+ struct turnstile *ts;
+ uintptr_t v;
+ int queue;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ rw = rwlock2rw(c);
+
+ if (rw_wlocked(rw) && rw_recursed(rw)) {
+ rw->rw_recurse--;
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
+ return;
+ }
+
+ KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
+ ("%s: neither of the waiter flags are set", __func__));
+
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
+
+ turnstile_chain_lock(&rw->lock_object);
+ ts = turnstile_lookup(&rw->lock_object);
+ MPASS(ts != NULL);
+
+ /*
+ * Use the same algo as sx locks for now. Prefer waking up shared
+ * waiters if we have any over writers. This is probably not ideal.
+ *
+ * 'v' is the value we are going to write back to rw_lock. If we
+ * have waiters on both queues, we need to preserve the state of
+ * the waiter flag for the queue we don't wake up. For now this is
+ * hardcoded for the algorithm mentioned above.
+ *
+ * In the case of both readers and writers waiting we wakeup the
+ * readers but leave the RW_LOCK_WRITE_WAITERS flag set. If a
+ * new writer comes in before a reader it will claim the lock up
+ * above. There is probably a potential priority inversion in
+ * there that could be worked around either by waking both queues
+ * of waiters or doing some complicated lock handoff gymnastics.
+ */
+ v = RW_UNLOCKED;
+ if (rw->rw_lock & RW_LOCK_WRITE_WAITERS) {
+ queue = TS_EXCLUSIVE_QUEUE;
+ v |= (rw->rw_lock & RW_LOCK_READ_WAITERS);
+ } else
+ queue = TS_SHARED_QUEUE;
+
+ /* Wake up all waiters for the specific queue. */
+ if (LOCK_LOG_TEST(&rw->lock_object, 0))
+ CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
+ queue == TS_SHARED_QUEUE ? "read" : "write");
+ turnstile_broadcast(ts, queue);
+ atomic_store_rel_ptr(&rw->rw_lock, v);
+ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+ turnstile_chain_unlock(&rw->lock_object);
+}
+
+/*
+ * Attempt to do a non-blocking upgrade from a read lock to a write
+ * lock. This will only succeed if this thread holds a single read
+ * lock. Returns true if the upgrade succeeded and false otherwise.
+ */
+int
+__rw_try_upgrade(volatile uintptr_t *c, const char *file, int line)
+{
+ struct rwlock *rw;
+ uintptr_t v, x, tid;
+ struct turnstile *ts;
+ int success;
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+ rw = rwlock2rw(c);
+
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
+ __rw_assert(c, RA_RLOCKED, file, line);
+
+ /*
+ * Attempt to switch from one reader to a writer. If there
+ * are any write waiters, then we will have to lock the
+ * turnstile first to prevent races with another writer
+ * calling turnstile_wait() before we have claimed this
+ * turnstile. So, do the simple case of no waiters first.
+ */
+ tid = (uintptr_t)curthread;
+ success = 0;
+ for (;;) {
+ v = rw->rw_lock;
+ if (RW_READERS(v) > 1)
+ break;
+ if (!(v & RW_LOCK_WAITERS)) {
+ success = atomic_cmpset_ptr(&rw->rw_lock, v, tid);
+ if (!success)
+ continue;
+ break;
+ }
+
+ /*
+ * Ok, we think we have waiters, so lock the turnstile.
+ */
+ ts = turnstile_trywait(&rw->lock_object);
+ v = rw->rw_lock;
+ if (RW_READERS(v) > 1) {
+ turnstile_cancel(ts);
+ break;
+ }
+ /*
+ * Try to switch from one reader to a writer again. This time
+ * we honor the current state of the waiters flags.
+ * If we obtain the lock with the flags set, then claim
+ * ownership of the turnstile.
+ */
+ x = rw->rw_lock & RW_LOCK_WAITERS;
+ success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x);
+ if (success) {
+ if (x)
+ turnstile_claim(ts);
+ else
+ turnstile_cancel(ts);
+ break;
+ }
+ turnstile_cancel(ts);
+ }
+ LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
+ if (success) {
+ curthread->td_rw_rlocks--;
+ WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, rw);
+ }
+ return (success);
+}
+
+/*
+ * Downgrade a write lock into a single read lock.
+ */
+void
+__rw_downgrade(volatile uintptr_t *c, const char *file, int line)
+{
+ struct rwlock *rw;
+ struct turnstile *ts;
+ uintptr_t tid, v;
+ int rwait, wwait;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ rw = rwlock2rw(c);
+
+ KASSERT(rw->rw_lock != RW_DESTROYED,
+ ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
+ __rw_assert(c, RA_WLOCKED | RA_NOTRECURSED, file, line);
+#ifndef INVARIANTS
+ if (rw_recursed(rw))
+ panic("downgrade of a recursed lock");
+#endif
+
+ WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
+
+ /*
+ * Convert from a writer to a single reader. First we handle
+ * the easy case with no waiters. If there are any waiters, we
+ * lock the turnstile and "disown" the lock.
+ */
+ tid = (uintptr_t)curthread;
+ if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
+ goto out;
+
+ /*
+ * Ok, we think we have waiters, so lock the turnstile so we can
+ * read the waiter flags without any races.
+ */
+ turnstile_chain_lock(&rw->lock_object);
+ v = rw->rw_lock & RW_LOCK_WAITERS;
+ rwait = v & RW_LOCK_READ_WAITERS;
+ wwait = v & RW_LOCK_WRITE_WAITERS;
+ MPASS(rwait | wwait);
+
+ /*
+ * Downgrade from a write lock while preserving waiters flag
+ * and give up ownership of the turnstile.
+ */
+ ts = turnstile_lookup(&rw->lock_object);
+ MPASS(ts != NULL);
+ if (!wwait)
+ v &= ~RW_LOCK_READ_WAITERS;
+ atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v);
+ /*
+ * Wake other readers if there are no writers pending. Otherwise they
+ * won't be able to acquire the lock anyway.
+ */
+ if (rwait && !wwait) {
+ turnstile_broadcast(ts, TS_SHARED_QUEUE);
+ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+ } else
+ turnstile_disown(ts);
+ turnstile_chain_unlock(&rw->lock_object);
+out:
+ curthread->td_rw_rlocks++;
+ LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
+ LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, rw);
+}
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef __rw_assert
+#endif
+
+/*
+ * In the non-WITNESS case, rw_assert() can only detect that at least
+ * *some* thread owns an rlock, but it cannot guarantee that *this*
+ * thread owns an rlock.
+ */
+void
+__rw_assert(const volatile uintptr_t *c, int what, const char *file, int line)
+{
+ const struct rwlock *rw;
+
+ if (panicstr != NULL)
+ return;
+
+ rw = rwlock2rw(c);
+
+ switch (what) {
+ case RA_LOCKED:
+ case RA_LOCKED | RA_RECURSED:
+ case RA_LOCKED | RA_NOTRECURSED:
+ case RA_RLOCKED:
+ case RA_RLOCKED | RA_RECURSED:
+ case RA_RLOCKED | RA_NOTRECURSED:
+#ifdef WITNESS
+ witness_assert(&rw->lock_object, what, file, line);
+#else
+ /*
+ * If some other thread has a write lock or we have one
+ * and are asserting a read lock, fail. Also, if no one
+ * has a lock at all, fail.
+ */
+ if (rw->rw_lock == RW_UNLOCKED ||
+ (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED ||
+ rw_wowner(rw) != curthread)))
+ panic("Lock %s not %slocked @ %s:%d\n",
+ rw->lock_object.lo_name, (what & RA_RLOCKED) ?
+ "read " : "", file, line);
+
+ if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) {
+ if (rw_recursed(rw)) {
+ if (what & RA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ rw->lock_object.lo_name, file,
+ line);
+ } else if (what & RA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+ }
+#endif
+ break;
+ case RA_WLOCKED:
+ case RA_WLOCKED | RA_RECURSED:
+ case RA_WLOCKED | RA_NOTRECURSED:
+ if (rw_wowner(rw) != curthread)
+ panic("Lock %s not exclusively locked @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+ if (rw_recursed(rw)) {
+ if (what & RA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+ } else if (what & RA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+ break;
+ case RA_UNLOCKED:
+#ifdef WITNESS
+ witness_assert(&rw->lock_object, what, file, line);
+#else
+ /*
+ * If we hold a write lock fail. We can't reliably check
+ * to see if we hold a read lock or not.
+ */
+ if (rw_wowner(rw) == curthread)
+ panic("Lock %s exclusively locked @ %s:%d\n",
+ rw->lock_object.lo_name, file, line);
+#endif
+ break;
+ default:
+ panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
+ line);
+ }
+}
+#endif /* INVARIANT_SUPPORT */
+
+#ifdef DDB
+void
+db_show_rwlock(const struct lock_object *lock)
+{
+ const struct rwlock *rw;
+ struct thread *td;
+
+ rw = (const struct rwlock *)lock;
+
+ db_printf(" state: ");
+ if (rw->rw_lock == RW_UNLOCKED)
+ db_printf("UNLOCKED\n");
+ else if (rw->rw_lock == RW_DESTROYED) {
+ db_printf("DESTROYED\n");
+ return;
+ } else if (rw->rw_lock & RW_LOCK_READ)
+ db_printf("RLOCK: %ju locks\n",
+ (uintmax_t)(RW_READERS(rw->rw_lock)));
+ else {
+ td = rw_wowner(rw);
+ db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+ td->td_tid, td->td_proc->p_pid, td->td_name);
+ if (rw_recursed(rw))
+ db_printf(" recursed: %u\n", rw->rw_recurse);
+ }
+ db_printf(" waiters: ");
+ switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
+ case RW_LOCK_READ_WAITERS:
+ db_printf("readers\n");
+ break;
+ case RW_LOCK_WRITE_WAITERS:
+ db_printf("writers\n");
+ break;
+ case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
+ db_printf("readers and writers\n");
+ break;
+ default:
+ db_printf("none\n");
+ break;
+ }
+}
+
+#endif
diff --git a/sys/kern/kern_sdt.c b/sys/kern/kern_sdt.c
new file mode 100644
index 0000000..c8e1940
--- /dev/null
+++ b/sys/kern/kern_sdt.c
@@ -0,0 +1,51 @@
+/*-
+ * Copyright 2006-2008 John Birrell <jb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sdt.h>
+
+/*
+ * Hook for the DTrace probe function. The SDT provider will set this to
+ * dtrace_probe() when it loads.
+ */
+sdt_probe_func_t sdt_probe_func = sdt_probe_stub;
+
+/*
+ * This is a stub for probe calls in case kernel DTrace support isn't
+ * enabled. It should never get called because there is no DTrace support
+ * to enable it.
+ */
+void
+sdt_probe_stub(uint32_t id, uintptr_t arg0, uintptr_t arg1,
+ uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
+{
+
+ printf("sdt_probe_stub: Why did this get called?\n");
+}
diff --git a/sys/kern/kern_sema.c b/sys/kern/kern_sema.c
new file mode 100644
index 0000000..f09099e
--- /dev/null
+++ b/sys/kern/kern_sema.c
@@ -0,0 +1,176 @@
+/*-
+ * Copyright (C) 2001 Jason Evans <jasone@freebsd.org>. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Counting semaphores.
+ *
+ * Priority propagation will not generally raise the priority of semaphore
+ * "owners" (a misnomer in the context of semaphores), so should not be relied
+ * upon in combination with semaphores.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+#include <sys/condvar.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sema.h>
+
+void
+sema_init(struct sema *sema, int value, const char *description)
+{
+
+ KASSERT((value >= 0), ("%s(): negative value\n", __func__));
+
+ bzero(sema, sizeof(*sema));
+ mtx_init(&sema->sema_mtx, description, "sema backing lock",
+ MTX_DEF | MTX_NOWITNESS | MTX_QUIET);
+ cv_init(&sema->sema_cv, description);
+ sema->sema_value = value;
+
+ CTR4(KTR_LOCK, "%s(%p, %d, \"%s\")", __func__, sema, value, description);
+}
+
+void
+sema_destroy(struct sema *sema)
+{
+
+ CTR3(KTR_LOCK, "%s(%p) \"%s\"", __func__, sema,
+ cv_wmesg(&sema->sema_cv));
+
+ KASSERT((sema->sema_waiters == 0), ("%s(): waiters\n", __func__));
+
+ mtx_destroy(&sema->sema_mtx);
+ cv_destroy(&sema->sema_cv);
+}
+
+void
+_sema_post(struct sema *sema, const char *file, int line)
+{
+
+ mtx_lock(&sema->sema_mtx);
+ sema->sema_value++;
+ if (sema->sema_waiters && sema->sema_value > 0)
+ cv_signal(&sema->sema_cv);
+
+ CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+
+ mtx_unlock(&sema->sema_mtx);
+}
+
+void
+_sema_wait(struct sema *sema, const char *file, int line)
+{
+
+ mtx_lock(&sema->sema_mtx);
+ while (sema->sema_value == 0) {
+ sema->sema_waiters++;
+ cv_wait(&sema->sema_cv, &sema->sema_mtx);
+ sema->sema_waiters--;
+ }
+ sema->sema_value--;
+
+ CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+
+ mtx_unlock(&sema->sema_mtx);
+}
+
+int
+_sema_timedwait(struct sema *sema, int timo, const char *file, int line)
+{
+ int error;
+
+ mtx_lock(&sema->sema_mtx);
+
+ /*
+ * A spurious wakeup will cause the timeout interval to start over.
+ * This isn't a big deal as long as spurious wakeups don't occur
+ * continuously, since the timeout period is merely a lower bound on how
+ * long to wait.
+ */
+ for (error = 0; sema->sema_value == 0 && error == 0;) {
+ sema->sema_waiters++;
+ error = cv_timedwait(&sema->sema_cv, &sema->sema_mtx, timo);
+ sema->sema_waiters--;
+ }
+ if (sema->sema_value > 0) {
+ /* Success. */
+ sema->sema_value--;
+ error = 0;
+
+ CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+ } else {
+ CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), file, line);
+ }
+
+ mtx_unlock(&sema->sema_mtx);
+ return (error);
+}
+
+int
+_sema_trywait(struct sema *sema, const char *file, int line)
+{
+ int ret;
+
+ mtx_lock(&sema->sema_mtx);
+
+ if (sema->sema_value > 0) {
+ /* Success. */
+ sema->sema_value--;
+ ret = 1;
+
+ CTR6(KTR_LOCK, "%s(%p) \"%s\" v = %d at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), sema->sema_value, file, line);
+ } else {
+ ret = 0;
+
+ CTR5(KTR_LOCK, "%s(%p) \"%s\" fail at %s:%d", __func__, sema,
+ cv_wmesg(&sema->sema_cv), file, line);
+ }
+
+ mtx_unlock(&sema->sema_mtx);
+ return (ret);
+}
+
+int
+sema_value(struct sema *sema)
+{
+ int ret;
+
+ mtx_lock(&sema->sema_mtx);
+ ret = sema->sema_value;
+ mtx_unlock(&sema->sema_mtx);
+ return (ret);
+}
diff --git a/sys/kern/kern_sharedpage.c b/sys/kern/kern_sharedpage.c
new file mode 100644
index 0000000..fd619cd
--- /dev/null
+++ b/sys/kern/kern_sharedpage.c
@@ -0,0 +1,239 @@
+/*-
+ * Copyright (c) 2010, 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/vdso.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+static struct sx shared_page_alloc_sx;
+static vm_object_t shared_page_obj;
+static int shared_page_free;
+char *shared_page_mapping;
+
+void
+shared_page_write(int base, int size, const void *data)
+{
+
+ bcopy(data, shared_page_mapping + base, size);
+}
+
+static int
+shared_page_alloc_locked(int size, int align)
+{
+ int res;
+
+ res = roundup(shared_page_free, align);
+ if (res + size >= IDX_TO_OFF(shared_page_obj->size))
+ res = -1;
+ else
+ shared_page_free = res + size;
+ return (res);
+}
+
+int
+shared_page_alloc(int size, int align)
+{
+ int res;
+
+ sx_xlock(&shared_page_alloc_sx);
+ res = shared_page_alloc_locked(size, align);
+ sx_xunlock(&shared_page_alloc_sx);
+ return (res);
+}
+
+int
+shared_page_fill(int size, int align, const void *data)
+{
+ int res;
+
+ sx_xlock(&shared_page_alloc_sx);
+ res = shared_page_alloc_locked(size, align);
+ if (res != -1)
+ shared_page_write(res, size, data);
+ sx_xunlock(&shared_page_alloc_sx);
+ return (res);
+}
+
+static void
+shared_page_init(void *dummy __unused)
+{
+ vm_page_t m;
+ vm_offset_t addr;
+
+ sx_init(&shared_page_alloc_sx, "shpsx");
+ shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
+ VM_PROT_DEFAULT, 0, NULL);
+ VM_OBJECT_WLOCK(shared_page_obj);
+ m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_NOBUSY | VM_ALLOC_ZERO);
+ m->valid = VM_PAGE_BITS_ALL;
+ VM_OBJECT_WUNLOCK(shared_page_obj);
+ addr = kva_alloc(PAGE_SIZE);
+ pmap_qenter(addr, &m, 1);
+ shared_page_mapping = (char *)addr;
+}
+
+SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
+ NULL);
+
+static void
+timehands_update(struct sysentvec *sv)
+{
+ struct vdso_timehands th;
+ struct vdso_timekeep *tk;
+ uint32_t enabled, idx;
+
+ enabled = tc_fill_vdso_timehands(&th);
+ tk = (struct vdso_timekeep *)(shared_page_mapping +
+ sv->sv_timekeep_off);
+ idx = sv->sv_timekeep_curr;
+ atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0);
+ if (++idx >= VDSO_TH_NUM)
+ idx = 0;
+ sv->sv_timekeep_curr = idx;
+ if (++sv->sv_timekeep_gen == 0)
+ sv->sv_timekeep_gen = 1;
+ th.th_gen = 0;
+ if (enabled)
+ tk->tk_th[idx] = th;
+ tk->tk_enabled = enabled;
+ atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen);
+ tk->tk_current = idx;
+}
+
+#ifdef COMPAT_FREEBSD32
+static void
+timehands_update32(struct sysentvec *sv)
+{
+ struct vdso_timekeep32 *tk;
+ struct vdso_timehands32 th;
+ uint32_t enabled, idx;
+
+ enabled = tc_fill_vdso_timehands32(&th);
+ tk = (struct vdso_timekeep32 *)(shared_page_mapping +
+ sv->sv_timekeep_off);
+ idx = sv->sv_timekeep_curr;
+ atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0);
+ if (++idx >= VDSO_TH_NUM)
+ idx = 0;
+ sv->sv_timekeep_curr = idx;
+ if (++sv->sv_timekeep_gen == 0)
+ sv->sv_timekeep_gen = 1;
+ th.th_gen = 0;
+ if (enabled)
+ tk->tk_th[idx] = th;
+ tk->tk_enabled = enabled;
+ atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen);
+ tk->tk_current = idx;
+}
+#endif
+
+/*
+ * This is hackish, but easiest way to avoid creating list structures
+ * that needs to be iterated over from the hardclock interrupt
+ * context.
+ */
+static struct sysentvec *host_sysentvec;
+#ifdef COMPAT_FREEBSD32
+static struct sysentvec *compat32_sysentvec;
+#endif
+
+void
+timekeep_push_vdso(void)
+{
+
+ if (host_sysentvec != NULL && host_sysentvec->sv_timekeep_base != 0)
+ timehands_update(host_sysentvec);
+#ifdef COMPAT_FREEBSD32
+ if (compat32_sysentvec != NULL &&
+ compat32_sysentvec->sv_timekeep_base != 0)
+ timehands_update32(compat32_sysentvec);
+#endif
+}
+
+void
+exec_sysvec_init(void *param)
+{
+ struct sysentvec *sv;
+ int tk_base;
+ uint32_t tk_ver;
+
+ sv = (struct sysentvec *)param;
+
+ if ((sv->sv_flags & SV_SHP) == 0)
+ return;
+ sv->sv_shared_page_obj = shared_page_obj;
+ sv->sv_sigcode_base = sv->sv_shared_page_base +
+ shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
+ if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD)
+ return;
+ tk_ver = VDSO_TK_VER_CURR;
+#ifdef COMPAT_FREEBSD32
+ if ((sv->sv_flags & SV_ILP32) != 0) {
+ tk_base = shared_page_alloc(sizeof(struct vdso_timekeep32) +
+ sizeof(struct vdso_timehands32) * VDSO_TH_NUM, 16);
+ KASSERT(tk_base != -1, ("tk_base -1 for 32bit"));
+ shared_page_write(tk_base + offsetof(struct vdso_timekeep32,
+ tk_ver), sizeof(uint32_t), &tk_ver);
+ KASSERT(compat32_sysentvec == 0,
+ ("Native compat32 already registered"));
+ compat32_sysentvec = sv;
+ } else {
+#endif
+ tk_base = shared_page_alloc(sizeof(struct vdso_timekeep) +
+ sizeof(struct vdso_timehands) * VDSO_TH_NUM, 16);
+ KASSERT(tk_base != -1, ("tk_base -1 for native"));
+ shared_page_write(tk_base + offsetof(struct vdso_timekeep,
+ tk_ver), sizeof(uint32_t), &tk_ver);
+ KASSERT(host_sysentvec == 0, ("Native already registered"));
+ host_sysentvec = sv;
+#ifdef COMPAT_FREEBSD32
+ }
+#endif
+ sv->sv_timekeep_base = sv->sv_shared_page_base + tk_base;
+ sv->sv_timekeep_off = tk_base;
+ timekeep_push_vdso();
+}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
new file mode 100644
index 0000000..b120263
--- /dev/null
+++ b/sys/kern/kern_shutdown.c
@@ -0,0 +1,893 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdb.h"
+#include "opt_panic.h"
+#include "opt_sched.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/eventhandler.h>
+#include <sys/jail.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kerneldump.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/reboot.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+#include <sys/watchdog.h>
+
+#include <ddb/ddb.h>
+
+#include <machine/cpu.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
+
+#include <sys/signalvar.h>
+
+#ifndef PANIC_REBOOT_WAIT_TIME
+#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#ifdef KDB
+#ifdef KDB_UNATTENDED
+int debugger_on_panic = 0;
+#else
+int debugger_on_panic = 1;
+#endif
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic,
+ CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_TUN,
+ &debugger_on_panic, 0, "Run debugger on kernel panic");
+TUNABLE_INT("debug.debugger_on_panic", &debugger_on_panic);
+
+#ifdef KDB_TRACE
+static int trace_on_panic = 1;
+#else
+static int trace_on_panic = 0;
+#endif
+SYSCTL_INT(_debug, OID_AUTO, trace_on_panic,
+ CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_TUN,
+ &trace_on_panic, 0, "Print stack trace on kernel panic");
+TUNABLE_INT("debug.trace_on_panic", &trace_on_panic);
+#endif /* KDB */
+
+static int sync_on_panic = 0;
+SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW | CTLFLAG_TUN,
+ &sync_on_panic, 0, "Do a sync before rebooting from a panic");
+TUNABLE_INT("kern.sync_on_panic", &sync_on_panic);
+
+static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0,
+ "Shutdown environment");
+
+#ifndef DIAGNOSTIC
+static int show_busybufs;
+#else
+static int show_busybufs = 1;
+#endif
+SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW,
+ &show_busybufs, 0, "");
+
+/*
+ * Variable panicstr contains argument to first call to panic; used as flag
+ * to indicate that the kernel has already called panic.
+ */
+const char *panicstr;
+
+int dumping; /* system is dumping */
+int rebooting; /* system is rebooting */
+static struct dumperinfo dumper; /* our selected dumper */
+
+/* Context information for dump-debuggers. */
+static struct pcb dumppcb; /* Registers. */
+lwpid_t dumptid; /* Thread ID. */
+
+static void poweroff_wait(void *, int);
+static void shutdown_halt(void *junk, int howto);
+static void shutdown_panic(void *junk, int howto);
+static void shutdown_reset(void *junk, int howto);
+static void vpanic(const char *fmt, va_list ap) __dead2;
+
+/* register various local shutdown events */
+static void
+shutdown_conf(void *unused)
+{
+
+ EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
+ SHUTDOWN_PRI_FIRST);
+ EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
+ SHUTDOWN_PRI_LAST + 100);
+ EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
+ SHUTDOWN_PRI_LAST + 100);
+ EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
+ SHUTDOWN_PRI_LAST + 200);
+}
+
+SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL);
+
+/*
+ * The system call that results in a reboot.
+ */
+/* ARGSUSED */
+int
+sys_reboot(struct thread *td, struct reboot_args *uap)
+{
+ int error;
+
+ error = 0;
+#ifdef MAC
+ error = mac_system_check_reboot(td->td_ucred, uap->opt);
+#endif
+ if (error == 0)
+ error = priv_check(td, PRIV_REBOOT);
+ if (error == 0) {
+ mtx_lock(&Giant);
+ kern_reboot(uap->opt);
+ mtx_unlock(&Giant);
+ }
+ return (error);
+}
+
+/*
+ * Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC
+ */
+static int shutdown_howto = 0;
+
+void
+shutdown_nice(int howto)
+{
+
+ shutdown_howto = howto;
+
+ /* Send a signal to init(8) and have it shutdown the world */
+ if (initproc != NULL) {
+ PROC_LOCK(initproc);
+ kern_psignal(initproc, SIGINT);
+ PROC_UNLOCK(initproc);
+ } else {
+ /* No init(8) running, so simply reboot */
+ kern_reboot(RB_NOSYNC);
+ }
+ return;
+}
+static int waittime = -1;
+
+static void
+print_uptime(void)
+{
+ int f;
+ struct timespec ts;
+
+ getnanouptime(&ts);
+ printf("Uptime: ");
+ f = 0;
+ if (ts.tv_sec >= 86400) {
+ printf("%ldd", (long)ts.tv_sec / 86400);
+ ts.tv_sec %= 86400;
+ f = 1;
+ }
+ if (f || ts.tv_sec >= 3600) {
+ printf("%ldh", (long)ts.tv_sec / 3600);
+ ts.tv_sec %= 3600;
+ f = 1;
+ }
+ if (f || ts.tv_sec >= 60) {
+ printf("%ldm", (long)ts.tv_sec / 60);
+ ts.tv_sec %= 60;
+ f = 1;
+ }
+ printf("%lds\n", (long)ts.tv_sec);
+}
+
+int
+doadump(boolean_t textdump)
+{
+ boolean_t coredump;
+
+ if (dumping)
+ return (EBUSY);
+ if (dumper.dumper == NULL)
+ return (ENXIO);
+
+ savectx(&dumppcb);
+ dumptid = curthread->td_tid;
+ dumping++;
+
+ coredump = TRUE;
+#ifdef DDB
+ if (textdump && textdump_pending) {
+ coredump = FALSE;
+ textdump_dumpsys(&dumper);
+ }
+#endif
+ if (coredump)
+ dumpsys(&dumper);
+
+ dumping--;
+ return (0);
+}
+
+static int
+isbufbusy(struct buf *bp)
+{
+ if (((bp->b_flags & (B_INVAL | B_PERSISTENT)) == 0 &&
+ BUF_ISLOCKED(bp)) ||
+ ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
+ return (1);
+ return (0);
+}
+
+/*
+ * Shutdown the system cleanly to prepare for reboot, halt, or power off.
+ */
+void
+kern_reboot(int howto)
+{
+ static int first_buf_printf = 1;
+
+#if defined(SMP)
+ /*
+ * Bind us to CPU 0 so that all shutdown code runs there. Some
+ * systems don't shutdown properly (i.e., ACPI power off) if we
+ * run on another processor.
+ */
+ if (!SCHEDULER_STOPPED()) {
+ thread_lock(curthread);
+ sched_bind(curthread, 0);
+ thread_unlock(curthread);
+ KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0"));
+ }
+#endif
+ /* We're in the process of rebooting. */
+ rebooting = 1;
+
+ /* collect extra flags that shutdown_nice might have set */
+ howto |= shutdown_howto;
+
+ /* We are out of the debugger now. */
+ kdb_active = 0;
+
+ /*
+ * Do any callouts that should be done BEFORE syncing the filesystems.
+ */
+ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
+
+ /*
+ * Now sync filesystems
+ */
+ if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
+ register struct buf *bp;
+ int iter, nbusy, pbusy;
+#ifndef PREEMPTION
+ int subiter;
+#endif
+
+ waittime = 0;
+
+ wdog_kern_pat(WD_LASTVAL);
+ sys_sync(curthread, NULL);
+
+ /*
+ * With soft updates, some buffers that are
+ * written will be remarked as dirty until other
+ * buffers are written.
+ */
+ for (iter = pbusy = 0; iter < 20; iter++) {
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; )
+ if (isbufbusy(bp))
+ nbusy++;
+ if (nbusy == 0) {
+ if (first_buf_printf)
+ printf("All buffers synced.");
+ break;
+ }
+ if (first_buf_printf) {
+ printf("Syncing disks, buffers remaining... ");
+ first_buf_printf = 0;
+ }
+ printf("%d ", nbusy);
+ if (nbusy < pbusy)
+ iter = 0;
+ pbusy = nbusy;
+
+ wdog_kern_pat(WD_LASTVAL);
+ sys_sync(curthread, NULL);
+
+#ifdef PREEMPTION
+ /*
+ * Drop Giant and spin for a while to allow
+ * interrupt threads to run.
+ */
+ DROP_GIANT();
+ DELAY(50000 * iter);
+ PICKUP_GIANT();
+#else
+ /*
+ * Drop Giant and context switch several times to
+ * allow interrupt threads to run.
+ */
+ DROP_GIANT();
+ for (subiter = 0; subiter < 50 * iter; subiter++) {
+ thread_lock(curthread);
+ mi_switch(SW_VOL, NULL);
+ thread_unlock(curthread);
+ DELAY(1000);
+ }
+ PICKUP_GIANT();
+#endif
+ }
+ printf("\n");
+ /*
+ * Count only busy local buffers to prevent forcing
+ * a fsck if we're just a client of a wedged NFS server
+ */
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if (isbufbusy(bp)) {
+#if 0
+/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */
+ if (bp->b_dev == NULL) {
+ TAILQ_REMOVE(&mountlist,
+ bp->b_vp->v_mount, mnt_list);
+ continue;
+ }
+#endif
+ nbusy++;
+ if (show_busybufs > 0) {
+ printf(
+ "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
+ nbusy, bp, bp->b_vp, bp->b_flags,
+ (intmax_t)bp->b_blkno,
+ (intmax_t)bp->b_lblkno);
+ BUF_LOCKPRINTINFO(bp);
+ if (show_busybufs > 1)
+ vn_printf(bp->b_vp,
+ "vnode content: ");
+ }
+ }
+ }
+ if (nbusy) {
+ /*
+ * Failed to sync all blocks. Indicate this and don't
+ * unmount filesystems (thus forcing an fsck on reboot).
+ */
+ printf("Giving up on %d buffers\n", nbusy);
+ DELAY(5000000); /* 5 seconds */
+ } else {
+ if (!first_buf_printf)
+ printf("Final sync complete\n");
+ /*
+ * Unmount filesystems
+ */
+ if (panicstr == 0)
+ vfs_unmountall();
+ }
+ swapoff_all();
+ DELAY(100000); /* wait for console output to finish */
+ }
+
+ print_uptime();
+
+ cngrab();
+
+ /*
+ * Ok, now do things that assume all filesystem activity has
+ * been completed.
+ */
+ EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
+
+ if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping)
+ doadump(TRUE);
+
+ /* Now that we're going to really halt the system... */
+ EVENTHANDLER_INVOKE(shutdown_final, howto);
+
+ for(;;) ; /* safety against shutdown_reset not working */
+ /* NOTREACHED */
+}
+
+/*
+ * If the shutdown was a clean halt, behave accordingly.
+ */
+static void
+shutdown_halt(void *junk, int howto)
+{
+
+ if (howto & RB_HALT) {
+ printf("\n");
+ printf("The operating system has halted.\n");
+ printf("Please press any key to reboot.\n\n");
+ switch (cngetc()) {
+ case -1: /* No console, just die */
+ cpu_halt();
+ /* NOTREACHED */
+ default:
+ howto &= ~RB_HALT;
+ break;
+ }
+ }
+}
+
+/*
+ * Check to see if the system paniced, pause and then reboot
+ * according to the specified delay.
+ */
+static void
+shutdown_panic(void *junk, int howto)
+{
+ int loop;
+
+ if (howto & RB_DUMP) {
+ if (PANIC_REBOOT_WAIT_TIME != 0) {
+ if (PANIC_REBOOT_WAIT_TIME != -1) {
+ printf("Automatic reboot in %d seconds - "
+ "press a key on the console to abort\n",
+ PANIC_REBOOT_WAIT_TIME);
+ for (loop = PANIC_REBOOT_WAIT_TIME * 10;
+ loop > 0; --loop) {
+ DELAY(1000 * 100); /* 1/10th second */
+ /* Did user type a key? */
+ if (cncheckc() != -1)
+ break;
+ }
+ if (!loop)
+ return;
+ }
+ } else { /* zero time specified - reboot NOW */
+ return;
+ }
+ printf("--> Press a key on the console to reboot,\n");
+ printf("--> or switch off the system now.\n");
+ cngetc();
+ }
+}
+
+/*
+ * Everything done, now reset
+ */
+static void
+shutdown_reset(void *junk, int howto)
+{
+
+ printf("Rebooting...\n");
+ DELAY(1000000); /* wait 1 sec for printf's to complete and be read */
+
+ /*
+ * Acquiring smp_ipi_mtx here has a double effect:
+ * - it disables interrupts avoiding CPU0 preemption
+ * by fast handlers (thus deadlocking against other CPUs)
+ * - it avoids deadlocks against smp_rendezvous() or, more
+ * generally, threads busy-waiting, with this spinlock held,
+ * and waiting for responses by threads on other CPUs
+ * (ie. smp_tlb_shootdown()).
+ *
+ * For the !SMP case it just needs to handle the former problem.
+ */
+#ifdef SMP
+ mtx_lock_spin(&smp_ipi_mtx);
+#else
+ spinlock_enter();
+#endif
+
+ /* cpu_boot(howto); */ /* doesn't do anything at the moment */
+ cpu_reset();
+ /* NOTREACHED */ /* assuming reset worked */
+}
+
+#if defined(WITNESS) || defined(INVARIANTS)
+static int kassert_warn_only = 0;
+#ifdef KDB
+static int kassert_do_kdb = 0;
+#endif
+#ifdef KTR
+static int kassert_do_ktr = 0;
+#endif
+static int kassert_do_log = 1;
+static int kassert_log_pps_limit = 4;
+static int kassert_log_mute_at = 0;
+static int kassert_log_panic_at = 0;
+static int kassert_warnings = 0;
+
+SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options");
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, CTLFLAG_RW | CTLFLAG_TUN,
+ &kassert_warn_only, 0,
+ "KASSERT triggers a panic (1) or just a warning (0)");
+TUNABLE_INT("debug.kassert.warn_only", &kassert_warn_only);
+
+#ifdef KDB
+SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, CTLFLAG_RW | CTLFLAG_TUN,
+ &kassert_do_kdb, 0, "KASSERT will enter the debugger");
+TUNABLE_INT("debug.kassert.do_kdb", &kassert_do_kdb);
+#endif
+
+#ifdef KTR
+SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, CTLFLAG_RW | CTLFLAG_TUN,
+ &kassert_do_ktr, 0,
+ "KASSERT does a KTR, set this to the KTRMASK you want");
+TUNABLE_INT("debug.kassert.do_ktr", &kassert_do_ktr);
+#endif
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, CTLFLAG_RW | CTLFLAG_TUN,
+ &kassert_do_log, 0, "KASSERT triggers a panic (1) or just a warning (0)");
+TUNABLE_INT("debug.kassert.do_log", &kassert_do_log);
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RW | CTLFLAG_TUN,
+ &kassert_warnings, 0, "number of KASSERTs that have been triggered");
+TUNABLE_INT("debug.kassert.warnings", &kassert_warnings);
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, CTLFLAG_RW | CTLFLAG_TUN,
+ &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic");
+TUNABLE_INT("debug.kassert.log_panic_at", &kassert_log_panic_at);
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CTLFLAG_RW | CTLFLAG_TUN,
+ &kassert_log_pps_limit, 0, "limit number of log messages per second");
+TUNABLE_INT("debug.kassert.log_pps_limit", &kassert_log_pps_limit);
+
+SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RW | CTLFLAG_TUN,
+ &kassert_log_mute_at, 0, "max number of KASSERTS to log");
+TUNABLE_INT("debug.kassert.log_mute_at", &kassert_log_mute_at);
+
+static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+ kassert_sysctl_kassert, "I", "set to trigger a test kassert");
+
+static int
+kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error == 0) {
+ i = 0;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i));
+ return (0);
+}
+
+/*
+ * Called by KASSERT, this decides if we will panic
+ * or if we will log via printf and/or ktr.
+ */
+void
+kassert_panic(const char *fmt, ...)
+{
+ static char buf[256];
+ va_list ap;
+
+ va_start(ap, fmt);
+ (void)vsnprintf(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+
+ /*
+ * panic if we're not just warning, or if we've exceeded
+ * kassert_log_panic_at warnings.
+ */
+ if (!kassert_warn_only ||
+ (kassert_log_panic_at > 0 &&
+ kassert_warnings >= kassert_log_panic_at)) {
+ va_start(ap, fmt);
+ vpanic(fmt, ap);
+ /* NORETURN */
+ }
+#ifdef KTR
+ if (kassert_do_ktr)
+ CTR0(ktr_mask, buf);
+#endif /* KTR */
+ /*
+ * log if we've not yet met the mute limit.
+ */
+ if (kassert_do_log &&
+ (kassert_log_mute_at == 0 ||
+ kassert_warnings < kassert_log_mute_at)) {
+ static struct timeval lasterr;
+ static int curerr;
+
+ if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) {
+ printf("KASSERT failed: %s\n", buf);
+ kdb_backtrace();
+ }
+ }
+#ifdef KDB
+ if (kassert_do_kdb) {
+ kdb_enter(KDB_WHY_KASSERT, buf);
+ }
+#endif
+ atomic_add_int(&kassert_warnings, 1);
+}
+#endif
+
+/*
+ * Panic is called on unresolvable fatal errors. It prints "panic: mesg",
+ * and then reboots. If we are called twice, then we avoid trying to sync
+ * the disks as this often leads to recursive panics.
+ */
+void
+panic(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vpanic(fmt, ap);
+}
+
+static void
+vpanic(const char *fmt, va_list ap)
+{
+#ifdef SMP
+ cpuset_t other_cpus;
+#endif
+ struct thread *td = curthread;
+ int bootopt, newpanic;
+ static char buf[256];
+
+ spinlock_enter();
+
+#ifdef SMP
+ /*
+ * stop_cpus_hard(other_cpus) should prevent multiple CPUs from
+ * concurrently entering panic. Only the winner will proceed
+ * further.
+ */
+ if (panicstr == NULL && !kdb_active) {
+ other_cpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ stop_cpus_hard(other_cpus);
+ }
+
+ /*
+ * We set stop_scheduler here and not in the block above,
+ * because we want to ensure that if panic has been called and
+ * stop_scheduler_on_panic is true, then stop_scheduler will
+ * always be set. Even if panic has been entered from kdb.
+ */
+ td->td_stopsched = 1;
+#endif
+
+ bootopt = RB_AUTOBOOT;
+ newpanic = 0;
+ if (panicstr)
+ bootopt |= RB_NOSYNC;
+ else {
+ bootopt |= RB_DUMP;
+ panicstr = fmt;
+ newpanic = 1;
+ }
+
+ if (newpanic) {
+ (void)vsnprintf(buf, sizeof(buf), fmt, ap);
+ panicstr = buf;
+ cngrab();
+ printf("panic: %s\n", buf);
+ } else {
+ printf("panic: ");
+ vprintf(fmt, ap);
+ printf("\n");
+ }
+#ifdef SMP
+ printf("cpuid = %d\n", PCPU_GET(cpuid));
+#endif
+
+#ifdef KDB
+ if (newpanic && trace_on_panic)
+ kdb_backtrace();
+ if (debugger_on_panic)
+ kdb_enter(KDB_WHY_PANIC, "panic");
+#endif
+ /*thread_lock(td); */
+ td->td_flags |= TDF_INPANIC;
+ /* thread_unlock(td); */
+ if (!sync_on_panic)
+ bootopt |= RB_NOSYNC;
+ kern_reboot(bootopt);
+}
+
+/*
+ * Support for poweroff delay.
+ *
+ * Please note that setting this delay too short might power off your machine
+ * before the write cache on your hard disk has been flushed, leading to
+ * soft-updates inconsistencies.
+ */
+#ifndef POWEROFF_DELAY
+# define POWEROFF_DELAY 5000
+#endif
+static int poweroff_delay = POWEROFF_DELAY;
+
+SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
+ &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)");
+
+static void
+poweroff_wait(void *junk, int howto)
+{
+
+ if (!(howto & RB_POWEROFF) || poweroff_delay <= 0)
+ return;
+ DELAY(poweroff_delay * 1000);
+}
+
+/*
+ * Some system processes (e.g. syncer) need to be stopped at appropriate
+ * points in their main loops prior to a system shutdown, so that they
+ * won't interfere with the shutdown process (e.g. by holding a disk buf
+ * to cause sync to fail). For each of these system processes, register
+ * shutdown_kproc() as a handler for one of shutdown events.
+ */
+static int kproc_shutdown_wait = 60;
+SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
+ &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process");
+
+void
+kproc_shutdown(void *arg, int howto)
+{
+ struct proc *p;
+ int error;
+
+ if (panicstr)
+ return;
+
+ p = (struct proc *)arg;
+ printf("Waiting (max %d seconds) for system process `%s' to stop...",
+ kproc_shutdown_wait, p->p_comm);
+ error = kproc_suspend(p, kproc_shutdown_wait * hz);
+
+ if (error == EWOULDBLOCK)
+ printf("timed out\n");
+ else
+ printf("done\n");
+}
+
+void
+kthread_shutdown(void *arg, int howto)
+{
+ struct thread *td;
+ int error;
+
+ if (panicstr)
+ return;
+
+ td = (struct thread *)arg;
+ printf("Waiting (max %d seconds) for system thread `%s' to stop...",
+ kproc_shutdown_wait, td->td_name);
+ error = kthread_suspend(td, kproc_shutdown_wait * hz);
+
+ if (error == EWOULDBLOCK)
+ printf("timed out\n");
+ else
+ printf("done\n");
+}
+
+static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)];
+SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD,
+ dumpdevname, 0, "Device for kernel dumps");
+
+/* Registration of dumpers */
+int
+set_dumper(struct dumperinfo *di, const char *devname)
+{
+ size_t wantcopy;
+
+ if (di == NULL) {
+ bzero(&dumper, sizeof dumper);
+ dumpdevname[0] = '\0';
+ return (0);
+ }
+ if (dumper.dumper != NULL)
+ return (EBUSY);
+ dumper = *di;
+ wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname));
+ if (wantcopy >= sizeof(dumpdevname)) {
+ printf("set_dumper: device name truncated from '%s' -> '%s'\n",
+ devname, dumpdevname);
+ }
+ return (0);
+}
+
+/* Call dumper with bounds checking. */
+int
+dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
+ off_t offset, size_t length)
+{
+
+ if (length != 0 && (offset < di->mediaoffset ||
+ offset - di->mediaoffset + length > di->mediasize)) {
+ printf("Attempt to write outside dump device boundaries.\n"
+ "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n",
+ (intmax_t)offset, (intmax_t)di->mediaoffset,
+ (uintmax_t)length, (intmax_t)di->mediasize);
+ return (ENOSPC);
+ }
+ return (di->dumper(di->priv, virtual, physical, offset, length));
+}
+
+void
+mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver,
+ uint64_t dumplen, uint32_t blksz)
+{
+
+ bzero(kdh, sizeof(*kdh));
+ strncpy(kdh->magic, magic, sizeof(kdh->magic));
+ strncpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
+ kdh->version = htod32(KERNELDUMPVERSION);
+ kdh->architectureversion = htod32(archver);
+ kdh->dumplength = htod64(dumplen);
+ kdh->dumptime = htod64(time_second);
+ kdh->blocksize = htod32(blksz);
+ strncpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname));
+ strncpy(kdh->versionstring, version, sizeof(kdh->versionstring));
+ if (panicstr != NULL)
+ strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
+ kdh->parity = kerneldump_parity(kdh);
+}
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
new file mode 100644
index 0000000..1797ebc
--- /dev/null
+++ b/sys/kern/kern_sig.c
@@ -0,0 +1,3469 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_core.h"
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/signalvar.h>
+#include <sys/vnode.h>
+#include <sys/acct.h>
+#include <sys/capability.h>
+#include <sys/condvar.h>
+#include <sys/event.h>
+#include <sys/fcntl.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/ktrace.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/posix4.h>
+#include <sys/pioctl.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sdt.h>
+#include <sys/sbuf.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/timers.h>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+#include <sys/jail.h>
+
+#include <machine/cpu.h>
+
+#include <security/audit/audit.h>
+
+#define ONSIG 32 /* NSIG for osig* syscalls. XXX. */
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE3(proc, kernel, , signal_send, signal-send, "struct thread *",
+ "struct proc *", "int");
+SDT_PROBE_DEFINE2(proc, kernel, , signal_clear, signal-clear, "int",
+ "ksiginfo_t *");
+SDT_PROBE_DEFINE3(proc, kernel, , signal_discard, signal-discard,
+ "struct thread *", "struct proc *", "int");
+
+static int coredump(struct thread *);
+static int killpg1(struct thread *td, int sig, int pgid, int all,
+ ksiginfo_t *ksi);
+static int issignal(struct thread *td);
+static int sigprop(int sig);
+static void tdsigwakeup(struct thread *, int, sig_t, int);
+static void sig_suspend_threads(struct thread *, struct proc *, int);
+static int filt_sigattach(struct knote *kn);
+static void filt_sigdetach(struct knote *kn);
+static int filt_signal(struct knote *kn, long hint);
+static struct thread *sigtd(struct proc *p, int sig, int prop);
+static void sigqueue_start(void);
+
+static uma_zone_t ksiginfo_zone = NULL;
+struct filterops sig_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_sigattach,
+ .f_detach = filt_sigdetach,
+ .f_event = filt_signal,
+};
+
+static int kern_logsigexit = 1;
+SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
+ &kern_logsigexit, 0,
+ "Log processes quitting on abnormal signals to syslog(3)");
+
+static int kern_forcesigexit = 1;
+SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
+ &kern_forcesigexit, 0, "Force trap signal to be handled");
+
+static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0,
+ "POSIX real time signal");
+
+static int max_pending_per_proc = 128;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
+ &max_pending_per_proc, 0, "Max pending signals per proc");
+
+static int preallocate_siginfo = 1024;
+TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
+ &preallocate_siginfo, 0, "Preallocated signal memory size");
+
+static int signal_overflow = 0;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
+ &signal_overflow, 0, "Number of signals overflew");
+
+static int signal_alloc_fail = 0;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
+ &signal_alloc_fail, 0, "signals failed to be allocated");
+
+SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
+
+/*
+ * Policy -- Can ucred cr1 send SIGIO to process cr2?
+ * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
+ * in the right situations.
+ */
+#define CANSIGIO(cr1, cr2) \
+ ((cr1)->cr_uid == 0 || \
+ (cr1)->cr_ruid == (cr2)->cr_ruid || \
+ (cr1)->cr_uid == (cr2)->cr_ruid || \
+ (cr1)->cr_ruid == (cr2)->cr_uid || \
+ (cr1)->cr_uid == (cr2)->cr_uid)
+
+static int sugid_coredump;
+TUNABLE_INT("kern.sugid_coredump", &sugid_coredump);
+SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW,
+ &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
+
+static int capmode_coredump;
+TUNABLE_INT("kern.capmode_coredump", &capmode_coredump);
+SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RW,
+ &capmode_coredump, 0, "Allow processes in capability mode to dump core");
+
+static int do_coredump = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
+ &do_coredump, 0, "Enable/Disable coredumps");
+
+static int set_core_nodump_flag = 0;
+SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
+ 0, "Enable setting the NODUMP flag on coredump files");
+
+/*
+ * Signal properties and actions.
+ * The array below categorizes the signals and their default actions
+ * according to the following properties:
+ */
+#define SA_KILL 0x01 /* terminates process by default */
+#define SA_CORE 0x02 /* ditto and coredumps */
+#define SA_STOP 0x04 /* suspend process */
+#define SA_TTYSTOP 0x08 /* ditto, from tty */
+#define SA_IGNORE 0x10 /* ignore by default */
+#define SA_CONT 0x20 /* continue if suspended */
+#define SA_CANTMASK 0x40 /* non-maskable, catchable */
+
+static int sigproptbl[NSIG] = {
+ SA_KILL, /* SIGHUP */
+ SA_KILL, /* SIGINT */
+ SA_KILL|SA_CORE, /* SIGQUIT */
+ SA_KILL|SA_CORE, /* SIGILL */
+ SA_KILL|SA_CORE, /* SIGTRAP */
+ SA_KILL|SA_CORE, /* SIGABRT */
+ SA_KILL|SA_CORE, /* SIGEMT */
+ SA_KILL|SA_CORE, /* SIGFPE */
+ SA_KILL, /* SIGKILL */
+ SA_KILL|SA_CORE, /* SIGBUS */
+ SA_KILL|SA_CORE, /* SIGSEGV */
+ SA_KILL|SA_CORE, /* SIGSYS */
+ SA_KILL, /* SIGPIPE */
+ SA_KILL, /* SIGALRM */
+ SA_KILL, /* SIGTERM */
+ SA_IGNORE, /* SIGURG */
+ SA_STOP, /* SIGSTOP */
+ SA_STOP|SA_TTYSTOP, /* SIGTSTP */
+ SA_IGNORE|SA_CONT, /* SIGCONT */
+ SA_IGNORE, /* SIGCHLD */
+ SA_STOP|SA_TTYSTOP, /* SIGTTIN */
+ SA_STOP|SA_TTYSTOP, /* SIGTTOU */
+ SA_IGNORE, /* SIGIO */
+ SA_KILL, /* SIGXCPU */
+ SA_KILL, /* SIGXFSZ */
+ SA_KILL, /* SIGVTALRM */
+ SA_KILL, /* SIGPROF */
+ SA_IGNORE, /* SIGWINCH */
+ SA_IGNORE, /* SIGINFO */
+ SA_KILL, /* SIGUSR1 */
+ SA_KILL, /* SIGUSR2 */
+};
+
+static void reschedule_signals(struct proc *p, sigset_t block, int flags);
+
+static void
+sigqueue_start(void)
+{
+ ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ uma_prealloc(ksiginfo_zone, preallocate_siginfo);
+ p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
+ p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
+ p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
+}
+
+ksiginfo_t *
+ksiginfo_alloc(int wait)
+{
+ int flags;
+
+ flags = M_ZERO;
+ if (! wait)
+ flags |= M_NOWAIT;
+ if (ksiginfo_zone != NULL)
+ return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
+ return (NULL);
+}
+
+void
+ksiginfo_free(ksiginfo_t *ksi)
+{
+ uma_zfree(ksiginfo_zone, ksi);
+}
+
+static __inline int
+ksiginfo_tryfree(ksiginfo_t *ksi)
+{
+ if (!(ksi->ksi_flags & KSI_EXT)) {
+ uma_zfree(ksiginfo_zone, ksi);
+ return (1);
+ }
+ return (0);
+}
+
+void
+sigqueue_init(sigqueue_t *list, struct proc *p)
+{
+ SIGEMPTYSET(list->sq_signals);
+ SIGEMPTYSET(list->sq_kill);
+ TAILQ_INIT(&list->sq_list);
+ list->sq_proc = p;
+ list->sq_flags = SQ_INIT;
+}
+
+/*
+ * Get a signal's ksiginfo.
+ * Return:
+ * 0 - signal not found
+ * others - signal number
+ */
+static int
+sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
+{
+ struct proc *p = sq->sq_proc;
+ struct ksiginfo *ksi, *next;
+ int count = 0;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+ if (!SIGISMEMBER(sq->sq_signals, signo))
+ return (0);
+
+ if (SIGISMEMBER(sq->sq_kill, signo)) {
+ count++;
+ SIGDELSET(sq->sq_kill, signo);
+ }
+
+ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
+ if (ksi->ksi_signo == signo) {
+ if (count == 0) {
+ TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = NULL;
+ ksiginfo_copy(ksi, si);
+ if (ksiginfo_tryfree(ksi) && p != NULL)
+ p->p_pendingcnt--;
+ }
+ if (++count > 1)
+ break;
+ }
+ }
+
+ if (count <= 1)
+ SIGDELSET(sq->sq_signals, signo);
+ si->ksi_signo = signo;
+ return (signo);
+}
+
+void
+sigqueue_take(ksiginfo_t *ksi)
+{
+ struct ksiginfo *kp;
+ struct proc *p;
+ sigqueue_t *sq;
+
+ if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
+ return;
+
+ p = sq->sq_proc;
+ TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = NULL;
+ if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
+ p->p_pendingcnt--;
+
+ for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
+ kp = TAILQ_NEXT(kp, ksi_link)) {
+ if (kp->ksi_signo == ksi->ksi_signo)
+ break;
+ }
+ if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
+ SIGDELSET(sq->sq_signals, ksi->ksi_signo);
+}
+
+static int
+sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
+{
+ struct proc *p = sq->sq_proc;
+ struct ksiginfo *ksi;
+ int ret = 0;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
+ SIGADDSET(sq->sq_kill, signo);
+ goto out_set_bit;
+ }
+
+ /* directly insert the ksi, don't copy it */
+ if (si->ksi_flags & KSI_INS) {
+ if (si->ksi_flags & KSI_HEAD)
+ TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
+ else
+ TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
+ si->ksi_sigq = sq;
+ goto out_set_bit;
+ }
+
+ if (__predict_false(ksiginfo_zone == NULL)) {
+ SIGADDSET(sq->sq_kill, signo);
+ goto out_set_bit;
+ }
+
+ if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
+ signal_overflow++;
+ ret = EAGAIN;
+ } else if ((ksi = ksiginfo_alloc(0)) == NULL) {
+ signal_alloc_fail++;
+ ret = EAGAIN;
+ } else {
+ if (p != NULL)
+ p->p_pendingcnt++;
+ ksiginfo_copy(si, ksi);
+ ksi->ksi_signo = signo;
+ if (si->ksi_flags & KSI_HEAD)
+ TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
+ else
+ TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = sq;
+ }
+
+ if ((si->ksi_flags & KSI_TRAP) != 0 ||
+ (si->ksi_flags & KSI_SIGQ) == 0) {
+ if (ret != 0)
+ SIGADDSET(sq->sq_kill, signo);
+ ret = 0;
+ goto out_set_bit;
+ }
+
+ if (ret != 0)
+ return (ret);
+
+out_set_bit:
+ SIGADDSET(sq->sq_signals, signo);
+ return (ret);
+}
+
+void
+sigqueue_flush(sigqueue_t *sq)
+{
+ struct proc *p = sq->sq_proc;
+ ksiginfo_t *ksi;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+ if (p != NULL)
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
+ TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = NULL;
+ if (ksiginfo_tryfree(ksi) && p != NULL)
+ p->p_pendingcnt--;
+ }
+
+ SIGEMPTYSET(sq->sq_signals);
+ SIGEMPTYSET(sq->sq_kill);
+}
+
+static void
+sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set)
+{
+ sigset_t tmp;
+ struct proc *p1, *p2;
+ ksiginfo_t *ksi, *next;
+
+ KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
+ KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
+ p1 = src->sq_proc;
+ p2 = dst->sq_proc;
+ /* Move siginfo to target list */
+ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
+ if (SIGISMEMBER(*set, ksi->ksi_signo)) {
+ TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
+ if (p1 != NULL)
+ p1->p_pendingcnt--;
+ TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = dst;
+ if (p2 != NULL)
+ p2->p_pendingcnt++;
+ }
+ }
+
+ /* Move pending bits to target list */
+ tmp = src->sq_kill;
+ SIGSETAND(tmp, *set);
+ SIGSETOR(dst->sq_kill, tmp);
+ SIGSETNAND(src->sq_kill, tmp);
+
+ tmp = src->sq_signals;
+ SIGSETAND(tmp, *set);
+ SIGSETOR(dst->sq_signals, tmp);
+ SIGSETNAND(src->sq_signals, tmp);
+}
+
+#if 0
+static void
+sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
+{
+ sigset_t set;
+
+ SIGEMPTYSET(set);
+ SIGADDSET(set, signo);
+ sigqueue_move_set(src, dst, &set);
+}
+#endif
+
+static void
+sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
+{
+ struct proc *p = sq->sq_proc;
+ ksiginfo_t *ksi, *next;
+
+ KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
+
+ /* Remove siginfo queue */
+ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
+ if (SIGISMEMBER(*set, ksi->ksi_signo)) {
+ TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+ ksi->ksi_sigq = NULL;
+ if (ksiginfo_tryfree(ksi) && p != NULL)
+ p->p_pendingcnt--;
+ }
+ }
+ SIGSETNAND(sq->sq_kill, *set);
+ SIGSETNAND(sq->sq_signals, *set);
+}
+
+void
+sigqueue_delete(sigqueue_t *sq, int signo)
+{
+ sigset_t set;
+
+ SIGEMPTYSET(set);
+ SIGADDSET(set, signo);
+ sigqueue_delete_set(sq, &set);
+}
+
+/* Remove a set of signals for a process */
+static void
+sigqueue_delete_set_proc(struct proc *p, const sigset_t *set)
+{
+ sigqueue_t worklist;
+ struct thread *td0;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ sigqueue_init(&worklist, NULL);
+ sigqueue_move_set(&p->p_sigqueue, &worklist, set);
+
+ FOREACH_THREAD_IN_PROC(p, td0)
+ sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
+
+ sigqueue_flush(&worklist);
+}
+
+void
+sigqueue_delete_proc(struct proc *p, int signo)
+{
+ sigset_t set;
+
+ SIGEMPTYSET(set);
+ SIGADDSET(set, signo);
+ sigqueue_delete_set_proc(p, &set);
+}
+
+static void
+sigqueue_delete_stopmask_proc(struct proc *p)
+{
+ sigset_t set;
+
+ SIGEMPTYSET(set);
+ SIGADDSET(set, SIGSTOP);
+ SIGADDSET(set, SIGTSTP);
+ SIGADDSET(set, SIGTTIN);
+ SIGADDSET(set, SIGTTOU);
+ sigqueue_delete_set_proc(p, &set);
+}
+
+/*
+ * Determine signal that should be delivered to thread td, the current
+ * thread, 0 if none. If there is a pending stop signal with default
+ * action, the process stops in issignal().
+ */
+int
+cursig(struct thread *td)
+{
+ PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
+ mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
+ return (SIGPENDING(td) ? issignal(td) : 0);
+}
+
+/*
+ * Arrange for ast() to handle unmasked pending signals on return to user
+ * mode. This must be called whenever a signal is added to td_sigqueue or
+ * unmasked in td_sigmask.
+ */
+void
+signotify(struct thread *td)
+{
+ struct proc *p;
+
+ p = td->td_proc;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if (SIGPENDING(td)) {
+ thread_lock(td);
+ td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
+ thread_unlock(td);
+ }
+}
+
+int
+sigonstack(size_t sp)
+{
+ struct thread *td = curthread;
+
+ return ((td->td_pflags & TDP_ALTSTACK) ?
+#if defined(COMPAT_43)
+ ((td->td_sigstk.ss_size == 0) ?
+ (td->td_sigstk.ss_flags & SS_ONSTACK) :
+ ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
+#else
+ ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
+#endif
+ : 0);
+}
+
+static __inline int
+sigprop(int sig)
+{
+
+ if (sig > 0 && sig < NSIG)
+ return (sigproptbl[_SIG_IDX(sig)]);
+ return (0);
+}
+
+int
+sig_ffs(sigset_t *set)
+{
+ int i;
+
+ for (i = 0; i < _SIG_WORDS; i++)
+ if (set->__bits[i])
+ return (ffs(set->__bits[i]) + (i * 32));
+ return (0);
+}
+
+/*
+ * kern_sigaction
+ * sigaction
+ * freebsd4_sigaction
+ * osigaction
+ */
+int
+kern_sigaction(td, sig, act, oact, flags)
+ struct thread *td;
+ register int sig;
+ struct sigaction *act, *oact;
+ int flags;
+{
+ struct sigacts *ps;
+ struct proc *p = td->td_proc;
+
+ if (!_SIG_VALID(sig))
+ return (EINVAL);
+
+ PROC_LOCK(p);
+ ps = p->p_sigacts;
+ mtx_lock(&ps->ps_mtx);
+ if (oact) {
+ oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
+ oact->sa_flags = 0;
+ if (SIGISMEMBER(ps->ps_sigonstack, sig))
+ oact->sa_flags |= SA_ONSTACK;
+ if (!SIGISMEMBER(ps->ps_sigintr, sig))
+ oact->sa_flags |= SA_RESTART;
+ if (SIGISMEMBER(ps->ps_sigreset, sig))
+ oact->sa_flags |= SA_RESETHAND;
+ if (SIGISMEMBER(ps->ps_signodefer, sig))
+ oact->sa_flags |= SA_NODEFER;
+ if (SIGISMEMBER(ps->ps_siginfo, sig)) {
+ oact->sa_flags |= SA_SIGINFO;
+ oact->sa_sigaction =
+ (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
+ } else
+ oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
+ if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
+ oact->sa_flags |= SA_NOCLDSTOP;
+ if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
+ oact->sa_flags |= SA_NOCLDWAIT;
+ }
+ if (act) {
+ if ((sig == SIGKILL || sig == SIGSTOP) &&
+ act->sa_handler != SIG_DFL) {
+ mtx_unlock(&ps->ps_mtx);
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+
+ /*
+ * Change setting atomically.
+ */
+
+ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
+ SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
+ if (act->sa_flags & SA_SIGINFO) {
+ ps->ps_sigact[_SIG_IDX(sig)] =
+ (__sighandler_t *)act->sa_sigaction;
+ SIGADDSET(ps->ps_siginfo, sig);
+ } else {
+ ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
+ SIGDELSET(ps->ps_siginfo, sig);
+ }
+ if (!(act->sa_flags & SA_RESTART))
+ SIGADDSET(ps->ps_sigintr, sig);
+ else
+ SIGDELSET(ps->ps_sigintr, sig);
+ if (act->sa_flags & SA_ONSTACK)
+ SIGADDSET(ps->ps_sigonstack, sig);
+ else
+ SIGDELSET(ps->ps_sigonstack, sig);
+ if (act->sa_flags & SA_RESETHAND)
+ SIGADDSET(ps->ps_sigreset, sig);
+ else
+ SIGDELSET(ps->ps_sigreset, sig);
+ if (act->sa_flags & SA_NODEFER)
+ SIGADDSET(ps->ps_signodefer, sig);
+ else
+ SIGDELSET(ps->ps_signodefer, sig);
+ if (sig == SIGCHLD) {
+ if (act->sa_flags & SA_NOCLDSTOP)
+ ps->ps_flag |= PS_NOCLDSTOP;
+ else
+ ps->ps_flag &= ~PS_NOCLDSTOP;
+ if (act->sa_flags & SA_NOCLDWAIT) {
+ /*
+ * Paranoia: since SA_NOCLDWAIT is implemented
+ * by reparenting the dying child to PID 1 (and
+ * trust it to reap the zombie), PID 1 itself
+ * is forbidden to set SA_NOCLDWAIT.
+ */
+ if (p->p_pid == 1)
+ ps->ps_flag &= ~PS_NOCLDWAIT;
+ else
+ ps->ps_flag |= PS_NOCLDWAIT;
+ } else
+ ps->ps_flag &= ~PS_NOCLDWAIT;
+ if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
+ ps->ps_flag |= PS_CLDSIGIGN;
+ else
+ ps->ps_flag &= ~PS_CLDSIGIGN;
+ }
+ /*
+ * Set bit in ps_sigignore for signals that are set to SIG_IGN,
+ * and for signals set to SIG_DFL where the default is to
+ * ignore. However, don't put SIGCONT in ps_sigignore, as we
+ * have to restart the process.
+ */
+ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+ (sigprop(sig) & SA_IGNORE &&
+ ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
+ /* never to be seen again */
+ sigqueue_delete_proc(p, sig);
+ if (sig != SIGCONT)
+ /* easier in psignal */
+ SIGADDSET(ps->ps_sigignore, sig);
+ SIGDELSET(ps->ps_sigcatch, sig);
+ } else {
+ SIGDELSET(ps->ps_sigignore, sig);
+ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
+ SIGDELSET(ps->ps_sigcatch, sig);
+ else
+ SIGADDSET(ps->ps_sigcatch, sig);
+ }
+#ifdef COMPAT_FREEBSD4
+ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+ ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
+ (flags & KSA_FREEBSD4) == 0)
+ SIGDELSET(ps->ps_freebsd4, sig);
+ else
+ SIGADDSET(ps->ps_freebsd4, sig);
+#endif
+#ifdef COMPAT_43
+ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
+ ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
+ (flags & KSA_OSIGSET) == 0)
+ SIGDELSET(ps->ps_osigset, sig);
+ else
+ SIGADDSET(ps->ps_osigset, sig);
+#endif
+ }
+ mtx_unlock(&ps->ps_mtx);
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaction_args {
+ int sig;
+ struct sigaction *act;
+ struct sigaction *oact;
+};
+#endif
+int
+sys_sigaction(td, uap)
+ struct thread *td;
+ register struct sigaction_args *uap;
+{
+ struct sigaction act, oact;
+ register struct sigaction *actp, *oactp;
+ int error;
+
+ actp = (uap->act != NULL) ? &act : NULL;
+ oactp = (uap->oact != NULL) ? &oact : NULL;
+ if (actp) {
+ error = copyin(uap->act, actp, sizeof(act));
+ if (error)
+ return (error);
+ }
+ error = kern_sigaction(td, uap->sig, actp, oactp, 0);
+ if (oactp && !error)
+ error = copyout(oactp, uap->oact, sizeof(oact));
+ return (error);
+}
+
+#ifdef COMPAT_FREEBSD4
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_sigaction_args {
+ int sig;
+ struct sigaction *act;
+ struct sigaction *oact;
+};
+#endif
+int
+freebsd4_sigaction(td, uap)
+ struct thread *td;
+ register struct freebsd4_sigaction_args *uap;
+{
+ struct sigaction act, oact;
+ register struct sigaction *actp, *oactp;
+ int error;
+
+
+ actp = (uap->act != NULL) ? &act : NULL;
+ oactp = (uap->oact != NULL) ? &oact : NULL;
+ if (actp) {
+ error = copyin(uap->act, actp, sizeof(act));
+ if (error)
+ return (error);
+ }
+ error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
+ if (oactp && !error)
+ error = copyout(oactp, uap->oact, sizeof(oact));
+ return (error);
+}
+#endif /* COMAPT_FREEBSD4 */
+
+#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigaction_args {
+ int signum;
+ struct osigaction *nsa;
+ struct osigaction *osa;
+};
+#endif
+int
+osigaction(td, uap)
+ struct thread *td;
+ register struct osigaction_args *uap;
+{
+ struct osigaction sa;
+ struct sigaction nsa, osa;
+ register struct sigaction *nsap, *osap;
+ int error;
+
+ if (uap->signum <= 0 || uap->signum >= ONSIG)
+ return (EINVAL);
+
+ nsap = (uap->nsa != NULL) ? &nsa : NULL;
+ osap = (uap->osa != NULL) ? &osa : NULL;
+
+ if (nsap) {
+ error = copyin(uap->nsa, &sa, sizeof(sa));
+ if (error)
+ return (error);
+ nsap->sa_handler = sa.sa_handler;
+ nsap->sa_flags = sa.sa_flags;
+ OSIG2SIG(sa.sa_mask, nsap->sa_mask);
+ }
+ error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
+ if (osap && !error) {
+ sa.sa_handler = osap->sa_handler;
+ sa.sa_flags = osap->sa_flags;
+ SIG2OSIG(osap->sa_mask, sa.sa_mask);
+ error = copyout(&sa, uap->osa, sizeof(sa));
+ }
+ return (error);
+}
+
+#if !defined(__i386__)
+/* Avoid replicating the same stub everywhere */
+int
+osigreturn(td, uap)
+ struct thread *td;
+ struct osigreturn_args *uap;
+{
+
+ return (nosys(td, (struct nosys_args *)uap));
+}
+#endif
+#endif /* COMPAT_43 */
+
+/*
+ * Initialize signal state for process 0;
+ * set to ignore signals that are ignored by default.
+ */
+void
+siginit(p)
+ struct proc *p;
+{
+ register int i;
+ struct sigacts *ps;
+
+ PROC_LOCK(p);
+ ps = p->p_sigacts;
+ mtx_lock(&ps->ps_mtx);
+ for (i = 1; i <= NSIG; i++)
+ if (sigprop(i) & SA_IGNORE && i != SIGCONT)
+ SIGADDSET(ps->ps_sigignore, i);
+ mtx_unlock(&ps->ps_mtx);
+ PROC_UNLOCK(p);
+}
+
+/*
+ * Reset signals for an exec of the specified process.
+ */
+void
+execsigs(struct proc *p)
+{
+ struct sigacts *ps;
+ int sig;
+ struct thread *td;
+
+ /*
+ * Reset caught signals. Held signals remain held
+ * through td_sigmask (unless they were caught,
+ * and are now ignored by default).
+ */
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ td = FIRST_THREAD_IN_PROC(p);
+ ps = p->p_sigacts;
+ mtx_lock(&ps->ps_mtx);
+ while (SIGNOTEMPTY(ps->ps_sigcatch)) {
+ sig = sig_ffs(&ps->ps_sigcatch);
+ SIGDELSET(ps->ps_sigcatch, sig);
+ if (sigprop(sig) & SA_IGNORE) {
+ if (sig != SIGCONT)
+ SIGADDSET(ps->ps_sigignore, sig);
+ sigqueue_delete_proc(p, sig);
+ }
+ ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+ }
+ /*
+ * Reset stack state to the user stack.
+ * Clear set of signals caught on the signal stack.
+ */
+ td->td_sigstk.ss_flags = SS_DISABLE;
+ td->td_sigstk.ss_size = 0;
+ td->td_sigstk.ss_sp = 0;
+ td->td_pflags &= ~TDP_ALTSTACK;
+ /*
+ * Reset no zombies if child dies flag as Solaris does.
+ */
+ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
+ if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
+ ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
+ mtx_unlock(&ps->ps_mtx);
+}
+
+/*
+ * kern_sigprocmask()
+ *
+ * Manipulate signal mask.
+ */
+int
+kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset,
+ int flags)
+{
+ sigset_t new_block, oset1;
+ struct proc *p;
+ int error;
+
+ p = td->td_proc;
+ if (!(flags & SIGPROCMASK_PROC_LOCKED))
+ PROC_LOCK(p);
+ if (oset != NULL)
+ *oset = td->td_sigmask;
+
+ error = 0;
+ if (set != NULL) {
+ switch (how) {
+ case SIG_BLOCK:
+ SIG_CANTMASK(*set);
+ oset1 = td->td_sigmask;
+ SIGSETOR(td->td_sigmask, *set);
+ new_block = td->td_sigmask;
+ SIGSETNAND(new_block, oset1);
+ break;
+ case SIG_UNBLOCK:
+ SIGSETNAND(td->td_sigmask, *set);
+ signotify(td);
+ goto out;
+ case SIG_SETMASK:
+ SIG_CANTMASK(*set);
+ oset1 = td->td_sigmask;
+ if (flags & SIGPROCMASK_OLD)
+ SIGSETLO(td->td_sigmask, *set);
+ else
+ td->td_sigmask = *set;
+ new_block = td->td_sigmask;
+ SIGSETNAND(new_block, oset1);
+ signotify(td);
+ break;
+ default:
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * The new_block set contains signals that were not previously
+ * blocked, but are blocked now.
+ *
+ * In case we block any signal that was not previously blocked
+ * for td, and process has the signal pending, try to schedule
+ * signal delivery to some thread that does not block the
+ * signal, possibly waking it up.
+ */
+ if (p->p_numthreads != 1)
+ reschedule_signals(p, new_block, flags);
+ }
+
+out:
+ if (!(flags & SIGPROCMASK_PROC_LOCKED))
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigprocmask_args {
+ int how;
+ const sigset_t *set;
+ sigset_t *oset;
+};
+#endif
+int
+sys_sigprocmask(td, uap)
+ register struct thread *td;
+ struct sigprocmask_args *uap;
+{
+ sigset_t set, oset;
+ sigset_t *setp, *osetp;
+ int error;
+
+ setp = (uap->set != NULL) ? &set : NULL;
+ osetp = (uap->oset != NULL) ? &oset : NULL;
+ if (setp) {
+ error = copyin(uap->set, setp, sizeof(set));
+ if (error)
+ return (error);
+ }
+ error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
+ if (osetp && !error) {
+ error = copyout(osetp, uap->oset, sizeof(oset));
+ }
+ return (error);
+}
+
+#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigprocmask_args {
+ int how;
+ osigset_t mask;
+};
+#endif
+int
+osigprocmask(td, uap)
+ register struct thread *td;
+ struct osigprocmask_args *uap;
+{
+ sigset_t set, oset;
+ int error;
+
+ OSIG2SIG(uap->mask, set);
+ error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
+ SIG2OSIG(oset, td->td_retval[0]);
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+int
+sys_sigwait(struct thread *td, struct sigwait_args *uap)
+{
+ ksiginfo_t ksi;
+ sigset_t set;
+ int error;
+
+ error = copyin(uap->set, &set, sizeof(set));
+ if (error) {
+ td->td_retval[0] = error;
+ return (0);
+ }
+
+ error = kern_sigtimedwait(td, set, &ksi, NULL);
+ if (error) {
+ if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT)
+ error = ERESTART;
+ if (error == ERESTART)
+ return (error);
+ td->td_retval[0] = error;
+ return (0);
+ }
+
+ error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
+ td->td_retval[0] = error;
+ return (0);
+}
+
+int
+sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
+{
+ struct timespec ts;
+ struct timespec *timeout;
+ sigset_t set;
+ ksiginfo_t ksi;
+ int error;
+
+ if (uap->timeout) {
+ error = copyin(uap->timeout, &ts, sizeof(ts));
+ if (error)
+ return (error);
+
+ timeout = &ts;
+ } else
+ timeout = NULL;
+
+ error = copyin(uap->set, &set, sizeof(set));
+ if (error)
+ return (error);
+
+ error = kern_sigtimedwait(td, set, &ksi, timeout);
+ if (error)
+ return (error);
+
+ if (uap->info)
+ error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
+
+ if (error == 0)
+ td->td_retval[0] = ksi.ksi_signo;
+ return (error);
+}
+
+int
+sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
+{
+ ksiginfo_t ksi;
+ sigset_t set;
+ int error;
+
+ error = copyin(uap->set, &set, sizeof(set));
+ if (error)
+ return (error);
+
+ error = kern_sigtimedwait(td, set, &ksi, NULL);
+ if (error)
+ return (error);
+
+ if (uap->info)
+ error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
+
+ if (error == 0)
+ td->td_retval[0] = ksi.ksi_signo;
+ return (error);
+}
+
+int
+kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
+ struct timespec *timeout)
+{
+ struct sigacts *ps;
+ sigset_t saved_mask, new_block;
+ struct proc *p;
+ int error, sig, timo, timevalid = 0;
+ struct timespec rts, ets, ts;
+ struct timeval tv;
+
+ p = td->td_proc;
+ error = 0;
+ ets.tv_sec = 0;
+ ets.tv_nsec = 0;
+
+ if (timeout != NULL) {
+ if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
+ timevalid = 1;
+ getnanouptime(&rts);
+ ets = rts;
+ timespecadd(&ets, timeout);
+ }
+ }
+ ksiginfo_init(ksi);
+ /* Some signals can not be waited for. */
+ SIG_CANTMASK(waitset);
+ ps = p->p_sigacts;
+ PROC_LOCK(p);
+ saved_mask = td->td_sigmask;
+ SIGSETNAND(td->td_sigmask, waitset);
+ for (;;) {
+ mtx_lock(&ps->ps_mtx);
+ sig = cursig(td);
+ mtx_unlock(&ps->ps_mtx);
+ if (sig != 0 && SIGISMEMBER(waitset, sig)) {
+ if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 ||
+ sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
+ error = 0;
+ break;
+ }
+ }
+
+ if (error != 0)
+ break;
+
+ /*
+ * POSIX says this must be checked after looking for pending
+ * signals.
+ */
+ if (timeout != NULL) {
+ if (!timevalid) {
+ error = EINVAL;
+ break;
+ }
+ getnanouptime(&rts);
+ if (timespeccmp(&rts, &ets, >=)) {
+ error = EAGAIN;
+ break;
+ }
+ ts = ets;
+ timespecsub(&ts, &rts);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ timo = tvtohz(&tv);
+ } else {
+ timo = 0;
+ }
+
+ error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo);
+
+ if (timeout != NULL) {
+ if (error == ERESTART) {
+ /* Timeout can not be restarted. */
+ error = EINTR;
+ } else if (error == EAGAIN) {
+ /* We will calculate timeout by ourself. */
+ error = 0;
+ }
+ }
+ }
+
+ new_block = saved_mask;
+ SIGSETNAND(new_block, td->td_sigmask);
+ td->td_sigmask = saved_mask;
+ /*
+ * Fewer signals can be delivered to us, reschedule signal
+ * notification.
+ */
+ if (p->p_numthreads != 1)
+ reschedule_signals(p, new_block, 0);
+
+ if (error == 0) {
+ SDT_PROBE(proc, kernel, , signal_clear, sig, ksi, 0, 0, 0);
+
+ if (ksi->ksi_code == SI_TIMER)
+ itimer_accept(p, ksi->ksi_timerid, ksi);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_PSIG)) {
+ sig_t action;
+
+ mtx_lock(&ps->ps_mtx);
+ action = ps->ps_sigact[_SIG_IDX(sig)];
+ mtx_unlock(&ps->ps_mtx);
+ ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
+ }
+#endif
+ if (sig == SIGKILL)
+ sigexit(td, sig);
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+ sigset_t *set;
+};
+#endif
+int
+sys_sigpending(td, uap)
+ struct thread *td;
+ struct sigpending_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t pending;
+
+ PROC_LOCK(p);
+ pending = p->p_sigqueue.sq_signals;
+ SIGSETOR(pending, td->td_sigqueue.sq_signals);
+ PROC_UNLOCK(p);
+ return (copyout(&pending, uap->set, sizeof(sigset_t)));
+}
+
+#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
+#ifndef _SYS_SYSPROTO_H_
+struct osigpending_args {
+ int dummy;
+};
+#endif
+int
+osigpending(td, uap)
+ struct thread *td;
+ struct osigpending_args *uap;
+{
+ struct proc *p = td->td_proc;
+ sigset_t pending;
+
+ PROC_LOCK(p);
+ pending = p->p_sigqueue.sq_signals;
+ SIGSETOR(pending, td->td_sigqueue.sq_signals);
+ PROC_UNLOCK(p);
+ SIG2OSIG(pending, td->td_retval[0]);
+ return (0);
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43)
+/*
+ * Generalized interface signal handler, 4.3-compatible.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigvec_args {
+ int signum;
+ struct sigvec *nsv;
+ struct sigvec *osv;
+};
+#endif
+/* ARGSUSED */
+int
+osigvec(td, uap)
+ struct thread *td;
+ register struct osigvec_args *uap;
+{
+ struct sigvec vec;
+ struct sigaction nsa, osa;
+ register struct sigaction *nsap, *osap;
+ int error;
+
+ if (uap->signum <= 0 || uap->signum >= ONSIG)
+ return (EINVAL);
+ nsap = (uap->nsv != NULL) ? &nsa : NULL;
+ osap = (uap->osv != NULL) ? &osa : NULL;
+ if (nsap) {
+ error = copyin(uap->nsv, &vec, sizeof(vec));
+ if (error)
+ return (error);
+ nsap->sa_handler = vec.sv_handler;
+ OSIG2SIG(vec.sv_mask, nsap->sa_mask);
+ nsap->sa_flags = vec.sv_flags;
+ nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */
+ }
+ error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
+ if (osap && !error) {
+ vec.sv_handler = osap->sa_handler;
+ SIG2OSIG(osap->sa_mask, vec.sv_mask);
+ vec.sv_flags = osap->sa_flags;
+ vec.sv_flags &= ~SA_NOCLDWAIT;
+ vec.sv_flags ^= SA_RESTART;
+ error = copyout(&vec, uap->osv, sizeof(vec));
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigblock_args {
+ int mask;
+};
+#endif
+int
+osigblock(td, uap)
+ register struct thread *td;
+ struct osigblock_args *uap;
+{
+ sigset_t set, oset;
+
+ OSIG2SIG(uap->mask, set);
+ kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
+ SIG2OSIG(oset, td->td_retval[0]);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct osigsetmask_args {
+ int mask;
+};
+#endif
+int
+osigsetmask(td, uap)
+ struct thread *td;
+ struct osigsetmask_args *uap;
+{
+ sigset_t set, oset;
+
+ OSIG2SIG(uap->mask, set);
+ kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
+ SIG2OSIG(oset, td->td_retval[0]);
+ return (0);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Suspend calling thread until signal, providing mask to be set in the
+ * meantime.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sigsuspend_args {
+ const sigset_t *sigmask;
+};
+#endif
+/* ARGSUSED */
+int
+sys_sigsuspend(td, uap)
+ struct thread *td;
+ struct sigsuspend_args *uap;
+{
+ sigset_t mask;
+ int error;
+
+ error = copyin(uap->sigmask, &mask, sizeof(mask));
+ if (error)
+ return (error);
+ return (kern_sigsuspend(td, mask));
+}
+
+int
+kern_sigsuspend(struct thread *td, sigset_t mask)
+{
+ struct proc *p = td->td_proc;
+ int has_sig, sig;
+
+ /*
+ * When returning from sigsuspend, we want
+ * the old mask to be restored after the
+ * signal handler has finished. Thus, we
+ * save it here and mark the sigacts structure
+ * to indicate this.
+ */
+ PROC_LOCK(p);
+ kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
+ SIGPROCMASK_PROC_LOCKED);
+ td->td_pflags |= TDP_OLDMASK;
+
+ /*
+ * Process signals now. Otherwise, we can get spurious wakeup
+ * due to signal entered process queue, but delivered to other
+ * thread. But sigsuspend should return only on signal
+ * delivery.
+ */
+ (p->p_sysent->sv_set_syscall_retval)(td, EINTR);
+ for (has_sig = 0; !has_sig;) {
+ while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause",
+ 0) == 0)
+ /* void */;
+ thread_suspend_check(0);
+ mtx_lock(&p->p_sigacts->ps_mtx);
+ while ((sig = cursig(td)) != 0)
+ has_sig += postsig(sig);
+ mtx_unlock(&p->p_sigacts->ps_mtx);
+ }
+ PROC_UNLOCK(p);
+ td->td_errno = EINTR;
+ td->td_pflags |= TDP_NERRNO;
+ return (EJUSTRETURN);
+}
+
+#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
+/*
+ * Compatibility sigsuspend call for old binaries. Note nonstandard calling
+ * convention: libc stub passes mask, not pointer, to save a copyin.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct osigsuspend_args {
+ osigset_t mask;
+};
+#endif
+/* ARGSUSED */
+int
+osigsuspend(td, uap)
+ struct thread *td;
+ struct osigsuspend_args *uap;
+{
+ sigset_t mask;
+
+ OSIG2SIG(uap->mask, mask);
+ return (kern_sigsuspend(td, mask));
+}
+#endif /* COMPAT_43 */
+
+#if defined(COMPAT_43)
+#ifndef _SYS_SYSPROTO_H_
+struct osigstack_args {
+ struct sigstack *nss;
+ struct sigstack *oss;
+};
+#endif
+/* ARGSUSED */
+int
+osigstack(td, uap)
+ struct thread *td;
+ register struct osigstack_args *uap;
+{
+ struct sigstack nss, oss;
+ int error = 0;
+
+ if (uap->nss != NULL) {
+ error = copyin(uap->nss, &nss, sizeof(nss));
+ if (error)
+ return (error);
+ }
+ oss.ss_sp = td->td_sigstk.ss_sp;
+ oss.ss_onstack = sigonstack(cpu_getstack(td));
+ if (uap->nss != NULL) {
+ td->td_sigstk.ss_sp = nss.ss_sp;
+ td->td_sigstk.ss_size = 0;
+ td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
+ td->td_pflags |= TDP_ALTSTACK;
+ }
+ if (uap->oss != NULL)
+ error = copyout(&oss, uap->oss, sizeof(oss));
+
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigaltstack_args {
+ stack_t *ss;
+ stack_t *oss;
+};
+#endif
+/* ARGSUSED */
+int
+sys_sigaltstack(td, uap)
+ struct thread *td;
+ register struct sigaltstack_args *uap;
+{
+ stack_t ss, oss;
+ int error;
+
+ if (uap->ss != NULL) {
+ error = copyin(uap->ss, &ss, sizeof(ss));
+ if (error)
+ return (error);
+ }
+ error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
+ (uap->oss != NULL) ? &oss : NULL);
+ if (error)
+ return (error);
+ if (uap->oss != NULL)
+ error = copyout(&oss, uap->oss, sizeof(stack_t));
+ return (error);
+}
+
+int
+kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
+{
+ struct proc *p = td->td_proc;
+ int oonstack;
+
+ oonstack = sigonstack(cpu_getstack(td));
+
+ if (oss != NULL) {
+ *oss = td->td_sigstk;
+ oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
+ ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
+ }
+
+ if (ss != NULL) {
+ if (oonstack)
+ return (EPERM);
+ if ((ss->ss_flags & ~SS_DISABLE) != 0)
+ return (EINVAL);
+ if (!(ss->ss_flags & SS_DISABLE)) {
+ if (ss->ss_size < p->p_sysent->sv_minsigstksz)
+ return (ENOMEM);
+
+ td->td_sigstk = *ss;
+ td->td_pflags |= TDP_ALTSTACK;
+ } else {
+ td->td_pflags &= ~TDP_ALTSTACK;
+ }
+ }
+ return (0);
+}
+
+/*
+ * Common code for kill process group/broadcast kill.
+ * cp is calling process.
+ */
+static int
+killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
+{
+ struct proc *p;
+ struct pgrp *pgrp;
+ int err;
+ int ret;
+
+ ret = ESRCH;
+ if (all) {
+ /*
+ * broadcast
+ */
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+ p == td->td_proc || p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ err = p_cansignal(td, p, sig);
+ if (err == 0) {
+ if (sig)
+ pksignal(p, sig, ksi);
+ ret = err;
+ }
+ else if (ret == ESRCH)
+ ret = err;
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+ } else {
+ sx_slock(&proctree_lock);
+ if (pgid == 0) {
+ /*
+ * zero pgid means send to my process group.
+ */
+ pgrp = td->td_proc->p_pgrp;
+ PGRP_LOCK(pgrp);
+ } else {
+ pgrp = pgfind(pgid);
+ if (pgrp == NULL) {
+ sx_sunlock(&proctree_lock);
+ return (ESRCH);
+ }
+ }
+ sx_sunlock(&proctree_lock);
+ LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+ p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ err = p_cansignal(td, p, sig);
+ if (err == 0) {
+ if (sig)
+ pksignal(p, sig, ksi);
+ ret = err;
+ }
+ else if (ret == ESRCH)
+ ret = err;
+ PROC_UNLOCK(p);
+ }
+ PGRP_UNLOCK(pgrp);
+ }
+ return (ret);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct kill_args {
+ int pid;
+ int signum;
+};
+#endif
+/* ARGSUSED */
+int
+sys_kill(struct thread *td, struct kill_args *uap)
+{
+ ksiginfo_t ksi;
+ struct proc *p;
+ int error;
+
+ /*
+ * A process in capability mode can send signals only to himself.
+ * The main rationale behind this is that abort(3) is implemented as
+ * kill(getpid(), SIGABRT).
+ */
+ if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid)
+ return (ECAPMODE);
+
+ AUDIT_ARG_SIGNUM(uap->signum);
+ AUDIT_ARG_PID(uap->pid);
+ if ((u_int)uap->signum > _SIG_MAXSIG)
+ return (EINVAL);
+
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = uap->signum;
+ ksi.ksi_code = SI_USER;
+ ksi.ksi_pid = td->td_proc->p_pid;
+ ksi.ksi_uid = td->td_ucred->cr_ruid;
+
+ if (uap->pid > 0) {
+ /* kill single process */
+ if ((p = pfind(uap->pid)) == NULL) {
+ if ((p = zpfind(uap->pid)) == NULL)
+ return (ESRCH);
+ }
+ AUDIT_ARG_PROCESS(p);
+ error = p_cansignal(td, p, uap->signum);
+ if (error == 0 && uap->signum)
+ pksignal(p, uap->signum, &ksi);
+ PROC_UNLOCK(p);
+ return (error);
+ }
+ switch (uap->pid) {
+ case -1: /* broadcast signal */
+ return (killpg1(td, uap->signum, 0, 1, &ksi));
+ case 0: /* signal own process group */
+ return (killpg1(td, uap->signum, 0, 0, &ksi));
+ default: /* negative explicit process group */
+ return (killpg1(td, uap->signum, -uap->pid, 0, &ksi));
+ }
+ /* NOTREACHED */
+}
+
+int
+sys_pdkill(td, uap)
+ struct thread *td;
+ struct pdkill_args *uap;
+{
+#ifdef PROCDESC
+ struct proc *p;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_SIGNUM(uap->signum);
+ AUDIT_ARG_FD(uap->fd);
+ if ((u_int)uap->signum > _SIG_MAXSIG)
+ return (EINVAL);
+
+ error = procdesc_find(td, uap->fd,
+ cap_rights_init(&rights, CAP_PDKILL), &p);
+ if (error)
+ return (error);
+ AUDIT_ARG_PROCESS(p);
+ error = p_cansignal(td, p, uap->signum);
+ if (error == 0 && uap->signum)
+ kern_psignal(p, uap->signum);
+ PROC_UNLOCK(p);
+ return (error);
+#else
+ return (ENOSYS);
+#endif
+}
+
+#if defined(COMPAT_43)
+#ifndef _SYS_SYSPROTO_H_
+struct okillpg_args {
+ int pgid;
+ int signum;
+};
+#endif
+/* ARGSUSED */
+int
+okillpg(struct thread *td, struct okillpg_args *uap)
+{
+ ksiginfo_t ksi;
+
+ AUDIT_ARG_SIGNUM(uap->signum);
+ AUDIT_ARG_PID(uap->pgid);
+ if ((u_int)uap->signum > _SIG_MAXSIG)
+ return (EINVAL);
+
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = uap->signum;
+ ksi.ksi_code = SI_USER;
+ ksi.ksi_pid = td->td_proc->p_pid;
+ ksi.ksi_uid = td->td_ucred->cr_ruid;
+ return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct sigqueue_args {
+ pid_t pid;
+ int signum;
+ /* union sigval */ void *value;
+};
+#endif
+int
+sys_sigqueue(struct thread *td, struct sigqueue_args *uap)
+{
+ ksiginfo_t ksi;
+ struct proc *p;
+ int error;
+
+ if ((u_int)uap->signum > _SIG_MAXSIG)
+ return (EINVAL);
+
+ /*
+ * Specification says sigqueue can only send signal to
+ * single process.
+ */
+ if (uap->pid <= 0)
+ return (EINVAL);
+
+ if ((p = pfind(uap->pid)) == NULL) {
+ if ((p = zpfind(uap->pid)) == NULL)
+ return (ESRCH);
+ }
+ error = p_cansignal(td, p, uap->signum);
+ if (error == 0 && uap->signum != 0) {
+ ksiginfo_init(&ksi);
+ ksi.ksi_flags = KSI_SIGQ;
+ ksi.ksi_signo = uap->signum;
+ ksi.ksi_code = SI_QUEUE;
+ ksi.ksi_pid = td->td_proc->p_pid;
+ ksi.ksi_uid = td->td_ucred->cr_ruid;
+ ksi.ksi_value.sival_ptr = uap->value;
+ error = pksignal(p, ksi.ksi_signo, &ksi);
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+/*
+ * Send a signal to a process group.
+ */
+void
+gsignal(int pgid, int sig, ksiginfo_t *ksi)
+{
+ struct pgrp *pgrp;
+
+ if (pgid != 0) {
+ sx_slock(&proctree_lock);
+ pgrp = pgfind(pgid);
+ sx_sunlock(&proctree_lock);
+ if (pgrp != NULL) {
+ pgsignal(pgrp, sig, 0, ksi);
+ PGRP_UNLOCK(pgrp);
+ }
+ }
+}
+
+/*
+ * Send a signal to a process group. If checktty is 1,
+ * limit to members which have a controlling terminal.
+ */
+void
+pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi)
+{
+ struct proc *p;
+
+ if (pgrp) {
+ PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
+ LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL &&
+ (checkctty == 0 || p->p_flag & P_CONTROLT))
+ pksignal(p, sig, ksi);
+ PROC_UNLOCK(p);
+ }
+ }
+}
+
+/*
+ * Send a signal caused by a trap to the current thread. If it will be
+ * caught immediately, deliver it with correct code. Otherwise, post it
+ * normally.
+ */
+void
+trapsignal(struct thread *td, ksiginfo_t *ksi)
+{
+ struct sigacts *ps;
+ sigset_t mask;
+ struct proc *p;
+ int sig;
+ int code;
+
+ p = td->td_proc;
+ sig = ksi->ksi_signo;
+ code = ksi->ksi_code;
+ KASSERT(_SIG_VALID(sig), ("invalid signal"));
+
+ PROC_LOCK(p);
+ ps = p->p_sigacts;
+ mtx_lock(&ps->ps_mtx);
+ if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
+ !SIGISMEMBER(td->td_sigmask, sig)) {
+ td->td_ru.ru_nsignals++;
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_PSIG))
+ ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
+ &td->td_sigmask, code);
+#endif
+ (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
+ ksi, &td->td_sigmask);
+ mask = ps->ps_catchmask[_SIG_IDX(sig)];
+ if (!SIGISMEMBER(ps->ps_signodefer, sig))
+ SIGADDSET(mask, sig);
+ kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
+ SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
+ if (SIGISMEMBER(ps->ps_sigreset, sig)) {
+ /*
+ * See kern_sigaction() for origin of this code.
+ */
+ SIGDELSET(ps->ps_sigcatch, sig);
+ if (sig != SIGCONT &&
+ sigprop(sig) & SA_IGNORE)
+ SIGADDSET(ps->ps_sigignore, sig);
+ ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+ }
+ mtx_unlock(&ps->ps_mtx);
+ } else {
+ /*
+ * Avoid a possible infinite loop if the thread
+ * masking the signal or process is ignoring the
+ * signal.
+ */
+ if (kern_forcesigexit &&
+ (SIGISMEMBER(td->td_sigmask, sig) ||
+ ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
+ SIGDELSET(td->td_sigmask, sig);
+ SIGDELSET(ps->ps_sigcatch, sig);
+ SIGDELSET(ps->ps_sigignore, sig);
+ ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+ }
+ mtx_unlock(&ps->ps_mtx);
+ p->p_code = code; /* XXX for core dump/debugger */
+ p->p_sig = sig; /* XXX to verify code */
+ tdsendsignal(p, td, sig, ksi);
+ }
+ PROC_UNLOCK(p);
+}
+
+static struct thread *
+sigtd(struct proc *p, int sig, int prop)
+{
+ struct thread *td, *signal_td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ /*
+ * Check if current thread can handle the signal without
+ * switching context to another thread.
+ */
+ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
+ return (curthread);
+ signal_td = NULL;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (!SIGISMEMBER(td->td_sigmask, sig)) {
+ signal_td = td;
+ break;
+ }
+ }
+ if (signal_td == NULL)
+ signal_td = FIRST_THREAD_IN_PROC(p);
+ return (signal_td);
+}
+
+/*
+ * Send the signal to the process. If the signal has an action, the action
+ * is usually performed by the target process rather than the caller; we add
+ * the signal to the set of pending signals for the process.
+ *
+ * Exceptions:
+ * o When a stop signal is sent to a sleeping process that takes the
+ * default action, the process is stopped without awakening it.
+ * o SIGCONT restarts stopped processes (or puts them back to sleep)
+ * regardless of the signal action (eg, blocked or ignored).
+ *
+ * Other ignored signals are discarded immediately.
+ *
+ * NB: This function may be entered from the debugger via the "kill" DDB
+ * command. There is little that can be done to mitigate the possibly messy
+ * side effects of this unwise possibility.
+ */
+void
+kern_psignal(struct proc *p, int sig)
+{
+ ksiginfo_t ksi;
+
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = sig;
+ ksi.ksi_code = SI_KERNEL;
+ (void) tdsendsignal(p, NULL, sig, &ksi);
+}
+
+int
+pksignal(struct proc *p, int sig, ksiginfo_t *ksi)
+{
+
+ return (tdsendsignal(p, NULL, sig, ksi));
+}
+
+/* Utility function for finding a thread to send signal event to. */
+int
+sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd)
+{
+ struct thread *td;
+
+ if (sigev->sigev_notify == SIGEV_THREAD_ID) {
+ td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
+ if (td == NULL)
+ return (ESRCH);
+ *ttd = td;
+ } else {
+ *ttd = NULL;
+ PROC_LOCK(p);
+ }
+ return (0);
+}
+
+void
+tdsignal(struct thread *td, int sig)
+{
+ ksiginfo_t ksi;
+
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = sig;
+ ksi.ksi_code = SI_KERNEL;
+ (void) tdsendsignal(td->td_proc, td, sig, &ksi);
+}
+
+void
+tdksignal(struct thread *td, int sig, ksiginfo_t *ksi)
+{
+
+ (void) tdsendsignal(td->td_proc, td, sig, ksi);
+}
+
+int
+tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
+{
+ sig_t action;
+ sigqueue_t *sigqueue;
+ int prop;
+ struct sigacts *ps;
+ int intrval;
+ int ret = 0;
+ int wakeup_swapper;
+
+ MPASS(td == NULL || p == td->td_proc);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if (!_SIG_VALID(sig))
+ panic("%s(): invalid signal %d", __func__, sig);
+
+ KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));
+
+ /*
+ * IEEE Std 1003.1-2001: return success when killing a zombie.
+ */
+ if (p->p_state == PRS_ZOMBIE) {
+ if (ksi && (ksi->ksi_flags & KSI_INS))
+ ksiginfo_tryfree(ksi);
+ return (ret);
+ }
+
+ ps = p->p_sigacts;
+ KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
+ prop = sigprop(sig);
+
+ if (td == NULL) {
+ td = sigtd(p, sig, prop);
+ sigqueue = &p->p_sigqueue;
+ } else
+ sigqueue = &td->td_sigqueue;
+
+ SDT_PROBE(proc, kernel, , signal_send, td, p, sig, 0, 0 );
+
+ /*
+ * If the signal is being ignored,
+ * then we forget about it immediately.
+ * (Note: we don't set SIGCONT in ps_sigignore,
+ * and if it is set to SIG_IGN,
+ * action will be SIG_DFL here.)
+ */
+ mtx_lock(&ps->ps_mtx);
+ if (SIGISMEMBER(ps->ps_sigignore, sig)) {
+ SDT_PROBE(proc, kernel, , signal_discard, td, p, sig, 0, 0 );
+
+ mtx_unlock(&ps->ps_mtx);
+ if (ksi && (ksi->ksi_flags & KSI_INS))
+ ksiginfo_tryfree(ksi);
+ return (ret);
+ }
+ if (SIGISMEMBER(td->td_sigmask, sig))
+ action = SIG_HOLD;
+ else if (SIGISMEMBER(ps->ps_sigcatch, sig))
+ action = SIG_CATCH;
+ else
+ action = SIG_DFL;
+ if (SIGISMEMBER(ps->ps_sigintr, sig))
+ intrval = EINTR;
+ else
+ intrval = ERESTART;
+ mtx_unlock(&ps->ps_mtx);
+
+ if (prop & SA_CONT)
+ sigqueue_delete_stopmask_proc(p);
+ else if (prop & SA_STOP) {
+ /*
+ * If sending a tty stop signal to a member of an orphaned
+ * process group, discard the signal here if the action
+ * is default; don't stop the process below if sleeping,
+ * and don't clear any pending SIGCONT.
+ */
+ if ((prop & SA_TTYSTOP) &&
+ (p->p_pgrp->pg_jobc == 0) &&
+ (action == SIG_DFL)) {
+ if (ksi && (ksi->ksi_flags & KSI_INS))
+ ksiginfo_tryfree(ksi);
+ return (ret);
+ }
+ sigqueue_delete_proc(p, SIGCONT);
+ if (p->p_flag & P_CONTINUED) {
+ p->p_flag &= ~P_CONTINUED;
+ PROC_LOCK(p->p_pptr);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(p->p_pptr);
+ }
+ }
+
+ ret = sigqueue_add(sigqueue, sig, ksi);
+ if (ret != 0)
+ return (ret);
+ signotify(td);
+ /*
+ * Defer further processing for signals which are held,
+ * except that stopped processes must be continued by SIGCONT.
+ */
+ if (action == SIG_HOLD &&
+ !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
+ return (ret);
+ /*
+ * SIGKILL: Remove procfs STOPEVENTs.
+ */
+ if (sig == SIGKILL) {
+ /* from procfs_ioctl.c: PIOCBIC */
+ p->p_stops = 0;
+ /* from procfs_ioctl.c: PIOCCONT */
+ p->p_step = 0;
+ wakeup(&p->p_step);
+ }
+ /*
+ * Some signals have a process-wide effect and a per-thread
+ * component. Most processing occurs when the process next
+ * tries to cross the user boundary, however there are some
+ * times when processing needs to be done immediately, such as
+ * waking up threads so that they can cross the user boundary.
+ * We try to do the per-process part here.
+ */
+ if (P_SHOULDSTOP(p)) {
+ KASSERT(!(p->p_flag & P_WEXIT),
+ ("signal to stopped but exiting process"));
+ if (sig == SIGKILL) {
+ /*
+ * If traced process is already stopped,
+ * then no further action is necessary.
+ */
+ if (p->p_flag & P_TRACED)
+ goto out;
+ /*
+ * SIGKILL sets process running.
+ * It will die elsewhere.
+ * All threads must be restarted.
+ */
+ p->p_flag &= ~P_STOPPED_SIG;
+ goto runfast;
+ }
+
+ if (prop & SA_CONT) {
+ /*
+ * If traced process is already stopped,
+ * then no further action is necessary.
+ */
+ if (p->p_flag & P_TRACED)
+ goto out;
+ /*
+ * If SIGCONT is default (or ignored), we continue the
+ * process but don't leave the signal in sigqueue as
+ * it has no further action. If SIGCONT is held, we
+ * continue the process and leave the signal in
+ * sigqueue. If the process catches SIGCONT, let it
+ * handle the signal itself. If it isn't waiting on
+ * an event, it goes back to run state.
+ * Otherwise, process goes back to sleep state.
+ */
+ p->p_flag &= ~P_STOPPED_SIG;
+ PROC_SLOCK(p);
+ if (p->p_numthreads == p->p_suspcount) {
+ PROC_SUNLOCK(p);
+ p->p_flag |= P_CONTINUED;
+ p->p_xstat = SIGCONT;
+ PROC_LOCK(p->p_pptr);
+ childproc_continued(p);
+ PROC_UNLOCK(p->p_pptr);
+ PROC_SLOCK(p);
+ }
+ if (action == SIG_DFL) {
+ thread_unsuspend(p);
+ PROC_SUNLOCK(p);
+ sigqueue_delete(sigqueue, sig);
+ goto out;
+ }
+ if (action == SIG_CATCH) {
+ /*
+ * The process wants to catch it so it needs
+ * to run at least one thread, but which one?
+ */
+ PROC_SUNLOCK(p);
+ goto runfast;
+ }
+ /*
+ * The signal is not ignored or caught.
+ */
+ thread_unsuspend(p);
+ PROC_SUNLOCK(p);
+ goto out;
+ }
+
+ if (prop & SA_STOP) {
+ /*
+ * If traced process is already stopped,
+ * then no further action is necessary.
+ */
+ if (p->p_flag & P_TRACED)
+ goto out;
+ /*
+ * Already stopped, don't need to stop again
+ * (If we did the shell could get confused).
+ * Just make sure the signal STOP bit set.
+ */
+ p->p_flag |= P_STOPPED_SIG;
+ sigqueue_delete(sigqueue, sig);
+ goto out;
+ }
+
+ /*
+ * All other kinds of signals:
+ * If a thread is sleeping interruptibly, simulate a
+ * wakeup so that when it is continued it will be made
+ * runnable and can look at the signal. However, don't make
+ * the PROCESS runnable, leave it stopped.
+ * It may run a bit until it hits a thread_suspend_check().
+ */
+ wakeup_swapper = 0;
+ PROC_SLOCK(p);
+ thread_lock(td);
+ if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
+ wakeup_swapper = sleepq_abort(td, intrval);
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
+ if (wakeup_swapper)
+ kick_proc0();
+ goto out;
+ /*
+ * Mutexes are short lived. Threads waiting on them will
+ * hit thread_suspend_check() soon.
+ */
+ } else if (p->p_state == PRS_NORMAL) {
+ if (p->p_flag & P_TRACED || action == SIG_CATCH) {
+ tdsigwakeup(td, sig, action, intrval);
+ goto out;
+ }
+
+ MPASS(action == SIG_DFL);
+
+ if (prop & SA_STOP) {
+ if (p->p_flag & (P_PPWAIT|P_WEXIT))
+ goto out;
+ p->p_flag |= P_STOPPED_SIG;
+ p->p_xstat = sig;
+ PROC_SLOCK(p);
+ sig_suspend_threads(td, p, 1);
+ if (p->p_numthreads == p->p_suspcount) {
+ /*
+ * only thread sending signal to another
+ * process can reach here, if thread is sending
+ * signal to its process, because thread does
+ * not suspend itself here, p_numthreads
+ * should never be equal to p_suspcount.
+ */
+ thread_stopped(p);
+ PROC_SUNLOCK(p);
+ sigqueue_delete_proc(p, p->p_xstat);
+ } else
+ PROC_SUNLOCK(p);
+ goto out;
+ }
+ } else {
+ /* Not in "NORMAL" state. discard the signal. */
+ sigqueue_delete(sigqueue, sig);
+ goto out;
+ }
+
+ /*
+ * The process is not stopped so we need to apply the signal to all the
+ * running threads.
+ */
+runfast:
+ tdsigwakeup(td, sig, action, intrval);
+ PROC_SLOCK(p);
+ thread_unsuspend(p);
+ PROC_SUNLOCK(p);
+out:
+ /* If we jump here, proc slock should not be owned. */
+ PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
+ return (ret);
+}
+
+/*
+ * The force of a signal has been directed against a single
+ * thread. We need to see what we can do about knocking it
+ * out of any sleep it may be in etc.
+ */
+static void
+tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
+{
+ struct proc *p = td->td_proc;
+ register int prop;
+ int wakeup_swapper;
+
+ wakeup_swapper = 0;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ prop = sigprop(sig);
+
+ PROC_SLOCK(p);
+ thread_lock(td);
+ /*
+ * Bring the priority of a thread up if we want it to get
+ * killed in this lifetime.
+ */
+ if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
+ sched_prio(td, PUSER);
+ if (TD_ON_SLEEPQ(td)) {
+ /*
+ * If thread is sleeping uninterruptibly
+ * we can't interrupt the sleep... the signal will
+ * be noticed when the process returns through
+ * trap() or syscall().
+ */
+ if ((td->td_flags & TDF_SINTR) == 0)
+ goto out;
+ /*
+ * If SIGCONT is default (or ignored) and process is
+ * asleep, we are finished; the process should not
+ * be awakened.
+ */
+ if ((prop & SA_CONT) && action == SIG_DFL) {
+ thread_unlock(td);
+ PROC_SUNLOCK(p);
+ sigqueue_delete(&p->p_sigqueue, sig);
+ /*
+ * It may be on either list in this state.
+ * Remove from both for now.
+ */
+ sigqueue_delete(&td->td_sigqueue, sig);
+ return;
+ }
+
+ /*
+ * Don't awaken a sleeping thread for SIGSTOP if the
+ * STOP signal is deferred.
+ */
+ if ((prop & SA_STOP) && (td->td_flags & TDF_SBDRY))
+ goto out;
+
+ /*
+ * Give low priority threads a better chance to run.
+ */
+ if (td->td_priority > PUSER)
+ sched_prio(td, PUSER);
+
+ wakeup_swapper = sleepq_abort(td, intrval);
+ } else {
+ /*
+ * Other states do nothing with the signal immediately,
+ * other than kicking ourselves if we are running.
+ * It will either never be noticed, or noticed very soon.
+ */
+#ifdef SMP
+ if (TD_IS_RUNNING(td) && td != curthread)
+ forward_signal(td);
+#endif
+ }
+out:
+ PROC_SUNLOCK(p);
+ thread_unlock(td);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+static void
+sig_suspend_threads(struct thread *td, struct proc *p, int sending)
+{
+ struct thread *td2;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+ FOREACH_THREAD_IN_PROC(p, td2) {
+ thread_lock(td2);
+ td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
+ if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
+ (td2->td_flags & TDF_SINTR)) {
+ if (td2->td_flags & TDF_SBDRY) {
+ /*
+ * Once a thread is asleep with
+ * TDF_SBDRY set, it should never
+ * become suspended due to this check.
+ */
+ KASSERT(!TD_IS_SUSPENDED(td2),
+ ("thread with deferred stops suspended"));
+ } else if (!TD_IS_SUSPENDED(td2)) {
+ thread_suspend_one(td2);
+ }
+ } else if (!TD_IS_SUSPENDED(td2)) {
+ if (sending || td != td2)
+ td2->td_flags |= TDF_ASTPENDING;
+#ifdef SMP
+ if (TD_IS_RUNNING(td2) && td2 != td)
+ forward_signal(td2);
+#endif
+ }
+ thread_unlock(td2);
+ }
+}
+
+int
+ptracestop(struct thread *td, int sig)
+{
+ struct proc *p = td->td_proc;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process"));
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
+ &p->p_mtx.lock_object, "Stopping for traced signal");
+
+ td->td_dbgflags |= TDB_XSIG;
+ td->td_xsig = sig;
+ PROC_SLOCK(p);
+ while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
+ if (p->p_flag & P_SINGLE_EXIT) {
+ td->td_dbgflags &= ~TDB_XSIG;
+ PROC_SUNLOCK(p);
+ return (sig);
+ }
+ /*
+ * Just make wait() to work, the last stopped thread
+ * will win.
+ */
+ p->p_xstat = sig;
+ p->p_xthread = td;
+ p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
+ sig_suspend_threads(td, p, 0);
+ if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
+ td->td_dbgflags &= ~TDB_STOPATFORK;
+ cv_broadcast(&p->p_dbgwait);
+ }
+stopme:
+ thread_suspend_switch(td);
+ if (p->p_xthread == td)
+ p->p_xthread = NULL;
+ if (!(p->p_flag & P_TRACED))
+ break;
+ if (td->td_dbgflags & TDB_SUSPEND) {
+ if (p->p_flag & P_SINGLE_EXIT)
+ break;
+ goto stopme;
+ }
+ }
+ PROC_SUNLOCK(p);
+ return (td->td_xsig);
+}
+
+static void
+reschedule_signals(struct proc *p, sigset_t block, int flags)
+{
+ struct sigacts *ps;
+ struct thread *td;
+ int sig;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if (SIGISEMPTY(p->p_siglist))
+ return;
+ ps = p->p_sigacts;
+ SIGSETAND(block, p->p_siglist);
+ while ((sig = sig_ffs(&block)) != 0) {
+ SIGDELSET(block, sig);
+ td = sigtd(p, sig, 0);
+ signotify(td);
+ if (!(flags & SIGPROCMASK_PS_LOCKED))
+ mtx_lock(&ps->ps_mtx);
+ if (p->p_flag & P_TRACED || SIGISMEMBER(ps->ps_sigcatch, sig))
+ tdsigwakeup(td, sig, SIG_CATCH,
+ (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
+ ERESTART));
+ if (!(flags & SIGPROCMASK_PS_LOCKED))
+ mtx_unlock(&ps->ps_mtx);
+ }
+}
+
+void
+tdsigcleanup(struct thread *td)
+{
+ struct proc *p;
+ sigset_t unblocked;
+
+ p = td->td_proc;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ sigqueue_flush(&td->td_sigqueue);
+ if (p->p_numthreads == 1)
+ return;
+
+ /*
+ * Since we cannot handle signals, notify signal post code
+ * about this by filling the sigmask.
+ *
+ * Also, if needed, wake up thread(s) that do not block the
+ * same signals as the exiting thread, since the thread might
+ * have been selected for delivery and woken up.
+ */
+ SIGFILLSET(unblocked);
+ SIGSETNAND(unblocked, td->td_sigmask);
+ SIGFILLSET(td->td_sigmask);
+ reschedule_signals(p, unblocked, 0);
+
+}
+
+/*
+ * Defer the delivery of SIGSTOP for the current thread. Returns true
+ * if stops were deferred and false if they were already deferred.
+ */
+int
+sigdeferstop(void)
+{
+ struct thread *td;
+
+ td = curthread;
+ if (td->td_flags & TDF_SBDRY)
+ return (0);
+ thread_lock(td);
+ td->td_flags |= TDF_SBDRY;
+ thread_unlock(td);
+ return (1);
+}
+
+/*
+ * Permit the delivery of SIGSTOP for the current thread. This does
+ * not immediately suspend if a stop was posted. Instead, the thread
+ * will suspend either via ast() or a subsequent interruptible sleep.
+ */
+void
+sigallowstop()
+{
+ struct thread *td;
+
+ td = curthread;
+ thread_lock(td);
+ td->td_flags &= ~TDF_SBDRY;
+ thread_unlock(td);
+}
+
+/*
+ * If the current process has received a signal (should be caught or cause
+ * termination, should interrupt current syscall), return the signal number.
+ * Stop signals with default action are processed immediately, then cleared;
+ * they aren't returned. This is checked after each entry to the system for
+ * a syscall or trap (though this can usually be done without calling issignal
+ * by checking the pending signal masks in cursig.) The normal call
+ * sequence is
+ *
+ * while (sig = cursig(curthread))
+ * postsig(sig);
+ */
+static int
+issignal(struct thread *td)
+{
+ struct proc *p;
+ struct sigacts *ps;
+ struct sigqueue *queue;
+ sigset_t sigpending;
+ int sig, prop, newsig;
+
+ p = td->td_proc;
+ ps = p->p_sigacts;
+ mtx_assert(&ps->ps_mtx, MA_OWNED);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ for (;;) {
+ int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
+
+ sigpending = td->td_sigqueue.sq_signals;
+ SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
+ SIGSETNAND(sigpending, td->td_sigmask);
+
+ if (p->p_flag & P_PPWAIT || td->td_flags & TDF_SBDRY)
+ SIG_STOPSIGMASK(sigpending);
+ if (SIGISEMPTY(sigpending)) /* no signal to send */
+ return (0);
+ sig = sig_ffs(&sigpending);
+
+ if (p->p_stops & S_SIG) {
+ mtx_unlock(&ps->ps_mtx);
+ stopevent(p, S_SIG, sig);
+ mtx_lock(&ps->ps_mtx);
+ }
+
+ /*
+ * We should see pending but ignored signals
+ * only if P_TRACED was on when they were posted.
+ */
+ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
+ sigqueue_delete(&td->td_sigqueue, sig);
+ sigqueue_delete(&p->p_sigqueue, sig);
+ continue;
+ }
+ if (p->p_flag & P_TRACED && (p->p_flag & P_PPTRACE) == 0) {
+ /*
+ * If traced, always stop.
+ * Remove old signal from queue before the stop.
+ * XXX shrug off debugger, it causes siginfo to
+ * be thrown away.
+ */
+ queue = &td->td_sigqueue;
+ td->td_dbgksi.ksi_signo = 0;
+ if (sigqueue_get(queue, sig, &td->td_dbgksi) == 0) {
+ queue = &p->p_sigqueue;
+ sigqueue_get(queue, sig, &td->td_dbgksi);
+ }
+
+ mtx_unlock(&ps->ps_mtx);
+ newsig = ptracestop(td, sig);
+ mtx_lock(&ps->ps_mtx);
+
+ if (sig != newsig) {
+
+ /*
+ * If parent wants us to take the signal,
+ * then it will leave it in p->p_xstat;
+ * otherwise we just look for signals again.
+ */
+ if (newsig == 0)
+ continue;
+ sig = newsig;
+
+ /*
+ * Put the new signal into td_sigqueue. If the
+ * signal is being masked, look for other
+ * signals.
+ */
+ sigqueue_add(queue, sig, NULL);
+ if (SIGISMEMBER(td->td_sigmask, sig))
+ continue;
+ signotify(td);
+ } else {
+ if (td->td_dbgksi.ksi_signo != 0) {
+ td->td_dbgksi.ksi_flags |= KSI_HEAD;
+ if (sigqueue_add(&td->td_sigqueue, sig,
+ &td->td_dbgksi) != 0)
+ td->td_dbgksi.ksi_signo = 0;
+ }
+ if (td->td_dbgksi.ksi_signo == 0)
+ sigqueue_add(&td->td_sigqueue, sig,
+ NULL);
+ }
+
+ /*
+ * If the traced bit got turned off, go back up
+ * to the top to rescan signals. This ensures
+ * that p_sig* and p_sigact are consistent.
+ */
+ if ((p->p_flag & P_TRACED) == 0)
+ continue;
+ }
+
+ prop = sigprop(sig);
+
+ /*
+ * Decide whether the signal should be returned.
+ * Return the signal's number, or fall through
+ * to clear it from the pending mask.
+ */
+ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
+
+ case (intptr_t)SIG_DFL:
+ /*
+ * Don't take default actions on system processes.
+ */
+ if (p->p_pid <= 1) {
+#ifdef DIAGNOSTIC
+ /*
+ * Are you sure you want to ignore SIGSEGV
+ * in init? XXX
+ */
+ printf("Process (pid %lu) got signal %d\n",
+ (u_long)p->p_pid, sig);
+#endif
+ break; /* == ignore */
+ }
+ /*
+ * If there is a pending stop signal to process
+ * with default action, stop here,
+ * then clear the signal. However,
+ * if process is member of an orphaned
+ * process group, ignore tty stop signals.
+ */
+ if (prop & SA_STOP) {
+ if (p->p_flag & (P_TRACED|P_WEXIT) ||
+ (p->p_pgrp->pg_jobc == 0 &&
+ prop & SA_TTYSTOP))
+ break; /* == ignore */
+ mtx_unlock(&ps->ps_mtx);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
+ &p->p_mtx.lock_object, "Catching SIGSTOP");
+ p->p_flag |= P_STOPPED_SIG;
+ p->p_xstat = sig;
+ PROC_SLOCK(p);
+ sig_suspend_threads(td, p, 0);
+ thread_suspend_switch(td);
+ PROC_SUNLOCK(p);
+ mtx_lock(&ps->ps_mtx);
+ break;
+ } else if (prop & SA_IGNORE) {
+ /*
+ * Except for SIGCONT, shouldn't get here.
+ * Default action is to ignore; drop it.
+ */
+ break; /* == ignore */
+ } else
+ return (sig);
+ /*NOTREACHED*/
+
+ case (intptr_t)SIG_IGN:
+ /*
+ * Masking above should prevent us ever trying
+ * to take action on an ignored signal other
+ * than SIGCONT, unless process is traced.
+ */
+ if ((prop & SA_CONT) == 0 &&
+ (p->p_flag & P_TRACED) == 0)
+ printf("issignal\n");
+ break; /* == ignore */
+
+ default:
+ /*
+ * This signal has an action, let
+ * postsig() process it.
+ */
+ return (sig);
+ }
+ sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */
+ sigqueue_delete(&p->p_sigqueue, sig);
+ }
+ /* NOTREACHED */
+}
+
+void
+thread_stopped(struct proc *p)
+{
+ int n;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ n = p->p_suspcount;
+ if (p == curproc)
+ n++;
+ if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
+ PROC_SUNLOCK(p);
+ p->p_flag &= ~P_WAITED;
+ PROC_LOCK(p->p_pptr);
+ childproc_stopped(p, (p->p_flag & P_TRACED) ?
+ CLD_TRAPPED : CLD_STOPPED);
+ PROC_UNLOCK(p->p_pptr);
+ PROC_SLOCK(p);
+ }
+}
+
+/*
+ * Take the action for the specified signal
+ * from the current set of pending signals.
+ */
+int
+postsig(sig)
+ register int sig;
+{
+ struct thread *td = curthread;
+ register struct proc *p = td->td_proc;
+ struct sigacts *ps;
+ sig_t action;
+ ksiginfo_t ksi;
+ sigset_t returnmask, mask;
+
+ KASSERT(sig != 0, ("postsig"));
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ ps = p->p_sigacts;
+ mtx_assert(&ps->ps_mtx, MA_OWNED);
+ ksiginfo_init(&ksi);
+ if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
+ sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
+ return (0);
+ ksi.ksi_signo = sig;
+ if (ksi.ksi_code == SI_TIMER)
+ itimer_accept(p, ksi.ksi_timerid, &ksi);
+ action = ps->ps_sigact[_SIG_IDX(sig)];
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_PSIG))
+ ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
+ &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
+#endif
+ if (p->p_stops & S_SIG) {
+ mtx_unlock(&ps->ps_mtx);
+ stopevent(p, S_SIG, sig);
+ mtx_lock(&ps->ps_mtx);
+ }
+
+ if (action == SIG_DFL) {
+ /*
+ * Default action, where the default is to kill
+ * the process. (Other cases were ignored above.)
+ */
+ mtx_unlock(&ps->ps_mtx);
+ sigexit(td, sig);
+ /* NOTREACHED */
+ } else {
+ /*
+ * If we get here, the signal must be caught.
+ */
+ KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
+ ("postsig action"));
+ /*
+ * Set the new mask value and also defer further
+ * occurrences of this signal.
+ *
+ * Special case: user has done a sigsuspend. Here the
+ * current mask is not of interest, but rather the
+ * mask from before the sigsuspend is what we want
+ * restored after the signal processing is completed.
+ */
+ if (td->td_pflags & TDP_OLDMASK) {
+ returnmask = td->td_oldsigmask;
+ td->td_pflags &= ~TDP_OLDMASK;
+ } else
+ returnmask = td->td_sigmask;
+
+ mask = ps->ps_catchmask[_SIG_IDX(sig)];
+ if (!SIGISMEMBER(ps->ps_signodefer, sig))
+ SIGADDSET(mask, sig);
+ kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
+ SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
+
+ if (SIGISMEMBER(ps->ps_sigreset, sig)) {
+ /*
+ * See kern_sigaction() for origin of this code.
+ */
+ SIGDELSET(ps->ps_sigcatch, sig);
+ if (sig != SIGCONT &&
+ sigprop(sig) & SA_IGNORE)
+ SIGADDSET(ps->ps_sigignore, sig);
+ ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+ }
+ td->td_ru.ru_nsignals++;
+ if (p->p_sig == sig) {
+ p->p_code = 0;
+ p->p_sig = 0;
+ }
+ (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
+ }
+ return (1);
+}
+
+/*
+ * Kill the current process for stated reason.
+ */
+void
+killproc(p, why)
+ struct proc *p;
+ char *why;
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid,
+ p->p_comm);
+ log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid,
+ p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why);
+ p->p_flag |= P_WKILLED;
+ kern_psignal(p, SIGKILL);
+}
+
+/*
+ * Force the current process to exit with the specified signal, dumping core
+ * if appropriate. We bypass the normal tests for masked and caught signals,
+ * allowing unrecoverable failures to terminate the process without changing
+ * signal state. Mark the accounting record with the signal termination.
+ * If dumping core, save the signal number for the debugger. Calls exit and
+ * does not return.
+ */
+void
+sigexit(td, sig)
+ struct thread *td;
+ int sig;
+{
+ struct proc *p = td->td_proc;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ p->p_acflag |= AXSIG;
+ /*
+ * We must be single-threading to generate a core dump. This
+ * ensures that the registers in the core file are up-to-date.
+ * Also, the ELF dump handler assumes that the thread list doesn't
+ * change out from under it.
+ *
+ * XXX If another thread attempts to single-thread before us
+ * (e.g. via fork()), we won't get a dump at all.
+ */
+ if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
+ p->p_sig = sig;
+ /*
+ * Log signals which would cause core dumps
+ * (Log as LOG_INFO to appease those who don't want
+ * these messages.)
+ * XXX : Todo, as well as euid, write out ruid too
+ * Note that coredump() drops proc lock.
+ */
+ if (coredump(td) == 0)
+ sig |= WCOREFLAG;
+ if (kern_logsigexit)
+ log(LOG_INFO,
+ "pid %d (%s), uid %d: exited on signal %d%s\n",
+ p->p_pid, p->p_comm,
+ td->td_ucred ? td->td_ucred->cr_uid : -1,
+ sig &~ WCOREFLAG,
+ sig & WCOREFLAG ? " (core dumped)" : "");
+ } else
+ PROC_UNLOCK(p);
+ exit1(td, W_EXITCODE(0, sig));
+ /* NOTREACHED */
+}
+
+/*
+ * Send queued SIGCHLD to parent when child process's state
+ * is changed.
+ */
+static void
+sigparent(struct proc *p, int reason, int status)
+{
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
+
+ if (p->p_ksi != NULL) {
+ p->p_ksi->ksi_signo = SIGCHLD;
+ p->p_ksi->ksi_code = reason;
+ p->p_ksi->ksi_status = status;
+ p->p_ksi->ksi_pid = p->p_pid;
+ p->p_ksi->ksi_uid = p->p_ucred->cr_ruid;
+ if (KSI_ONQ(p->p_ksi))
+ return;
+ }
+ pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
+}
+
+static void
+childproc_jobstate(struct proc *p, int reason, int status)
+{
+ struct sigacts *ps;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
+
+ /*
+ * Wake up parent sleeping in kern_wait(), also send
+ * SIGCHLD to parent, but SIGCHLD does not guarantee
+ * that parent will awake, because parent may masked
+ * the signal.
+ */
+ p->p_pptr->p_flag |= P_STATCHILD;
+ wakeup(p->p_pptr);
+
+ ps = p->p_pptr->p_sigacts;
+ mtx_lock(&ps->ps_mtx);
+ if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
+ mtx_unlock(&ps->ps_mtx);
+ sigparent(p, reason, status);
+ } else
+ mtx_unlock(&ps->ps_mtx);
+}
+
+void
+childproc_stopped(struct proc *p, int reason)
+{
+ childproc_jobstate(p, reason, p->p_xstat);
+}
+
+void
+childproc_continued(struct proc *p)
+{
+ childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
+}
+
+void
+childproc_exited(struct proc *p)
+{
+ int reason;
+ int status = p->p_xstat; /* convert to int */
+
+ reason = CLD_EXITED;
+ if (WCOREDUMP(status))
+ reason = CLD_DUMPED;
+ else if (WIFSIGNALED(status))
+ reason = CLD_KILLED;
+ /*
+ * XXX avoid calling wakeup(p->p_pptr), the work is
+ * done in exit1().
+ */
+ sigparent(p, reason, status);
+}
+
+/*
+ * We only have 1 character for the core count in the format
+ * string, so the range will be 0-9
+ */
+#define MAX_NUM_CORES 10
+static int num_cores = 5;
+
+static int
+sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int new_val;
+
+ new_val = num_cores;
+ error = sysctl_handle_int(oidp, &new_val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (new_val > MAX_NUM_CORES)
+ new_val = MAX_NUM_CORES;
+ if (new_val < 0)
+ new_val = 0;
+ num_cores = new_val;
+ return (0);
+}
+SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW,
+ 0, sizeof(int), sysctl_debug_num_cores_check, "I", "");
+
+#if defined(COMPRESS_USER_CORES)
+int compress_user_cores = 1;
+SYSCTL_INT(_kern, OID_AUTO, compress_user_cores, CTLFLAG_RW,
+ &compress_user_cores, 0, "Compression of user corefiles");
+
+int compress_user_cores_gzlevel = -1; /* default level */
+SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_gzlevel, CTLFLAG_RW,
+ &compress_user_cores_gzlevel, -1, "Corefile gzip compression level");
+
+#define GZ_SUFFIX ".gz"
+#define GZ_SUFFIX_LEN 3
+#endif
+
+static char corefilename[MAXPATHLEN] = {"%N.core"};
+TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
+SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
+ sizeof(corefilename), "Process corefile name format string");
+
+/*
+ * corefile_open(comm, uid, pid, td, compress, vpp, namep)
+ * Expand the name described in corefilename, using name, uid, and pid
+ * and open/create core file.
+ * corefilename is a printf-like string, with three format specifiers:
+ * %N name of process ("name")
+ * %P process id (pid)
+ * %U user id (uid)
+ * For example, "%N.core" is the default; they can be disabled completely
+ * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
+ * This is controlled by the sysctl variable kern.corefile (see above).
+ */
+static int
+corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
+ int compress, struct vnode **vpp, char **namep)
+{
+ struct nameidata nd;
+ struct sbuf sb;
+ const char *format;
+ char *hostname, *name;
+ int indexpos, i, error, cmode, flags, oflags;
+
+ hostname = NULL;
+ format = corefilename;
+ name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
+ indexpos = -1;
+ (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
+ for (i = 0; format[i] != '\0'; i++) {
+ switch (format[i]) {
+ case '%': /* Format character */
+ i++;
+ switch (format[i]) {
+ case '%':
+ sbuf_putc(&sb, '%');
+ break;
+ case 'H': /* hostname */
+ if (hostname == NULL) {
+ hostname = malloc(MAXHOSTNAMELEN,
+ M_TEMP, M_WAITOK);
+ }
+ getcredhostname(td->td_ucred, hostname,
+ MAXHOSTNAMELEN);
+ sbuf_printf(&sb, "%s", hostname);
+ break;
+ case 'I': /* autoincrementing index */
+ sbuf_printf(&sb, "0");
+ indexpos = sbuf_len(&sb) - 1;
+ break;
+ case 'N': /* process name */
+ sbuf_printf(&sb, "%s", comm);
+ break;
+ case 'P': /* process id */
+ sbuf_printf(&sb, "%u", pid);
+ break;
+ case 'U': /* user id */
+ sbuf_printf(&sb, "%u", uid);
+ break;
+ default:
+ log(LOG_ERR,
+ "Unknown format character %c in "
+ "corename `%s'\n", format[i], format);
+ break;
+ }
+ break;
+ default:
+ sbuf_putc(&sb, format[i]);
+ break;
+ }
+ }
+ free(hostname, M_TEMP);
+#ifdef COMPRESS_USER_CORES
+ if (compress)
+ sbuf_printf(&sb, GZ_SUFFIX);
+#endif
+ if (sbuf_error(&sb) != 0) {
+ log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
+ "long\n", (long)pid, comm, (u_long)uid);
+ sbuf_delete(&sb);
+ free(name, M_TEMP);
+ return (ENOMEM);
+ }
+ sbuf_finish(&sb);
+ sbuf_delete(&sb);
+
+ cmode = S_IRUSR | S_IWUSR;
+ oflags = VN_OPEN_NOAUDIT | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
+
+ /*
+ * If the core format has a %I in it, then we need to check
+ * for existing corefiles before returning a name.
+ * To do this we iterate over 0..num_cores to find a
+ * non-existing core file name to use.
+ */
+ if (indexpos != -1) {
+ for (i = 0; i < num_cores; i++) {
+ flags = O_CREAT | O_EXCL | FWRITE | O_NOFOLLOW;
+ name[indexpos] = '0' + i;
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
+ error = vn_open_cred(&nd, &flags, cmode, oflags,
+ td->td_ucred, NULL);
+ if (error) {
+ if (error == EEXIST)
+ continue;
+ log(LOG_ERR,
+ "pid %d (%s), uid (%u): Path `%s' failed "
+ "on initial open test, error = %d\n",
+ pid, comm, uid, name, error);
+ }
+ goto out;
+ }
+ }
+
+ flags = O_CREAT | FWRITE | O_NOFOLLOW;
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
+ error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL);
+out:
+ if (error) {
+#ifdef AUDIT
+ audit_proc_coredump(td, name, error);
+#endif
+ free(name, M_TEMP);
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ *vpp = nd.ni_vp;
+ *namep = name;
+ return (0);
+}
+
+/*
+ * Dump a process' core. The main routine does some
+ * policy checking, and creates the name of the coredump;
+ * then it passes on a vnode and a size limit to the process-specific
+ * coredump routine if there is one; if there _is not_ one, it returns
+ * ENOSYS; otherwise it returns the error from the process-specific routine.
+ */
+
+static int
+coredump(struct thread *td)
+{
+ struct proc *p = td->td_proc;
+ struct ucred *cred = td->td_ucred;
+ struct vnode *vp;
+ struct flock lf;
+ struct vattr vattr;
+ int error, error1, locked;
+ struct mount *mp;
+ char *name; /* name of corefile */
+ off_t limit;
+ int compress;
+
+#ifdef COMPRESS_USER_CORES
+ compress = compress_user_cores;
+#else
+ compress = 0;
+#endif
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
+ _STOPEVENT(p, S_CORE, 0);
+
+ if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0)) {
+ PROC_UNLOCK(p);
+ return (EFAULT);
+ }
+
+ /*
+ * Note that the bulk of limit checking is done after
+ * the corefile is created. The exception is if the limit
+ * for corefiles is 0, in which case we don't bother
+ * creating the corefile at all. This layout means that
+ * a corefile is truncated instead of not being created,
+ * if it is larger than the limit.
+ */
+ limit = (off_t)lim_cur(p, RLIMIT_CORE);
+ if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
+ PROC_UNLOCK(p);
+ return (EFBIG);
+ }
+ PROC_UNLOCK(p);
+
+restart:
+ error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress,
+ &vp, &name);
+ if (error != 0)
+ return (error);
+
+ /* Don't dump to non-regular files or files with links. */
+ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
+ vattr.va_nlink != 1) {
+ VOP_UNLOCK(vp, 0);
+ error = EFAULT;
+ goto close;
+ }
+
+ VOP_UNLOCK(vp, 0);
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_WRLCK;
+ locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
+
+ if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+ lf.l_type = F_UNLCK;
+ if (locked)
+ VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+ if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
+ goto out;
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ goto out;
+ free(name, M_TEMP);
+ goto restart;
+ }
+
+ VATTR_NULL(&vattr);
+ vattr.va_size = 0;
+ if (set_core_nodump_flag)
+ vattr.va_flags = UF_NODUMP;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_SETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ PROC_LOCK(p);
+ p->p_acflag |= ACORE;
+ PROC_UNLOCK(p);
+
+ if (p->p_sysent->sv_coredump != NULL) {
+ error = p->p_sysent->sv_coredump(td, vp, limit,
+ compress ? IMGACT_CORE_COMPRESS : 0);
+ } else {
+ error = ENOSYS;
+ }
+
+ if (locked) {
+ lf.l_type = F_UNLCK;
+ VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
+ }
+close:
+ error1 = vn_close(vp, FWRITE, cred, td);
+ if (error == 0)
+ error = error1;
+out:
+#ifdef AUDIT
+ audit_proc_coredump(td, name, error);
+#endif
+ free(name, M_TEMP);
+ return (error);
+}
+
+/*
+ * Nonexistent system call-- signal process (may want to handle it). Flag
+ * error in case process won't see signal immediately (blocked or ignored).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct nosys_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+nosys(td, args)
+ struct thread *td;
+ struct nosys_args *args;
+{
+ struct proc *p = td->td_proc;
+
+ PROC_LOCK(p);
+ tdsignal(td, SIGSYS);
+ PROC_UNLOCK(p);
+ return (ENOSYS);
+}
+
+/*
+ * Send a SIGIO or SIGURG signal to a process or process group using stored
+ * credentials rather than those of the current process.
+ */
+void
+pgsigio(sigiop, sig, checkctty)
+ struct sigio **sigiop;
+ int sig, checkctty;
+{
+ ksiginfo_t ksi;
+ struct sigio *sigio;
+
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = sig;
+ ksi.ksi_code = SI_KERNEL;
+
+ SIGIO_LOCK();
+ sigio = *sigiop;
+ if (sigio == NULL) {
+ SIGIO_UNLOCK();
+ return;
+ }
+ if (sigio->sio_pgid > 0) {
+ PROC_LOCK(sigio->sio_proc);
+ if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
+ kern_psignal(sigio->sio_proc, sig);
+ PROC_UNLOCK(sigio->sio_proc);
+ } else if (sigio->sio_pgid < 0) {
+ struct proc *p;
+
+ PGRP_LOCK(sigio->sio_pgrp);
+ LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NORMAL &&
+ CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
+ (checkctty == 0 || (p->p_flag & P_CONTROLT)))
+ kern_psignal(p, sig);
+ PROC_UNLOCK(p);
+ }
+ PGRP_UNLOCK(sigio->sio_pgrp);
+ }
+ SIGIO_UNLOCK();
+}
+
+static int
+filt_sigattach(struct knote *kn)
+{
+ struct proc *p = curproc;
+
+ kn->kn_ptr.p_proc = p;
+ kn->kn_flags |= EV_CLEAR; /* automatically set */
+
+ knlist_add(&p->p_klist, kn, 0);
+
+ return (0);
+}
+
+static void
+filt_sigdetach(struct knote *kn)
+{
+ struct proc *p = kn->kn_ptr.p_proc;
+
+ knlist_remove(&p->p_klist, kn, 0);
+}
+
+/*
+ * signal knotes are shared with proc knotes, so we apply a mask to
+ * the hint in order to differentiate them from process hints. This
+ * could be avoided by using a signal-specific knote list, but probably
+ * isn't worth the trouble.
+ */
+static int
+filt_signal(struct knote *kn, long hint)
+{
+
+ if (hint & NOTE_SIGNAL) {
+ hint &= ~NOTE_SIGNAL;
+
+ if (kn->kn_id == hint)
+ kn->kn_data++;
+ }
+ return (kn->kn_data != 0);
+}
+
+struct sigacts *
+sigacts_alloc(void)
+{
+ struct sigacts *ps;
+
+ ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
+ ps->ps_refcnt = 1;
+ mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
+ return (ps);
+}
+
+void
+sigacts_free(struct sigacts *ps)
+{
+
+ mtx_lock(&ps->ps_mtx);
+ ps->ps_refcnt--;
+ if (ps->ps_refcnt == 0) {
+ mtx_destroy(&ps->ps_mtx);
+ free(ps, M_SUBPROC);
+ } else
+ mtx_unlock(&ps->ps_mtx);
+}
+
+struct sigacts *
+sigacts_hold(struct sigacts *ps)
+{
+ mtx_lock(&ps->ps_mtx);
+ ps->ps_refcnt++;
+ mtx_unlock(&ps->ps_mtx);
+ return (ps);
+}
+
+void
+sigacts_copy(struct sigacts *dest, struct sigacts *src)
+{
+
+ KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
+ mtx_lock(&src->ps_mtx);
+ bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
+ mtx_unlock(&src->ps_mtx);
+}
+
+int
+sigacts_shared(struct sigacts *ps)
+{
+ int shared;
+
+ mtx_lock(&ps->ps_mtx);
+ shared = ps->ps_refcnt > 1;
+ mtx_unlock(&ps->ps_mtx);
+ return (shared);
+}
diff --git a/sys/kern/kern_switch.c b/sys/kern/kern_switch.c
new file mode 100644
index 0000000..d0009b1
--- /dev/null
+++ b/sys/kern/kern_switch.c
@@ -0,0 +1,513 @@
+/*-
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+
+/* Uncomment this to enable logging of critical_enter/exit. */
+#if 0
+#define KTR_CRITICAL KTR_SCHED
+#else
+#define KTR_CRITICAL 0
+#endif
+
+#ifdef FULL_PREEMPTION
+#ifndef PREEMPTION
+#error "The FULL_PREEMPTION option requires the PREEMPTION option"
+#endif
+#endif
+
+CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
+
+/*
+ * kern.sched.preemption allows user space to determine if preemption support
+ * is compiled in or not. It is not currently a boot or runtime flag that
+ * can be changed.
+ */
+#ifdef PREEMPTION
+static int kern_sched_preemption = 1;
+#else
+static int kern_sched_preemption = 0;
+#endif
+SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
+ &kern_sched_preemption, 0, "Kernel preemption enabled");
+
+/*
+ * Support for scheduler stats exported via kern.sched.stats. All stats may
+ * be reset with kern.sched.stats.reset = 1. Stats may be defined elsewhere
+ * with SCHED_STAT_DEFINE().
+ */
+#ifdef SCHED_STATS
+SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
+
+/* Switch reasons from mi_switch(). */
+DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
+SCHED_STAT_DEFINE_VAR(uncategorized,
+ &DPCPU_NAME(sched_switch_stats[SWT_NONE]), "");
+SCHED_STAT_DEFINE_VAR(preempt,
+ &DPCPU_NAME(sched_switch_stats[SWT_PREEMPT]), "");
+SCHED_STAT_DEFINE_VAR(owepreempt,
+ &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
+SCHED_STAT_DEFINE_VAR(turnstile,
+ &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
+SCHED_STAT_DEFINE_VAR(sleepq,
+ &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
+SCHED_STAT_DEFINE_VAR(sleepqtimo,
+ &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), "");
+SCHED_STAT_DEFINE_VAR(relinquish,
+ &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
+SCHED_STAT_DEFINE_VAR(needresched,
+ &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
+SCHED_STAT_DEFINE_VAR(idle,
+ &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
+SCHED_STAT_DEFINE_VAR(iwait,
+ &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
+SCHED_STAT_DEFINE_VAR(suspend,
+ &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
+SCHED_STAT_DEFINE_VAR(remotepreempt,
+ &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
+SCHED_STAT_DEFINE_VAR(remotewakeidle,
+ &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
+
+static int
+sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
+{
+ struct sysctl_oid *p;
+ uintptr_t counter;
+ int error;
+ int val;
+ int i;
+
+ val = 0;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (val == 0)
+ return (0);
+ /*
+ * Traverse the list of children of _kern_sched_stats and reset each
+ * to 0. Skip the reset entry.
+ */
+ SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
+ if (p == oidp || p->oid_arg1 == NULL)
+ continue;
+ counter = (uintptr_t)p->oid_arg1;
+ CPU_FOREACH(i) {
+ *(long *)(dpcpu_off[i] + counter) = 0;
+ }
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
+ 0, sysctl_stats_reset, "I", "Reset scheduler statistics");
+#endif
+
+/************************************************************************
+ * Functions that manipulate runnability from a thread perspective. *
+ ************************************************************************/
+/*
+ * Select the thread that will be run next.
+ */
+struct thread *
+choosethread(void)
+{
+ struct thread *td;
+
+retry:
+ td = sched_choose();
+
+ /*
+ * If we are in panic, only allow system threads,
+ * plus the one we are running in, to be run.
+ */
+ if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
+ (td->td_flags & TDF_INPANIC) == 0)) {
+ /* note that it is no longer on the run queue */
+ TD_SET_CAN_RUN(td);
+ goto retry;
+ }
+
+ TD_SET_RUNNING(td);
+ return (td);
+}
+
+/*
+ * Kernel thread preemption implementation. Critical sections mark
+ * regions of code in which preemptions are not allowed.
+ *
+ * It might seem a good idea to inline critical_enter() but, in order
+ * to prevent instructions reordering by the compiler, a __compiler_membar()
+ * would have to be used here (the same as sched_pin()). The performance
+ * penalty imposed by the membar could, then, produce slower code than
+ * the function call itself, for most cases.
+ */
+void
+critical_enter(void)
+{
+ struct thread *td;
+
+ td = curthread;
+ td->td_critnest++;
+ CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
+ (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
+}
+
+void
+critical_exit(void)
+{
+ struct thread *td;
+ int flags;
+
+ td = curthread;
+ KASSERT(td->td_critnest != 0,
+ ("critical_exit: td_critnest == 0"));
+
+ if (td->td_critnest == 1) {
+ td->td_critnest = 0;
+ if (td->td_owepreempt && !kdb_active) {
+ td->td_critnest = 1;
+ thread_lock(td);
+ td->td_critnest--;
+ flags = SW_INVOL | SW_PREEMPT;
+ if (TD_IS_IDLETHREAD(td))
+ flags |= SWT_IDLE;
+ else
+ flags |= SWT_OWEPREEMPT;
+ mi_switch(flags, NULL);
+ thread_unlock(td);
+ }
+ } else
+ td->td_critnest--;
+
+ CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
+ (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
+}
+
+/************************************************************************
+ * SYSTEM RUN QUEUE manipulations and tests *
+ ************************************************************************/
+/*
+ * Initialize a run structure.
+ */
+void
+runq_init(struct runq *rq)
+{
+ int i;
+
+ bzero(rq, sizeof *rq);
+ for (i = 0; i < RQ_NQS; i++)
+ TAILQ_INIT(&rq->rq_queues[i]);
+}
+
+/*
+ * Clear the status bit of the queue corresponding to priority level pri,
+ * indicating that it is empty.
+ */
+static __inline void
+runq_clrbit(struct runq *rq, int pri)
+{
+ struct rqbits *rqb;
+
+ rqb = &rq->rq_status;
+ CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
+ rqb->rqb_bits[RQB_WORD(pri)],
+ rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
+ RQB_BIT(pri), RQB_WORD(pri));
+ rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
+}
+
+/*
+ * Find the index of the first non-empty run queue. This is done by
+ * scanning the status bits, a set bit indicates a non-empty queue.
+ */
+static __inline int
+runq_findbit(struct runq *rq)
+{
+ struct rqbits *rqb;
+ int pri;
+ int i;
+
+ rqb = &rq->rq_status;
+ for (i = 0; i < RQB_LEN; i++)
+ if (rqb->rqb_bits[i]) {
+ pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
+ CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
+ rqb->rqb_bits[i], i, pri);
+ return (pri);
+ }
+
+ return (-1);
+}
+
+static __inline int
+runq_findbit_from(struct runq *rq, u_char pri)
+{
+ struct rqbits *rqb;
+ rqb_word_t mask;
+ int i;
+
+ /*
+ * Set the mask for the first word so we ignore priorities before 'pri'.
+ */
+ mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
+ rqb = &rq->rq_status;
+again:
+ for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
+ mask = rqb->rqb_bits[i] & mask;
+ if (mask == 0)
+ continue;
+ pri = RQB_FFS(mask) + (i << RQB_L2BPW);
+ CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
+ mask, i, pri);
+ return (pri);
+ }
+ if (pri == 0)
+ return (-1);
+ /*
+ * Wrap back around to the beginning of the list just once so we
+ * scan the whole thing.
+ */
+ pri = 0;
+ goto again;
+}
+
+/*
+ * Set the status bit of the queue corresponding to priority level pri,
+ * indicating that it is non-empty.
+ */
+static __inline void
+runq_setbit(struct runq *rq, int pri)
+{
+ struct rqbits *rqb;
+
+ rqb = &rq->rq_status;
+ CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
+ rqb->rqb_bits[RQB_WORD(pri)],
+ rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
+ RQB_BIT(pri), RQB_WORD(pri));
+ rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
+}
+
+/*
+ * Add the thread to the queue specified by its priority, and set the
+ * corresponding status bit.
+ */
+void
+runq_add(struct runq *rq, struct thread *td, int flags)
+{
+ struct rqhead *rqh;
+ int pri;
+
+ pri = td->td_priority / RQ_PPQ;
+ td->td_rqindex = pri;
+ runq_setbit(rq, pri);
+ rqh = &rq->rq_queues[pri];
+ CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
+ td, td->td_priority, pri, rqh);
+ if (flags & SRQ_PREEMPTED) {
+ TAILQ_INSERT_HEAD(rqh, td, td_runq);
+ } else {
+ TAILQ_INSERT_TAIL(rqh, td, td_runq);
+ }
+}
+
+void
+runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
+{
+ struct rqhead *rqh;
+
+ KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
+ td->td_rqindex = pri;
+ runq_setbit(rq, pri);
+ rqh = &rq->rq_queues[pri];
+ CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
+ td, td->td_priority, pri, rqh);
+ if (flags & SRQ_PREEMPTED) {
+ TAILQ_INSERT_HEAD(rqh, td, td_runq);
+ } else {
+ TAILQ_INSERT_TAIL(rqh, td, td_runq);
+ }
+}
+/*
+ * Return true if there are runnable processes of any priority on the run
+ * queue, false otherwise. Has no side effects, does not modify the run
+ * queue structure.
+ */
+int
+runq_check(struct runq *rq)
+{
+ struct rqbits *rqb;
+ int i;
+
+ rqb = &rq->rq_status;
+ for (i = 0; i < RQB_LEN; i++)
+ if (rqb->rqb_bits[i]) {
+ CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
+ rqb->rqb_bits[i], i);
+ return (1);
+ }
+ CTR0(KTR_RUNQ, "runq_check: empty");
+
+ return (0);
+}
+
+/*
+ * Find the highest priority process on the run queue.
+ */
+struct thread *
+runq_choose_fuzz(struct runq *rq, int fuzz)
+{
+ struct rqhead *rqh;
+ struct thread *td;
+ int pri;
+
+ while ((pri = runq_findbit(rq)) != -1) {
+ rqh = &rq->rq_queues[pri];
+ /* fuzz == 1 is normal.. 0 or less are ignored */
+ if (fuzz > 1) {
+ /*
+ * In the first couple of entries, check if
+ * there is one for our CPU as a preference.
+ */
+ int count = fuzz;
+ int cpu = PCPU_GET(cpuid);
+ struct thread *td2;
+ td2 = td = TAILQ_FIRST(rqh);
+
+ while (count-- && td2) {
+ if (td2->td_lastcpu == cpu) {
+ td = td2;
+ break;
+ }
+ td2 = TAILQ_NEXT(td2, td_runq);
+ }
+ } else
+ td = TAILQ_FIRST(rqh);
+ KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
+ CTR3(KTR_RUNQ,
+ "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
+ return (td);
+ }
+ CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
+
+ return (NULL);
+}
+
+/*
+ * Find the highest priority process on the run queue.
+ */
+struct thread *
+runq_choose(struct runq *rq)
+{
+ struct rqhead *rqh;
+ struct thread *td;
+ int pri;
+
+ while ((pri = runq_findbit(rq)) != -1) {
+ rqh = &rq->rq_queues[pri];
+ td = TAILQ_FIRST(rqh);
+ KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
+ CTR3(KTR_RUNQ,
+ "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
+ return (td);
+ }
+ CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
+
+ return (NULL);
+}
+
+struct thread *
+runq_choose_from(struct runq *rq, u_char idx)
+{
+ struct rqhead *rqh;
+ struct thread *td;
+ int pri;
+
+ if ((pri = runq_findbit_from(rq, idx)) != -1) {
+ rqh = &rq->rq_queues[pri];
+ td = TAILQ_FIRST(rqh);
+ KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
+ CTR4(KTR_RUNQ,
+ "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
+ pri, td, td->td_rqindex, rqh);
+ return (td);
+ }
+ CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
+
+ return (NULL);
+}
+/*
+ * Remove the thread from the queue specified by its priority, and clear the
+ * corresponding status bit if the queue becomes empty.
+ * Caller must set state afterwards.
+ */
+void
+runq_remove(struct runq *rq, struct thread *td)
+{
+
+ runq_remove_idx(rq, td, NULL);
+}
+
+void
+runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
+{
+ struct rqhead *rqh;
+ u_char pri;
+
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("runq_remove_idx: thread swapped out"));
+ pri = td->td_rqindex;
+ KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
+ rqh = &rq->rq_queues[pri];
+ CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
+ td, td->td_priority, pri, rqh);
+ TAILQ_REMOVE(rqh, td, td_runq);
+ if (TAILQ_EMPTY(rqh)) {
+ CTR0(KTR_RUNQ, "runq_remove_idx: empty");
+ runq_clrbit(rq, pri);
+ if (idx != NULL && *idx == pri)
+ *idx = (pri + 1) % RQ_NQS;
+ }
+}
diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c
new file mode 100644
index 0000000..ff5d95d
--- /dev/null
+++ b/sys/kern/kern_sx.c
@@ -0,0 +1,1214 @@
+/*-
+ * Copyright (c) 2007 Attilio Rao <attilio@freebsd.org>
+ * Copyright (c) 2001 Jason Evans <jasone@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+/*
+ * Shared/exclusive locks. This implementation attempts to ensure
+ * deterministic lock granting behavior, so that slocks and xlocks are
+ * interleaved.
+ *
+ * Priority propagation will not generally raise the priority of lock holders,
+ * so should not be relied upon in combination with sx locks.
+ */
+
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_no_adaptive_sx.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sleepqueue.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_SX)
+#include <machine/cpu.h>
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_SX)
+#define ADAPTIVE_SX
+#endif
+
+CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE);
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+PMC_SOFT_DECLARE( , , lock, failed);
+#endif
+
+/* Handy macros for sleep queues. */
+#define SQ_EXCLUSIVE_QUEUE 0
+#define SQ_SHARED_QUEUE 1
+
+/*
+ * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file. We
+ * drop Giant anytime we have to sleep or if we adaptively spin.
+ */
+#define GIANT_DECLARE \
+ int _giantcnt = 0; \
+ WITNESS_SAVE_DECL(Giant) \
+
+#define GIANT_SAVE() do { \
+ if (mtx_owned(&Giant)) { \
+ WITNESS_SAVE(&Giant.lock_object, Giant); \
+ while (mtx_owned(&Giant)) { \
+ _giantcnt++; \
+ mtx_unlock(&Giant); \
+ } \
+ } \
+} while (0)
+
+#define GIANT_RESTORE() do { \
+ if (_giantcnt > 0) { \
+ mtx_assert(&Giant, MA_NOTOWNED); \
+ while (_giantcnt--) \
+ mtx_lock(&Giant); \
+ WITNESS_RESTORE(&Giant.lock_object, Giant); \
+ } \
+} while (0)
+
+/*
+ * Returns true if an exclusive lock is recursed. It assumes
+ * curthread currently has an exclusive lock.
+ */
+#define sx_recurse lock_object.lo_data
+#define sx_recursed(sx) ((sx)->sx_recurse != 0)
+
+static void assert_sx(const struct lock_object *lock, int what);
+#ifdef DDB
+static void db_show_sx(const struct lock_object *lock);
+#endif
+static void lock_sx(struct lock_object *lock, int how);
+#ifdef KDTRACE_HOOKS
+static int owner_sx(const struct lock_object *lock, struct thread **owner);
+#endif
+static int unlock_sx(struct lock_object *lock);
+
+struct lock_class lock_class_sx = {
+ .lc_name = "sx",
+ .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
+ .lc_assert = assert_sx,
+#ifdef DDB
+ .lc_ddb_show = db_show_sx,
+#endif
+ .lc_lock = lock_sx,
+ .lc_unlock = unlock_sx,
+#ifdef KDTRACE_HOOKS
+ .lc_owner = owner_sx,
+#endif
+};
+
+#ifndef INVARIANTS
+#define _sx_assert(sx, what, file, line)
+#endif
+
+#ifdef ADAPTIVE_SX
+static u_int asx_retries = 10;
+static u_int asx_loops = 10000;
+static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging");
+SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, "");
+SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, "");
+#endif
+
+void
+assert_sx(const struct lock_object *lock, int what)
+{
+
+ sx_assert((const struct sx *)lock, what);
+}
+
+void
+lock_sx(struct lock_object *lock, int how)
+{
+ struct sx *sx;
+
+ sx = (struct sx *)lock;
+ if (how)
+ sx_xlock(sx);
+ else
+ sx_slock(sx);
+}
+
+int
+unlock_sx(struct lock_object *lock)
+{
+ struct sx *sx;
+
+ sx = (struct sx *)lock;
+ sx_assert(sx, SA_LOCKED | SA_NOTRECURSED);
+ if (sx_xlocked(sx)) {
+ sx_xunlock(sx);
+ return (1);
+ } else {
+ sx_sunlock(sx);
+ return (0);
+ }
+}
+
+#ifdef KDTRACE_HOOKS
+int
+owner_sx(const struct lock_object *lock, struct thread **owner)
+{
+ const struct sx *sx = (const struct sx *)lock;
+ uintptr_t x = sx->sx_lock;
+
+ *owner = (struct thread *)SX_OWNER(x);
+ return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) :
+ (*owner != NULL));
+}
+#endif
+
+void
+sx_sysinit(void *arg)
+{
+ struct sx_args *sargs = arg;
+
+ sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags);
+}
+
+void
+sx_init_flags(struct sx *sx, const char *description, int opts)
+{
+ int flags;
+
+ MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK |
+ SX_NOPROFILE | SX_NOADAPTIVE)) == 0);
+ ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock,
+ ("%s: sx_lock not aligned for %s: %p", __func__, description,
+ &sx->sx_lock));
+
+ flags = LO_SLEEPABLE | LO_UPGRADABLE;
+ if (opts & SX_DUPOK)
+ flags |= LO_DUPOK;
+ if (opts & SX_NOPROFILE)
+ flags |= LO_NOPROFILE;
+ if (!(opts & SX_NOWITNESS))
+ flags |= LO_WITNESS;
+ if (opts & SX_RECURSE)
+ flags |= LO_RECURSABLE;
+ if (opts & SX_QUIET)
+ flags |= LO_QUIET;
+
+ flags |= opts & SX_NOADAPTIVE;
+ lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags);
+ sx->sx_lock = SX_LOCK_UNLOCKED;
+ sx->sx_recurse = 0;
+}
+
+void
+sx_destroy(struct sx *sx)
+{
+
+ KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held"));
+ KASSERT(sx->sx_recurse == 0, ("sx lock still recursed"));
+ sx->sx_lock = SX_LOCK_DESTROYED;
+ lock_destroy(&sx->lock_object);
+}
+
+int
+_sx_slock(struct sx *sx, int opts, const char *file, int line)
+{
+ int error = 0;
+
+ if (SCHEDULER_STOPPED())
+ return (0);
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("sx_slock() by idle thread %p on sx %s @ %s:%d",
+ curthread, sx->lock_object.lo_name, file, line));
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_slock() of destroyed sx @ %s:%d", file, line));
+ WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL);
+ error = __sx_slock(sx, opts, file, line);
+ if (!error) {
+ LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line);
+ WITNESS_LOCK(&sx->lock_object, 0, file, line);
+ curthread->td_locks++;
+ }
+
+ return (error);
+}
+
+int
+sx_try_slock_(struct sx *sx, const char *file, int line)
+{
+ uintptr_t x;
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("sx_try_slock() by idle thread %p on sx %s @ %s:%d",
+ curthread, sx->lock_object.lo_name, file, line));
+
+ for (;;) {
+ x = sx->sx_lock;
+ KASSERT(x != SX_LOCK_DESTROYED,
+ ("sx_try_slock() of destroyed sx @ %s:%d", file, line));
+ if (!(x & SX_LOCK_SHARED))
+ break;
+ if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) {
+ LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line);
+ WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line);
+ curthread->td_locks++;
+ return (1);
+ }
+ }
+
+ LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line);
+ return (0);
+}
+
+int
+_sx_xlock(struct sx *sx, int opts, const char *file, int line)
+{
+ int error = 0;
+
+ if (SCHEDULER_STOPPED())
+ return (0);
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("sx_xlock() by idle thread %p on sx %s @ %s:%d",
+ curthread, sx->lock_object.lo_name, file, line));
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_xlock() of destroyed sx @ %s:%d", file, line));
+ WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
+ line, NULL);
+ error = __sx_xlock(sx, curthread, opts, file, line);
+ if (!error) {
+ LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse,
+ file, line);
+ WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
+ curthread->td_locks++;
+ }
+
+ return (error);
+}
+
+int
+sx_try_xlock_(struct sx *sx, const char *file, int line)
+{
+ int rval;
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+ KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
+ ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d",
+ curthread, sx->lock_object.lo_name, file, line));
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_try_xlock() of destroyed sx @ %s:%d", file, line));
+
+ if (sx_xlocked(sx) &&
+ (sx->lock_object.lo_flags & LO_RECURSABLE) != 0) {
+ sx->sx_recurse++;
+ atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+ rval = 1;
+ } else
+ rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED,
+ (uintptr_t)curthread);
+ LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line);
+ if (rval) {
+ WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ curthread->td_locks++;
+ }
+
+ return (rval);
+}
+
+void
+_sx_sunlock(struct sx *sx, const char *file, int line)
+{
+
+ if (SCHEDULER_STOPPED())
+ return;
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_sunlock() of destroyed sx @ %s:%d", file, line));
+ _sx_assert(sx, SA_SLOCKED, file, line);
+ WITNESS_UNLOCK(&sx->lock_object, 0, file, line);
+ LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line);
+ __sx_sunlock(sx, file, line);
+ LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_SUNLOCK_RELEASE, sx);
+ curthread->td_locks--;
+}
+
+void
+_sx_xunlock(struct sx *sx, const char *file, int line)
+{
+
+ if (SCHEDULER_STOPPED())
+ return;
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_xunlock() of destroyed sx @ %s:%d", file, line));
+ _sx_assert(sx, SA_XLOCKED, file, line);
+ WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
+ LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file,
+ line);
+ if (!sx_recursed(sx))
+ LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_XUNLOCK_RELEASE, sx);
+ __sx_xunlock(sx, curthread, file, line);
+ curthread->td_locks--;
+}
+
+/*
+ * Try to do a non-blocking upgrade from a shared lock to an exclusive lock.
+ * This will only succeed if this thread holds a single shared lock.
+ * Return 1 if if the upgrade succeed, 0 otherwise.
+ */
+int
+sx_try_upgrade_(struct sx *sx, const char *file, int line)
+{
+ uintptr_t x;
+ int success;
+
+ if (SCHEDULER_STOPPED())
+ return (1);
+
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line));
+ _sx_assert(sx, SA_SLOCKED, file, line);
+
+ /*
+ * Try to switch from one shared lock to an exclusive lock. We need
+ * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that
+ * we will wake up the exclusive waiters when we drop the lock.
+ */
+ x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS;
+ success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x,
+ (uintptr_t)curthread | x);
+ LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line);
+ if (success) {
+ WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+ file, line);
+ LOCKSTAT_RECORD0(LS_SX_TRYUPGRADE_UPGRADE, sx);
+ }
+ return (success);
+}
+
+/*
+ * Downgrade an unrecursed exclusive lock into a single shared lock.
+ */
+void
+sx_downgrade_(struct sx *sx, const char *file, int line)
+{
+ uintptr_t x;
+ int wakeup_swapper;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+ ("sx_downgrade() of destroyed sx @ %s:%d", file, line));
+ _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line);
+#ifndef INVARIANTS
+ if (sx_recursed(sx))
+ panic("downgrade of a recursed lock");
+#endif
+
+ WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line);
+
+ /*
+ * Try to switch from an exclusive lock with no shared waiters
+ * to one sharer with no shared waiters. If there are
+ * exclusive waiters, we don't need to lock the sleep queue so
+ * long as we preserve the flag. We do one quick try and if
+ * that fails we grab the sleepq lock to keep the flags from
+ * changing and do it the slow way.
+ *
+ * We have to lock the sleep queue if there are shared waiters
+ * so we can wake them up.
+ */
+ x = sx->sx_lock;
+ if (!(x & SX_LOCK_SHARED_WAITERS) &&
+ atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) |
+ (x & SX_LOCK_EXCLUSIVE_WAITERS))) {
+ LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
+ return;
+ }
+
+ /*
+ * Lock the sleep queue so we can read the waiters bits
+ * without any races and wakeup any shared waiters.
+ */
+ sleepq_lock(&sx->lock_object);
+
+ /*
+ * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single
+ * shared lock. If there are any shared waiters, wake them up.
+ */
+ wakeup_swapper = 0;
+ x = sx->sx_lock;
+ atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) |
+ (x & SX_LOCK_EXCLUSIVE_WAITERS));
+ if (x & SX_LOCK_SHARED_WAITERS)
+ wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
+ 0, SQ_SHARED_QUEUE);
+ sleepq_release(&sx->lock_object);
+
+ LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
+ LOCKSTAT_RECORD0(LS_SX_DOWNGRADE_DOWNGRADE, sx);
+
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+/*
+ * This function represents the so-called 'hard case' for sx_xlock
+ * operation. All 'easy case' failures are redirected to this. Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
+int
+_sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
+ int line)
+{
+ GIANT_DECLARE;
+#ifdef ADAPTIVE_SX
+ volatile struct thread *owner;
+ u_int i, spintries = 0;
+#endif
+ uintptr_t x;
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+ int error = 0;
+#ifdef KDTRACE_HOOKS
+ uint64_t spin_cnt = 0;
+ uint64_t sleep_cnt = 0;
+ int64_t sleep_time = 0;
+#endif
+
+ if (SCHEDULER_STOPPED())
+ return (0);
+
+ /* If we already hold an exclusive lock, then recurse. */
+ if (sx_xlocked(sx)) {
+ KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0,
+ ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n",
+ sx->lock_object.lo_name, file, line));
+ sx->sx_recurse++;
+ atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx);
+ return (0);
+ }
+
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
+ sx->lock_object.lo_name, (void *)sx->sx_lock, file, line);
+
+ while (!atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) {
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
+ &waittime);
+#ifdef ADAPTIVE_SX
+ /*
+ * If the lock is write locked and the owner is
+ * running on another CPU, spin until the owner stops
+ * running or the state of the lock changes.
+ */
+ x = sx->sx_lock;
+ if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
+ if ((x & SX_LOCK_SHARED) == 0) {
+ x = SX_OWNER(x);
+ owner = (struct thread *)x;
+ if (TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, sx, owner);
+ GIANT_SAVE();
+ while (SX_OWNER(sx->sx_lock) == x &&
+ TD_IS_RUNNING(owner)) {
+ cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ }
+ continue;
+ }
+ } else if (SX_SHARERS(x) && spintries < asx_retries) {
+ GIANT_SAVE();
+ spintries++;
+ for (i = 0; i < asx_loops; i++) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: shared spinning on %p with %u and %u",
+ __func__, sx, spintries, i);
+ x = sx->sx_lock;
+ if ((x & SX_LOCK_SHARED) == 0 ||
+ SX_SHARERS(x) == 0)
+ break;
+ cpu_spinwait();
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ }
+ if (i != asx_loops)
+ continue;
+ }
+ }
+#endif
+
+ sleepq_lock(&sx->lock_object);
+ x = sx->sx_lock;
+
+ /*
+ * If the lock was released while spinning on the
+ * sleep queue chain lock, try again.
+ */
+ if (x == SX_LOCK_UNLOCKED) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+
+#ifdef ADAPTIVE_SX
+ /*
+ * The current lock owner might have started executing
+ * on another CPU (or the lock could have changed
+ * owners) while we were waiting on the sleep queue
+ * chain lock. If so, drop the sleep queue lock and try
+ * again.
+ */
+ if (!(x & SX_LOCK_SHARED) &&
+ (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
+ owner = (struct thread *)SX_OWNER(x);
+ if (TD_IS_RUNNING(owner)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * If an exclusive lock was released with both shared
+ * and exclusive waiters and a shared waiter hasn't
+ * woken up and acquired the lock yet, sx_lock will be
+ * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS.
+ * If we see that value, try to acquire it once. Note
+ * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS
+ * as there are other exclusive waiters still. If we
+ * fail, restart the loop.
+ */
+ if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) {
+ if (atomic_cmpset_acq_ptr(&sx->sx_lock,
+ SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS,
+ tid | SX_LOCK_EXCLUSIVE_WAITERS)) {
+ sleepq_release(&sx->lock_object);
+ CTR2(KTR_LOCK, "%s: %p claimed by new writer",
+ __func__, sx);
+ break;
+ }
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+
+ /*
+ * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail,
+ * than loop back and retry.
+ */
+ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
+ if (!atomic_cmpset_ptr(&sx->sx_lock, x,
+ x | SX_LOCK_EXCLUSIVE_WAITERS)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p set excl waiters flag",
+ __func__, sx);
+ }
+
+ /*
+ * Since we have been unable to acquire the exclusive
+ * lock and the exclusive waiters flag is set, we have
+ * to sleep.
+ */
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
+ __func__, sx);
+
+#ifdef KDTRACE_HOOKS
+ sleep_time -= lockstat_nsecs();
+#endif
+ GIANT_SAVE();
+ sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
+ SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
+ SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE);
+ if (!(opts & SX_INTERRUPTIBLE))
+ sleepq_wait(&sx->lock_object, 0);
+ else
+ error = sleepq_wait_sig(&sx->lock_object, 0);
+#ifdef KDTRACE_HOOKS
+ sleep_time += lockstat_nsecs();
+ sleep_cnt++;
+#endif
+ if (error) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK,
+ "%s: interruptible sleep by %p suspended by signal",
+ __func__, sx);
+ break;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
+ __func__, sx);
+ }
+
+ GIANT_RESTORE();
+ if (!error)
+ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_XLOCK_ACQUIRE, sx,
+ contested, waittime, file, line);
+#ifdef KDTRACE_HOOKS
+ if (sleep_time)
+ LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time);
+ if (spin_cnt > sleep_cnt)
+ LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt));
+#endif
+ return (error);
+}
+
+/*
+ * This function represents the so-called 'hard case' for sx_xunlock
+ * operation. All 'easy case' failures are redirected to this. Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
+void
+_sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line)
+{
+ uintptr_t x;
+ int queue, wakeup_swapper;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ MPASS(!(sx->sx_lock & SX_LOCK_SHARED));
+
+ /* If the lock is recursed, then unrecurse one level. */
+ if (sx_xlocked(sx) && sx_recursed(sx)) {
+ if ((--sx->sx_recurse) == 0)
+ atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx);
+ return;
+ }
+ MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS |
+ SX_LOCK_EXCLUSIVE_WAITERS));
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p contested", __func__, sx);
+
+ sleepq_lock(&sx->lock_object);
+ x = SX_LOCK_UNLOCKED;
+
+ /*
+ * The wake up algorithm here is quite simple and probably not
+ * ideal. It gives precedence to shared waiters if they are
+ * present. For this condition, we have to preserve the
+ * state of the exclusive waiters flag.
+ * If interruptible sleeps left the shared queue empty avoid a
+ * starvation for the threads sleeping on the exclusive queue by giving
+ * them precedence and cleaning up the shared waiters bit anyway.
+ */
+ if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 &&
+ sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) {
+ queue = SQ_SHARED_QUEUE;
+ x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS);
+ } else
+ queue = SQ_EXCLUSIVE_QUEUE;
+
+ /* Wake up all the waiters for the specific queue. */
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue",
+ __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" :
+ "exclusive");
+ atomic_store_rel_ptr(&sx->sx_lock, x);
+ wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0,
+ queue);
+ sleepq_release(&sx->lock_object);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+/*
+ * This function represents the so-called 'hard case' for sx_slock
+ * operation. All 'easy case' failures are redirected to this. Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
+int
+_sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
+{
+ GIANT_DECLARE;
+#ifdef ADAPTIVE_SX
+ volatile struct thread *owner;
+#endif
+#ifdef LOCK_PROFILING
+ uint64_t waittime = 0;
+ int contested = 0;
+#endif
+ uintptr_t x;
+ int error = 0;
+#ifdef KDTRACE_HOOKS
+ uint64_t spin_cnt = 0;
+ uint64_t sleep_cnt = 0;
+ int64_t sleep_time = 0;
+#endif
+
+ if (SCHEDULER_STOPPED())
+ return (0);
+
+ /*
+ * As with rwlocks, we don't make any attempt to try to block
+ * shared locks once there is an exclusive waiter.
+ */
+ for (;;) {
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ x = sx->sx_lock;
+
+ /*
+ * If no other thread has an exclusive lock then try to bump up
+ * the count of sharers. Since we have to preserve the state
+ * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the
+ * shared lock loop back and retry.
+ */
+ if (x & SX_LOCK_SHARED) {
+ MPASS(!(x & SX_LOCK_SHARED_WAITERS));
+ if (atomic_cmpset_acq_ptr(&sx->sx_lock, x,
+ x + SX_ONE_SHARER)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: %p succeed %p -> %p", __func__,
+ sx, (void *)x,
+ (void *)(x + SX_ONE_SHARER));
+ break;
+ }
+ continue;
+ }
+#ifdef HWPMC_HOOKS
+ PMC_SOFT_CALL( , , lock, failed);
+#endif
+ lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
+ &waittime);
+
+#ifdef ADAPTIVE_SX
+ /*
+ * If the owner is running on another CPU, spin until
+ * the owner stops running or the state of the lock
+ * changes.
+ */
+ if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
+ x = SX_OWNER(x);
+ owner = (struct thread *)x;
+ if (TD_IS_RUNNING(owner)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR3(KTR_LOCK,
+ "%s: spinning on %p held by %p",
+ __func__, sx, owner);
+ GIANT_SAVE();
+ while (SX_OWNER(sx->sx_lock) == x &&
+ TD_IS_RUNNING(owner)) {
+#ifdef KDTRACE_HOOKS
+ spin_cnt++;
+#endif
+ cpu_spinwait();
+ }
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * Some other thread already has an exclusive lock, so
+ * start the process of blocking.
+ */
+ sleepq_lock(&sx->lock_object);
+ x = sx->sx_lock;
+
+ /*
+ * The lock could have been released while we spun.
+ * In this case loop back and retry.
+ */
+ if (x & SX_LOCK_SHARED) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+
+#ifdef ADAPTIVE_SX
+ /*
+ * If the owner is running on another CPU, spin until
+ * the owner stops running or the state of the lock
+ * changes.
+ */
+ if (!(x & SX_LOCK_SHARED) &&
+ (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
+ owner = (struct thread *)SX_OWNER(x);
+ if (TD_IS_RUNNING(owner)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ }
+#endif
+
+ /*
+ * Try to set the SX_LOCK_SHARED_WAITERS flag. If we
+ * fail to set it drop the sleep queue lock and loop
+ * back.
+ */
+ if (!(x & SX_LOCK_SHARED_WAITERS)) {
+ if (!atomic_cmpset_ptr(&sx->sx_lock, x,
+ x | SX_LOCK_SHARED_WAITERS)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p set shared waiters flag",
+ __func__, sx);
+ }
+
+ /*
+ * Since we have been unable to acquire the shared lock,
+ * we have to sleep.
+ */
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
+ __func__, sx);
+
+#ifdef KDTRACE_HOOKS
+ sleep_time -= lockstat_nsecs();
+#endif
+ GIANT_SAVE();
+ sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
+ SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
+ SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE);
+ if (!(opts & SX_INTERRUPTIBLE))
+ sleepq_wait(&sx->lock_object, 0);
+ else
+ error = sleepq_wait_sig(&sx->lock_object, 0);
+#ifdef KDTRACE_HOOKS
+ sleep_time += lockstat_nsecs();
+ sleep_cnt++;
+#endif
+ if (error) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK,
+ "%s: interruptible sleep by %p suspended by signal",
+ __func__, sx);
+ break;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
+ __func__, sx);
+ }
+ if (error == 0)
+ LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_SLOCK_ACQUIRE, sx,
+ contested, waittime, file, line);
+#ifdef KDTRACE_HOOKS
+ if (sleep_time)
+ LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time);
+ if (spin_cnt > sleep_cnt)
+ LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt));
+#endif
+ GIANT_RESTORE();
+ return (error);
+}
+
+/*
+ * This function represents the so-called 'hard case' for sx_sunlock
+ * operation. All 'easy case' failures are redirected to this. Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
+void
+_sx_sunlock_hard(struct sx *sx, const char *file, int line)
+{
+ uintptr_t x;
+ int wakeup_swapper;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ for (;;) {
+ x = sx->sx_lock;
+
+ /*
+ * We should never have sharers while at least one thread
+ * holds a shared lock.
+ */
+ KASSERT(!(x & SX_LOCK_SHARED_WAITERS),
+ ("%s: waiting sharers", __func__));
+
+ /*
+ * See if there is more than one shared lock held. If
+ * so, just drop one and return.
+ */
+ if (SX_SHARERS(x) > 1) {
+ if (atomic_cmpset_rel_ptr(&sx->sx_lock, x,
+ x - SX_ONE_SHARER)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR4(KTR_LOCK,
+ "%s: %p succeeded %p -> %p",
+ __func__, sx, (void *)x,
+ (void *)(x - SX_ONE_SHARER));
+ break;
+ }
+ continue;
+ }
+
+ /*
+ * If there aren't any waiters for an exclusive lock,
+ * then try to drop it quickly.
+ */
+ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
+ MPASS(x == SX_SHARERS_LOCK(1));
+ if (atomic_cmpset_rel_ptr(&sx->sx_lock,
+ SX_SHARERS_LOCK(1), SX_LOCK_UNLOCKED)) {
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p last succeeded",
+ __func__, sx);
+ break;
+ }
+ continue;
+ }
+
+ /*
+ * At this point, there should just be one sharer with
+ * exclusive waiters.
+ */
+ MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS));
+
+ sleepq_lock(&sx->lock_object);
+
+ /*
+ * Wake up semantic here is quite simple:
+ * Just wake up all the exclusive waiters.
+ * Note that the state of the lock could have changed,
+ * so if it fails loop back and retry.
+ */
+ if (!atomic_cmpset_rel_ptr(&sx->sx_lock,
+ SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS,
+ SX_LOCK_UNLOCKED)) {
+ sleepq_release(&sx->lock_object);
+ continue;
+ }
+ if (LOCK_LOG_TEST(&sx->lock_object, 0))
+ CTR2(KTR_LOCK, "%s: %p waking up all thread on"
+ "exclusive queue", __func__, sx);
+ wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
+ 0, SQ_EXCLUSIVE_QUEUE);
+ sleepq_release(&sx->lock_object);
+ if (wakeup_swapper)
+ kick_proc0();
+ break;
+ }
+}
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef _sx_assert
+#endif
+
+/*
+ * In the non-WITNESS case, sx_assert() can only detect that at least
+ * *some* thread owns an slock, but it cannot guarantee that *this*
+ * thread owns an slock.
+ */
+void
+_sx_assert(const struct sx *sx, int what, const char *file, int line)
+{
+#ifndef WITNESS
+ int slocked = 0;
+#endif
+
+ if (panicstr != NULL)
+ return;
+ switch (what) {
+ case SA_SLOCKED:
+ case SA_SLOCKED | SA_NOTRECURSED:
+ case SA_SLOCKED | SA_RECURSED:
+#ifndef WITNESS
+ slocked = 1;
+ /* FALLTHROUGH */
+#endif
+ case SA_LOCKED:
+ case SA_LOCKED | SA_NOTRECURSED:
+ case SA_LOCKED | SA_RECURSED:
+#ifdef WITNESS
+ witness_assert(&sx->lock_object, what, file, line);
+#else
+ /*
+ * If some other thread has an exclusive lock or we
+ * have one and are asserting a shared lock, fail.
+ * Also, if no one has a lock at all, fail.
+ */
+ if (sx->sx_lock == SX_LOCK_UNLOCKED ||
+ (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked ||
+ sx_xholder(sx) != curthread)))
+ panic("Lock %s not %slocked @ %s:%d\n",
+ sx->lock_object.lo_name, slocked ? "share " : "",
+ file, line);
+
+ if (!(sx->sx_lock & SX_LOCK_SHARED)) {
+ if (sx_recursed(sx)) {
+ if (what & SA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ sx->lock_object.lo_name, file,
+ line);
+ } else if (what & SA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ sx->lock_object.lo_name, file, line);
+ }
+#endif
+ break;
+ case SA_XLOCKED:
+ case SA_XLOCKED | SA_NOTRECURSED:
+ case SA_XLOCKED | SA_RECURSED:
+ if (sx_xholder(sx) != curthread)
+ panic("Lock %s not exclusively locked @ %s:%d\n",
+ sx->lock_object.lo_name, file, line);
+ if (sx_recursed(sx)) {
+ if (what & SA_NOTRECURSED)
+ panic("Lock %s recursed @ %s:%d\n",
+ sx->lock_object.lo_name, file, line);
+ } else if (what & SA_RECURSED)
+ panic("Lock %s not recursed @ %s:%d\n",
+ sx->lock_object.lo_name, file, line);
+ break;
+ case SA_UNLOCKED:
+#ifdef WITNESS
+ witness_assert(&sx->lock_object, what, file, line);
+#else
+ /*
+ * If we hold an exclusve lock fail. We can't
+ * reliably check to see if we hold a shared lock or
+ * not.
+ */
+ if (sx_xholder(sx) == curthread)
+ panic("Lock %s exclusively locked @ %s:%d\n",
+ sx->lock_object.lo_name, file, line);
+#endif
+ break;
+ default:
+ panic("Unknown sx lock assertion: %d @ %s:%d", what, file,
+ line);
+ }
+}
+#endif /* INVARIANT_SUPPORT */
+
+#ifdef DDB
+static void
+db_show_sx(const struct lock_object *lock)
+{
+ struct thread *td;
+ const struct sx *sx;
+
+ sx = (const struct sx *)lock;
+
+ db_printf(" state: ");
+ if (sx->sx_lock == SX_LOCK_UNLOCKED)
+ db_printf("UNLOCKED\n");
+ else if (sx->sx_lock == SX_LOCK_DESTROYED) {
+ db_printf("DESTROYED\n");
+ return;
+ } else if (sx->sx_lock & SX_LOCK_SHARED)
+ db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock));
+ else {
+ td = sx_xholder(sx);
+ db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+ td->td_tid, td->td_proc->p_pid, td->td_name);
+ if (sx_recursed(sx))
+ db_printf(" recursed: %d\n", sx->sx_recurse);
+ }
+
+ db_printf(" waiters: ");
+ switch(sx->sx_lock &
+ (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) {
+ case SX_LOCK_SHARED_WAITERS:
+ db_printf("shared\n");
+ break;
+ case SX_LOCK_EXCLUSIVE_WAITERS:
+ db_printf("exclusive\n");
+ break;
+ case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS:
+ db_printf("exclusive and shared\n");
+ break;
+ default:
+ db_printf("none\n");
+ }
+}
+
+/*
+ * Check to see if a thread that is blocked on a sleep queue is actually
+ * blocked on an sx lock. If so, output some details and return true.
+ * If the lock has an exclusive owner, return that in *ownerp.
+ */
+int
+sx_chain(struct thread *td, struct thread **ownerp)
+{
+ struct sx *sx;
+
+ /*
+ * Check to see if this thread is blocked on an sx lock.
+ * First, we check the lock class. If that is ok, then we
+ * compare the lock name against the wait message.
+ */
+ sx = td->td_wchan;
+ if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
+ sx->lock_object.lo_name != td->td_wmesg)
+ return (0);
+
+ /* We think we have an sx lock, so output some details. */
+ db_printf("blocked on sx \"%s\" ", td->td_wmesg);
+ *ownerp = sx_xholder(sx);
+ if (sx->sx_lock & SX_LOCK_SHARED)
+ db_printf("SLOCK (count %ju)\n",
+ (uintmax_t)SX_SHARERS(sx->sx_lock));
+ else
+ db_printf("XLOCK\n");
+ return (1);
+}
+#endif
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
new file mode 100644
index 0000000..b0e1908
--- /dev/null
+++ b/sys/kern/kern_synch.c
@@ -0,0 +1,632 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/cpu.h>
+
+#ifdef XEN
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#endif
+
+#define KTDSTATE(td) \
+ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \
+ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \
+ ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \
+ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \
+ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding")
+
+static void synch_setup(void *dummy);
+SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup,
+ NULL);
+
+int hogticks;
+static uint8_t pause_wchan[MAXCPU];
+
+static struct callout loadav_callout;
+
+struct loadavg averunnable =
+ { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
+/*
+ * Constants for averages over 1, 5, and 15 minutes
+ * when sampling at 5 second intervals.
+ */
+static fixpt_t cexp[3] = {
+ 0.9200444146293232 * FSCALE, /* exp(-1/12) */
+ 0.9834714538216174 * FSCALE, /* exp(-1/60) */
+ 0.9944598480048967 * FSCALE, /* exp(-1/180) */
+};
+
+/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
+static int fscale __unused = FSCALE;
+SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
+
+static void loadav(void *arg);
+
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE(sched, , , preempt, preempt);
+
+/*
+ * These probes reference Solaris features that are not implemented in FreeBSD.
+ * Create the probes anyway for compatibility with existing D scripts; they'll
+ * just never fire.
+ */
+SDT_PROBE_DEFINE(sched, , , cpucaps_sleep, cpucaps-sleep);
+SDT_PROBE_DEFINE(sched, , , cpucaps_wakeup, cpucaps-wakeup);
+SDT_PROBE_DEFINE(sched, , , schedctl_nopreempt, schedctl-nopreempt);
+SDT_PROBE_DEFINE(sched, , , schedctl_preempt, schedctl-preempt);
+SDT_PROBE_DEFINE(sched, , , schedctl_yield, schedctl-yield);
+
+static void
+sleepinit(void *unused)
+{
+
+ hogticks = (hz / 10) * 2; /* Default only. */
+ init_sleepqueues();
+}
+
+/*
+ * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure
+ * it is available.
+ */
+SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, 0);
+
+/*
+ * General sleep call. Suspends the current thread until a wakeup is
+ * performed on the specified identifier. The thread will then be made
+ * runnable with the specified priority. Sleeps at most sbt units of time
+ * (0 means no timeout). If pri includes the PCATCH flag, let signals
+ * interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if
+ * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
+ * signal becomes pending, ERESTART is returned if the current system
+ * call should be restarted if possible, and EINTR is returned if the system
+ * call should be interrupted by the signal (return EINTR).
+ *
+ * The lock argument is unlocked before the caller is suspended, and
+ * re-locked before _sleep() returns. If priority includes the PDROP
+ * flag the lock is not re-locked before returning.
+ */
+int
+_sleep(void *ident, struct lock_object *lock, int priority,
+ const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
+{
+ struct thread *td;
+ struct proc *p;
+ struct lock_class *class;
+ int catch, lock_state, pri, rval, sleepq_flags;
+ WITNESS_SAVE_DECL(lock_witness);
+
+ td = curthread;
+ p = td->td_proc;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 0, wmesg);
+#endif
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+ "Sleeping on \"%s\"", wmesg);
+ KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL,
+ ("sleeping without a lock"));
+ KASSERT(p != NULL, ("msleep1"));
+ KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
+ if (priority & PDROP)
+ KASSERT(lock != NULL && lock != &Giant.lock_object,
+ ("PDROP requires a non-Giant lock"));
+ if (lock != NULL)
+ class = LOCK_CLASS(lock);
+ else
+ class = NULL;
+
+ if (cold || SCHEDULER_STOPPED()) {
+ /*
+ * During autoconfiguration, just return;
+ * don't run any other threads or panic below,
+ * in case this is the idle thread and already asleep.
+ * XXX: this used to do "s = splhigh(); splx(safepri);
+ * splx(s);" to give interrupts a chance, but there is
+ * no way to give interrupts a chance now.
+ */
+ if (lock != NULL && priority & PDROP)
+ class->lc_unlock(lock);
+ return (0);
+ }
+ catch = priority & PCATCH;
+ pri = priority & PRIMASK;
+
+ /*
+ * If we are already on a sleep queue, then remove us from that
+ * sleep queue first. We have to do this to handle recursive
+ * sleeps.
+ */
+ if (TD_ON_SLEEPQ(td))
+ sleepq_remove(td, td->td_wchan);
+
+ if ((uint8_t *)ident >= &pause_wchan[0] &&
+ (uint8_t *)ident <= &pause_wchan[MAXCPU - 1])
+ sleepq_flags = SLEEPQ_PAUSE;
+ else
+ sleepq_flags = SLEEPQ_SLEEP;
+ if (catch)
+ sleepq_flags |= SLEEPQ_INTERRUPTIBLE;
+
+ sleepq_lock(ident);
+ CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
+ td->td_tid, p->p_pid, td->td_name, wmesg, ident);
+
+ if (lock == &Giant.lock_object)
+ mtx_assert(&Giant, MA_OWNED);
+ DROP_GIANT();
+ if (lock != NULL && lock != &Giant.lock_object &&
+ !(class->lc_flags & LC_SLEEPABLE)) {
+ WITNESS_SAVE(lock, lock_witness);
+ lock_state = class->lc_unlock(lock);
+ } else
+ /* GCC needs to follow the Yellow Brick Road */
+ lock_state = -1;
+
+ /*
+ * We put ourselves on the sleep queue and start our timeout
+ * before calling thread_suspend_check, as we could stop there,
+ * and a wakeup or a SIGCONT (or both) could occur while we were
+ * stopped without resuming us. Thus, we must be ready for sleep
+ * when cursig() is called. If the wakeup happens while we're
+ * stopped, then td will no longer be on a sleep queue upon
+ * return from cursig().
+ */
+ sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
+ if (sbt != 0)
+ sleepq_set_timeout_sbt(ident, sbt, pr, flags);
+ if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
+ sleepq_release(ident);
+ WITNESS_SAVE(lock, lock_witness);
+ lock_state = class->lc_unlock(lock);
+ sleepq_lock(ident);
+ }
+ if (sbt != 0 && catch)
+ rval = sleepq_timedwait_sig(ident, pri);
+ else if (sbt != 0)
+ rval = sleepq_timedwait(ident, pri);
+ else if (catch)
+ rval = sleepq_wait_sig(ident, pri);
+ else {
+ sleepq_wait(ident, pri);
+ rval = 0;
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0, wmesg);
+#endif
+ PICKUP_GIANT();
+ if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
+ class->lc_lock(lock, lock_state);
+ WITNESS_RESTORE(lock, lock_witness);
+ }
+ return (rval);
+}
+
+int
+msleep_spin_sbt(void *ident, struct mtx *mtx, const char *wmesg,
+ sbintime_t sbt, sbintime_t pr, int flags)
+{
+ struct thread *td;
+ struct proc *p;
+ int rval;
+ WITNESS_SAVE_DECL(mtx);
+
+ td = curthread;
+ p = td->td_proc;
+ KASSERT(mtx != NULL, ("sleeping without a mutex"));
+ KASSERT(p != NULL, ("msleep1"));
+ KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
+
+ if (cold || SCHEDULER_STOPPED()) {
+ /*
+ * During autoconfiguration, just return;
+ * don't run any other threads or panic below,
+ * in case this is the idle thread and already asleep.
+ * XXX: this used to do "s = splhigh(); splx(safepri);
+ * splx(s);" to give interrupts a chance, but there is
+ * no way to give interrupts a chance now.
+ */
+ return (0);
+ }
+
+ sleepq_lock(ident);
+ CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
+ td->td_tid, p->p_pid, td->td_name, wmesg, ident);
+
+ DROP_GIANT();
+ mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
+ WITNESS_SAVE(&mtx->lock_object, mtx);
+ mtx_unlock_spin(mtx);
+
+ /*
+ * We put ourselves on the sleep queue and start our timeout.
+ */
+ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
+ if (sbt != 0)
+ sleepq_set_timeout_sbt(ident, sbt, pr, flags);
+
+ /*
+ * Can't call ktrace with any spin locks held so it can lock the
+ * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
+ * any spin lock. Thus, we have to drop the sleepq spin lock while
+ * we handle those requests. This is safe since we have placed our
+ * thread on the sleep queue already.
+ */
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW)) {
+ sleepq_release(ident);
+ ktrcsw(1, 0, wmesg);
+ sleepq_lock(ident);
+ }
+#endif
+#ifdef WITNESS
+ sleepq_release(ident);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
+ wmesg);
+ sleepq_lock(ident);
+#endif
+ if (sbt != 0)
+ rval = sleepq_timedwait(ident, 0);
+ else {
+ sleepq_wait(ident, 0);
+ rval = 0;
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 0, wmesg);
+#endif
+ PICKUP_GIANT();
+ mtx_lock_spin(mtx);
+ WITNESS_RESTORE(&mtx->lock_object, mtx);
+ return (rval);
+}
+
+/*
+ * pause() delays the calling thread by the given number of system ticks.
+ * During cold bootup, pause() uses the DELAY() function instead of
+ * the tsleep() function to do the waiting. The "timo" argument must be
+ * greater than or equal to zero. A "timo" value of zero is equivalent
+ * to a "timo" value of one.
+ */
+int
+pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
+{
+ KASSERT(sbt >= 0, ("pause: timeout must be >= 0"));
+
+ /* silently convert invalid timeouts */
+ if (sbt == 0)
+ sbt = tick_sbt;
+
+ if (cold) {
+ /*
+ * We delay one second at a time to avoid overflowing the
+ * system specific DELAY() function(s):
+ */
+ while (sbt >= SBT_1S) {
+ DELAY(1000000);
+ sbt -= SBT_1S;
+ }
+ /* Do the delay remainder, if any */
+ sbt = (sbt + SBT_1US - 1) / SBT_1US;
+ if (sbt > 0)
+ DELAY(sbt);
+ return (0);
+ }
+ return (_sleep(&pause_wchan[curcpu], NULL, 0, wmesg, sbt, pr, flags));
+}
+
+/*
+ * Make all threads sleeping on the specified identifier runnable.
+ */
+void
+wakeup(void *ident)
+{
+ int wakeup_swapper;
+
+ sleepq_lock(ident);
+ wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
+ sleepq_release(ident);
+ if (wakeup_swapper) {
+ KASSERT(ident != &proc0,
+ ("wakeup and wakeup_swapper and proc0"));
+ kick_proc0();
+ }
+}
+
+/*
+ * Make a thread sleeping on the specified identifier runnable.
+ * May wake more than one thread if a target thread is currently
+ * swapped out.
+ */
+void
+wakeup_one(void *ident)
+{
+ int wakeup_swapper;
+
+ sleepq_lock(ident);
+ wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0);
+ sleepq_release(ident);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+static void
+kdb_switch(void)
+{
+ thread_unlock(curthread);
+ kdb_backtrace();
+ kdb_reenter();
+ panic("%s: did not reenter debugger", __func__);
+}
+
+/*
+ * The machine independent parts of context switching.
+ */
+void
+mi_switch(int flags, struct thread *newtd)
+{
+ uint64_t runtime, new_switchtime;
+ struct thread *td;
+ struct proc *p;
+
+ td = curthread; /* XXX */
+ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
+ p = td->td_proc; /* XXX */
+ KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
+#ifdef INVARIANTS
+ if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
+ mtx_assert(&Giant, MA_NOTOWNED);
+#endif
+ KASSERT(td->td_critnest == 1 || panicstr,
+ ("mi_switch: switch in a critical section"));
+ KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
+ ("mi_switch: switch must be voluntary or involuntary"));
+ KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
+
+ /*
+ * Don't perform context switches from the debugger.
+ */
+ if (kdb_active)
+ kdb_switch();
+ if (SCHEDULER_STOPPED())
+ return;
+ if (flags & SW_VOL) {
+ td->td_ru.ru_nvcsw++;
+ td->td_swvoltick = ticks;
+ } else
+ td->td_ru.ru_nivcsw++;
+#ifdef SCHED_STATS
+ SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
+#endif
+ /*
+ * Compute the amount of time during which the current
+ * thread was running, and add that to its total so far.
+ */
+ new_switchtime = cpu_ticks();
+ runtime = new_switchtime - PCPU_GET(switchtime);
+ td->td_runtime += runtime;
+ td->td_incruntime += runtime;
+ PCPU_SET(switchtime, new_switchtime);
+ td->td_generation++; /* bump preempt-detect counter */
+ PCPU_INC(cnt.v_swtch);
+ PCPU_SET(switchticks, ticks);
+ CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)",
+ td->td_tid, td->td_sched, p->p_pid, td->td_name);
+#if (KTR_COMPILE & KTR_SCHED) != 0
+ if (TD_IS_IDLETHREAD(td))
+ KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
+ "prio:%d", td->td_priority);
+ else
+ KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
+ "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
+ "lockname:\"%s\"", td->td_lockname);
+#endif
+ SDT_PROBE0(sched, , , preempt);
+#ifdef XEN
+ PT_UPDATES_FLUSH();
+#endif
+ sched_switch(td, newtd, flags);
+ KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
+ "prio:%d", td->td_priority);
+
+ CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
+ td->td_tid, td->td_sched, p->p_pid, td->td_name);
+
+ /*
+ * If the last thread was exiting, finish cleaning it up.
+ */
+ if ((td = PCPU_GET(deadthread))) {
+ PCPU_SET(deadthread, NULL);
+ thread_stash(td);
+ }
+}
+
+/*
+ * Change thread state to be runnable, placing it on the run queue if
+ * it is in memory. If it is swapped out, return true so our caller
+ * will know to awaken the swapper.
+ */
+int
+setrunnable(struct thread *td)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
+ ("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
+ switch (td->td_state) {
+ case TDS_RUNNING:
+ case TDS_RUNQ:
+ return (0);
+ case TDS_INHIBITED:
+ /*
+ * If we are only inhibited because we are swapped out
+ * then arange to swap in this process. Otherwise just return.
+ */
+ if (td->td_inhibitors != TDI_SWAPPED)
+ return (0);
+ /* FALLTHROUGH */
+ case TDS_CAN_RUN:
+ break;
+ default:
+ printf("state is 0x%x", td->td_state);
+ panic("setrunnable(2)");
+ }
+ if ((td->td_flags & TDF_INMEM) == 0) {
+ if ((td->td_flags & TDF_SWAPINREQ) == 0) {
+ td->td_flags |= TDF_SWAPINREQ;
+ return (1);
+ }
+ } else
+ sched_wakeup(td);
+ return (0);
+}
+
+/*
+ * Compute a tenex style load average of a quantity on
+ * 1, 5 and 15 minute intervals.
+ */
+static void
+loadav(void *arg)
+{
+ int i, nrun;
+ struct loadavg *avg;
+
+ nrun = sched_load();
+ avg = &averunnable;
+
+ for (i = 0; i < 3; i++)
+ avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
+ nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
+
+ /*
+ * Schedule the next update to occur after 5 seconds, but add a
+ * random variation to avoid synchronisation with processes that
+ * run at regular intervals.
+ */
+ callout_reset_sbt(&loadav_callout,
+ tick_sbt * (hz * 4 + (int)(random() % (hz * 2 + 1))), 0,
+ loadav, NULL, C_DIRECT_EXEC | C_HARDCLOCK);
+}
+
+/* ARGSUSED */
+static void
+synch_setup(void *dummy)
+{
+ callout_init(&loadav_callout, CALLOUT_MPSAFE);
+
+ /* Kick off timeout driven events by calling first time. */
+ loadav(NULL);
+}
+
+int
+should_yield(void)
+{
+
+ return ((unsigned int)(ticks - curthread->td_swvoltick) >= hogticks);
+}
+
+void
+maybe_yield(void)
+{
+
+ if (should_yield())
+ kern_yield(PRI_USER);
+}
+
+void
+kern_yield(int prio)
+{
+ struct thread *td;
+
+ td = curthread;
+ DROP_GIANT();
+ thread_lock(td);
+ if (prio == PRI_USER)
+ prio = td->td_user_pri;
+ if (prio >= 0)
+ sched_prio(td, prio);
+ mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+ thread_unlock(td);
+ PICKUP_GIANT();
+}
+
+/*
+ * General purpose yield system call.
+ */
+int
+sys_yield(struct thread *td, struct yield_args *uap)
+{
+
+ thread_lock(td);
+ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+ sched_prio(td, PRI_MAX_TIMESHARE);
+ mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+ thread_unlock(td);
+ td->td_retval[0] = 0;
+ return (0);
+}
diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c
new file mode 100644
index 0000000..03f6088
--- /dev/null
+++ b/sys/kern/kern_syscalls.c
@@ -0,0 +1,220 @@
+/*-
+ * Copyright (c) 1999 Assar Westerlund
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/sx.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <machine/atomic.h>
+
+/*
+ * Acts like "nosys" but can be identified in sysent for dynamic call
+ * number assignment for a limited number of calls.
+ *
+ * Place holder for system call slots reserved for loadable modules.
+ */
+int
+lkmnosys(struct thread *td, struct nosys_args *args)
+{
+
+ return (nosys(td, args));
+}
+
+int
+lkmressys(struct thread *td, struct nosys_args *args)
+{
+
+ return (nosys(td, args));
+}
+
+static void
+syscall_thread_drain(struct sysent *se)
+{
+ u_int32_t cnt, oldcnt;
+
+ do {
+ oldcnt = se->sy_thrcnt;
+ KASSERT((oldcnt & SY_THR_STATIC) == 0,
+ ("drain on static syscall"));
+ cnt = oldcnt | SY_THR_DRAINING;
+ } while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
+ while (atomic_cmpset_32(&se->sy_thrcnt, SY_THR_DRAINING,
+ SY_THR_ABSENT) == 0)
+ pause("scdrn", hz/2);
+}
+
+int
+syscall_thread_enter(struct thread *td, struct sysent *se)
+{
+ u_int32_t cnt, oldcnt;
+
+ do {
+ oldcnt = se->sy_thrcnt;
+ if ((oldcnt & SY_THR_STATIC) != 0)
+ return (0);
+ if ((oldcnt & (SY_THR_DRAINING | SY_THR_ABSENT)) != 0)
+ return (ENOSYS);
+ cnt = oldcnt + SY_THR_INCR;
+ } while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
+ return (0);
+}
+
+void
+syscall_thread_exit(struct thread *td, struct sysent *se)
+{
+ u_int32_t cnt, oldcnt;
+
+ do {
+ oldcnt = se->sy_thrcnt;
+ if ((oldcnt & SY_THR_STATIC) != 0)
+ return;
+ cnt = oldcnt - SY_THR_INCR;
+ } while (atomic_cmpset_rel_32(&se->sy_thrcnt, oldcnt, cnt) == 0);
+}
+
+int
+syscall_register(int *offset, struct sysent *new_sysent,
+ struct sysent *old_sysent)
+{
+ int i;
+
+ if (*offset == NO_SYSCALL) {
+ for (i = 1; i < SYS_MAXSYSCALL; ++i)
+ if (sysent[i].sy_call == (sy_call_t *)lkmnosys)
+ break;
+ if (i == SYS_MAXSYSCALL)
+ return (ENFILE);
+ *offset = i;
+ } else if (*offset < 0 || *offset >= SYS_MAXSYSCALL)
+ return (EINVAL);
+ else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys &&
+ sysent[*offset].sy_call != (sy_call_t *)lkmressys)
+ return (EEXIST);
+
+ KASSERT(sysent[*offset].sy_thrcnt == SY_THR_ABSENT,
+ ("dynamic syscall is not protected"));
+ *old_sysent = sysent[*offset];
+ new_sysent->sy_thrcnt = SY_THR_ABSENT;
+ sysent[*offset] = *new_sysent;
+ atomic_store_rel_32(&sysent[*offset].sy_thrcnt, 0);
+ return (0);
+}
+
+int
+syscall_deregister(int *offset, struct sysent *old_sysent)
+{
+
+ if (*offset) {
+ syscall_thread_drain(&sysent[*offset]);
+ sysent[*offset] = *old_sysent;
+ }
+ return (0);
+}
+
+int
+syscall_module_handler(struct module *mod, int what, void *arg)
+{
+ struct syscall_module_data *data = arg;
+ modspecific_t ms;
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ error = syscall_register(data->offset, data->new_sysent,
+ &data->old_sysent);
+ if (error) {
+ /* Leave a mark so we know to safely unload below. */
+ data->offset = NULL;
+ return (error);
+ }
+ ms.intval = *data->offset;
+ MOD_XLOCK;
+ module_setspecific(mod, &ms);
+ MOD_XUNLOCK;
+ if (data->chainevh)
+ error = data->chainevh(mod, what, data->chainarg);
+ return (error);
+ case MOD_UNLOAD:
+ /*
+ * MOD_LOAD failed, so just return without calling the
+ * chained handler since we didn't pass along the MOD_LOAD
+ * event.
+ */
+ if (data->offset == NULL)
+ return (0);
+ if (data->chainevh) {
+ error = data->chainevh(mod, what, data->chainarg);
+ if (error)
+ return error;
+ }
+ error = syscall_deregister(data->offset, &data->old_sysent);
+ return (error);
+ default:
+ if (data->chainevh)
+ return (data->chainevh(mod, what, data->chainarg));
+ return (EOPNOTSUPP);
+ }
+
+ /* NOTREACHED */
+}
+
+int
+syscall_helper_register(struct syscall_helper_data *sd)
+{
+ struct syscall_helper_data *sd1;
+ int error;
+
+ for (sd1 = sd; sd1->syscall_no != NO_SYSCALL; sd1++) {
+ error = syscall_register(&sd1->syscall_no, &sd1->new_sysent,
+ &sd1->old_sysent);
+ if (error != 0) {
+ syscall_helper_unregister(sd);
+ return (error);
+ }
+ sd1->registered = 1;
+ }
+ return (0);
+}
+
+int
+syscall_helper_unregister(struct syscall_helper_data *sd)
+{
+ struct syscall_helper_data *sd1;
+
+ for (sd1 = sd; sd1->registered != 0; sd1++) {
+ syscall_deregister(&sd1->syscall_no, &sd1->old_sysent);
+ sd1->registered = 0;
+ }
+ return (0);
+}
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
new file mode 100644
index 0000000..416f85f
--- /dev/null
+++ b/sys/kern/kern_sysctl.c
@@ -0,0 +1,1656 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/fail.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+#include <sys/uio.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
+static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
+static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");
+
+/*
+ * The sysctllock protects the MIB tree. It also protects sysctl
+ * contexts used with dynamic sysctls. The sysctl_register_oid() and
+ * sysctl_unregister_oid() routines require the sysctllock to already
+ * be held, so the sysctl_lock() and sysctl_unlock() routines are
+ * provided for the few places in the kernel which need to use that
+ * API rather than using the dynamic API. Use of the dynamic API is
+ * strongly encouraged for most code.
+ *
+ * The sysctlmemlock is used to limit the amount of user memory wired for
+ * sysctl requests. This is implemented by serializing any userland
+ * sysctl requests larger than a single page via an exclusive lock.
+ */
+static struct sx sysctllock;
+static struct sx sysctlmemlock;
+
+#define SYSCTL_XLOCK() sx_xlock(&sysctllock)
+#define SYSCTL_XUNLOCK() sx_xunlock(&sysctllock)
+#define SYSCTL_ASSERT_XLOCKED() sx_assert(&sysctllock, SA_XLOCKED)
+#define SYSCTL_INIT() sx_init(&sysctllock, "sysctl lock")
+#define SYSCTL_SLEEP(ch, wmesg, timo) \
+ sx_sleep(ch, &sysctllock, 0, wmesg, timo)
+
+static int sysctl_root(SYSCTL_HANDLER_ARGS);
+
+struct sysctl_oid_list sysctl__children; /* root list */
+
+static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
+ int recurse);
+
+static struct sysctl_oid *
+sysctl_find_oidname(const char *name, struct sysctl_oid_list *list)
+{
+ struct sysctl_oid *oidp;
+
+ SYSCTL_ASSERT_XLOCKED();
+ SLIST_FOREACH(oidp, list, oid_link) {
+ if (strcmp(oidp->oid_name, name) == 0) {
+ return (oidp);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Initialization of the MIB tree.
+ *
+ * Order by number in each list.
+ */
+void
+sysctl_lock(void)
+{
+
+ SYSCTL_XLOCK();
+}
+
+void
+sysctl_unlock(void)
+{
+
+ SYSCTL_XUNLOCK();
+}
+
+void
+sysctl_register_oid(struct sysctl_oid *oidp)
+{
+ struct sysctl_oid_list *parent = oidp->oid_parent;
+ struct sysctl_oid *p;
+ struct sysctl_oid *q;
+
+ /*
+ * First check if another oid with the same name already
+ * exists in the parent's list.
+ */
+ SYSCTL_ASSERT_XLOCKED();
+ p = sysctl_find_oidname(oidp->oid_name, parent);
+ if (p != NULL) {
+ if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ p->oid_refcnt++;
+ return;
+ } else {
+ printf("can't re-use a leaf (%s)!\n", p->oid_name);
+ return;
+ }
+ }
+ /*
+ * If this oid has a number OID_AUTO, give it a number which
+ * is greater than any current oid.
+ * NOTE: DO NOT change the starting value here, change it in
+ * <sys/sysctl.h>, and make sure it is at least 256 to
+ * accomodate e.g. net.inet.raw as a static sysctl node.
+ */
+ if (oidp->oid_number == OID_AUTO) {
+ static int newoid = CTL_AUTO_START;
+
+ oidp->oid_number = newoid++;
+ if (newoid == 0x7fffffff)
+ panic("out of oids");
+ }
+#if 0
+ else if (oidp->oid_number >= CTL_AUTO_START) {
+ /* do not panic; this happens when unregistering sysctl sets */
+ printf("static sysctl oid too high: %d", oidp->oid_number);
+ }
+#endif
+
+ /*
+ * Insert the oid into the parent's list in order.
+ */
+ q = NULL;
+ SLIST_FOREACH(p, parent, oid_link) {
+ if (oidp->oid_number < p->oid_number)
+ break;
+ q = p;
+ }
+ if (q)
+ SLIST_INSERT_AFTER(q, oidp, oid_link);
+ else
+ SLIST_INSERT_HEAD(parent, oidp, oid_link);
+}
+
+void
+sysctl_unregister_oid(struct sysctl_oid *oidp)
+{
+ struct sysctl_oid *p;
+ int error;
+
+ SYSCTL_ASSERT_XLOCKED();
+ error = ENOENT;
+ if (oidp->oid_number == OID_AUTO) {
+ error = EINVAL;
+ } else {
+ SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
+ if (p == oidp) {
+ SLIST_REMOVE(oidp->oid_parent, oidp,
+ sysctl_oid, oid_link);
+ error = 0;
+ break;
+ }
+ }
+ }
+
+ /*
+ * This can happen when a module fails to register and is
+ * being unloaded afterwards. It should not be a panic()
+ * for normal use.
+ */
+ if (error)
+ printf("%s: failed to unregister sysctl\n", __func__);
+}
+
+/* Initialize a new context to keep track of dynamically added sysctls. */
+int
+sysctl_ctx_init(struct sysctl_ctx_list *c)
+{
+
+ if (c == NULL) {
+ return (EINVAL);
+ }
+
+ /*
+ * No locking here, the caller is responsible for not adding
+ * new nodes to a context until after this function has
+ * returned.
+ */
+ TAILQ_INIT(c);
+ return (0);
+}
+
+/* Free the context, and destroy all dynamic oids registered in this context */
+int
+sysctl_ctx_free(struct sysctl_ctx_list *clist)
+{
+ struct sysctl_ctx_entry *e, *e1;
+ int error;
+
+ error = 0;
+ /*
+ * First perform a "dry run" to check if it's ok to remove oids.
+ * XXX FIXME
+ * XXX This algorithm is a hack. But I don't know any
+ * XXX better solution for now...
+ */
+ SYSCTL_XLOCK();
+ TAILQ_FOREACH(e, clist, link) {
+ error = sysctl_remove_oid_locked(e->entry, 0, 0);
+ if (error)
+ break;
+ }
+ /*
+ * Restore deregistered entries, either from the end,
+ * or from the place where error occured.
+ * e contains the entry that was not unregistered
+ */
+ if (error)
+ e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
+ else
+ e1 = TAILQ_LAST(clist, sysctl_ctx_list);
+ while (e1 != NULL) {
+ sysctl_register_oid(e1->entry);
+ e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
+ }
+ if (error) {
+ SYSCTL_XUNLOCK();
+ return(EBUSY);
+ }
+ /* Now really delete the entries */
+ e = TAILQ_FIRST(clist);
+ while (e != NULL) {
+ e1 = TAILQ_NEXT(e, link);
+ error = sysctl_remove_oid_locked(e->entry, 1, 0);
+ if (error)
+ panic("sysctl_remove_oid: corrupt tree, entry: %s",
+ e->entry->oid_name);
+ free(e, M_SYSCTLOID);
+ e = e1;
+ }
+ SYSCTL_XUNLOCK();
+ return (error);
+}
+
+/* Add an entry to the context */
+struct sysctl_ctx_entry *
+sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+ struct sysctl_ctx_entry *e;
+
+ SYSCTL_ASSERT_XLOCKED();
+ if (clist == NULL || oidp == NULL)
+ return(NULL);
+ e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
+ e->entry = oidp;
+ TAILQ_INSERT_HEAD(clist, e, link);
+ return (e);
+}
+
+/* Find an entry in the context */
+struct sysctl_ctx_entry *
+sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+ struct sysctl_ctx_entry *e;
+
+ SYSCTL_ASSERT_XLOCKED();
+ if (clist == NULL || oidp == NULL)
+ return(NULL);
+ TAILQ_FOREACH(e, clist, link) {
+ if(e->entry == oidp)
+ return(e);
+ }
+ return (e);
+}
+
+/*
+ * Delete an entry from the context.
+ * NOTE: this function doesn't free oidp! You have to remove it
+ * with sysctl_remove_oid().
+ */
+int
+sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp)
+{
+ struct sysctl_ctx_entry *e;
+
+ if (clist == NULL || oidp == NULL)
+ return (EINVAL);
+ SYSCTL_XLOCK();
+ e = sysctl_ctx_entry_find(clist, oidp);
+ if (e != NULL) {
+ TAILQ_REMOVE(clist, e, link);
+ SYSCTL_XUNLOCK();
+ free(e, M_SYSCTLOID);
+ return (0);
+ } else {
+ SYSCTL_XUNLOCK();
+ return (ENOENT);
+ }
+}
+
+/*
+ * Remove dynamically created sysctl trees.
+ * oidp - top of the tree to be removed
+ * del - if 0 - just deregister, otherwise free up entries as well
+ * recurse - if != 0 traverse the subtree to be deleted
+ */
+int
+sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
+{
+ int error;
+
+ SYSCTL_XLOCK();
+ error = sysctl_remove_oid_locked(oidp, del, recurse);
+ SYSCTL_XUNLOCK();
+ return (error);
+}
+
+int
+sysctl_remove_name(struct sysctl_oid *parent, const char *name,
+ int del, int recurse)
+{
+ struct sysctl_oid *p, *tmp;
+ int error;
+
+ error = ENOENT;
+ SYSCTL_XLOCK();
+ SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) {
+ if (strcmp(p->oid_name, name) == 0) {
+ error = sysctl_remove_oid_locked(p, del, recurse);
+ break;
+ }
+ }
+ SYSCTL_XUNLOCK();
+
+ return (error);
+}
+
+
+static int
+sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
+{
+ struct sysctl_oid *p, *tmp;
+ int error;
+
+ SYSCTL_ASSERT_XLOCKED();
+ if (oidp == NULL)
+ return(EINVAL);
+ if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
+ printf("can't remove non-dynamic nodes!\n");
+ return (EINVAL);
+ }
+ /*
+ * WARNING: normal method to do this should be through
+ * sysctl_ctx_free(). Use recursing as the last resort
+ * method to purge your sysctl tree of leftovers...
+ * However, if some other code still references these nodes,
+ * it will panic.
+ */
+ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ if (oidp->oid_refcnt == 1) {
+ SLIST_FOREACH_SAFE(p,
+ SYSCTL_CHILDREN(oidp), oid_link, tmp) {
+ if (!recurse) {
+ printf("Warning: failed attempt to "
+ "remove oid %s with child %s\n",
+ oidp->oid_name, p->oid_name);
+ return (ENOTEMPTY);
+ }
+ error = sysctl_remove_oid_locked(p, del,
+ recurse);
+ if (error)
+ return (error);
+ }
+ if (del)
+ free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
+ }
+ }
+ if (oidp->oid_refcnt > 1 ) {
+ oidp->oid_refcnt--;
+ } else {
+ if (oidp->oid_refcnt == 0) {
+ printf("Warning: bad oid_refcnt=%u (%s)!\n",
+ oidp->oid_refcnt, oidp->oid_name);
+ return (EINVAL);
+ }
+ sysctl_unregister_oid(oidp);
+ if (del) {
+ /*
+ * Wait for all threads running the handler to drain.
+ * This preserves the previous behavior when the
+ * sysctl lock was held across a handler invocation,
+ * and is necessary for module unload correctness.
+ */
+ while (oidp->oid_running > 0) {
+ oidp->oid_kind |= CTLFLAG_DYING;
+ SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0);
+ }
+ if (oidp->oid_descr)
+ free(__DECONST(char *, oidp->oid_descr),
+ M_SYSCTLOID);
+ free(__DECONST(char *, oidp->oid_name), M_SYSCTLOID);
+ free(oidp, M_SYSCTLOID);
+ }
+ }
+ return (0);
+}
+/*
+ * Create new sysctls at run time.
+ * clist may point to a valid context initialized with sysctl_ctx_init().
+ */
+struct sysctl_oid *
+sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent,
+ int number, const char *name, int kind, void *arg1, intptr_t arg2,
+ int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr)
+{
+ struct sysctl_oid *oidp;
+
+ /* You have to hook up somewhere.. */
+ if (parent == NULL)
+ return(NULL);
+ /* Check if the node already exists, otherwise create it */
+ SYSCTL_XLOCK();
+ oidp = sysctl_find_oidname(name, parent);
+ if (oidp != NULL) {
+ if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ oidp->oid_refcnt++;
+ /* Update the context */
+ if (clist != NULL)
+ sysctl_ctx_entry_add(clist, oidp);
+ SYSCTL_XUNLOCK();
+ return (oidp);
+ } else {
+ SYSCTL_XUNLOCK();
+ printf("can't re-use a leaf (%s)!\n", name);
+ return (NULL);
+ }
+ }
+ oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK|M_ZERO);
+ oidp->oid_parent = parent;
+ SLIST_NEXT(oidp, oid_link) = NULL;
+ oidp->oid_number = number;
+ oidp->oid_refcnt = 1;
+ oidp->oid_name = strdup(name, M_SYSCTLOID);
+ oidp->oid_handler = handler;
+ oidp->oid_kind = CTLFLAG_DYN | kind;
+ if ((kind & CTLTYPE) == CTLTYPE_NODE) {
+ /* Allocate space for children */
+ SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list),
+ M_SYSCTLOID, M_WAITOK));
+ SLIST_INIT(SYSCTL_CHILDREN(oidp));
+ oidp->oid_arg2 = arg2;
+ } else {
+ oidp->oid_arg1 = arg1;
+ oidp->oid_arg2 = arg2;
+ }
+ oidp->oid_fmt = fmt;
+ if (descr)
+ oidp->oid_descr = strdup(descr, M_SYSCTLOID);
+ /* Update the context, if used */
+ if (clist != NULL)
+ sysctl_ctx_entry_add(clist, oidp);
+ /* Register this oid */
+ sysctl_register_oid(oidp);
+ SYSCTL_XUNLOCK();
+ return (oidp);
+}
+
+/*
+ * Rename an existing oid.
+ */
+void
+sysctl_rename_oid(struct sysctl_oid *oidp, const char *name)
+{
+ char *newname;
+ char *oldname;
+
+ newname = strdup(name, M_SYSCTLOID);
+ SYSCTL_XLOCK();
+ oldname = __DECONST(char *, oidp->oid_name);
+ oidp->oid_name = newname;
+ SYSCTL_XUNLOCK();
+ free(oldname, M_SYSCTLOID);
+}
+
+/*
+ * Reparent an existing oid.
+ */
+int
+sysctl_move_oid(struct sysctl_oid *oid, struct sysctl_oid_list *parent)
+{
+ struct sysctl_oid *oidp;
+
+ SYSCTL_XLOCK();
+ if (oid->oid_parent == parent) {
+ SYSCTL_XUNLOCK();
+ return (0);
+ }
+ oidp = sysctl_find_oidname(oid->oid_name, parent);
+ if (oidp != NULL) {
+ SYSCTL_XUNLOCK();
+ return (EEXIST);
+ }
+ sysctl_unregister_oid(oid);
+ oid->oid_parent = parent;
+ oid->oid_number = OID_AUTO;
+ sysctl_register_oid(oid);
+ SYSCTL_XUNLOCK();
+ return (0);
+}
+
+/*
+ * Register the kernel's oids on startup.
+ */
+SET_DECLARE(sysctl_set, struct sysctl_oid);
+
+static void
+sysctl_register_all(void *arg)
+{
+ struct sysctl_oid **oidp;
+
+ sx_init(&sysctlmemlock, "sysctl mem");
+ SYSCTL_INIT();
+ SYSCTL_XLOCK();
+ SET_FOREACH(oidp, sysctl_set)
+ sysctl_register_oid(*oidp);
+ SYSCTL_XUNLOCK();
+}
+SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);
+
+/*
+ * "Staff-functions"
+ *
+ * These functions implement a presently undocumented interface
+ * used by the sysctl program to walk the tree, and get the type
+ * so it can print the value.
+ * This interface is under work and consideration, and should probably
+ * be killed with a big axe by the first person who can find the time.
+ * (be aware though, that the proper interface isn't as obvious as it
+ * may seem, there are various conflicting requirements.
+ *
+ * {0,0} printf the entire MIB-tree.
+ * {0,1,...} return the name of the "..." OID.
+ * {0,2,...} return the next OID.
+ * {0,3} return the OID of the name in "new"
+ * {0,4,...} return the kind & format info for the "..." OID.
+ * {0,5,...} return the description the "..." OID.
+ */
+
+#ifdef SYSCTL_DEBUG
+static void
+sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
+{
+ int k;
+ struct sysctl_oid *oidp;
+
+ SYSCTL_ASSERT_XLOCKED();
+ SLIST_FOREACH(oidp, l, oid_link) {
+
+ for (k=0; k<i; k++)
+ printf(" ");
+
+ printf("%d %s ", oidp->oid_number, oidp->oid_name);
+
+ printf("%c%c",
+ oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
+ oidp->oid_kind & CTLFLAG_WR ? 'W':' ');
+
+ if (oidp->oid_handler)
+ printf(" *Handler");
+
+ switch (oidp->oid_kind & CTLTYPE) {
+ case CTLTYPE_NODE:
+ printf(" Node\n");
+ if (!oidp->oid_handler) {
+ sysctl_sysctl_debug_dump_node(
+ oidp->oid_arg1, i+2);
+ }
+ break;
+ case CTLTYPE_INT: printf(" Int\n"); break;
+ case CTLTYPE_UINT: printf(" u_int\n"); break;
+ case CTLTYPE_LONG: printf(" Long\n"); break;
+ case CTLTYPE_ULONG: printf(" u_long\n"); break;
+ case CTLTYPE_STRING: printf(" String\n"); break;
+ case CTLTYPE_U64: printf(" uint64_t\n"); break;
+ case CTLTYPE_S64: printf(" int64_t\n"); break;
+ case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
+ default: printf("\n");
+ }
+
+ }
+}
+
+static int
+sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
+ if (error)
+ return (error);
+ SYSCTL_XLOCK();
+ sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
+ SYSCTL_XUNLOCK();
+ return (ENOENT);
+}
+
+SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
+ 0, 0, sysctl_sysctl_debug, "-", "");
+#endif
+
+static int
+sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int error = 0;
+ struct sysctl_oid *oid;
+ struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
+ char buf[10];
+
+ SYSCTL_XLOCK();
+ while (namelen) {
+ if (!lsp) {
+ snprintf(buf,sizeof(buf),"%d",*name);
+ if (req->oldidx)
+ error = SYSCTL_OUT(req, ".", 1);
+ if (!error)
+ error = SYSCTL_OUT(req, buf, strlen(buf));
+ if (error)
+ goto out;
+ namelen--;
+ name++;
+ continue;
+ }
+ lsp2 = 0;
+ SLIST_FOREACH(oid, lsp, oid_link) {
+ if (oid->oid_number != *name)
+ continue;
+
+ if (req->oldidx)
+ error = SYSCTL_OUT(req, ".", 1);
+ if (!error)
+ error = SYSCTL_OUT(req, oid->oid_name,
+ strlen(oid->oid_name));
+ if (error)
+ goto out;
+
+ namelen--;
+ name++;
+
+ if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ break;
+
+ if (oid->oid_handler)
+ break;
+
+ lsp2 = SYSCTL_CHILDREN(oid);
+ break;
+ }
+ lsp = lsp2;
+ }
+ error = SYSCTL_OUT(req, "", 1);
+ out:
+ SYSCTL_XUNLOCK();
+ return (error);
+}
+
+/*
+ * XXXRW/JA: Shouldn't return name data for nodes that we don't permit in
+ * capability mode.
+ */
+static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_CAPRD,
+ sysctl_sysctl_name, "");
+
+static int
+sysctl_sysctl_next_ls(struct sysctl_oid_list *lsp, int *name, u_int namelen,
+ int *next, int *len, int level, struct sysctl_oid **oidpp)
+{
+ struct sysctl_oid *oidp;
+
+ SYSCTL_ASSERT_XLOCKED();
+ *len = level;
+ SLIST_FOREACH(oidp, lsp, oid_link) {
+ *next = oidp->oid_number;
+ *oidpp = oidp;
+
+ if (oidp->oid_kind & CTLFLAG_SKIP)
+ continue;
+
+ if (!namelen) {
+ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ return (0);
+ if (oidp->oid_handler)
+ /* We really should call the handler here...*/
+ return (0);
+ lsp = SYSCTL_CHILDREN(oidp);
+ if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1,
+ len, level+1, oidpp))
+ return (0);
+ goto emptynode;
+ }
+
+ if (oidp->oid_number < *name)
+ continue;
+
+ if (oidp->oid_number > *name) {
+ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ return (0);
+ if (oidp->oid_handler)
+ return (0);
+ lsp = SYSCTL_CHILDREN(oidp);
+ if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1,
+ next+1, len, level+1, oidpp))
+ return (0);
+ goto next;
+ }
+ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ continue;
+
+ if (oidp->oid_handler)
+ continue;
+
+ lsp = SYSCTL_CHILDREN(oidp);
+ if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1,
+ len, level+1, oidpp))
+ return (0);
+ next:
+ namelen = 1;
+ emptynode:
+ *len = level;
+ }
+ return (1);
+}
+
+static int
+sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int i, j, error;
+ struct sysctl_oid *oid;
+ struct sysctl_oid_list *lsp = &sysctl__children;
+ int newoid[CTL_MAXNAME];
+
+ SYSCTL_XLOCK();
+ i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
+ SYSCTL_XUNLOCK();
+ if (i)
+ return (ENOENT);
+ error = SYSCTL_OUT(req, newoid, j * sizeof (int));
+ return (error);
+}
+
+/*
+ * XXXRW/JA: Shouldn't return next data for nodes that we don't permit in
+ * capability mode.
+ */
+static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_CAPRD,
+ sysctl_sysctl_next, "");
+
+static int
+name2oid(char *name, int *oid, int *len, struct sysctl_oid **oidpp)
+{
+ struct sysctl_oid *oidp;
+ struct sysctl_oid_list *lsp = &sysctl__children;
+ char *p;
+
+ SYSCTL_ASSERT_XLOCKED();
+
+ for (*len = 0; *len < CTL_MAXNAME;) {
+ p = strsep(&name, ".");
+
+ oidp = SLIST_FIRST(lsp);
+ for (;; oidp = SLIST_NEXT(oidp, oid_link)) {
+ if (oidp == NULL)
+ return (ENOENT);
+ if (strcmp(p, oidp->oid_name) == 0)
+ break;
+ }
+ *oid++ = oidp->oid_number;
+ (*len)++;
+
+ if (name == NULL || *name == '\0') {
+ if (oidpp)
+ *oidpp = oidp;
+ return (0);
+ }
+
+ if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ break;
+
+ if (oidp->oid_handler)
+ break;
+
+ lsp = SYSCTL_CHILDREN(oidp);
+ }
+ return (ENOENT);
+}
+
+static int
+sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
+{
+ char *p;
+ int error, oid[CTL_MAXNAME], len = 0;
+ struct sysctl_oid *op = 0;
+
+ if (!req->newlen)
+ return (ENOENT);
+ if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */
+ return (ENAMETOOLONG);
+
+ p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
+
+ error = SYSCTL_IN(req, p, req->newlen);
+ if (error) {
+ free(p, M_SYSCTL);
+ return (error);
+ }
+
+ p [req->newlen] = '\0';
+
+ SYSCTL_XLOCK();
+ error = name2oid(p, oid, &len, &op);
+ SYSCTL_XUNLOCK();
+
+ free(p, M_SYSCTL);
+
+ if (error)
+ return (error);
+
+ error = SYSCTL_OUT(req, oid, len * sizeof *oid);
+ return (error);
+}
+
+/*
+ * XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in
+ * capability mode.
+ */
+SYSCTL_PROC(_sysctl, 3, name2oid,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MPSAFE
+ | CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", "");
+
+static int
+sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
+{
+ struct sysctl_oid *oid;
+ int error;
+
+ SYSCTL_XLOCK();
+ error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
+ if (error)
+ goto out;
+
+ if (oid->oid_fmt == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
+ if (error)
+ goto out;
+ error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
+ out:
+ SYSCTL_XUNLOCK();
+ return (error);
+}
+
+
+static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD|CTLFLAG_MPSAFE|CTLFLAG_CAPRD,
+ sysctl_sysctl_oidfmt, "");
+
+static int
+sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
+{
+ struct sysctl_oid *oid;
+ int error;
+
+ SYSCTL_XLOCK();
+ error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
+ if (error)
+ goto out;
+
+ if (oid->oid_descr == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
+ out:
+ SYSCTL_XUNLOCK();
+ return (error);
+}
+
+static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD|CTLFLAG_CAPRD,
+ sysctl_sysctl_oiddescr, "");
+
+/*
+ * Default "handler" functions.
+ */
+
+/*
+ * Handle an int, signed or unsigned.
+ * Two cases:
+ * a variable: point arg1 at it.
+ * a constant: pass it in arg2.
+ */
+
+int
+sysctl_handle_int(SYSCTL_HANDLER_ARGS)
+{
+ int tmpout, error = 0;
+
+ /*
+ * Attempt to get a coherent snapshot by making a copy of the data.
+ */
+ if (arg1)
+ tmpout = *(int *)arg1;
+ else
+ tmpout = arg2;
+ error = SYSCTL_OUT(req, &tmpout, sizeof(int));
+
+ if (error || !req->newptr)
+ return (error);
+
+ if (!arg1)
+ error = EPERM;
+ else
+ error = SYSCTL_IN(req, arg1, sizeof(int));
+ return (error);
+}
+
+/*
+ * Based on on sysctl_handle_int() convert milliseconds into ticks.
+ * Note: this is used by TCP.
+ */
+
+int
+sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
+{
+ int error, s, tt;
+
+ tt = *(int *)arg1;
+ s = (int)((int64_t)tt * 1000 / hz);
+
+ error = sysctl_handle_int(oidp, &s, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ tt = (int)((int64_t)s * hz / 1000);
+ if (tt < 1)
+ return (EINVAL);
+
+ *(int *)arg1 = tt;
+ return (0);
+}
+
+
+/*
+ * Handle a long, signed or unsigned.
+ * Two cases:
+ * a variable: point arg1 at it.
+ * a constant: pass it in arg2.
+ */
+
+int
+sysctl_handle_long(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+ long tmplong;
+#ifdef SCTL_MASK32
+ int tmpint;
+#endif
+
+ /*
+ * Attempt to get a coherent snapshot by making a copy of the data.
+ */
+ if (arg1)
+ tmplong = *(long *)arg1;
+ else
+ tmplong = arg2;
+#ifdef SCTL_MASK32
+ if (req->flags & SCTL_MASK32) {
+ tmpint = tmplong;
+ error = SYSCTL_OUT(req, &tmpint, sizeof(int));
+ } else
+#endif
+ error = SYSCTL_OUT(req, &tmplong, sizeof(long));
+
+ if (error || !req->newptr)
+ return (error);
+
+ if (!arg1)
+ error = EPERM;
+#ifdef SCTL_MASK32
+ else if (req->flags & SCTL_MASK32) {
+ error = SYSCTL_IN(req, &tmpint, sizeof(int));
+ *(long *)arg1 = (long)tmpint;
+ }
+#endif
+ else
+ error = SYSCTL_IN(req, arg1, sizeof(long));
+ return (error);
+}
+
+/*
+ * Handle a 64 bit int, signed or unsigned.
+ * Two cases:
+ * a variable: point arg1 at it.
+ * a constant: pass it in arg2.
+ */
+int
+sysctl_handle_64(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+ uint64_t tmpout;
+
+ /*
+ * Attempt to get a coherent snapshot by making a copy of the data.
+ */
+ if (arg1)
+ tmpout = *(uint64_t *)arg1;
+ else
+ tmpout = arg2;
+ error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
+
+ if (error || !req->newptr)
+ return (error);
+
+ if (!arg1)
+ error = EPERM;
+ else
+ error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
+ return (error);
+}
+
+/*
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * a variable string: point arg1 at it, arg2 is max length.
+ * a constant string: point arg1 at it, arg2 is zero.
+ */
+
+int
+sysctl_handle_string(SYSCTL_HANDLER_ARGS)
+{
+ int error=0;
+ char *tmparg;
+ size_t outlen;
+
+ /*
+ * Attempt to get a coherent snapshot by copying to a
+ * temporary kernel buffer.
+ */
+retry:
+ outlen = strlen((char *)arg1)+1;
+ tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);
+
+ if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) {
+ free(tmparg, M_SYSCTLTMP);
+ goto retry;
+ }
+
+ error = SYSCTL_OUT(req, tmparg, outlen);
+ free(tmparg, M_SYSCTLTMP);
+
+ if (error || !req->newptr)
+ return (error);
+
+ if ((req->newlen - req->newidx) >= arg2) {
+ error = EINVAL;
+ } else {
+ arg2 = (req->newlen - req->newidx);
+ error = SYSCTL_IN(req, arg1, arg2);
+ ((char *)arg1)[arg2] = '\0';
+ }
+
+ return (error);
+}
+
+/*
+ * Handle any kind of opaque data.
+ * arg1 points to it, arg2 is the size.
+ */
+
+int
+sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
+{
+ int error, tries;
+ u_int generation;
+ struct sysctl_req req2;
+
+ /*
+ * Attempt to get a coherent snapshot, by using the thread
+ * pre-emption counter updated from within mi_switch() to
+ * determine if we were pre-empted during a bcopy() or
+ * copyout(). Make 3 attempts at doing this before giving up.
+ * If we encounter an error, stop immediately.
+ */
+ tries = 0;
+ req2 = *req;
+retry:
+ generation = curthread->td_generation;
+ error = SYSCTL_OUT(req, arg1, arg2);
+ if (error)
+ return (error);
+ tries++;
+ if (generation != curthread->td_generation && tries < 3) {
+ *req = req2;
+ goto retry;
+ }
+
+ error = SYSCTL_IN(req, arg1, arg2);
+
+ return (error);
+}
+
+/*
+ * Transfer functions to/from kernel space.
+ * XXX: rather untested at this point
+ */
+static int
+sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l)
+{
+ size_t i = 0;
+
+ if (req->oldptr) {
+ i = l;
+ if (req->oldlen <= req->oldidx)
+ i = 0;
+ else
+ if (i > req->oldlen - req->oldidx)
+ i = req->oldlen - req->oldidx;
+ if (i > 0)
+ bcopy(p, (char *)req->oldptr + req->oldidx, i);
+ }
+ req->oldidx += l;
+ if (req->oldptr && i != l)
+ return (ENOMEM);
+ return (0);
+}
+
+static int
+sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l)
+{
+ if (!req->newptr)
+ return (0);
+ if (req->newlen - req->newidx < l)
+ return (EINVAL);
+ bcopy((char *)req->newptr + req->newidx, p, l);
+ req->newidx += l;
+ return (0);
+}
+
+int
+kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
+ size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags)
+{
+ int error = 0;
+ struct sysctl_req req;
+
+ bzero(&req, sizeof req);
+
+ req.td = td;
+ req.flags = flags;
+
+ if (oldlenp) {
+ req.oldlen = *oldlenp;
+ }
+ req.validlen = req.oldlen;
+
+ if (old) {
+ req.oldptr= old;
+ }
+
+ if (new != NULL) {
+ req.newlen = newlen;
+ req.newptr = new;
+ }
+
+ req.oldfunc = sysctl_old_kernel;
+ req.newfunc = sysctl_new_kernel;
+ req.lock = REQ_UNWIRED;
+
+ SYSCTL_XLOCK();
+ error = sysctl_root(0, name, namelen, &req);
+ SYSCTL_XUNLOCK();
+
+ if (req.lock == REQ_WIRED && req.validlen > 0)
+ vsunlock(req.oldptr, req.validlen);
+
+ if (error && error != ENOMEM)
+ return (error);
+
+ if (retval) {
+ if (req.oldptr && req.oldidx > req.validlen)
+ *retval = req.validlen;
+ else
+ *retval = req.oldidx;
+ }
+ return (error);
+}
+
+int
+kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp,
+ void *new, size_t newlen, size_t *retval, int flags)
+{
+ int oid[CTL_MAXNAME];
+ size_t oidlen, plen;
+ int error;
+
+ oid[0] = 0; /* sysctl internal magic */
+ oid[1] = 3; /* name2oid */
+ oidlen = sizeof(oid);
+
+ error = kernel_sysctl(td, oid, 2, oid, &oidlen,
+ (void *)name, strlen(name), &plen, flags);
+ if (error)
+ return (error);
+
+ error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
+ new, newlen, retval, flags);
+ return (error);
+}
+
+/*
+ * Transfer function to/from user space.
+ */
+static int
+sysctl_old_user(struct sysctl_req *req, const void *p, size_t l)
+{
+ size_t i, len, origidx;
+ int error;
+
+ origidx = req->oldidx;
+ req->oldidx += l;
+ if (req->oldptr == NULL)
+ return (0);
+ /*
+ * If we have not wired the user supplied buffer and we are currently
+ * holding locks, drop a witness warning, as it's possible that
+ * write operations to the user page can sleep.
+ */
+ if (req->lock != REQ_WIRED)
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "sysctl_old_user()");
+ i = l;
+ len = req->validlen;
+ if (len <= origidx)
+ i = 0;
+ else {
+ if (i > len - origidx)
+ i = len - origidx;
+ if (req->lock == REQ_WIRED) {
+ error = copyout_nofault(p, (char *)req->oldptr +
+ origidx, i);
+ } else
+ error = copyout(p, (char *)req->oldptr + origidx, i);
+ if (error != 0)
+ return (error);
+ }
+ if (i < l)
+ return (ENOMEM);
+ return (0);
+}
+
+static int
+sysctl_new_user(struct sysctl_req *req, void *p, size_t l)
+{
+ int error;
+
+ if (!req->newptr)
+ return (0);
+ if (req->newlen - req->newidx < l)
+ return (EINVAL);
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "sysctl_new_user()");
+ error = copyin((char *)req->newptr + req->newidx, p, l);
+ req->newidx += l;
+ return (error);
+}
+
+/*
+ * Wire the user space destination buffer. If set to a value greater than
+ * zero, the len parameter limits the maximum amount of wired memory.
+ */
+int
+sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
+{
+ int ret;
+ size_t wiredlen;
+
+ wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
+ ret = 0;
+ if (req->lock != REQ_WIRED && req->oldptr &&
+ req->oldfunc == sysctl_old_user) {
+ if (wiredlen != 0) {
+ ret = vslock(req->oldptr, wiredlen);
+ if (ret != 0) {
+ if (ret != ENOMEM)
+ return (ret);
+ wiredlen = 0;
+ }
+ }
+ req->lock = REQ_WIRED;
+ req->validlen = wiredlen;
+ }
+ return (0);
+}
+
+int
+sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
+ int *nindx, struct sysctl_req *req)
+{
+ struct sysctl_oid_list *lsp;
+ struct sysctl_oid *oid;
+ int indx;
+
+ SYSCTL_ASSERT_XLOCKED();
+ lsp = &sysctl__children;
+ indx = 0;
+ while (indx < CTL_MAXNAME) {
+ SLIST_FOREACH(oid, lsp, oid_link) {
+ if (oid->oid_number == name[indx])
+ break;
+ }
+ if (oid == NULL)
+ return (ENOENT);
+
+ indx++;
+ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ if (oid->oid_handler != NULL || indx == namelen) {
+ *noid = oid;
+ if (nindx != NULL)
+ *nindx = indx;
+ KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
+ ("%s found DYING node %p", __func__, oid));
+ return (0);
+ }
+ lsp = SYSCTL_CHILDREN(oid);
+ } else if (indx == namelen) {
+ *noid = oid;
+ if (nindx != NULL)
+ *nindx = indx;
+ KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
+ ("%s found DYING node %p", __func__, oid));
+ return (0);
+ } else {
+ return (ENOTDIR);
+ }
+ }
+ return (ENOENT);
+}
+
+/*
+ * Traverse our tree, and find the right node, execute whatever it points
+ * to, and return the resulting error code.
+ */
+
+static int
+sysctl_root(SYSCTL_HANDLER_ARGS)
+{
+ struct sysctl_oid *oid;
+ int error, indx, lvl;
+
+ SYSCTL_ASSERT_XLOCKED();
+
+ error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
+ if (error)
+ return (error);
+
+ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ /*
+ * You can't call a sysctl when it's a node, but has
+ * no handler. Inform the user that it's a node.
+ * The indx may or may not be the same as namelen.
+ */
+ if (oid->oid_handler == NULL)
+ return (EISDIR);
+ }
+
+ /* Is this sysctl writable? */
+ if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
+ return (EPERM);
+
+ KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));
+
+#ifdef CAPABILITY_MODE
+ /*
+ * If the process is in capability mode, then don't permit reading or
+ * writing unless specifically granted for the node.
+ */
+ if (IN_CAPABILITY_MODE(req->td)) {
+ if (req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD))
+ return (EPERM);
+ if (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))
+ return (EPERM);
+ }
+#endif
+
+ /* Is this sysctl sensitive to securelevels? */
+ if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
+ lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
+ error = securelevel_gt(req->td->td_ucred, lvl);
+ if (error)
+ return (error);
+ }
+
+ /* Is this sysctl writable by only privileged users? */
+ if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
+ int priv;
+
+ if (oid->oid_kind & CTLFLAG_PRISON)
+ priv = PRIV_SYSCTL_WRITEJAIL;
+#ifdef VIMAGE
+ else if ((oid->oid_kind & CTLFLAG_VNET) &&
+ prison_owns_vnet(req->td->td_ucred))
+ priv = PRIV_SYSCTL_WRITEJAIL;
+#endif
+ else
+ priv = PRIV_SYSCTL_WRITE;
+ error = priv_check(req->td, priv);
+ if (error)
+ return (error);
+ }
+
+ if (!oid->oid_handler)
+ return (EINVAL);
+
+ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ arg1 = (int *)arg1 + indx;
+ arg2 -= indx;
+ } else {
+ arg1 = oid->oid_arg1;
+ arg2 = oid->oid_arg2;
+ }
+#ifdef MAC
+ error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
+ req);
+ if (error != 0)
+ return (error);
+#endif
+ oid->oid_running++;
+ SYSCTL_XUNLOCK();
+
+ if (!(oid->oid_kind & CTLFLAG_MPSAFE))
+ mtx_lock(&Giant);
+ error = oid->oid_handler(oid, arg1, arg2, req);
+ if (!(oid->oid_kind & CTLFLAG_MPSAFE))
+ mtx_unlock(&Giant);
+
+ KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error);
+
+ SYSCTL_XLOCK();
+ oid->oid_running--;
+ if (oid->oid_running == 0 && (oid->oid_kind & CTLFLAG_DYING) != 0)
+ wakeup(&oid->oid_running);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sysctl_args {
+ int *name;
+ u_int namelen;
+ void *old;
+ size_t *oldlenp;
+ void *new;
+ size_t newlen;
+};
+#endif
+int
+sys___sysctl(struct thread *td, struct sysctl_args *uap)
+{
+ int error, i, name[CTL_MAXNAME];
+ size_t j;
+
+ if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
+ return (EINVAL);
+
+ error = copyin(uap->name, &name, uap->namelen * sizeof(int));
+ if (error)
+ return (error);
+
+ error = userland_sysctl(td, name, uap->namelen,
+ uap->old, uap->oldlenp, 0,
+ uap->new, uap->newlen, &j, 0);
+ if (error && error != ENOMEM)
+ return (error);
+ if (uap->oldlenp) {
+ i = copyout(&j, uap->oldlenp, sizeof(j));
+ if (i)
+ return (i);
+ }
+ return (error);
+}
+
+/*
+ * This is used from various compatibility syscalls too. That's why name
+ * must be in kernel space.
+ */
+int
+userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
+ size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval,
+ int flags)
+{
+ int error = 0, memlocked;
+ struct sysctl_req req;
+
+ bzero(&req, sizeof req);
+
+ req.td = td;
+ req.flags = flags;
+
+ if (oldlenp) {
+ if (inkernel) {
+ req.oldlen = *oldlenp;
+ } else {
+ error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
+ if (error)
+ return (error);
+ }
+ }
+ req.validlen = req.oldlen;
+
+ if (old) {
+ if (!useracc(old, req.oldlen, VM_PROT_WRITE))
+ return (EFAULT);
+ req.oldptr= old;
+ }
+
+ if (new != NULL) {
+ if (!useracc(new, newlen, VM_PROT_READ))
+ return (EFAULT);
+ req.newlen = newlen;
+ req.newptr = new;
+ }
+
+ req.oldfunc = sysctl_old_user;
+ req.newfunc = sysctl_new_user;
+ req.lock = REQ_UNWIRED;
+
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_SYSCTL))
+ ktrsysctl(name, namelen);
+#endif
+
+ if (req.oldlen > PAGE_SIZE) {
+ memlocked = 1;
+ sx_xlock(&sysctlmemlock);
+ } else
+ memlocked = 0;
+ CURVNET_SET(TD_TO_VNET(td));
+
+ for (;;) {
+ req.oldidx = 0;
+ req.newidx = 0;
+ SYSCTL_XLOCK();
+ error = sysctl_root(0, name, namelen, &req);
+ SYSCTL_XUNLOCK();
+ if (error != EAGAIN)
+ break;
+ kern_yield(PRI_USER);
+ }
+
+ CURVNET_RESTORE();
+
+ if (req.lock == REQ_WIRED && req.validlen > 0)
+ vsunlock(req.oldptr, req.validlen);
+ if (memlocked)
+ sx_xunlock(&sysctlmemlock);
+
+ if (error && error != ENOMEM)
+ return (error);
+
+ if (retval) {
+ if (req.oldptr && req.oldidx > req.validlen)
+ *retval = req.validlen;
+ else
+ *retval = req.oldidx;
+ }
+ return (error);
+}
+
+/*
+ * Drain into a sysctl struct. The user buffer should be wired if a page
+ * fault would cause issue.
+ */
+static int
+sbuf_sysctl_drain(void *arg, const char *data, int len)
+{
+ struct sysctl_req *req = arg;
+ int error;
+
+ error = SYSCTL_OUT(req, data, len);
+ KASSERT(error >= 0, ("Got unexpected negative value %d", error));
+ return (error == 0 ? len : -error);
+}
+
+struct sbuf *
+sbuf_new_for_sysctl(struct sbuf *s, char *buf, int length,
+ struct sysctl_req *req)
+{
+
+ s = sbuf_new(s, buf, length, SBUF_FIXEDLEN);
+ sbuf_set_drain(s, sbuf_sysctl_drain, req);
+ return (s);
+}
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
new file mode 100644
index 0000000..9fe7ebe
--- /dev/null
+++ b/sys/kern/kern_tc.c
@@ -0,0 +1,2030 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Julien Ridoux at the University
+ * of Melbourne under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ntp.h"
+#include "opt_ffclock.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#ifdef FFCLOCK
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/timeffc.h>
+#include <sys/timepps.h>
+#include <sys/timetc.h>
+#include <sys/timex.h>
+#include <sys/vdso.h>
+
+/*
+ * A large step happens on boot. This constant detects such steps.
+ * It is relatively small so that ntp_update_second gets called enough
+ * in the typical 'missed a couple of seconds' case, but doesn't loop
+ * forever when the time step is large.
+ */
+#define LARGE_STEP 200
+
+/*
+ * Implement a dummy timecounter which we can use until we get a real one
+ * in the air. This allows the console and other early stuff to use
+ * time services.
+ */
+
+static u_int
+dummy_get_timecount(struct timecounter *tc)
+{
+ static u_int now;
+
+ return (++now);
+}
+
+static struct timecounter dummy_timecounter = {
+ dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000
+};
+
+struct timehands {
+ /* These fields must be initialized by the driver. */
+ struct timecounter *th_counter;
+ int64_t th_adjustment;
+ uint64_t th_scale;
+ u_int th_offset_count;
+ struct bintime th_offset;
+ struct timeval th_microtime;
+ struct timespec th_nanotime;
+ /* Fields not to be copied in tc_windup start with th_generation. */
+ volatile u_int th_generation;
+ struct timehands *th_next;
+};
+
+static struct timehands th0;
+static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
+static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
+static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
+static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th7};
+static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th6};
+static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th5};
+static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th4};
+static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th3};
+static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th2};
+static struct timehands th0 = {
+ &dummy_timecounter,
+ 0,
+ (uint64_t)-1 / 1000000,
+ 0,
+ {1, 0},
+ {0, 0},
+ {0, 0},
+ 1,
+ &th1
+};
+
+static struct timehands *volatile timehands = &th0;
+struct timecounter *timecounter = &dummy_timecounter;
+static struct timecounter *timecounters = &dummy_timecounter;
+
+int tc_min_ticktock_freq = 1;
+
+volatile time_t time_second = 1;
+volatile time_t time_uptime = 1;
+
+struct bintime boottimebin;
+struct timeval boottime;
+static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT|CTLFLAG_RD,
+ NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
+
+SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
+
+static int timestepwarnings;
+SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
+ &timestepwarnings, 0, "Log time steps");
+
+struct bintime bt_timethreshold;
+struct bintime bt_tickthreshold;
+sbintime_t sbt_timethreshold;
+sbintime_t sbt_tickthreshold;
+struct bintime tc_tick_bt;
+sbintime_t tc_tick_sbt;
+int tc_precexp;
+int tc_timepercentage = TC_DEFAULTPERC;
+TUNABLE_INT("kern.timecounter.alloweddeviation", &tc_timepercentage);
+static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+ sysctl_kern_timecounter_adjprecision, "I",
+ "Allowed time interval deviation in percents");
+
+static void tc_windup(void);
+static void cpu_tick_calibrate(int);
+
+void dtrace_getnanotime(struct timespec *tsp);
+
+static int
+sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
+{
+#ifndef __mips__
+#ifdef SCTL_MASK32
+ int tv[2];
+
+ if (req->flags & SCTL_MASK32) {
+ tv[0] = boottime.tv_sec;
+ tv[1] = boottime.tv_usec;
+ return SYSCTL_OUT(req, tv, sizeof(tv));
+ } else
+#endif
+#endif
+ return SYSCTL_OUT(req, &boottime, sizeof(boottime));
+}
+
+static int
+sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
+{
+ u_int ncount;
+ struct timecounter *tc = arg1;
+
+ ncount = tc->tc_get_timecount(tc);
+ return sysctl_handle_int(oidp, &ncount, 0, req);
+}
+
+static int
+sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t freq;
+ struct timecounter *tc = arg1;
+
+ freq = tc->tc_frequency;
+ return sysctl_handle_64(oidp, &freq, 0, req);
+}
+
+/*
+ * Return the difference between the timehands' counter value now and what
+ * was when we copied it to the timehands' offset_count.
+ */
+static __inline u_int
+tc_delta(struct timehands *th)
+{
+ struct timecounter *tc;
+
+ tc = th->th_counter;
+ return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
+ tc->tc_counter_mask);
+}
+
+/*
+ * Functions for reading the time. We have to loop until we are sure that
+ * the timehands that we operated on was not updated under our feet. See
+ * the comment in <sys/time.h> for a description of these 12 functions.
+ */
+
+#ifdef FFCLOCK
+void
+fbclock_binuptime(struct bintime *bt)
+{
+ struct timehands *th;
+ unsigned int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ bintime_addx(bt, th->th_scale * tc_delta(th));
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_nanouptime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ fbclock_binuptime(&bt);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+fbclock_microuptime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ fbclock_binuptime(&bt);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+fbclock_bintime(struct bintime *bt)
+{
+
+ fbclock_binuptime(bt);
+ bintime_add(bt, &boottimebin);
+}
+
+void
+fbclock_nanotime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ fbclock_bintime(&bt);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+fbclock_microtime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ fbclock_bintime(&bt);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+fbclock_getbinuptime(struct bintime *bt)
+{
+ struct timehands *th;
+ unsigned int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_getnanouptime(struct timespec *tsp)
+{
+ struct timehands *th;
+ unsigned int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ bintime2timespec(&th->th_offset, tsp);
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_getmicrouptime(struct timeval *tvp)
+{
+ struct timehands *th;
+ unsigned int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ bintime2timeval(&th->th_offset, tvp);
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_getbintime(struct bintime *bt)
+{
+ struct timehands *th;
+ unsigned int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ } while (gen == 0 || gen != th->th_generation);
+ bintime_add(bt, &boottimebin);
+}
+
+void
+fbclock_getnanotime(struct timespec *tsp)
+{
+ struct timehands *th;
+ unsigned int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *tsp = th->th_nanotime;
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+fbclock_getmicrotime(struct timeval *tvp)
+{
+ struct timehands *th;
+ unsigned int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *tvp = th->th_microtime;
+ } while (gen == 0 || gen != th->th_generation);
+}
+#else /* !FFCLOCK */
+void
+binuptime(struct bintime *bt)
+{
+ struct timehands *th;
+ u_int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ bintime_addx(bt, th->th_scale * tc_delta(th));
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+nanouptime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ binuptime(&bt);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+microuptime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ binuptime(&bt);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+bintime(struct bintime *bt)
+{
+
+ binuptime(bt);
+ bintime_add(bt, &boottimebin);
+}
+
+void
+nanotime(struct timespec *tsp)
+{
+ struct bintime bt;
+
+ bintime(&bt);
+ bintime2timespec(&bt, tsp);
+}
+
+void
+microtime(struct timeval *tvp)
+{
+ struct bintime bt;
+
+ bintime(&bt);
+ bintime2timeval(&bt, tvp);
+}
+
+void
+getbinuptime(struct bintime *bt)
+{
+ struct timehands *th;
+ u_int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ bintime2timespec(&th->th_offset, tsp);
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ bintime2timeval(&th->th_offset, tvp);
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getbintime(struct bintime *bt)
+{
+ struct timehands *th;
+ u_int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *bt = th->th_offset;
+ } while (gen == 0 || gen != th->th_generation);
+ bintime_add(bt, &boottimebin);
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *tsp = th->th_nanotime;
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+void
+getmicrotime(struct timeval *tvp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *tvp = th->th_microtime;
+ } while (gen == 0 || gen != th->th_generation);
+}
+#endif /* FFCLOCK */
+
+#ifdef FFCLOCK
+/*
+ * Support for feed-forward synchronization algorithms. This is heavily inspired
+ * by the timehands mechanism but kept independent from it. *_windup() functions
+ * have some connection to avoid accessing the timecounter hardware more than
+ * necessary.
+ */
+
+/* Feed-forward clock estimates kept updated by the synchronization daemon. */
+struct ffclock_estimate ffclock_estimate;
+struct bintime ffclock_boottime; /* Feed-forward boot time estimate. */
+uint32_t ffclock_status; /* Feed-forward clock status. */
+int8_t ffclock_updated; /* New estimates are available. */
+struct mtx ffclock_mtx; /* Mutex on ffclock_estimate. */
+
+struct fftimehands {
+ struct ffclock_estimate cest;
+ struct bintime tick_time;
+ struct bintime tick_time_lerp;
+ ffcounter tick_ffcount;
+ uint64_t period_lerp;
+ volatile uint8_t gen;
+ struct fftimehands *next;
+};
+
+#define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x))
+
+static struct fftimehands ffth[10];
+static struct fftimehands *volatile fftimehands = ffth;
+
+static void
+ffclock_init(void)
+{
+ struct fftimehands *cur;
+ struct fftimehands *last;
+
+ memset(ffth, 0, sizeof(ffth));
+
+ last = ffth + NUM_ELEMENTS(ffth) - 1;
+ for (cur = ffth; cur < last; cur++)
+ cur->next = cur + 1;
+ last->next = ffth;
+
+ ffclock_updated = 0;
+ ffclock_status = FFCLOCK_STA_UNSYNC;
+ mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF);
+}
+
+/*
+ * Reset the feed-forward clock estimates. Called from inittodr() to get things
+ * kick started and uses the timecounter nominal frequency as a first period
+ * estimate. Note: this function may be called several time just after boot.
+ * Note: this is the only function that sets the value of boot time for the
+ * monotonic (i.e. uptime) version of the feed-forward clock.
+ */
+void
+ffclock_reset_clock(struct timespec *ts)
+{
+ struct timecounter *tc;
+ struct ffclock_estimate cest;
+
+ tc = timehands->th_counter;
+ memset(&cest, 0, sizeof(struct ffclock_estimate));
+
+ timespec2bintime(ts, &ffclock_boottime);
+ timespec2bintime(ts, &(cest.update_time));
+ ffclock_read_counter(&cest.update_ffcount);
+ cest.leapsec_next = 0;
+ cest.period = ((1ULL << 63) / tc->tc_frequency) << 1;
+ cest.errb_abs = 0;
+ cest.errb_rate = 0;
+ cest.status = FFCLOCK_STA_UNSYNC;
+ cest.leapsec_total = 0;
+ cest.leapsec = 0;
+
+ mtx_lock(&ffclock_mtx);
+ bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
+ ffclock_updated = INT8_MAX;
+ mtx_unlock(&ffclock_mtx);
+
+ printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name,
+ (unsigned long long)tc->tc_frequency, (long)ts->tv_sec,
+ (unsigned long)ts->tv_nsec);
+}
+
+/*
+ * Sub-routine to convert a time interval measured in RAW counter units to time
+ * in seconds stored in bintime format.
+ * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be
+ * larger than the max value of u_int (on 32 bit architecture). Loop to consume
+ * extra cycles.
+ */
+static void
+ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt)
+{
+ struct bintime bt2;
+ ffcounter delta, delta_max;
+
+ delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1;
+ bintime_clear(bt);
+ do {
+ if (ffdelta > delta_max)
+ delta = delta_max;
+ else
+ delta = ffdelta;
+ bt2.sec = 0;
+ bt2.frac = period;
+ bintime_mul(&bt2, (unsigned int)delta);
+ bintime_add(bt, &bt2);
+ ffdelta -= delta;
+ } while (ffdelta > 0);
+}
+
+/*
+ * Update the fftimehands.
+ * Push the tick ffcount and time(s) forward based on current clock estimate.
+ * The conversion from ffcounter to bintime relies on the difference clock
+ * principle, whose accuracy relies on computing small time intervals. If a new
+ * clock estimate has been passed by the synchronisation daemon, make it
+ * current, and compute the linear interpolation for monotonic time if needed.
+ */
+static void
+ffclock_windup(unsigned int delta)
+{
+ struct ffclock_estimate *cest;
+ struct fftimehands *ffth;
+ struct bintime bt, gap_lerp;
+ ffcounter ffdelta;
+ uint64_t frac;
+ unsigned int polling;
+ uint8_t forward_jump, ogen;
+
+ /*
+ * Pick the next timehand, copy current ffclock estimates and move tick
+ * times and counter forward.
+ */
+ forward_jump = 0;
+ ffth = fftimehands->next;
+ ogen = ffth->gen;
+ ffth->gen = 0;
+ cest = &ffth->cest;
+ bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate));
+ ffdelta = (ffcounter)delta;
+ ffth->period_lerp = fftimehands->period_lerp;
+
+ ffth->tick_time = fftimehands->tick_time;
+ ffclock_convert_delta(ffdelta, cest->period, &bt);
+ bintime_add(&ffth->tick_time, &bt);
+
+ ffth->tick_time_lerp = fftimehands->tick_time_lerp;
+ ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt);
+ bintime_add(&ffth->tick_time_lerp, &bt);
+
+ ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta;
+
+ /*
+ * Assess the status of the clock, if the last update is too old, it is
+ * likely the synchronisation daemon is dead and the clock is free
+ * running.
+ */
+ if (ffclock_updated == 0) {
+ ffdelta = ffth->tick_ffcount - cest->update_ffcount;
+ ffclock_convert_delta(ffdelta, cest->period, &bt);
+ if (bt.sec > 2 * FFCLOCK_SKM_SCALE)
+ ffclock_status |= FFCLOCK_STA_UNSYNC;
+ }
+
+ /*
+ * If available, grab updated clock estimates and make them current.
+ * Recompute time at this tick using the updated estimates. The clock
+ * estimates passed the feed-forward synchronisation daemon may result
+ * in time conversion that is not monotonically increasing (just after
+ * the update). time_lerp is a particular linear interpolation over the
+ * synchronisation algo polling period that ensures monotonicity for the
+ * clock ids requesting it.
+ */
+ if (ffclock_updated > 0) {
+ bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate));
+ ffdelta = ffth->tick_ffcount - cest->update_ffcount;
+ ffth->tick_time = cest->update_time;
+ ffclock_convert_delta(ffdelta, cest->period, &bt);
+ bintime_add(&ffth->tick_time, &bt);
+
+ /* ffclock_reset sets ffclock_updated to INT8_MAX */
+ if (ffclock_updated == INT8_MAX)
+ ffth->tick_time_lerp = ffth->tick_time;
+
+ if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >))
+ forward_jump = 1;
+ else
+ forward_jump = 0;
+
+ bintime_clear(&gap_lerp);
+ if (forward_jump) {
+ gap_lerp = ffth->tick_time;
+ bintime_sub(&gap_lerp, &ffth->tick_time_lerp);
+ } else {
+ gap_lerp = ffth->tick_time_lerp;
+ bintime_sub(&gap_lerp, &ffth->tick_time);
+ }
+
+ /*
+ * The reset from the RTC clock may be far from accurate, and
+ * reducing the gap between real time and interpolated time
+ * could take a very long time if the interpolated clock insists
+ * on strict monotonicity. The clock is reset under very strict
+ * conditions (kernel time is known to be wrong and
+ * synchronization daemon has been restarted recently.
+ * ffclock_boottime absorbs the jump to ensure boot time is
+ * correct and uptime functions stay consistent.
+ */
+ if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) &&
+ ((cest->status & FFCLOCK_STA_UNSYNC) == 0) &&
+ ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) {
+ if (forward_jump)
+ bintime_add(&ffclock_boottime, &gap_lerp);
+ else
+ bintime_sub(&ffclock_boottime, &gap_lerp);
+ ffth->tick_time_lerp = ffth->tick_time;
+ bintime_clear(&gap_lerp);
+ }
+
+ ffclock_status = cest->status;
+ ffth->period_lerp = cest->period;
+
+ /*
+ * Compute corrected period used for the linear interpolation of
+ * time. The rate of linear interpolation is capped to 5000PPM
+ * (5ms/s).
+ */
+ if (bintime_isset(&gap_lerp)) {
+ ffdelta = cest->update_ffcount;
+ ffdelta -= fftimehands->cest.update_ffcount;
+ ffclock_convert_delta(ffdelta, cest->period, &bt);
+ polling = bt.sec;
+ bt.sec = 0;
+ bt.frac = 5000000 * (uint64_t)18446744073LL;
+ bintime_mul(&bt, polling);
+ if (bintime_cmp(&gap_lerp, &bt, >))
+ gap_lerp = bt;
+
+ /* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */
+ frac = 0;
+ if (gap_lerp.sec > 0) {
+ frac -= 1;
+ frac /= ffdelta / gap_lerp.sec;
+ }
+ frac += gap_lerp.frac / ffdelta;
+
+ if (forward_jump)
+ ffth->period_lerp += frac;
+ else
+ ffth->period_lerp -= frac;
+ }
+
+ ffclock_updated = 0;
+ }
+ if (++ogen == 0)
+ ogen = 1;
+ ffth->gen = ogen;
+ fftimehands = ffth;
+}
+
+/*
+ * Adjust the fftimehands when the timecounter is changed. Stating the obvious,
+ * the old and new hardware counter cannot be read simultaneously. tc_windup()
+ * does read the two counters 'back to back', but a few cycles are effectively
+ * lost, and not accumulated in tick_ffcount. This is a fairly radical
+ * operation for a feed-forward synchronization daemon, and it is its job to not
+ * pushing irrelevant data to the kernel. Because there is no locking here,
+ * simply force to ignore pending or next update to give daemon a chance to
+ * realize the counter has changed.
+ */
+static void
+ffclock_change_tc(struct timehands *th)
+{
+ struct fftimehands *ffth;
+ struct ffclock_estimate *cest;
+ struct timecounter *tc;
+ uint8_t ogen;
+
+ tc = th->th_counter;
+ ffth = fftimehands->next;
+ ogen = ffth->gen;
+ ffth->gen = 0;
+
+ cest = &ffth->cest;
+ bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate));
+ cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1;
+ cest->errb_abs = 0;
+ cest->errb_rate = 0;
+ cest->status |= FFCLOCK_STA_UNSYNC;
+
+ ffth->tick_ffcount = fftimehands->tick_ffcount;
+ ffth->tick_time_lerp = fftimehands->tick_time_lerp;
+ ffth->tick_time = fftimehands->tick_time;
+ ffth->period_lerp = cest->period;
+
+ /* Do not lock but ignore next update from synchronization daemon. */
+ ffclock_updated--;
+
+ if (++ogen == 0)
+ ogen = 1;
+ ffth->gen = ogen;
+ fftimehands = ffth;
+}
+
+/*
+ * Retrieve feed-forward counter and time of last kernel tick.
+ */
+void
+ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags)
+{
+ struct fftimehands *ffth;
+ uint8_t gen;
+
+ /*
+ * No locking but check generation has not changed. Also need to make
+ * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
+ */
+ do {
+ ffth = fftimehands;
+ gen = ffth->gen;
+ if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP)
+ *bt = ffth->tick_time_lerp;
+ else
+ *bt = ffth->tick_time;
+ *ffcount = ffth->tick_ffcount;
+ } while (gen == 0 || gen != ffth->gen);
+}
+
+/*
+ * Absolute clock conversion. Low level function to convert ffcounter to
+ * bintime. The ffcounter is converted using the current ffclock period estimate
+ * or the "interpolated period" to ensure monotonicity.
+ * NOTE: this conversion may have been deferred, and the clock updated since the
+ * hardware counter has been read.
+ */
+void
+ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags)
+{
+ struct fftimehands *ffth;
+ struct bintime bt2;
+ ffcounter ffdelta;
+ uint8_t gen;
+
+ /*
+ * No locking but check generation has not changed. Also need to make
+ * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
+ */
+ do {
+ ffth = fftimehands;
+ gen = ffth->gen;
+ if (ffcount > ffth->tick_ffcount)
+ ffdelta = ffcount - ffth->tick_ffcount;
+ else
+ ffdelta = ffth->tick_ffcount - ffcount;
+
+ if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) {
+ *bt = ffth->tick_time_lerp;
+ ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2);
+ } else {
+ *bt = ffth->tick_time;
+ ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2);
+ }
+
+ if (ffcount > ffth->tick_ffcount)
+ bintime_add(bt, &bt2);
+ else
+ bintime_sub(bt, &bt2);
+ } while (gen == 0 || gen != ffth->gen);
+}
+
+/*
+ * Difference clock conversion.
+ * Low level function to Convert a time interval measured in RAW counter units
+ * into bintime. The difference clock allows measuring small intervals much more
+ * reliably than the absolute clock.
+ */
+void
+ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt)
+{
+ struct fftimehands *ffth;
+ uint8_t gen;
+
+ /* No locking but check generation has not changed. */
+ do {
+ ffth = fftimehands;
+ gen = ffth->gen;
+ ffclock_convert_delta(ffdelta, ffth->cest.period, bt);
+ } while (gen == 0 || gen != ffth->gen);
+}
+
+/*
+ * Access to current ffcounter value.
+ */
+void
+ffclock_read_counter(ffcounter *ffcount)
+{
+ struct timehands *th;
+ struct fftimehands *ffth;
+ unsigned int gen, delta;
+
+ /*
+ * ffclock_windup() called from tc_windup(), safe to rely on
+ * th->th_generation only, for correct delta and ffcounter.
+ */
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ ffth = fftimehands;
+ delta = tc_delta(th);
+ *ffcount = ffth->tick_ffcount;
+ } while (gen == 0 || gen != th->th_generation);
+
+ *ffcount += delta;
+}
+
+void
+binuptime(struct bintime *bt)
+{
+
+ binuptime_fromclock(bt, sysclock_active);
+}
+
+void
+nanouptime(struct timespec *tsp)
+{
+
+ nanouptime_fromclock(tsp, sysclock_active);
+}
+
+void
+microuptime(struct timeval *tvp)
+{
+
+ microuptime_fromclock(tvp, sysclock_active);
+}
+
+void
+bintime(struct bintime *bt)
+{
+
+ bintime_fromclock(bt, sysclock_active);
+}
+
+void
+nanotime(struct timespec *tsp)
+{
+
+ nanotime_fromclock(tsp, sysclock_active);
+}
+
+void
+microtime(struct timeval *tvp)
+{
+
+ microtime_fromclock(tvp, sysclock_active);
+}
+
+void
+getbinuptime(struct bintime *bt)
+{
+
+ getbinuptime_fromclock(bt, sysclock_active);
+}
+
+void
+getnanouptime(struct timespec *tsp)
+{
+
+ getnanouptime_fromclock(tsp, sysclock_active);
+}
+
+void
+getmicrouptime(struct timeval *tvp)
+{
+
+ getmicrouptime_fromclock(tvp, sysclock_active);
+}
+
+void
+getbintime(struct bintime *bt)
+{
+
+ getbintime_fromclock(bt, sysclock_active);
+}
+
+void
+getnanotime(struct timespec *tsp)
+{
+
+ getnanotime_fromclock(tsp, sysclock_active);
+}
+
+void
+getmicrotime(struct timeval *tvp)
+{
+
+ getmicrouptime_fromclock(tvp, sysclock_active);
+}
+
+#endif /* FFCLOCK */
+
+/*
+ * This is a clone of getnanotime and used for walltimestamps.
+ * The dtrace_ prefix prevents fbt from creating probes for
+ * it so walltimestamp can be safely used in all fbt probes.
+ */
+void
+dtrace_getnanotime(struct timespec *tsp)
+{
+ struct timehands *th;
+ u_int gen;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ *tsp = th->th_nanotime;
+ } while (gen == 0 || gen != th->th_generation);
+}
+
+/*
+ * System clock currently providing time to the system. Modifiable via sysctl
+ * when the FFCLOCK option is defined.
+ */
+int sysclock_active = SYSCLOCK_FBCK;
+
+/* Internal NTP status and error estimates. */
+extern int time_status;
+extern long time_esterror;
+
+/*
+ * Take a snapshot of sysclock data which can be used to compare system clocks
+ * and generate timestamps after the fact.
+ */
+void
+sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast)
+{
+ struct fbclock_info *fbi;
+ struct timehands *th;
+ struct bintime bt;
+ unsigned int delta, gen;
+#ifdef FFCLOCK
+ ffcounter ffcount;
+ struct fftimehands *ffth;
+ struct ffclock_info *ffi;
+ struct ffclock_estimate cest;
+
+ ffi = &clock_snap->ff_info;
+#endif
+
+ fbi = &clock_snap->fb_info;
+ delta = 0;
+
+ do {
+ th = timehands;
+ gen = th->th_generation;
+ fbi->th_scale = th->th_scale;
+ fbi->tick_time = th->th_offset;
+#ifdef FFCLOCK
+ ffth = fftimehands;
+ ffi->tick_time = ffth->tick_time_lerp;
+ ffi->tick_time_lerp = ffth->tick_time_lerp;
+ ffi->period = ffth->cest.period;
+ ffi->period_lerp = ffth->period_lerp;
+ clock_snap->ffcount = ffth->tick_ffcount;
+ cest = ffth->cest;
+#endif
+ if (!fast)
+ delta = tc_delta(th);
+ } while (gen == 0 || gen != th->th_generation);
+
+ clock_snap->delta = delta;
+ clock_snap->sysclock_active = sysclock_active;
+
+ /* Record feedback clock status and error. */
+ clock_snap->fb_info.status = time_status;
+ /* XXX: Very crude estimate of feedback clock error. */
+ bt.sec = time_esterror / 1000000;
+ bt.frac = ((time_esterror - bt.sec) * 1000000) *
+ (uint64_t)18446744073709ULL;
+ clock_snap->fb_info.error = bt;
+
+#ifdef FFCLOCK
+ if (!fast)
+ clock_snap->ffcount += delta;
+
+ /* Record feed-forward clock leap second adjustment. */
+ ffi->leapsec_adjustment = cest.leapsec_total;
+ if (clock_snap->ffcount > cest.leapsec_next)
+ ffi->leapsec_adjustment -= cest.leapsec;
+
+ /* Record feed-forward clock status and error. */
+ clock_snap->ff_info.status = cest.status;
+ ffcount = clock_snap->ffcount - cest.update_ffcount;
+ ffclock_convert_delta(ffcount, cest.period, &bt);
+ /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */
+ bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL);
+ /* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */
+ bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL);
+ clock_snap->ff_info.error = bt;
+#endif
+}
+
+/*
+ * Convert a sysclock snapshot into a struct bintime based on the specified
+ * clock source and flags.
+ */
+int
+sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt,
+ int whichclock, uint32_t flags)
+{
+#ifdef FFCLOCK
+ struct bintime bt2;
+ uint64_t period;
+#endif
+
+ switch (whichclock) {
+ case SYSCLOCK_FBCK:
+ *bt = cs->fb_info.tick_time;
+
+ /* If snapshot was created with !fast, delta will be >0. */
+ if (cs->delta > 0)
+ bintime_addx(bt, cs->fb_info.th_scale * cs->delta);
+
+ if ((flags & FBCLOCK_UPTIME) == 0)
+ bintime_add(bt, &boottimebin);
+ break;
+#ifdef FFCLOCK
+ case SYSCLOCK_FFWD:
+ if (flags & FFCLOCK_LERP) {
+ *bt = cs->ff_info.tick_time_lerp;
+ period = cs->ff_info.period_lerp;
+ } else {
+ *bt = cs->ff_info.tick_time;
+ period = cs->ff_info.period;
+ }
+
+ /* If snapshot was created with !fast, delta will be >0. */
+ if (cs->delta > 0) {
+ ffclock_convert_delta(cs->delta, period, &bt2);
+ bintime_add(bt, &bt2);
+ }
+
+ /* Leap second adjustment. */
+ if (flags & FFCLOCK_LEAPSEC)
+ bt->sec -= cs->ff_info.leapsec_adjustment;
+
+ /* Boot time adjustment, for uptime/monotonic clocks. */
+ if (flags & FFCLOCK_UPTIME)
+ bintime_sub(bt, &ffclock_boottime);
+ break;
+#endif
+ default:
+ return (EINVAL);
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * Initialize a new timecounter and possibly use it.
+ */
+void
+tc_init(struct timecounter *tc)
+{
+ u_int u;
+ struct sysctl_oid *tc_root;
+
+ u = tc->tc_frequency / tc->tc_counter_mask;
+ /* XXX: We need some margin here, 10% is a guess */
+ u *= 11;
+ u /= 10;
+ if (u > hz && tc->tc_quality >= 0) {
+ tc->tc_quality = -2000;
+ if (bootverbose) {
+ printf("Timecounter \"%s\" frequency %ju Hz",
+ tc->tc_name, (uintmax_t)tc->tc_frequency);
+ printf(" -- Insufficient hz, needs at least %u\n", u);
+ }
+ } else if (tc->tc_quality >= 0 || bootverbose) {
+ printf("Timecounter \"%s\" frequency %ju Hz quality %d\n",
+ tc->tc_name, (uintmax_t)tc->tc_frequency,
+ tc->tc_quality);
+ }
+
+ tc->tc_next = timecounters;
+ timecounters = tc;
+ /*
+ * Set up sysctl tree for this counter.
+ */
+ tc_root = SYSCTL_ADD_NODE(NULL,
+ SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
+ CTLFLAG_RW, 0, "timecounter description");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
+ "mask for implemented bits");
+ SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
+ sysctl_kern_timecounter_get, "IU", "current timecounter value");
+ SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "frequency", CTLTYPE_U64 | CTLFLAG_RD, tc, sizeof(*tc),
+ sysctl_kern_timecounter_freq, "QU", "timecounter frequency");
+ SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+ "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
+ "goodness of time counter");
+ /*
+ * Never automatically use a timecounter with negative quality.
+ * Even though we run on the dummy counter, switching here may be
+ * worse since this timecounter may not be monotonous.
+ */
+ if (tc->tc_quality < 0)
+ return;
+ if (tc->tc_quality < timecounter->tc_quality)
+ return;
+ if (tc->tc_quality == timecounter->tc_quality &&
+ tc->tc_frequency < timecounter->tc_frequency)
+ return;
+ (void)tc->tc_get_timecount(tc);
+ (void)tc->tc_get_timecount(tc);
+ timecounter = tc;
+}
+
+/* Report the frequency of the current timecounter. */
+uint64_t
+tc_getfrequency(void)
+{
+
+ return (timehands->th_counter->tc_frequency);
+}
+
+/*
+ * Step our concept of UTC. This is done by modifying our estimate of
+ * when we booted.
+ * XXX: not locked.
+ */
+void
+tc_setclock(struct timespec *ts)
+{
+ struct timespec tbef, taft;
+ struct bintime bt, bt2;
+
+ cpu_tick_calibrate(1);
+ nanotime(&tbef);
+ timespec2bintime(ts, &bt);
+ binuptime(&bt2);
+ bintime_sub(&bt, &bt2);
+ bintime_add(&bt2, &boottimebin);
+ boottimebin = bt;
+ bintime2timeval(&bt, &boottime);
+
+ /* XXX fiddle all the little crinkly bits around the fiords... */
+ tc_windup();
+ nanotime(&taft);
+ if (timestepwarnings) {
+ log(LOG_INFO,
+ "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
+ (intmax_t)tbef.tv_sec, tbef.tv_nsec,
+ (intmax_t)taft.tv_sec, taft.tv_nsec,
+ (intmax_t)ts->tv_sec, ts->tv_nsec);
+ }
+ cpu_tick_calibrate(1);
+}
+
+/*
+ * Initialize the next struct timehands in the ring and make
+ * it the active timehands. Along the way we might switch to a different
+ * timecounter and/or do seconds processing in NTP. Slightly magic.
+ */
+static void
+tc_windup(void)
+{
+ struct bintime bt;
+ struct timehands *th, *tho;
+ uint64_t scale;
+ u_int delta, ncount, ogen;
+ int i;
+ time_t t;
+
+ /*
+ * Make the next timehands a copy of the current one, but do not
+ * overwrite the generation or next pointer. While we update
+ * the contents, the generation must be zero.
+ */
+ tho = timehands;
+ th = tho->th_next;
+ ogen = th->th_generation;
+ th->th_generation = 0;
+ bcopy(tho, th, offsetof(struct timehands, th_generation));
+
+ /*
+ * Capture a timecounter delta on the current timecounter and if
+ * changing timecounters, a counter value from the new timecounter.
+ * Update the offset fields accordingly.
+ */
+ delta = tc_delta(th);
+ if (th->th_counter != timecounter)
+ ncount = timecounter->tc_get_timecount(timecounter);
+ else
+ ncount = 0;
+#ifdef FFCLOCK
+ ffclock_windup(delta);
+#endif
+ th->th_offset_count += delta;
+ th->th_offset_count &= th->th_counter->tc_counter_mask;
+ while (delta > th->th_counter->tc_frequency) {
+ /* Eat complete unadjusted seconds. */
+ delta -= th->th_counter->tc_frequency;
+ th->th_offset.sec++;
+ }
+ if ((delta > th->th_counter->tc_frequency / 2) &&
+ (th->th_scale * delta < ((uint64_t)1 << 63))) {
+ /* The product th_scale * delta just barely overflows. */
+ th->th_offset.sec++;
+ }
+ bintime_addx(&th->th_offset, th->th_scale * delta);
+
+ /*
+ * Hardware latching timecounters may not generate interrupts on
+ * PPS events, so instead we poll them. There is a finite risk that
+ * the hardware might capture a count which is later than the one we
+ * got above, and therefore possibly in the next NTP second which might
+ * have a different rate than the current NTP second. It doesn't
+ * matter in practice.
+ */
+ if (tho->th_counter->tc_poll_pps)
+ tho->th_counter->tc_poll_pps(tho->th_counter);
+
+ /*
+ * Deal with NTP second processing. The for loop normally
+ * iterates at most once, but in extreme situations it might
+ * keep NTP sane if timeouts are not run for several seconds.
+ * At boot, the time step can be large when the TOD hardware
+ * has been read, so on really large steps, we call
+ * ntp_update_second only twice. We need to call it twice in
+ * case we missed a leap second.
+ */
+ bt = th->th_offset;
+ bintime_add(&bt, &boottimebin);
+ i = bt.sec - tho->th_microtime.tv_sec;
+ if (i > LARGE_STEP)
+ i = 2;
+ for (; i > 0; i--) {
+ t = bt.sec;
+ ntp_update_second(&th->th_adjustment, &bt.sec);
+ if (bt.sec != t)
+ boottimebin.sec += bt.sec - t;
+ }
+ /* Update the UTC timestamps used by the get*() functions. */
+ /* XXX shouldn't do this here. Should force non-`get' versions. */
+ bintime2timeval(&bt, &th->th_microtime);
+ bintime2timespec(&bt, &th->th_nanotime);
+
+ /* Now is a good time to change timecounters. */
+ if (th->th_counter != timecounter) {
+#ifndef __arm__
+ if ((timecounter->tc_flags & TC_FLAGS_C3STOP) != 0)
+ cpu_disable_deep_sleep++;
+ if ((th->th_counter->tc_flags & TC_FLAGS_C3STOP) != 0)
+ cpu_disable_deep_sleep--;
+#endif
+ th->th_counter = timecounter;
+ th->th_offset_count = ncount;
+ tc_min_ticktock_freq = max(1, timecounter->tc_frequency /
+ (((uint64_t)timecounter->tc_counter_mask + 1) / 3));
+#ifdef FFCLOCK
+ ffclock_change_tc(th);
+#endif
+ }
+
+ /*-
+ * Recalculate the scaling factor. We want the number of 1/2^64
+ * fractions of a second per period of the hardware counter, taking
+ * into account the th_adjustment factor which the NTP PLL/adjtime(2)
+ * processing provides us with.
+ *
+ * The th_adjustment is nanoseconds per second with 32 bit binary
+ * fraction and we want 64 bit binary fraction of second:
+ *
+ * x = a * 2^32 / 10^9 = a * 4.294967296
+ *
+ * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
+ * we can only multiply by about 850 without overflowing, that
+ * leaves no suitably precise fractions for multiply before divide.
+ *
+ * Divide before multiply with a fraction of 2199/512 results in a
+ * systematic undercompensation of 10PPM of th_adjustment. On a
+ * 5000PPM adjustment this is a 0.05PPM error. This is acceptable.
+ *
+ * We happily sacrifice the lowest of the 64 bits of our result
+ * to the goddess of code clarity.
+ *
+ */
+ scale = (uint64_t)1 << 63;
+ scale += (th->th_adjustment / 1024) * 2199;
+ scale /= th->th_counter->tc_frequency;
+ th->th_scale = scale * 2;
+
+ /*
+ * Now that the struct timehands is again consistent, set the new
+ * generation number, making sure to not make it zero.
+ */
+ if (++ogen == 0)
+ ogen = 1;
+ th->th_generation = ogen;
+
+ /* Go live with the new struct timehands. */
+#ifdef FFCLOCK
+ switch (sysclock_active) {
+ case SYSCLOCK_FBCK:
+#endif
+ time_second = th->th_microtime.tv_sec;
+ time_uptime = th->th_offset.sec;
+#ifdef FFCLOCK
+ break;
+ case SYSCLOCK_FFWD:
+ time_second = fftimehands->tick_time_lerp.sec;
+ time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec;
+ break;
+ }
+#endif
+
+ timehands = th;
+ timekeep_push_vdso();
+}
+
+/* Report or change the active timecounter hardware. */
+static int
+sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
+{
+ char newname[32];
+ struct timecounter *newtc, *tc;
+ int error;
+
+ tc = timecounter;
+ strlcpy(newname, tc->tc_name, sizeof(newname));
+
+ error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
+ if (error != 0 || req->newptr == NULL ||
+ strcmp(newname, tc->tc_name) == 0)
+ return (error);
+ for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
+ if (strcmp(newname, newtc->tc_name) != 0)
+ continue;
+
+ /* Warm up new timecounter. */
+ (void)newtc->tc_get_timecount(newtc);
+ (void)newtc->tc_get_timecount(newtc);
+
+ timecounter = newtc;
+ timekeep_push_vdso();
+ return (0);
+ }
+ return (EINVAL);
+}
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
+ 0, 0, sysctl_kern_timecounter_hardware, "A",
+ "Timecounter hardware selected");
+
+
+/* Report or change the active timecounter hardware. */
+static int
+sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
+{
+ char buf[32], *spc;
+ struct timecounter *tc;
+ int error;
+
+ spc = "";
+ error = 0;
+ for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) {
+ sprintf(buf, "%s%s(%d)",
+ spc, tc->tc_name, tc->tc_quality);
+ error = SYSCTL_OUT(req, buf, strlen(buf));
+ spc = " ";
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD,
+ 0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected");
+
+/*
+ * RFC 2783 PPS-API implementation.
+ */
+
+static int
+pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps)
+{
+ int err, timo;
+ pps_seq_t aseq, cseq;
+ struct timeval tv;
+
+ if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
+ return (EINVAL);
+
+ /*
+ * If no timeout is requested, immediately return whatever values were
+ * most recently captured. If timeout seconds is -1, that's a request
+ * to block without a timeout. WITNESS won't let us sleep forever
+ * without a lock (we really don't need a lock), so just repeatedly
+ * sleep a long time.
+ */
+ if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) {
+ if (fapi->timeout.tv_sec == -1)
+ timo = 0x7fffffff;
+ else {
+ tv.tv_sec = fapi->timeout.tv_sec;
+ tv.tv_usec = fapi->timeout.tv_nsec / 1000;
+ timo = tvtohz(&tv);
+ }
+ aseq = pps->ppsinfo.assert_sequence;
+ cseq = pps->ppsinfo.clear_sequence;
+ while (aseq == pps->ppsinfo.assert_sequence &&
+ cseq == pps->ppsinfo.clear_sequence) {
+ err = tsleep(pps, PCATCH, "ppsfch", timo);
+ if (err == EWOULDBLOCK && fapi->timeout.tv_sec == -1) {
+ continue;
+ } else if (err != 0) {
+ return (err);
+ }
+ }
+ }
+
+ pps->ppsinfo.current_mode = pps->ppsparam.mode;
+ fapi->pps_info_buf = pps->ppsinfo;
+
+ return (0);
+}
+
+int
+pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
+{
+ pps_params_t *app;
+ struct pps_fetch_args *fapi;
+#ifdef FFCLOCK
+ struct pps_fetch_ffc_args *fapi_ffc;
+#endif
+#ifdef PPS_SYNC
+ struct pps_kcbind_args *kapi;
+#endif
+
+ KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl"));
+ switch (cmd) {
+ case PPS_IOC_CREATE:
+ return (0);
+ case PPS_IOC_DESTROY:
+ return (0);
+ case PPS_IOC_SETPARAMS:
+ app = (pps_params_t *)data;
+ if (app->mode & ~pps->ppscap)
+ return (EINVAL);
+#ifdef FFCLOCK
+ /* Ensure only a single clock is selected for ffc timestamp. */
+ if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK)
+ return (EINVAL);
+#endif
+ pps->ppsparam = *app;
+ return (0);
+ case PPS_IOC_GETPARAMS:
+ app = (pps_params_t *)data;
+ *app = pps->ppsparam;
+ app->api_version = PPS_API_VERS_1;
+ return (0);
+ case PPS_IOC_GETCAP:
+ *(int*)data = pps->ppscap;
+ return (0);
+ case PPS_IOC_FETCH:
+ fapi = (struct pps_fetch_args *)data;
+ return (pps_fetch(fapi, pps));
+#ifdef FFCLOCK
+ case PPS_IOC_FETCH_FFCOUNTER:
+ fapi_ffc = (struct pps_fetch_ffc_args *)data;
+ if (fapi_ffc->tsformat && fapi_ffc->tsformat !=
+ PPS_TSFMT_TSPEC)
+ return (EINVAL);
+ if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec)
+ return (EOPNOTSUPP);
+ pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode;
+ fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc;
+ /* Overwrite timestamps if feedback clock selected. */
+ switch (pps->ppsparam.mode & PPS_TSCLK_MASK) {
+ case PPS_TSCLK_FBCK:
+ fapi_ffc->pps_info_buf_ffc.assert_timestamp =
+ pps->ppsinfo.assert_timestamp;
+ fapi_ffc->pps_info_buf_ffc.clear_timestamp =
+ pps->ppsinfo.clear_timestamp;
+ break;
+ case PPS_TSCLK_FFWD:
+ break;
+ default:
+ break;
+ }
+ return (0);
+#endif /* FFCLOCK */
+ case PPS_IOC_KCBIND:
+#ifdef PPS_SYNC
+ kapi = (struct pps_kcbind_args *)data;
+ /* XXX Only root should be able to do this */
+ if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
+ return (EINVAL);
+ if (kapi->kernel_consumer != PPS_KC_HARDPPS)
+ return (EINVAL);
+ if (kapi->edge & ~pps->ppscap)
+ return (EINVAL);
+ pps->kcmode = kapi->edge;
+ return (0);
+#else
+ return (EOPNOTSUPP);
+#endif
+ default:
+ return (ENOIOCTL);
+ }
+}
+
+void
+pps_init(struct pps_state *pps)
+{
+ pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT;
+ if (pps->ppscap & PPS_CAPTUREASSERT)
+ pps->ppscap |= PPS_OFFSETASSERT;
+ if (pps->ppscap & PPS_CAPTURECLEAR)
+ pps->ppscap |= PPS_OFFSETCLEAR;
+#ifdef FFCLOCK
+ pps->ppscap |= PPS_TSCLK_MASK;
+#endif
+}
+
+void
+pps_capture(struct pps_state *pps)
+{
+ struct timehands *th;
+
+ KASSERT(pps != NULL, ("NULL pps pointer in pps_capture"));
+ th = timehands;
+ pps->capgen = th->th_generation;
+ pps->capth = th;
+#ifdef FFCLOCK
+ pps->capffth = fftimehands;
+#endif
+ pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
+ if (pps->capgen != th->th_generation)
+ pps->capgen = 0;
+}
+
+void
+pps_event(struct pps_state *pps, int event)
+{
+ struct bintime bt;
+ struct timespec ts, *tsp, *osp;
+ u_int tcount, *pcount;
+ int foff, fhard;
+ pps_seq_t *pseq;
+#ifdef FFCLOCK
+ struct timespec *tsp_ffc;
+ pps_seq_t *pseq_ffc;
+ ffcounter *ffcount;
+#endif
+
+ KASSERT(pps != NULL, ("NULL pps pointer in pps_event"));
+ /* If the timecounter was wound up underneath us, bail out. */
+ if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation)
+ return;
+
+ /* Things would be easier with arrays. */
+ if (event == PPS_CAPTUREASSERT) {
+ tsp = &pps->ppsinfo.assert_timestamp;
+ osp = &pps->ppsparam.assert_offset;
+ foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
+ fhard = pps->kcmode & PPS_CAPTUREASSERT;
+ pcount = &pps->ppscount[0];
+ pseq = &pps->ppsinfo.assert_sequence;
+#ifdef FFCLOCK
+ ffcount = &pps->ppsinfo_ffc.assert_ffcount;
+ tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp;
+ pseq_ffc = &pps->ppsinfo_ffc.assert_sequence;
+#endif
+ } else {
+ tsp = &pps->ppsinfo.clear_timestamp;
+ osp = &pps->ppsparam.clear_offset;
+ foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
+ fhard = pps->kcmode & PPS_CAPTURECLEAR;
+ pcount = &pps->ppscount[1];
+ pseq = &pps->ppsinfo.clear_sequence;
+#ifdef FFCLOCK
+ ffcount = &pps->ppsinfo_ffc.clear_ffcount;
+ tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp;
+ pseq_ffc = &pps->ppsinfo_ffc.clear_sequence;
+#endif
+ }
+
+ /*
+ * If the timecounter changed, we cannot compare the count values, so
+ * we have to drop the rest of the PPS-stuff until the next event.
+ */
+ if (pps->ppstc != pps->capth->th_counter) {
+ pps->ppstc = pps->capth->th_counter;
+ *pcount = pps->capcount;
+ pps->ppscount[2] = pps->capcount;
+ return;
+ }
+
+ /* Convert the count to a timespec. */
+ tcount = pps->capcount - pps->capth->th_offset_count;
+ tcount &= pps->capth->th_counter->tc_counter_mask;
+ bt = pps->capth->th_offset;
+ bintime_addx(&bt, pps->capth->th_scale * tcount);
+ bintime_add(&bt, &boottimebin);
+ bintime2timespec(&bt, &ts);
+
+ /* If the timecounter was wound up underneath us, bail out. */
+ if (pps->capgen != pps->capth->th_generation)
+ return;
+
+ *pcount = pps->capcount;
+ (*pseq)++;
+ *tsp = ts;
+
+ if (foff) {
+ timespecadd(tsp, osp);
+ if (tsp->tv_nsec < 0) {
+ tsp->tv_nsec += 1000000000;
+ tsp->tv_sec -= 1;
+ }
+ }
+
+#ifdef FFCLOCK
+ *ffcount = pps->capffth->tick_ffcount + tcount;
+ bt = pps->capffth->tick_time;
+ ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt);
+ bintime_add(&bt, &pps->capffth->tick_time);
+ bintime2timespec(&bt, &ts);
+ (*pseq_ffc)++;
+ *tsp_ffc = ts;
+#endif
+
+#ifdef PPS_SYNC
+ if (fhard) {
+ uint64_t scale;
+
+ /*
+ * Feed the NTP PLL/FLL.
+ * The FLL wants to know how many (hardware) nanoseconds
+ * elapsed since the previous event.
+ */
+ tcount = pps->capcount - pps->ppscount[2];
+ pps->ppscount[2] = pps->capcount;
+ tcount &= pps->capth->th_counter->tc_counter_mask;
+ scale = (uint64_t)1 << 63;
+ scale /= pps->capth->th_counter->tc_frequency;
+ scale *= 2;
+ bt.sec = 0;
+ bt.frac = 0;
+ bintime_addx(&bt, scale * tcount);
+ bintime2timespec(&bt, &ts);
+ hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
+ }
+#endif
+
+ /* Wakeup anyone sleeping in pps_fetch(). */
+ wakeup(pps);
+}
+
+/*
+ * Timecounters need to be updated every so often to prevent the hardware
+ * counter from overflowing. Updating also recalculates the cached values
+ * used by the get*() family of functions, so their precision depends on
+ * the update frequency.
+ */
+
+static int tc_tick;
+SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0,
+ "Approximate number of hardclock ticks in a millisecond");
+
+void
+tc_ticktock(int cnt)
+{
+ static int count;
+
+ count += cnt;
+ if (count < tc_tick)
+ return;
+ count = 0;
+ tc_windup();
+}
+
+static void __inline
+tc_adjprecision(void)
+{
+ int t;
+
+ if (tc_timepercentage > 0) {
+ t = (99 + tc_timepercentage) / tc_timepercentage;
+ tc_precexp = fls(t + (t >> 1)) - 1;
+ FREQ2BT(hz / tc_tick, &bt_timethreshold);
+ FREQ2BT(hz, &bt_tickthreshold);
+ bintime_shift(&bt_timethreshold, tc_precexp);
+ bintime_shift(&bt_tickthreshold, tc_precexp);
+ } else {
+ tc_precexp = 31;
+ bt_timethreshold.sec = INT_MAX;
+ bt_timethreshold.frac = ~(uint64_t)0;
+ bt_tickthreshold = bt_timethreshold;
+ }
+ sbt_timethreshold = bttosbt(bt_timethreshold);
+ sbt_tickthreshold = bttosbt(bt_tickthreshold);
+}
+
+static int
+sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS)
+{
+ int error, val;
+
+ val = tc_timepercentage;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ tc_timepercentage = val;
+ tc_adjprecision();
+ return (0);
+}
+
+static void
+inittimecounter(void *dummy)
+{
+ u_int p;
+ int tick_rate;
+
+ /*
+ * Set the initial timeout to
+ * max(1, <approx. number of hardclock ticks in a millisecond>).
+ * People should probably not use the sysctl to set the timeout
+ * to smaller than its inital value, since that value is the
+ * smallest reasonable one. If they want better timestamps they
+ * should use the non-"get"* functions.
+ */
+ if (hz > 1000)
+ tc_tick = (hz + 500) / 1000;
+ else
+ tc_tick = 1;
+ tc_adjprecision();
+ FREQ2BT(hz, &tick_bt);
+ tick_sbt = bttosbt(tick_bt);
+ tick_rate = hz / tc_tick;
+ FREQ2BT(tick_rate, &tc_tick_bt);
+ tc_tick_sbt = bttosbt(tc_tick_bt);
+ p = (tc_tick * 1000000) / hz;
+ printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
+
+#ifdef FFCLOCK
+ ffclock_init();
+#endif
+ /* warm up new timecounter (again) and get rolling. */
+ (void)timecounter->tc_get_timecount(timecounter);
+ (void)timecounter->tc_get_timecount(timecounter);
+ tc_windup();
+}
+
+SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL);
+
+/* Cpu tick handling -------------------------------------------------*/
+
+static int cpu_tick_variable;
+static uint64_t cpu_tick_frequency;
+
+static uint64_t
+tc_cpu_ticks(void)
+{
+ static uint64_t base;
+ static unsigned last;
+ unsigned u;
+ struct timecounter *tc;
+
+ tc = timehands->th_counter;
+ u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
+ if (u < last)
+ base += (uint64_t)tc->tc_counter_mask + 1;
+ last = u;
+ return (u + base);
+}
+
+void
+cpu_tick_calibration(void)
+{
+ static time_t last_calib;
+
+ if (time_uptime != last_calib && !(time_uptime & 0xf)) {
+ cpu_tick_calibrate(0);
+ last_calib = time_uptime;
+ }
+}
+
+/*
+ * This function gets called every 16 seconds on only one designated
+ * CPU in the system from hardclock() via cpu_tick_calibration()().
+ *
+ * Whenever the real time clock is stepped we get called with reset=1
+ * to make sure we handle suspend/resume and similar events correctly.
+ */
+
+static void
+cpu_tick_calibrate(int reset)
+{
+ static uint64_t c_last;
+ uint64_t c_this, c_delta;
+ static struct bintime t_last;
+ struct bintime t_this, t_delta;
+ uint32_t divi;
+
+ if (reset) {
+ /* The clock was stepped, abort & reset */
+ t_last.sec = 0;
+ return;
+ }
+
+ /* we don't calibrate fixed rate cputicks */
+ if (!cpu_tick_variable)
+ return;
+
+ getbinuptime(&t_this);
+ c_this = cpu_ticks();
+ if (t_last.sec != 0) {
+ c_delta = c_this - c_last;
+ t_delta = t_this;
+ bintime_sub(&t_delta, &t_last);
+ /*
+ * Headroom:
+ * 2^(64-20) / 16[s] =
+ * 2^(44) / 16[s] =
+ * 17.592.186.044.416 / 16 =
+ * 1.099.511.627.776 [Hz]
+ */
+ divi = t_delta.sec << 20;
+ divi |= t_delta.frac >> (64 - 20);
+ c_delta <<= 20;
+ c_delta /= divi;
+ if (c_delta > cpu_tick_frequency) {
+ if (0 && bootverbose)
+ printf("cpu_tick increased to %ju Hz\n",
+ c_delta);
+ cpu_tick_frequency = c_delta;
+ }
+ }
+ c_last = c_this;
+ t_last = t_this;
+}
+
+void
+set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
+{
+
+ if (func == NULL) {
+ cpu_ticks = tc_cpu_ticks;
+ } else {
+ cpu_tick_frequency = freq;
+ cpu_tick_variable = var;
+ cpu_ticks = func;
+ }
+}
+
+uint64_t
+cpu_tickrate(void)
+{
+
+ if (cpu_ticks == tc_cpu_ticks)
+ return (tc_getfrequency());
+ return (cpu_tick_frequency);
+}
+
+/*
+ * We need to be slightly careful converting cputicks to microseconds.
+ * There is plenty of margin in 64 bits of microseconds (half a million
+ * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
+ * before divide conversion (to retain precision) we find that the
+ * margin shrinks to 1.5 hours (one millionth of 146y).
+ * With a three prong approach we never lose significant bits, no
+ * matter what the cputick rate and length of timeinterval is.
+ */
+
+uint64_t
+cputick2usec(uint64_t tick)
+{
+
+ if (tick > 18446744073709551LL) /* floor(2^64 / 1000) */
+ return (tick / (cpu_tickrate() / 1000000LL));
+ else if (tick > 18446744073709LL) /* floor(2^64 / 1000000) */
+ return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
+ else
+ return ((tick * 1000000LL) / cpu_tickrate());
+}
+
+cpu_tick_f *cpu_ticks = tc_cpu_ticks;
+
+static int vdso_th_enable = 1;
+static int
+sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
+{
+ int old_vdso_th_enable, error;
+
+ old_vdso_th_enable = vdso_th_enable;
+ error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
+ if (error != 0)
+ return (error);
+ vdso_th_enable = old_vdso_th_enable;
+ timekeep_push_vdso();
+ return (0);
+}
+SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
+
+uint32_t
+tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+{
+ struct timehands *th;
+ uint32_t enabled;
+
+ th = timehands;
+ vdso_th->th_algo = VDSO_TH_ALGO_1;
+ vdso_th->th_scale = th->th_scale;
+ vdso_th->th_offset_count = th->th_offset_count;
+ vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
+ vdso_th->th_offset = th->th_offset;
+ vdso_th->th_boottime = boottimebin;
+ enabled = cpu_fill_vdso_timehands(vdso_th);
+ if (!vdso_th_enable)
+ enabled = 0;
+ return (enabled);
+}
+
+#ifdef COMPAT_FREEBSD32
+uint32_t
+tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+{
+ struct timehands *th;
+ uint32_t enabled;
+
+ th = timehands;
+ vdso_th32->th_algo = VDSO_TH_ALGO_1;
+ *(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
+ vdso_th32->th_offset_count = th->th_offset_count;
+ vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
+ vdso_th32->th_offset.sec = th->th_offset.sec;
+ *(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
+ vdso_th32->th_boottime.sec = boottimebin.sec;
+ *(uint64_t *)&vdso_th32->th_boottime.frac[0] = boottimebin.frac;
+ enabled = cpu_fill_vdso_timehands32(vdso_th32);
+ if (!vdso_th_enable)
+ enabled = 0;
+ return (enabled);
+}
+#endif
diff --git a/sys/kern/kern_thr.c b/sys/kern/kern_thr.c
new file mode 100644
index 0000000..4270b41
--- /dev/null
+++ b/sys/kern/kern_thr.c
@@ -0,0 +1,555 @@
+/*-
+ * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_posix.h"
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/ucontext.h>
+#include <sys/thr.h>
+#include <sys/rtprio.h>
+#include <sys/umtx.h>
+#include <sys/limits.h>
+
+#include <machine/frame.h>
+
+#include <security/audit/audit.h>
+
+static SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0,
+ "thread allocation");
+
+static int max_threads_per_proc = 1500;
+SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
+ &max_threads_per_proc, 0, "Limit on threads per proc");
+
+static int max_threads_hits;
+SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
+ &max_threads_hits, 0, "kern.threads.max_threads_per_proc hit count");
+
+#ifdef COMPAT_FREEBSD32
+
+static inline int
+suword_lwpid(void *addr, lwpid_t lwpid)
+{
+ int error;
+
+ if (SV_CURPROC_FLAG(SV_LP64))
+ error = suword(addr, lwpid);
+ else
+ error = suword32(addr, lwpid);
+ return (error);
+}
+
+#else
+#define suword_lwpid suword
+#endif
+
+static int create_thread(struct thread *td, mcontext_t *ctx,
+ void (*start_func)(void *), void *arg,
+ char *stack_base, size_t stack_size,
+ char *tls_base,
+ long *child_tid, long *parent_tid,
+ int flags, struct rtprio *rtp);
+
+/*
+ * System call interface.
+ */
+int
+sys_thr_create(struct thread *td, struct thr_create_args *uap)
+ /* ucontext_t *ctx, long *id, int flags */
+{
+ ucontext_t ctx;
+ int error;
+
+ if ((error = copyin(uap->ctx, &ctx, sizeof(ctx))))
+ return (error);
+
+ error = create_thread(td, &ctx.uc_mcontext, NULL, NULL,
+ NULL, 0, NULL, uap->id, NULL, uap->flags, NULL);
+ return (error);
+}
+
+int
+sys_thr_new(struct thread *td, struct thr_new_args *uap)
+ /* struct thr_param * */
+{
+ struct thr_param param;
+ int error;
+
+ if (uap->param_size < 0 || uap->param_size > sizeof(param))
+ return (EINVAL);
+ bzero(&param, sizeof(param));
+ if ((error = copyin(uap->param, &param, uap->param_size)))
+ return (error);
+ return (kern_thr_new(td, &param));
+}
+
+int
+kern_thr_new(struct thread *td, struct thr_param *param)
+{
+ struct rtprio rtp, *rtpp;
+ int error;
+
+ rtpp = NULL;
+ if (param->rtp != 0) {
+ error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
+ if (error)
+ return (error);
+ rtpp = &rtp;
+ }
+ error = create_thread(td, NULL, param->start_func, param->arg,
+ param->stack_base, param->stack_size, param->tls_base,
+ param->child_tid, param->parent_tid, param->flags,
+ rtpp);
+ return (error);
+}
+
+static int
+create_thread(struct thread *td, mcontext_t *ctx,
+ void (*start_func)(void *), void *arg,
+ char *stack_base, size_t stack_size,
+ char *tls_base,
+ long *child_tid, long *parent_tid,
+ int flags, struct rtprio *rtp)
+{
+ stack_t stack;
+ struct thread *newtd;
+ struct proc *p;
+ int error;
+
+ p = td->td_proc;
+
+ /* Have race condition but it is cheap. */
+ if (p->p_numthreads >= max_threads_per_proc) {
+ ++max_threads_hits;
+ return (EPROCLIM);
+ }
+
+ if (rtp != NULL) {
+ switch(rtp->type) {
+ case RTP_PRIO_REALTIME:
+ case RTP_PRIO_FIFO:
+ /* Only root can set scheduler policy */
+ if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
+ return (EPERM);
+ if (rtp->prio > RTP_PRIO_MAX)
+ return (EINVAL);
+ break;
+ case RTP_PRIO_NORMAL:
+ rtp->prio = 0;
+ break;
+ default:
+ return (EINVAL);
+ }
+ }
+
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ error = racct_add(p, RACCT_NTHR, 1);
+ PROC_UNLOCK(td->td_proc);
+ if (error != 0)
+ return (EPROCLIM);
+#endif
+
+ /* Initialize our td */
+ newtd = thread_alloc(0);
+ if (newtd == NULL) {
+ error = ENOMEM;
+ goto fail;
+ }
+
+ cpu_set_upcall(newtd, td);
+
+ /*
+ * Try the copyout as soon as we allocate the td so we don't
+ * have to tear things down in a failure case below.
+ * Here we copy out tid to two places, one for child and one
+ * for parent, because pthread can create a detached thread,
+ * if parent wants to safely access child tid, it has to provide
+ * its storage, because child thread may exit quickly and
+ * memory is freed before parent thread can access it.
+ */
+ if ((child_tid != NULL &&
+ suword_lwpid(child_tid, newtd->td_tid)) ||
+ (parent_tid != NULL &&
+ suword_lwpid(parent_tid, newtd->td_tid))) {
+ thread_free(newtd);
+ error = EFAULT;
+ goto fail;
+ }
+
+ bzero(&newtd->td_startzero,
+ __rangeof(struct thread, td_startzero, td_endzero));
+ bcopy(&td->td_startcopy, &newtd->td_startcopy,
+ __rangeof(struct thread, td_startcopy, td_endcopy));
+ newtd->td_proc = td->td_proc;
+ newtd->td_ucred = crhold(td->td_ucred);
+
+ if (ctx != NULL) { /* old way to set user context */
+ error = set_mcontext(newtd, ctx);
+ if (error != 0) {
+ thread_free(newtd);
+ crfree(td->td_ucred);
+ goto fail;
+ }
+ } else {
+ /* Set up our machine context. */
+ stack.ss_sp = stack_base;
+ stack.ss_size = stack_size;
+ /* Set upcall address to user thread entry function. */
+ cpu_set_upcall_kse(newtd, start_func, arg, &stack);
+ /* Setup user TLS address and TLS pointer register. */
+ error = cpu_set_user_tls(newtd, tls_base);
+ if (error != 0) {
+ thread_free(newtd);
+ crfree(td->td_ucred);
+ goto fail;
+ }
+ }
+
+ PROC_LOCK(td->td_proc);
+ td->td_proc->p_flag |= P_HADTHREADS;
+ thread_link(newtd, p);
+ bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
+ thread_lock(td);
+ /* let the scheduler know about these things. */
+ sched_fork_thread(td, newtd);
+ thread_unlock(td);
+ if (P_SHOULDSTOP(p))
+ newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
+ PROC_UNLOCK(p);
+
+ tidhash_add(newtd);
+
+ thread_lock(newtd);
+ if (rtp != NULL) {
+ if (!(td->td_pri_class == PRI_TIMESHARE &&
+ rtp->type == RTP_PRIO_NORMAL)) {
+ rtp_to_pri(rtp, newtd);
+ sched_prio(newtd, newtd->td_user_pri);
+ } /* ignore timesharing class */
+ }
+ TD_SET_CAN_RUN(newtd);
+ sched_add(newtd, SRQ_BORING);
+ thread_unlock(newtd);
+
+ return (0);
+
+fail:
+#ifdef RACCT
+ PROC_LOCK(p);
+ racct_sub(p, RACCT_NTHR, 1);
+ PROC_UNLOCK(p);
+#endif
+ return (error);
+}
+
+int
+sys_thr_self(struct thread *td, struct thr_self_args *uap)
+ /* long *id */
+{
+ int error;
+
+ error = suword_lwpid(uap->id, (unsigned)td->td_tid);
+ if (error == -1)
+ return (EFAULT);
+ return (0);
+}
+
+int
+sys_thr_exit(struct thread *td, struct thr_exit_args *uap)
+ /* long *state */
+{
+ struct proc *p;
+
+ p = td->td_proc;
+
+ /* Signal userland that it can free the stack. */
+ if ((void *)uap->state != NULL) {
+ suword_lwpid(uap->state, 1);
+ kern_umtx_wake(td, uap->state, INT_MAX, 0);
+ }
+
+ rw_wlock(&tidhash_lock);
+
+ PROC_LOCK(p);
+
+ /*
+ * Shutting down last thread in the proc. This will actually
+ * call exit() in the trampoline when it returns.
+ */
+ if (p->p_numthreads != 1) {
+ racct_sub(p, RACCT_NTHR, 1);
+ LIST_REMOVE(td, td_hash);
+ rw_wunlock(&tidhash_lock);
+ tdsigcleanup(td);
+ PROC_SLOCK(p);
+ thread_stopped(p);
+ thread_exit();
+ /* NOTREACHED */
+ }
+ PROC_UNLOCK(p);
+ rw_wunlock(&tidhash_lock);
+ return (0);
+}
+
+int
+sys_thr_kill(struct thread *td, struct thr_kill_args *uap)
+ /* long id, int sig */
+{
+ ksiginfo_t ksi;
+ struct thread *ttd;
+ struct proc *p;
+ int error;
+
+ p = td->td_proc;
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = uap->sig;
+ ksi.ksi_code = SI_LWP;
+ ksi.ksi_pid = p->p_pid;
+ ksi.ksi_uid = td->td_ucred->cr_ruid;
+ if (uap->id == -1) {
+ if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
+ error = EINVAL;
+ } else {
+ error = ESRCH;
+ PROC_LOCK(p);
+ FOREACH_THREAD_IN_PROC(p, ttd) {
+ if (ttd != td) {
+ error = 0;
+ if (uap->sig == 0)
+ break;
+ tdksignal(ttd, uap->sig, &ksi);
+ }
+ }
+ PROC_UNLOCK(p);
+ }
+ } else {
+ error = 0;
+ ttd = tdfind((lwpid_t)uap->id, p->p_pid);
+ if (ttd == NULL)
+ return (ESRCH);
+ if (uap->sig == 0)
+ ;
+ else if (!_SIG_VALID(uap->sig))
+ error = EINVAL;
+ else
+ tdksignal(ttd, uap->sig, &ksi);
+ PROC_UNLOCK(ttd->td_proc);
+ }
+ return (error);
+}
+
+int
+sys_thr_kill2(struct thread *td, struct thr_kill2_args *uap)
+ /* pid_t pid, long id, int sig */
+{
+ ksiginfo_t ksi;
+ struct thread *ttd;
+ struct proc *p;
+ int error;
+
+ AUDIT_ARG_SIGNUM(uap->sig);
+
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = uap->sig;
+ ksi.ksi_code = SI_LWP;
+ ksi.ksi_pid = td->td_proc->p_pid;
+ ksi.ksi_uid = td->td_ucred->cr_ruid;
+ if (uap->id == -1) {
+ if ((p = pfind(uap->pid)) == NULL)
+ return (ESRCH);
+ AUDIT_ARG_PROCESS(p);
+ error = p_cansignal(td, p, uap->sig);
+ if (error) {
+ PROC_UNLOCK(p);
+ return (error);
+ }
+ if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
+ error = EINVAL;
+ } else {
+ error = ESRCH;
+ FOREACH_THREAD_IN_PROC(p, ttd) {
+ if (ttd != td) {
+ error = 0;
+ if (uap->sig == 0)
+ break;
+ tdksignal(ttd, uap->sig, &ksi);
+ }
+ }
+ }
+ PROC_UNLOCK(p);
+ } else {
+ ttd = tdfind((lwpid_t)uap->id, uap->pid);
+ if (ttd == NULL)
+ return (ESRCH);
+ p = ttd->td_proc;
+ AUDIT_ARG_PROCESS(p);
+ error = p_cansignal(td, p, uap->sig);
+ if (uap->sig == 0)
+ ;
+ else if (!_SIG_VALID(uap->sig))
+ error = EINVAL;
+ else
+ tdksignal(ttd, uap->sig, &ksi);
+ PROC_UNLOCK(p);
+ }
+ return (error);
+}
+
+int
+sys_thr_suspend(struct thread *td, struct thr_suspend_args *uap)
+ /* const struct timespec *timeout */
+{
+ struct timespec ts, *tsp;
+ int error;
+
+ tsp = NULL;
+ if (uap->timeout != NULL) {
+ error = umtx_copyin_timeout(uap->timeout, &ts);
+ if (error != 0)
+ return (error);
+ tsp = &ts;
+ }
+
+ return (kern_thr_suspend(td, tsp));
+}
+
+int
+kern_thr_suspend(struct thread *td, struct timespec *tsp)
+{
+ struct proc *p = td->td_proc;
+ struct timeval tv;
+ int error = 0;
+ int timo = 0;
+
+ if (td->td_pflags & TDP_WAKEUP) {
+ td->td_pflags &= ~TDP_WAKEUP;
+ return (0);
+ }
+
+ if (tsp != NULL) {
+ if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
+ error = EWOULDBLOCK;
+ else {
+ TIMESPEC_TO_TIMEVAL(&tv, tsp);
+ timo = tvtohz(&tv);
+ }
+ }
+
+ PROC_LOCK(p);
+ if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
+ error = msleep((void *)td, &p->p_mtx,
+ PCATCH, "lthr", timo);
+
+ if (td->td_flags & TDF_THRWAKEUP) {
+ thread_lock(td);
+ td->td_flags &= ~TDF_THRWAKEUP;
+ thread_unlock(td);
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ PROC_UNLOCK(p);
+ if (error == EWOULDBLOCK)
+ error = ETIMEDOUT;
+ else if (error == ERESTART) {
+ if (timo != 0)
+ error = EINTR;
+ }
+ return (error);
+}
+
+int
+sys_thr_wake(struct thread *td, struct thr_wake_args *uap)
+ /* long id */
+{
+ struct proc *p;
+ struct thread *ttd;
+
+ if (uap->id == td->td_tid) {
+ td->td_pflags |= TDP_WAKEUP;
+ return (0);
+ }
+
+ p = td->td_proc;
+ ttd = tdfind((lwpid_t)uap->id, p->p_pid);
+ if (ttd == NULL)
+ return (ESRCH);
+ thread_lock(ttd);
+ ttd->td_flags |= TDF_THRWAKEUP;
+ thread_unlock(ttd);
+ wakeup((void *)ttd);
+ PROC_UNLOCK(p);
+ return (0);
+}
+
+int
+sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap)
+{
+ struct proc *p;
+ char name[MAXCOMLEN + 1];
+ struct thread *ttd;
+ int error;
+
+ error = 0;
+ name[0] = '\0';
+ if (uap->name != NULL) {
+ error = copyinstr(uap->name, name, sizeof(name),
+ NULL);
+ if (error)
+ return (error);
+ }
+ p = td->td_proc;
+ ttd = tdfind((lwpid_t)uap->id, p->p_pid);
+ if (ttd == NULL)
+ return (ESRCH);
+ strcpy(ttd->td_name, name);
+#ifdef KTR
+ sched_clear_tdname(ttd);
+#endif
+ PROC_UNLOCK(p);
+ return (error);
+}
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
new file mode 100644
index 0000000..5da4866
--- /dev/null
+++ b/sys/kern/kern_thread.c
@@ -0,0 +1,1054 @@
+/*-
+ * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice(s), this list of conditions and the following disclaimer as
+ * the first lines of this file unmodified other than the possible
+ * addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice(s), this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include "opt_witness.h"
+#include "opt_kdtrace.h"
+#include "opt_hwpmc_hooks.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rangelock.h>
+#include <sys/resourcevar.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sched.h>
+#include <sys/sleepqueue.h>
+#include <sys/selinfo.h>
+#include <sys/turnstile.h>
+#include <sys/ktr.h>
+#include <sys/rwlock.h>
+#include <sys/umtx.h>
+#include <sys/cpuset.h>
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#include <security/audit/audit.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+#include <sys/eventhandler.h>
+
+SDT_PROVIDER_DECLARE(proc);
+SDT_PROBE_DEFINE(proc, , , lwp_exit, lwp-exit);
+
+
+/*
+ * thread related storage.
+ */
+static uma_zone_t thread_zone;
+
+TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
+static struct mtx zombie_lock;
+MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
+
+static void thread_zombie(struct thread *);
+
+#define TID_BUFFER_SIZE 1024
+
+struct mtx tid_lock;
+static struct unrhdr *tid_unrhdr;
+static lwpid_t tid_buffer[TID_BUFFER_SIZE];
+static int tid_head, tid_tail;
+static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
+
+struct tidhashhead *tidhashtbl;
+u_long tidhash;
+struct rwlock tidhash_lock;
+
+static lwpid_t
+tid_alloc(void)
+{
+ lwpid_t tid;
+
+ tid = alloc_unr(tid_unrhdr);
+ if (tid != -1)
+ return (tid);
+ mtx_lock(&tid_lock);
+ if (tid_head == tid_tail) {
+ mtx_unlock(&tid_lock);
+ return (-1);
+ }
+ tid = tid_buffer[tid_head];
+ tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
+ mtx_unlock(&tid_lock);
+ return (tid);
+}
+
+static void
+tid_free(lwpid_t tid)
+{
+ lwpid_t tmp_tid = -1;
+
+ mtx_lock(&tid_lock);
+ if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) {
+ tmp_tid = tid_buffer[tid_head];
+ tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
+ }
+ tid_buffer[tid_tail] = tid;
+ tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE;
+ mtx_unlock(&tid_lock);
+ if (tmp_tid != -1)
+ free_unr(tid_unrhdr, tmp_tid);
+}
+
+/*
+ * Prepare a thread for use.
+ */
+static int
+thread_ctor(void *mem, int size, void *arg, int flags)
+{
+ struct thread *td;
+
+ td = (struct thread *)mem;
+ td->td_state = TDS_INACTIVE;
+ td->td_oncpu = NOCPU;
+
+ td->td_tid = tid_alloc();
+
+ /*
+ * Note that td_critnest begins life as 1 because the thread is not
+ * running and is thereby implicitly waiting to be on the receiving
+ * end of a context switch.
+ */
+ td->td_critnest = 1;
+ td->td_lend_user_pri = PRI_MAX;
+ EVENTHANDLER_INVOKE(thread_ctor, td);
+#ifdef AUDIT
+ audit_thread_alloc(td);
+#endif
+ umtx_thread_alloc(td);
+ return (0);
+}
+
+/*
+ * Reclaim a thread after use.
+ */
+static void
+thread_dtor(void *mem, int size, void *arg)
+{
+ struct thread *td;
+
+ td = (struct thread *)mem;
+
+#ifdef INVARIANTS
+ /* Verify that this thread is in a safe state to free. */
+ switch (td->td_state) {
+ case TDS_INHIBITED:
+ case TDS_RUNNING:
+ case TDS_CAN_RUN:
+ case TDS_RUNQ:
+ /*
+ * We must never unlink a thread that is in one of
+ * these states, because it is currently active.
+ */
+ panic("bad state for thread unlinking");
+ /* NOTREACHED */
+ case TDS_INACTIVE:
+ break;
+ default:
+ panic("bad thread state");
+ /* NOTREACHED */
+ }
+#endif
+#ifdef AUDIT
+ audit_thread_free(td);
+#endif
+ /* Free all OSD associated to this thread. */
+ osd_thread_exit(td);
+
+ EVENTHANDLER_INVOKE(thread_dtor, td);
+ tid_free(td->td_tid);
+}
+
+/*
+ * Initialize type-stable parts of a thread (when newly created).
+ */
+static int
+thread_init(void *mem, int size, int flags)
+{
+ struct thread *td;
+
+ td = (struct thread *)mem;
+
+ td->td_sleepqueue = sleepq_alloc();
+ td->td_turnstile = turnstile_alloc();
+ td->td_rlqe = NULL;
+ EVENTHANDLER_INVOKE(thread_init, td);
+ td->td_sched = (struct td_sched *)&td[1];
+ umtx_thread_init(td);
+ td->td_kstack = 0;
+ return (0);
+}
+
+/*
+ * Tear down type-stable parts of a thread (just before being discarded).
+ */
+static void
+thread_fini(void *mem, int size)
+{
+ struct thread *td;
+
+ td = (struct thread *)mem;
+ EVENTHANDLER_INVOKE(thread_fini, td);
+ rlqentry_free(td->td_rlqe);
+ turnstile_free(td->td_turnstile);
+ sleepq_free(td->td_sleepqueue);
+ umtx_thread_fini(td);
+ seltdfini(td);
+}
+
+/*
+ * For a newly created process,
+ * link up all the structures and its initial threads etc.
+ * called from:
+ * {arch}/{arch}/machdep.c ia64_init(), init386() etc.
+ * proc_dtor() (should go away)
+ * proc_init()
+ */
+void
+proc_linkup0(struct proc *p, struct thread *td)
+{
+ TAILQ_INIT(&p->p_threads); /* all threads in proc */
+ proc_linkup(p, td);
+}
+
+void
+proc_linkup(struct proc *p, struct thread *td)
+{
+
+ sigqueue_init(&p->p_sigqueue, p);
+ p->p_ksi = ksiginfo_alloc(1);
+ if (p->p_ksi != NULL) {
+ /* XXX p_ksi may be null if ksiginfo zone is not ready */
+ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
+ }
+ LIST_INIT(&p->p_mqnotifier);
+ p->p_numthreads = 0;
+ thread_link(td, p);
+}
+
+/*
+ * Initialize global thread allocation resources.
+ */
+void
+threadinit(void)
+{
+
+ mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
+
+ /*
+ * pid_max cannot be greater than PID_MAX.
+ * leave one number for thread0.
+ */
+ tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock);
+
+ thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
+ thread_ctor, thread_dtor, thread_init, thread_fini,
+ 16 - 1, 0);
+ tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
+ rw_init(&tidhash_lock, "tidhash");
+}
+
+/*
+ * Place an unused thread on the zombie list.
+ * Use the slpq as that must be unused by now.
+ */
+void
+thread_zombie(struct thread *td)
+{
+ mtx_lock_spin(&zombie_lock);
+ TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
+ mtx_unlock_spin(&zombie_lock);
+}
+
+/*
+ * Release a thread that has exited after cpu_throw().
+ */
+void
+thread_stash(struct thread *td)
+{
+ atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
+ thread_zombie(td);
+}
+
+/*
+ * Reap zombie resources.
+ */
+void
+thread_reap(void)
+{
+ struct thread *td_first, *td_next;
+
+ /*
+ * Don't even bother to lock if none at this instant,
+ * we really don't care about the next instant..
+ */
+ if (!TAILQ_EMPTY(&zombie_threads)) {
+ mtx_lock_spin(&zombie_lock);
+ td_first = TAILQ_FIRST(&zombie_threads);
+ if (td_first)
+ TAILQ_INIT(&zombie_threads);
+ mtx_unlock_spin(&zombie_lock);
+ while (td_first) {
+ td_next = TAILQ_NEXT(td_first, td_slpq);
+ if (td_first->td_ucred)
+ crfree(td_first->td_ucred);
+ thread_free(td_first);
+ td_first = td_next;
+ }
+ }
+}
+
+/*
+ * Allocate a thread.
+ */
+struct thread *
+thread_alloc(int pages)
+{
+ struct thread *td;
+
+ thread_reap(); /* check if any zombies to get */
+
+ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
+ KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
+ if (!vm_thread_new(td, pages)) {
+ uma_zfree(thread_zone, td);
+ return (NULL);
+ }
+ cpu_thread_alloc(td);
+ return (td);
+}
+
+int
+thread_alloc_stack(struct thread *td, int pages)
+{
+
+ KASSERT(td->td_kstack == 0,
+ ("thread_alloc_stack called on a thread with kstack"));
+ if (!vm_thread_new(td, pages))
+ return (0);
+ cpu_thread_alloc(td);
+ return (1);
+}
+
+/*
+ * Deallocate a thread.
+ */
+void
+thread_free(struct thread *td)
+{
+
+ lock_profile_thread_exit(td);
+ if (td->td_cpuset)
+ cpuset_rel(td->td_cpuset);
+ td->td_cpuset = NULL;
+ cpu_thread_free(td);
+ if (td->td_kstack != 0)
+ vm_thread_dispose(td);
+ uma_zfree(thread_zone, td);
+}
+
+/*
+ * Discard the current thread and exit from its context.
+ * Always called with scheduler locked.
+ *
+ * Because we can't free a thread while we're operating under its context,
+ * push the current thread into our CPU's deadthread holder. This means
+ * we needn't worry about someone else grabbing our context before we
+ * do a cpu_throw().
+ */
+void
+thread_exit(void)
+{
+ uint64_t runtime, new_switchtime;
+ struct thread *td;
+ struct thread *td2;
+ struct proc *p;
+ int wakeup_swapper;
+
+ td = curthread;
+ p = td->td_proc;
+
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ KASSERT(p != NULL, ("thread exiting without a process"));
+ CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
+ (long)p->p_pid, td->td_name);
+ KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
+
+#ifdef AUDIT
+ AUDIT_SYSCALL_EXIT(0, td);
+#endif
+ umtx_thread_exit(td);
+ /*
+ * drop FPU & debug register state storage, or any other
+ * architecture specific resources that
+ * would not be on a new untouched process.
+ */
+ cpu_thread_exit(td); /* XXXSMP */
+
+ /*
+ * The last thread is left attached to the process
+ * So that the whole bundle gets recycled. Skip
+ * all this stuff if we never had threads.
+ * EXIT clears all sign of other threads when
+ * it goes to single threading, so the last thread always
+ * takes the short path.
+ */
+ if (p->p_flag & P_HADTHREADS) {
+ if (p->p_numthreads > 1) {
+ thread_unlink(td);
+ td2 = FIRST_THREAD_IN_PROC(p);
+ sched_exit_thread(td2, td);
+
+ /*
+ * The test below is NOT true if we are the
+ * sole exiting thread. P_STOPPED_SINGLE is unset
+ * in exit1() after it is the only survivor.
+ */
+ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+ if (p->p_numthreads == p->p_suspcount) {
+ thread_lock(p->p_singlethread);
+ wakeup_swapper = thread_unsuspend_one(
+ p->p_singlethread);
+ thread_unlock(p->p_singlethread);
+ if (wakeup_swapper)
+ kick_proc0();
+ }
+ }
+
+ atomic_add_int(&td->td_proc->p_exitthreads, 1);
+ PCPU_SET(deadthread, td);
+ } else {
+ /*
+ * The last thread is exiting.. but not through exit()
+ */
+ panic ("thread_exit: Last thread exiting on its own");
+ }
+ }
+#ifdef HWPMC_HOOKS
+ /*
+ * If this thread is part of a process that is being tracked by hwpmc(4),
+ * inform the module of the thread's impending exit.
+ */
+ if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+ PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
+#endif
+ PROC_UNLOCK(p);
+
+ /* Do the same timestamp bookkeeping that mi_switch() would do. */
+ new_switchtime = cpu_ticks();
+ runtime = new_switchtime - PCPU_GET(switchtime);
+ td->td_runtime += runtime;
+ td->td_incruntime += runtime;
+ PCPU_SET(switchtime, new_switchtime);
+ PCPU_SET(switchticks, ticks);
+ PCPU_INC(cnt.v_swtch);
+
+ /* Save our resource usage in our process. */
+ td->td_ru.ru_nvcsw++;
+ ruxagg(p, td);
+ rucollect(&p->p_ru, &td->td_ru);
+
+ thread_lock(td);
+ PROC_SUNLOCK(p);
+ td->td_state = TDS_INACTIVE;
+#ifdef WITNESS
+ witness_thread_exit(td);
+#endif
+ CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
+ sched_throw(td);
+ panic("I'm a teapot!");
+ /* NOTREACHED */
+}
+
+/*
+ * Do any thread specific cleanups that may be needed in wait()
+ * called with Giant, proc and schedlock not held.
+ */
+void
+thread_wait(struct proc *p)
+{
+ struct thread *td;
+
+ mtx_assert(&Giant, MA_NOTOWNED);
+ KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
+ td = FIRST_THREAD_IN_PROC(p);
+ /* Lock the last thread so we spin until it exits cpu_throw(). */
+ thread_lock(td);
+ thread_unlock(td);
+ /* Wait for any remaining threads to exit cpu_throw(). */
+ while (p->p_exitthreads)
+ sched_relinquish(curthread);
+ lock_profile_thread_exit(td);
+ cpuset_rel(td->td_cpuset);
+ td->td_cpuset = NULL;
+ cpu_thread_clean(td);
+ crfree(td->td_ucred);
+ thread_reap(); /* check for zombie threads etc. */
+}
+
+/*
+ * Link a thread to a process.
+ * set up anything that needs to be initialized for it to
+ * be used by the process.
+ */
+void
+thread_link(struct thread *td, struct proc *p)
+{
+
+ /*
+ * XXX This can't be enabled because it's called for proc0 before
+ * its lock has been created.
+ * PROC_LOCK_ASSERT(p, MA_OWNED);
+ */
+ td->td_state = TDS_INACTIVE;
+ td->td_proc = p;
+ td->td_flags = TDF_INMEM;
+
+ LIST_INIT(&td->td_contested);
+ LIST_INIT(&td->td_lprof[0]);
+ LIST_INIT(&td->td_lprof[1]);
+ sigqueue_init(&td->td_sigqueue, p);
+ callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
+ TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
+ p->p_numthreads++;
+}
+
+/*
+ * Convert a process with one thread to an unthreaded process.
+ */
+void
+thread_unthread(struct thread *td)
+{
+ struct proc *p = td->td_proc;
+
+ KASSERT((p->p_numthreads == 1), ("Unthreading with >1 threads"));
+ p->p_flag &= ~P_HADTHREADS;
+}
+
+/*
+ * Called from:
+ * thread_exit()
+ */
+void
+thread_unlink(struct thread *td)
+{
+ struct proc *p = td->td_proc;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ TAILQ_REMOVE(&p->p_threads, td, td_plist);
+ p->p_numthreads--;
+ /* could clear a few other things here */
+ /* Must NOT clear links to proc! */
+}
+
+static int
+calc_remaining(struct proc *p, int mode)
+{
+ int remaining;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ if (mode == SINGLE_EXIT)
+ remaining = p->p_numthreads;
+ else if (mode == SINGLE_BOUNDARY)
+ remaining = p->p_numthreads - p->p_boundary_count;
+ else if (mode == SINGLE_NO_EXIT)
+ remaining = p->p_numthreads - p->p_suspcount;
+ else
+ panic("calc_remaining: wrong mode %d", mode);
+ return (remaining);
+}
+
+/*
+ * Enforce single-threading.
+ *
+ * Returns 1 if the caller must abort (another thread is waiting to
+ * exit the process or similar). Process is locked!
+ * Returns 0 when you are successfully the only thread running.
+ * A process has successfully single threaded in the suspend mode when
+ * There are no threads in user mode. Threads in the kernel must be
+ * allowed to continue until they get to the user boundary. They may even
+ * copy out their return values and data before suspending. They may however be
+ * accelerated in reaching the user boundary as we will wake up
+ * any sleeping threads that are interruptable. (PCATCH).
+ */
+int
+thread_single(int mode)
+{
+ struct thread *td;
+ struct thread *td2;
+ struct proc *p;
+ int remaining, wakeup_swapper;
+
+ td = curthread;
+ p = td->td_proc;
+ mtx_assert(&Giant, MA_NOTOWNED);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ if ((p->p_flag & P_HADTHREADS) == 0)
+ return (0);
+
+ /* Is someone already single threading? */
+ if (p->p_singlethread != NULL && p->p_singlethread != td)
+ return (1);
+
+ if (mode == SINGLE_EXIT) {
+ p->p_flag |= P_SINGLE_EXIT;
+ p->p_flag &= ~P_SINGLE_BOUNDARY;
+ } else {
+ p->p_flag &= ~P_SINGLE_EXIT;
+ if (mode == SINGLE_BOUNDARY)
+ p->p_flag |= P_SINGLE_BOUNDARY;
+ else
+ p->p_flag &= ~P_SINGLE_BOUNDARY;
+ }
+ p->p_flag |= P_STOPPED_SINGLE;
+ PROC_SLOCK(p);
+ p->p_singlethread = td;
+ remaining = calc_remaining(p, mode);
+ while (remaining != 1) {
+ if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
+ goto stopme;
+ wakeup_swapper = 0;
+ FOREACH_THREAD_IN_PROC(p, td2) {
+ if (td2 == td)
+ continue;
+ thread_lock(td2);
+ td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
+ if (TD_IS_INHIBITED(td2)) {
+ switch (mode) {
+ case SINGLE_EXIT:
+ if (TD_IS_SUSPENDED(td2))
+ wakeup_swapper |=
+ thread_unsuspend_one(td2);
+ if (TD_ON_SLEEPQ(td2) &&
+ (td2->td_flags & TDF_SINTR))
+ wakeup_swapper |=
+ sleepq_abort(td2, EINTR);
+ break;
+ case SINGLE_BOUNDARY:
+ if (TD_IS_SUSPENDED(td2) &&
+ !(td2->td_flags & TDF_BOUNDARY))
+ wakeup_swapper |=
+ thread_unsuspend_one(td2);
+ if (TD_ON_SLEEPQ(td2) &&
+ (td2->td_flags & TDF_SINTR))
+ wakeup_swapper |=
+ sleepq_abort(td2, ERESTART);
+ break;
+ case SINGLE_NO_EXIT:
+ if (TD_IS_SUSPENDED(td2) &&
+ !(td2->td_flags & TDF_BOUNDARY))
+ wakeup_swapper |=
+ thread_unsuspend_one(td2);
+ if (TD_ON_SLEEPQ(td2) &&
+ (td2->td_flags & TDF_SINTR))
+ wakeup_swapper |=
+ sleepq_abort(td2, ERESTART);
+ break;
+ default:
+ break;
+ }
+ }
+#ifdef SMP
+ else if (TD_IS_RUNNING(td2) && td != td2) {
+ forward_signal(td2);
+ }
+#endif
+ thread_unlock(td2);
+ }
+ if (wakeup_swapper)
+ kick_proc0();
+ remaining = calc_remaining(p, mode);
+
+ /*
+ * Maybe we suspended some threads.. was it enough?
+ */
+ if (remaining == 1)
+ break;
+
+stopme:
+ /*
+ * Wake us up when everyone else has suspended.
+ * In the mean time we suspend as well.
+ */
+ thread_suspend_switch(td);
+ remaining = calc_remaining(p, mode);
+ }
+ if (mode == SINGLE_EXIT) {
+ /*
+ * We have gotten rid of all the other threads and we
+ * are about to either exit or exec. In either case,
+ * we try our utmost to revert to being a non-threaded
+ * process.
+ */
+ p->p_singlethread = NULL;
+ p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
+ thread_unthread(td);
+ }
+ PROC_SUNLOCK(p);
+ return (0);
+}
+
+/*
+ * Called in from locations that can safely check to see
+ * whether we have to suspend or at least throttle for a
+ * single-thread event (e.g. fork).
+ *
+ * Such locations include userret().
+ * If the "return_instead" argument is non zero, the thread must be able to
+ * accept 0 (caller may continue), or 1 (caller must abort) as a result.
+ *
+ * The 'return_instead' argument tells the function if it may do a
+ * thread_exit() or suspend, or whether the caller must abort and back
+ * out instead.
+ *
+ * If the thread that set the single_threading request has set the
+ * P_SINGLE_EXIT bit in the process flags then this call will never return
+ * if 'return_instead' is false, but will exit.
+ *
+ * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
+ *---------------+--------------------+---------------------
+ * 0 | returns 0 | returns 0 or 1
+ * | when ST ends | immediately
+ *---------------+--------------------+---------------------
+ * 1 | thread exits | returns 1
+ * | | immediately
+ * 0 = thread_exit() or suspension ok,
+ * other = return error instead of stopping the thread.
+ *
+ * While a full suspension is under effect, even a single threading
+ * thread would be suspended if it made this call (but it shouldn't).
+ * This call should only be made from places where
+ * thread_exit() would be safe as that may be the outcome unless
+ * return_instead is set.
+ */
+int
+thread_suspend_check(int return_instead)
+{
+ struct thread *td;
+ struct proc *p;
+ int wakeup_swapper;
+
+ td = curthread;
+ p = td->td_proc;
+ mtx_assert(&Giant, MA_NOTOWNED);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ while (P_SHOULDSTOP(p) ||
+ ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
+ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+ KASSERT(p->p_singlethread != NULL,
+ ("singlethread not set"));
+ /*
+ * The only suspension in action is a
+ * single-threading. Single threader need not stop.
+ * XXX Should be safe to access unlocked
+ * as it can only be set to be true by us.
+ */
+ if (p->p_singlethread == td)
+ return (0); /* Exempt from stopping. */
+ }
+ if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
+ return (EINTR);
+
+ /* Should we goto user boundary if we didn't come from there? */
+ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
+ (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
+ return (ERESTART);
+
+ /*
+ * Ignore suspend requests for stop signals if they
+ * are deferred.
+ */
+ if (P_SHOULDSTOP(p) == P_STOPPED_SIG &&
+ td->td_flags & TDF_SBDRY) {
+ KASSERT(return_instead,
+ ("TDF_SBDRY set for unsafe thread_suspend_check"));
+ return (0);
+ }
+
+ /*
+ * If the process is waiting for us to exit,
+ * this thread should just suicide.
+ * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
+ */
+ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
+ PROC_UNLOCK(p);
+ tidhash_remove(td);
+ PROC_LOCK(p);
+ tdsigcleanup(td);
+ PROC_SLOCK(p);
+ thread_stopped(p);
+ thread_exit();
+ }
+
+ PROC_SLOCK(p);
+ thread_stopped(p);
+ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+ if (p->p_numthreads == p->p_suspcount + 1) {
+ thread_lock(p->p_singlethread);
+ wakeup_swapper =
+ thread_unsuspend_one(p->p_singlethread);
+ thread_unlock(p->p_singlethread);
+ if (wakeup_swapper)
+ kick_proc0();
+ }
+ }
+ PROC_UNLOCK(p);
+ thread_lock(td);
+ /*
+ * When a thread suspends, it just
+ * gets taken off all queues.
+ */
+ thread_suspend_one(td);
+ if (return_instead == 0) {
+ p->p_boundary_count++;
+ td->td_flags |= TDF_BOUNDARY;
+ }
+ PROC_SUNLOCK(p);
+ mi_switch(SW_INVOL | SWT_SUSPEND, NULL);
+ if (return_instead == 0)
+ td->td_flags &= ~TDF_BOUNDARY;
+ thread_unlock(td);
+ PROC_LOCK(p);
+ if (return_instead == 0) {
+ PROC_SLOCK(p);
+ p->p_boundary_count--;
+ PROC_SUNLOCK(p);
+ }
+ }
+ return (0);
+}
+
+void
+thread_suspend_switch(struct thread *td)
+{
+ struct proc *p;
+
+ p = td->td_proc;
+ KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ /*
+ * We implement thread_suspend_one in stages here to avoid
+ * dropping the proc lock while the thread lock is owned.
+ */
+ thread_stopped(p);
+ p->p_suspcount++;
+ PROC_UNLOCK(p);
+ thread_lock(td);
+ td->td_flags &= ~TDF_NEEDSUSPCHK;
+ TD_SET_SUSPENDED(td);
+ sched_sleep(td, 0);
+ PROC_SUNLOCK(p);
+ DROP_GIANT();
+ mi_switch(SW_VOL | SWT_SUSPEND, NULL);
+ thread_unlock(td);
+ PICKUP_GIANT();
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
+}
+
+void
+thread_suspend_one(struct thread *td)
+{
+ struct proc *p = td->td_proc;
+
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
+ p->p_suspcount++;
+ td->td_flags &= ~TDF_NEEDSUSPCHK;
+ TD_SET_SUSPENDED(td);
+ sched_sleep(td, 0);
+}
+
+int
+thread_unsuspend_one(struct thread *td)
+{
+ struct proc *p = td->td_proc;
+
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
+ TD_CLR_SUSPENDED(td);
+ p->p_suspcount--;
+ return (setrunnable(td));
+}
+
+/*
+ * Allow all threads blocked by single threading to continue running.
+ */
+void
+thread_unsuspend(struct proc *p)
+{
+ struct thread *td;
+ int wakeup_swapper;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ PROC_SLOCK_ASSERT(p, MA_OWNED);
+ wakeup_swapper = 0;
+ if (!P_SHOULDSTOP(p)) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (TD_IS_SUSPENDED(td)) {
+ wakeup_swapper |= thread_unsuspend_one(td);
+ }
+ thread_unlock(td);
+ }
+ } else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
+ (p->p_numthreads == p->p_suspcount)) {
+ /*
+ * Stopping everything also did the job for the single
+ * threading request. Now we've downgraded to single-threaded,
+ * let it continue.
+ */
+ thread_lock(p->p_singlethread);
+ wakeup_swapper = thread_unsuspend_one(p->p_singlethread);
+ thread_unlock(p->p_singlethread);
+ }
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+/*
+ * End the single threading mode..
+ */
+void
+thread_single_end(void)
+{
+ struct thread *td;
+ struct proc *p;
+ int wakeup_swapper;
+
+ td = curthread;
+ p = td->td_proc;
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY);
+ PROC_SLOCK(p);
+ p->p_singlethread = NULL;
+ wakeup_swapper = 0;
+ /*
+ * If there are other threads they may now run,
+ * unless of course there is a blanket 'stop order'
+ * on the process. The single threader must be allowed
+ * to continue however as this is a bad place to stop.
+ */
+ if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (TD_IS_SUSPENDED(td)) {
+ wakeup_swapper |= thread_unsuspend_one(td);
+ }
+ thread_unlock(td);
+ }
+ }
+ PROC_SUNLOCK(p);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+struct thread *
+thread_find(struct proc *p, lwpid_t tid)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (td->td_tid == tid)
+ break;
+ }
+ return (td);
+}
+
+/* Locate a thread by number; return with proc lock held. */
+struct thread *
+tdfind(lwpid_t tid, pid_t pid)
+{
+#define RUN_THRESH 16
+ struct thread *td;
+ int run = 0;
+
+ rw_rlock(&tidhash_lock);
+ LIST_FOREACH(td, TIDHASH(tid), td_hash) {
+ if (td->td_tid == tid) {
+ if (pid != -1 && td->td_proc->p_pid != pid) {
+ td = NULL;
+ break;
+ }
+ PROC_LOCK(td->td_proc);
+ if (td->td_proc->p_state == PRS_NEW) {
+ PROC_UNLOCK(td->td_proc);
+ td = NULL;
+ break;
+ }
+ if (run > RUN_THRESH) {
+ if (rw_try_upgrade(&tidhash_lock)) {
+ LIST_REMOVE(td, td_hash);
+ LIST_INSERT_HEAD(TIDHASH(td->td_tid),
+ td, td_hash);
+ rw_wunlock(&tidhash_lock);
+ return (td);
+ }
+ }
+ break;
+ }
+ run++;
+ }
+ rw_runlock(&tidhash_lock);
+ return (td);
+}
+
+void
+tidhash_add(struct thread *td)
+{
+ rw_wlock(&tidhash_lock);
+ LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
+ rw_wunlock(&tidhash_lock);
+}
+
+void
+tidhash_remove(struct thread *td)
+{
+ rw_wlock(&tidhash_lock);
+ LIST_REMOVE(td, td_hash);
+ rw_wunlock(&tidhash_lock);
+}
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
new file mode 100644
index 0000000..3aaed60
--- /dev/null
+++ b/sys/kern/kern_time.c
@@ -0,0 +1,1648 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_time.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/clock.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/sleepqueue.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/time.h>
+#include <sys/timers.h>
+#include <sys/timetc.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+#define MAX_CLOCKS (CLOCK_MONOTONIC+1)
+#define CPUCLOCK_BIT 0x80000000
+#define CPUCLOCK_PROCESS_BIT 0x40000000
+#define CPUCLOCK_ID_MASK (~(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT))
+#define MAKE_THREAD_CPUCLOCK(tid) (CPUCLOCK_BIT|(tid))
+#define MAKE_PROCESS_CPUCLOCK(pid) \
+ (CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT|(pid))
+
+static struct kclock posix_clocks[MAX_CLOCKS];
+static uma_zone_t itimer_zone = NULL;
+
+/*
+ * Time of day and interval timer support.
+ *
+ * These routines provide the kernel entry points to get and set
+ * the time-of-day and per-process interval timers. Subroutines
+ * here provide support for adding and subtracting timeval structures
+ * and decrementing interval timers, optionally reloading the interval
+ * timers when they expire.
+ */
+
+static int settime(struct thread *, struct timeval *);
+static void timevalfix(struct timeval *);
+
+static void itimer_start(void);
+static int itimer_init(void *, int, int);
+static void itimer_fini(void *, int);
+static void itimer_enter(struct itimer *);
+static void itimer_leave(struct itimer *);
+static struct itimer *itimer_find(struct proc *, int);
+static void itimers_alloc(struct proc *);
+static void itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp);
+static void itimers_event_hook_exit(void *arg, struct proc *p);
+static int realtimer_create(struct itimer *);
+static int realtimer_gettime(struct itimer *, struct itimerspec *);
+static int realtimer_settime(struct itimer *, int,
+ struct itimerspec *, struct itimerspec *);
+static int realtimer_delete(struct itimer *);
+static void realtimer_clocktime(clockid_t, struct timespec *);
+static void realtimer_expire(void *);
+
+int register_posix_clock(int, struct kclock *);
+void itimer_fire(struct itimer *it);
+int itimespecfix(struct timespec *ts);
+
+#define CLOCK_CALL(clock, call, arglist) \
+ ((*posix_clocks[clock].call) arglist)
+
+SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
+
+
+static int
+settime(struct thread *td, struct timeval *tv)
+{
+ struct timeval delta, tv1, tv2;
+ static struct timeval maxtime, laststep;
+ struct timespec ts;
+ int s;
+
+ s = splclock();
+ microtime(&tv1);
+ delta = *tv;
+ timevalsub(&delta, &tv1);
+
+ /*
+ * If the system is secure, we do not allow the time to be
+ * set to a value earlier than 1 second less than the highest
+ * time we have yet seen. The worst a miscreant can do in
+ * this circumstance is "freeze" time. He couldn't go
+ * back to the past.
+ *
+ * We similarly do not allow the clock to be stepped more
+ * than one second, nor more than once per second. This allows
+ * a miscreant to make the clock march double-time, but no worse.
+ */
+ if (securelevel_gt(td->td_ucred, 1) != 0) {
+ if (delta.tv_sec < 0 || delta.tv_usec < 0) {
+ /*
+ * Update maxtime to latest time we've seen.
+ */
+ if (tv1.tv_sec > maxtime.tv_sec)
+ maxtime = tv1;
+ tv2 = *tv;
+ timevalsub(&tv2, &maxtime);
+ if (tv2.tv_sec < -1) {
+ tv->tv_sec = maxtime.tv_sec - 1;
+ printf("Time adjustment clamped to -1 second\n");
+ }
+ } else {
+ if (tv1.tv_sec == laststep.tv_sec) {
+ splx(s);
+ return (EPERM);
+ }
+ if (delta.tv_sec > 1) {
+ tv->tv_sec = tv1.tv_sec + 1;
+ printf("Time adjustment clamped to +1 second\n");
+ }
+ laststep = *tv;
+ }
+ }
+
+ ts.tv_sec = tv->tv_sec;
+ ts.tv_nsec = tv->tv_usec * 1000;
+ mtx_lock(&Giant);
+ tc_setclock(&ts);
+ resettodr();
+ mtx_unlock(&Giant);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_getcpuclockid2_args {
+ id_t id;
+ int which,
+ clockid_t *clock_id;
+};
+#endif
+/* ARGSUSED */
+int
+sys_clock_getcpuclockid2(struct thread *td, struct clock_getcpuclockid2_args *uap)
+{
+ clockid_t clk_id;
+ int error;
+
+ error = kern_clock_getcpuclockid2(td, uap->id, uap->which, &clk_id);
+ if (error == 0)
+ error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
+ return (error);
+}
+
+int
+kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
+ clockid_t *clk_id)
+{
+ struct proc *p;
+ pid_t pid;
+ lwpid_t tid;
+ int error;
+
+ switch (which) {
+ case CPUCLOCK_WHICH_PID:
+ if (id != 0) {
+ p = pfind(id);
+ if (p == NULL)
+ return (ESRCH);
+ error = p_cansee(td, p);
+ PROC_UNLOCK(p);
+ if (error != 0)
+ return (error);
+ pid = id;
+ } else {
+ pid = td->td_proc->p_pid;
+ }
+ *clk_id = MAKE_PROCESS_CPUCLOCK(pid);
+ return (0);
+ case CPUCLOCK_WHICH_TID:
+ tid = id == 0 ? td->td_tid : id;
+ *clk_id = MAKE_THREAD_CPUCLOCK(tid);
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_gettime_args {
+ clockid_t clock_id;
+ struct timespec *tp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_clock_gettime(struct thread *td, struct clock_gettime_args *uap)
+{
+ struct timespec ats;
+ int error;
+
+ error = kern_clock_gettime(td, uap->clock_id, &ats);
+ if (error == 0)
+ error = copyout(&ats, uap->tp, sizeof(ats));
+
+ return (error);
+}
+
+static inline void
+cputick2timespec(uint64_t runtime, struct timespec *ats)
+{
+ runtime = cputick2usec(runtime);
+ ats->tv_sec = runtime / 1000000;
+ ats->tv_nsec = runtime % 1000000 * 1000;
+}
+
+static void
+get_thread_cputime(struct thread *targettd, struct timespec *ats)
+{
+ uint64_t runtime, curtime, switchtime;
+
+ if (targettd == NULL) { /* current thread */
+ critical_enter();
+ switchtime = PCPU_GET(switchtime);
+ curtime = cpu_ticks();
+ runtime = curthread->td_runtime;
+ critical_exit();
+ runtime += curtime - switchtime;
+ } else {
+ thread_lock(targettd);
+ runtime = targettd->td_runtime;
+ thread_unlock(targettd);
+ }
+ cputick2timespec(runtime, ats);
+}
+
+static void
+get_process_cputime(struct proc *targetp, struct timespec *ats)
+{
+ uint64_t runtime;
+ struct rusage ru;
+
+ PROC_SLOCK(targetp);
+ rufetch(targetp, &ru);
+ runtime = targetp->p_rux.rux_runtime;
+ PROC_SUNLOCK(targetp);
+ cputick2timespec(runtime, ats);
+}
+
+static int
+get_cputime(struct thread *td, clockid_t clock_id, struct timespec *ats)
+{
+ struct proc *p, *p2;
+ struct thread *td2;
+ lwpid_t tid;
+ pid_t pid;
+ int error;
+
+ p = td->td_proc;
+ if ((clock_id & CPUCLOCK_PROCESS_BIT) == 0) {
+ tid = clock_id & CPUCLOCK_ID_MASK;
+ td2 = tdfind(tid, p->p_pid);
+ if (td2 == NULL)
+ return (EINVAL);
+ get_thread_cputime(td2, ats);
+ PROC_UNLOCK(td2->td_proc);
+ } else {
+ pid = clock_id & CPUCLOCK_ID_MASK;
+ error = pget(pid, PGET_CANSEE, &p2);
+ if (error != 0)
+ return (EINVAL);
+ get_process_cputime(p2, ats);
+ PROC_UNLOCK(p2);
+ }
+ return (0);
+}
+
+int
+kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
+{
+ struct timeval sys, user;
+ struct proc *p;
+
+ p = td->td_proc;
+ switch (clock_id) {
+ case CLOCK_REALTIME: /* Default to precise. */
+ case CLOCK_REALTIME_PRECISE:
+ nanotime(ats);
+ break;
+ case CLOCK_REALTIME_FAST:
+ getnanotime(ats);
+ break;
+ case CLOCK_VIRTUAL:
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
+ calcru(p, &user, &sys);
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
+ TIMEVAL_TO_TIMESPEC(&user, ats);
+ break;
+ case CLOCK_PROF:
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
+ calcru(p, &user, &sys);
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
+ timevaladd(&user, &sys);
+ TIMEVAL_TO_TIMESPEC(&user, ats);
+ break;
+ case CLOCK_MONOTONIC: /* Default to precise. */
+ case CLOCK_MONOTONIC_PRECISE:
+ case CLOCK_UPTIME:
+ case CLOCK_UPTIME_PRECISE:
+ nanouptime(ats);
+ break;
+ case CLOCK_UPTIME_FAST:
+ case CLOCK_MONOTONIC_FAST:
+ getnanouptime(ats);
+ break;
+ case CLOCK_SECOND:
+ ats->tv_sec = time_second;
+ ats->tv_nsec = 0;
+ break;
+ case CLOCK_THREAD_CPUTIME_ID:
+ get_thread_cputime(NULL, ats);
+ break;
+ case CLOCK_PROCESS_CPUTIME_ID:
+ PROC_LOCK(p);
+ get_process_cputime(p, ats);
+ PROC_UNLOCK(p);
+ break;
+ default:
+ if ((int)clock_id >= 0)
+ return (EINVAL);
+ return (get_cputime(td, clock_id, ats));
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_settime_args {
+ clockid_t clock_id;
+ const struct timespec *tp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_clock_settime(struct thread *td, struct clock_settime_args *uap)
+{
+ struct timespec ats;
+ int error;
+
+ if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
+ return (error);
+ return (kern_clock_settime(td, uap->clock_id, &ats));
+}
+
+int
+kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats)
+{
+ struct timeval atv;
+ int error;
+
+ if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
+ return (error);
+ if (clock_id != CLOCK_REALTIME)
+ return (EINVAL);
+ if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000)
+ return (EINVAL);
+ /* XXX Don't convert nsec->usec and back */
+ TIMESPEC_TO_TIMEVAL(&atv, ats);
+ error = settime(td, &atv);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct clock_getres_args {
+ clockid_t clock_id;
+ struct timespec *tp;
+};
+#endif
+int
+sys_clock_getres(struct thread *td, struct clock_getres_args *uap)
+{
+ struct timespec ts;
+ int error;
+
+ if (uap->tp == NULL)
+ return (0);
+
+ error = kern_clock_getres(td, uap->clock_id, &ts);
+ if (error == 0)
+ error = copyout(&ts, uap->tp, sizeof(ts));
+ return (error);
+}
+
+int
+kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts)
+{
+
+ ts->tv_sec = 0;
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ case CLOCK_REALTIME_FAST:
+ case CLOCK_REALTIME_PRECISE:
+ case CLOCK_MONOTONIC:
+ case CLOCK_MONOTONIC_FAST:
+ case CLOCK_MONOTONIC_PRECISE:
+ case CLOCK_UPTIME:
+ case CLOCK_UPTIME_FAST:
+ case CLOCK_UPTIME_PRECISE:
+ /*
+ * Round up the result of the division cheaply by adding 1.
+ * Rounding up is especially important if rounding down
+ * would give 0. Perfect rounding is unimportant.
+ */
+ ts->tv_nsec = 1000000000 / tc_getfrequency() + 1;
+ break;
+ case CLOCK_VIRTUAL:
+ case CLOCK_PROF:
+ /* Accurately round up here because we can do so cheaply. */
+ ts->tv_nsec = (1000000000 + hz - 1) / hz;
+ break;
+ case CLOCK_SECOND:
+ ts->tv_sec = 1;
+ ts->tv_nsec = 0;
+ break;
+ case CLOCK_THREAD_CPUTIME_ID:
+ case CLOCK_PROCESS_CPUTIME_ID:
+ cputime:
+ /* sync with cputick2usec */
+ ts->tv_nsec = 1000000 / cpu_tickrate();
+ if (ts->tv_nsec == 0)
+ ts->tv_nsec = 1000;
+ break;
+ default:
+ if ((int)clock_id < 0)
+ goto cputime;
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static uint8_t nanowait[MAXCPU];
+
+int
+kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
+{
+ struct timespec ts;
+ sbintime_t sbt, sbtt, prec, tmp;
+ time_t over;
+ int error;
+
+ if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
+ return (EINVAL);
+ if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
+ return (0);
+ ts = *rqt;
+ if (ts.tv_sec > INT32_MAX / 2) {
+ over = ts.tv_sec - INT32_MAX / 2;
+ ts.tv_sec -= over;
+ } else
+ over = 0;
+ tmp = tstosbt(ts);
+ prec = tmp;
+ prec >>= tc_precexp;
+ if (TIMESEL(&sbt, tmp))
+ sbt += tc_tick_sbt;
+ sbt += tmp;
+ error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp",
+ sbt, prec, C_ABSOLUTE);
+ if (error != EWOULDBLOCK) {
+ if (error == ERESTART)
+ error = EINTR;
+ TIMESEL(&sbtt, tmp);
+ if (rmt != NULL) {
+ ts = sbttots(sbt - sbtt);
+ ts.tv_sec += over;
+ if (ts.tv_sec < 0)
+ timespecclear(&ts);
+ *rmt = ts;
+ }
+ if (sbtt >= sbt)
+ return (0);
+ return (error);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nanosleep_args {
+ struct timespec *rqtp;
+ struct timespec *rmtp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_nanosleep(struct thread *td, struct nanosleep_args *uap)
+{
+ struct timespec rmt, rqt;
+ int error;
+
+ error = copyin(uap->rqtp, &rqt, sizeof(rqt));
+ if (error)
+ return (error);
+
+ if (uap->rmtp &&
+ !useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
+ return (EFAULT);
+ error = kern_nanosleep(td, &rqt, &rmt);
+ if (error && uap->rmtp) {
+ int error2;
+
+ error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
+ if (error2)
+ error = error2;
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct gettimeofday_args {
+ struct timeval *tp;
+ struct timezone *tzp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap)
+{
+ struct timeval atv;
+ struct timezone rtz;
+ int error = 0;
+
+ if (uap->tp) {
+ microtime(&atv);
+ error = copyout(&atv, uap->tp, sizeof (atv));
+ }
+ if (error == 0 && uap->tzp != NULL) {
+ rtz.tz_minuteswest = tz_minuteswest;
+ rtz.tz_dsttime = tz_dsttime;
+ error = copyout(&rtz, uap->tzp, sizeof (rtz));
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct settimeofday_args {
+ struct timeval *tv;
+ struct timezone *tzp;
+};
+#endif
+/* ARGSUSED */
+int
+sys_settimeofday(struct thread *td, struct settimeofday_args *uap)
+{
+ struct timeval atv, *tvp;
+ struct timezone atz, *tzp;
+ int error;
+
+ if (uap->tv) {
+ error = copyin(uap->tv, &atv, sizeof(atv));
+ if (error)
+ return (error);
+ tvp = &atv;
+ } else
+ tvp = NULL;
+ if (uap->tzp) {
+ error = copyin(uap->tzp, &atz, sizeof(atz));
+ if (error)
+ return (error);
+ tzp = &atz;
+ } else
+ tzp = NULL;
+ return (kern_settimeofday(td, tvp, tzp));
+}
+
+int
+kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp)
+{
+ int error;
+
+ error = priv_check(td, PRIV_SETTIMEOFDAY);
+ if (error)
+ return (error);
+ /* Verify all parameters before changing time. */
+ if (tv) {
+ if (tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+ return (EINVAL);
+ error = settime(td, tv);
+ }
+ if (tzp && error == 0) {
+ tz_minuteswest = tzp->tz_minuteswest;
+ tz_dsttime = tzp->tz_dsttime;
+ }
+ return (error);
+}
+
+/*
+ * Get value of an interval timer. The process virtual and profiling virtual
+ * time timers are kept in the p_stats area, since they can be swapped out.
+ * These are kept internally in the way they are specified externally: in
+ * time until they expire.
+ *
+ * The real time interval timer is kept in the process table slot for the
+ * process, and its value (it_value) is kept as an absolute time rather than
+ * as a delta, so that it is easy to keep periodic real-time signals from
+ * drifting.
+ *
+ * Virtual time timers are processed in the hardclock() routine of
+ * kern_clock.c. The real time timer is processed by a timeout routine,
+ * called from the softclock() routine. Since a callout may be delayed in
+ * real time due to interrupt processing in the system, it is possible for
+ * the real time timeout routine (realitexpire, given below), to be delayed
+ * in real time past when it is supposed to occur. It does not suffice,
+ * therefore, to reload the real timer .it_value from the real time timers
+ * .it_interval. Rather, we compute the next time in absolute time the timer
+ * should go off.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getitimer_args {
+ u_int which;
+ struct itimerval *itv;
+};
+#endif
+int
+sys_getitimer(struct thread *td, struct getitimer_args *uap)
+{
+ struct itimerval aitv;
+ int error;
+
+ error = kern_getitimer(td, uap->which, &aitv);
+ if (error != 0)
+ return (error);
+ return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
+}
+
+int
+kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv)
+{
+ struct proc *p = td->td_proc;
+ struct timeval ctv;
+
+ if (which > ITIMER_PROF)
+ return (EINVAL);
+
+ if (which == ITIMER_REAL) {
+ /*
+ * Convert from absolute to relative time in .it_value
+ * part of real time timer. If time for real time timer
+ * has passed return 0, else return difference between
+ * current time and time for the timer to go off.
+ */
+ PROC_LOCK(p);
+ *aitv = p->p_realtimer;
+ PROC_UNLOCK(p);
+ if (timevalisset(&aitv->it_value)) {
+ microuptime(&ctv);
+ if (timevalcmp(&aitv->it_value, &ctv, <))
+ timevalclear(&aitv->it_value);
+ else
+ timevalsub(&aitv->it_value, &ctv);
+ }
+ } else {
+ PROC_SLOCK(p);
+ *aitv = p->p_stats->p_timer[which];
+ PROC_SUNLOCK(p);
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setitimer_args {
+ u_int which;
+ struct itimerval *itv, *oitv;
+};
+#endif
+int
+sys_setitimer(struct thread *td, struct setitimer_args *uap)
+{
+ struct itimerval aitv, oitv;
+ int error;
+
+ if (uap->itv == NULL) {
+ uap->itv = uap->oitv;
+ return (sys_getitimer(td, (struct getitimer_args *)uap));
+ }
+
+ if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval))))
+ return (error);
+ error = kern_setitimer(td, uap->which, &aitv, &oitv);
+ if (error != 0 || uap->oitv == NULL)
+ return (error);
+ return (copyout(&oitv, uap->oitv, sizeof(struct itimerval)));
+}
+
+int
+kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv,
+ struct itimerval *oitv)
+{
+ struct proc *p = td->td_proc;
+ struct timeval ctv;
+ sbintime_t sbt, pr;
+
+ if (aitv == NULL)
+ return (kern_getitimer(td, which, oitv));
+
+ if (which > ITIMER_PROF)
+ return (EINVAL);
+ if (itimerfix(&aitv->it_value) ||
+ aitv->it_value.tv_sec > INT32_MAX / 2)
+ return (EINVAL);
+ if (!timevalisset(&aitv->it_value))
+ timevalclear(&aitv->it_interval);
+ else if (itimerfix(&aitv->it_interval) ||
+ aitv->it_interval.tv_sec > INT32_MAX / 2)
+ return (EINVAL);
+
+ if (which == ITIMER_REAL) {
+ PROC_LOCK(p);
+ if (timevalisset(&p->p_realtimer.it_value))
+ callout_stop(&p->p_itcallout);
+ microuptime(&ctv);
+ if (timevalisset(&aitv->it_value)) {
+ pr = tvtosbt(aitv->it_value) >> tc_precexp;
+ timevaladd(&aitv->it_value, &ctv);
+ sbt = tvtosbt(aitv->it_value);
+ callout_reset_sbt(&p->p_itcallout, sbt, pr,
+ realitexpire, p, C_ABSOLUTE);
+ }
+ *oitv = p->p_realtimer;
+ p->p_realtimer = *aitv;
+ PROC_UNLOCK(p);
+ if (timevalisset(&oitv->it_value)) {
+ if (timevalcmp(&oitv->it_value, &ctv, <))
+ timevalclear(&oitv->it_value);
+ else
+ timevalsub(&oitv->it_value, &ctv);
+ }
+ } else {
+ PROC_SLOCK(p);
+ *oitv = p->p_stats->p_timer[which];
+ p->p_stats->p_timer[which] = *aitv;
+ PROC_SUNLOCK(p);
+ }
+ return (0);
+}
+
+/*
+ * Real interval timer expired:
+ * send process whose timer expired an alarm signal.
+ * If time is not set up to reload, then just return.
+ * Else compute next time timer should go off which is > current time.
+ * This is where delay in processing this timeout causes multiple
+ * SIGALRM calls to be compressed into one.
+ * tvtohz() always adds 1 to allow for the time until the next clock
+ * interrupt being strictly less than 1 clock tick, but we don't want
+ * that here since we want to appear to be in sync with the clock
+ * interrupt even when we're delayed.
+ */
+void
+realitexpire(void *arg)
+{
+ struct proc *p;
+ struct timeval ctv;
+ sbintime_t isbt;
+
+ p = (struct proc *)arg;
+ kern_psignal(p, SIGALRM);
+ if (!timevalisset(&p->p_realtimer.it_interval)) {
+ timevalclear(&p->p_realtimer.it_value);
+ if (p->p_flag & P_WEXIT)
+ wakeup(&p->p_itcallout);
+ return;
+ }
+ isbt = tvtosbt(p->p_realtimer.it_interval);
+ if (isbt >= sbt_timethreshold)
+ getmicrouptime(&ctv);
+ else
+ microuptime(&ctv);
+ do {
+ timevaladd(&p->p_realtimer.it_value,
+ &p->p_realtimer.it_interval);
+ } while (timevalcmp(&p->p_realtimer.it_value, &ctv, <=));
+ callout_reset_sbt(&p->p_itcallout, tvtosbt(p->p_realtimer.it_value),
+ isbt >> tc_precexp, realitexpire, p, C_ABSOLUTE);
+}
+
+/*
+ * Check that a proposed value to load into the .it_value or
+ * .it_interval part of an interval timer is acceptable, and
+ * fix it to have at least minimal value (i.e. if it is less
+ * than the resolution of the clock, round it up.)
+ */
+int
+itimerfix(struct timeval *tv)
+{
+
+ if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+ return (EINVAL);
+ if (tv->tv_sec == 0 && tv->tv_usec != 0 &&
+ tv->tv_usec < (u_int)tick / 16)
+ tv->tv_usec = (u_int)tick / 16;
+ return (0);
+}
+
+/*
+ * Decrement an interval timer by a specified number
+ * of microseconds, which must be less than a second,
+ * i.e. < 1000000. If the timer expires, then reload
+ * it. In this case, carry over (usec - old value) to
+ * reduce the value reloaded into the timer so that
+ * the timer does not drift. This routine assumes
+ * that it is called in a context where the timers
+ * on which it is operating cannot change in value.
+ */
+int
+itimerdecr(struct itimerval *itp, int usec)
+{
+
+ if (itp->it_value.tv_usec < usec) {
+ if (itp->it_value.tv_sec == 0) {
+ /* expired, and already in next interval */
+ usec -= itp->it_value.tv_usec;
+ goto expire;
+ }
+ itp->it_value.tv_usec += 1000000;
+ itp->it_value.tv_sec--;
+ }
+ itp->it_value.tv_usec -= usec;
+ usec = 0;
+ if (timevalisset(&itp->it_value))
+ return (1);
+ /* expired, exactly at end of interval */
+expire:
+ if (timevalisset(&itp->it_interval)) {
+ itp->it_value = itp->it_interval;
+ itp->it_value.tv_usec -= usec;
+ if (itp->it_value.tv_usec < 0) {
+ itp->it_value.tv_usec += 1000000;
+ itp->it_value.tv_sec--;
+ }
+ } else
+ itp->it_value.tv_usec = 0; /* sec is already 0 */
+ return (0);
+}
+
+/*
+ * Add and subtract routines for timevals.
+ * N.B.: subtract routine doesn't deal with
+ * results which are before the beginning,
+ * it just gets very confused in this case.
+ * Caveat emptor.
+ */
+void
+timevaladd(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec += t2->tv_sec;
+ t1->tv_usec += t2->tv_usec;
+ timevalfix(t1);
+}
+
+void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+/*
+ * ratecheck(): simple time-based rate-limit checking.
+ */
+int
+ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
+{
+ struct timeval tv, delta;
+ int rv = 0;
+
+ getmicrouptime(&tv); /* NB: 10ms precision */
+ delta = tv;
+ timevalsub(&delta, lasttime);
+
+ /*
+ * check for 0,0 is so that the message will be seen at least once,
+ * even if interval is huge.
+ */
+ if (timevalcmp(&delta, mininterval, >=) ||
+ (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
+ *lasttime = tv;
+ rv = 1;
+ }
+
+ return (rv);
+}
+
+/*
+ * ppsratecheck(): packets (or events) per second limitation.
+ *
+ * Return 0 if the limit is to be enforced (e.g. the caller
+ * should drop a packet because of the rate limitation).
+ *
+ * maxpps of 0 always causes zero to be returned. maxpps of -1
+ * always causes 1 to be returned; this effectively defeats rate
+ * limiting.
+ *
+ * Note that we maintain the struct timeval for compatibility
+ * with other bsd systems. We reuse the storage and just monitor
+ * clock ticks for minimal overhead.
+ */
+int
+ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
+{
+ int now;
+
+ /*
+ * Reset the last time and counter if this is the first call
+ * or more than a second has passed since the last update of
+ * lasttime.
+ */
+ now = ticks;
+ if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
+ lasttime->tv_sec = now;
+ *curpps = 1;
+ return (maxpps != 0);
+ } else {
+ (*curpps)++; /* NB: ignore potential overflow */
+ return (maxpps < 0 || *curpps < maxpps);
+ }
+}
+
+static void
+itimer_start(void)
+{
+ struct kclock rt_clock = {
+ .timer_create = realtimer_create,
+ .timer_delete = realtimer_delete,
+ .timer_settime = realtimer_settime,
+ .timer_gettime = realtimer_gettime,
+ .event_hook = NULL
+ };
+
+ itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
+ NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
+ register_posix_clock(CLOCK_REALTIME, &rt_clock);
+ register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
+ p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
+ p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
+ p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
+ EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
+ (void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
+ EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
+ (void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
+}
+
+int
+register_posix_clock(int clockid, struct kclock *clk)
+{
+ if ((unsigned)clockid >= MAX_CLOCKS) {
+ printf("%s: invalid clockid\n", __func__);
+ return (0);
+ }
+ posix_clocks[clockid] = *clk;
+ return (1);
+}
+
+static int
+itimer_init(void *mem, int size, int flags)
+{
+ struct itimer *it;
+
+ it = (struct itimer *)mem;
+ mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
+ return (0);
+}
+
+static void
+itimer_fini(void *mem, int size)
+{
+ struct itimer *it;
+
+ it = (struct itimer *)mem;
+ mtx_destroy(&it->it_mtx);
+}
+
+static void
+itimer_enter(struct itimer *it)
+{
+
+ mtx_assert(&it->it_mtx, MA_OWNED);
+ it->it_usecount++;
+}
+
+static void
+itimer_leave(struct itimer *it)
+{
+
+ mtx_assert(&it->it_mtx, MA_OWNED);
+ KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
+
+ if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
+ wakeup(it);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_create_args {
+ clockid_t clock_id;
+ struct sigevent * evp;
+ int * timerid;
+};
+#endif
+int
+sys_ktimer_create(struct thread *td, struct ktimer_create_args *uap)
+{
+ struct sigevent *evp, ev;
+ int id;
+ int error;
+
+ if (uap->evp == NULL) {
+ evp = NULL;
+ } else {
+ error = copyin(uap->evp, &ev, sizeof(ev));
+ if (error != 0)
+ return (error);
+ evp = &ev;
+ }
+ error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1);
+ if (error == 0) {
+ error = copyout(&id, uap->timerid, sizeof(int));
+ if (error != 0)
+ kern_ktimer_delete(td, id);
+ }
+ return (error);
+}
+
+int
+kern_ktimer_create(struct thread *td, clockid_t clock_id, struct sigevent *evp,
+ int *timerid, int preset_id)
+{
+ struct proc *p = td->td_proc;
+ struct itimer *it;
+ int id;
+ int error;
+
+ if (clock_id < 0 || clock_id >= MAX_CLOCKS)
+ return (EINVAL);
+
+ if (posix_clocks[clock_id].timer_create == NULL)
+ return (EINVAL);
+
+ if (evp != NULL) {
+ if (evp->sigev_notify != SIGEV_NONE &&
+ evp->sigev_notify != SIGEV_SIGNAL &&
+ evp->sigev_notify != SIGEV_THREAD_ID)
+ return (EINVAL);
+ if ((evp->sigev_notify == SIGEV_SIGNAL ||
+ evp->sigev_notify == SIGEV_THREAD_ID) &&
+ !_SIG_VALID(evp->sigev_signo))
+ return (EINVAL);
+ }
+
+ if (p->p_itimers == NULL)
+ itimers_alloc(p);
+
+ it = uma_zalloc(itimer_zone, M_WAITOK);
+ it->it_flags = 0;
+ it->it_usecount = 0;
+ it->it_active = 0;
+ timespecclear(&it->it_time.it_value);
+ timespecclear(&it->it_time.it_interval);
+ it->it_overrun = 0;
+ it->it_overrun_last = 0;
+ it->it_clockid = clock_id;
+ it->it_timerid = -1;
+ it->it_proc = p;
+ ksiginfo_init(&it->it_ksi);
+ it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
+ error = CLOCK_CALL(clock_id, timer_create, (it));
+ if (error != 0)
+ goto out;
+
+ PROC_LOCK(p);
+ if (preset_id != -1) {
+ KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
+ id = preset_id;
+ if (p->p_itimers->its_timers[id] != NULL) {
+ PROC_UNLOCK(p);
+ error = 0;
+ goto out;
+ }
+ } else {
+ /*
+ * Find a free timer slot, skipping those reserved
+ * for setitimer().
+ */
+ for (id = 3; id < TIMER_MAX; id++)
+ if (p->p_itimers->its_timers[id] == NULL)
+ break;
+ if (id == TIMER_MAX) {
+ PROC_UNLOCK(p);
+ error = EAGAIN;
+ goto out;
+ }
+ }
+ it->it_timerid = id;
+ p->p_itimers->its_timers[id] = it;
+ if (evp != NULL)
+ it->it_sigev = *evp;
+ else {
+ it->it_sigev.sigev_notify = SIGEV_SIGNAL;
+ switch (clock_id) {
+ default:
+ case CLOCK_REALTIME:
+ it->it_sigev.sigev_signo = SIGALRM;
+ break;
+ case CLOCK_VIRTUAL:
+ it->it_sigev.sigev_signo = SIGVTALRM;
+ break;
+ case CLOCK_PROF:
+ it->it_sigev.sigev_signo = SIGPROF;
+ break;
+ }
+ it->it_sigev.sigev_value.sival_int = id;
+ }
+
+ if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
+ it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
+ it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
+ it->it_ksi.ksi_code = SI_TIMER;
+ it->it_ksi.ksi_value = it->it_sigev.sigev_value;
+ it->it_ksi.ksi_timerid = id;
+ }
+ PROC_UNLOCK(p);
+ *timerid = id;
+ return (0);
+
+out:
+ ITIMER_LOCK(it);
+ CLOCK_CALL(it->it_clockid, timer_delete, (it));
+ ITIMER_UNLOCK(it);
+ uma_zfree(itimer_zone, it);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_delete_args {
+ int timerid;
+};
+#endif
+int
+sys_ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
+{
+
+ return (kern_ktimer_delete(td, uap->timerid));
+}
+
+static struct itimer *
+itimer_find(struct proc *p, int timerid)
+{
+ struct itimer *it;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ if ((p->p_itimers == NULL) ||
+ (timerid < 0) || (timerid >= TIMER_MAX) ||
+ (it = p->p_itimers->its_timers[timerid]) == NULL) {
+ return (NULL);
+ }
+ ITIMER_LOCK(it);
+ if ((it->it_flags & ITF_DELETING) != 0) {
+ ITIMER_UNLOCK(it);
+ it = NULL;
+ }
+ return (it);
+}
+
+int
+kern_ktimer_delete(struct thread *td, int timerid)
+{
+ struct proc *p = td->td_proc;
+ struct itimer *it;
+
+ PROC_LOCK(p);
+ it = itimer_find(p, timerid);
+ if (it == NULL) {
+ PROC_UNLOCK(p);
+ return (EINVAL);
+ }
+ PROC_UNLOCK(p);
+
+ it->it_flags |= ITF_DELETING;
+ while (it->it_usecount > 0) {
+ it->it_flags |= ITF_WANTED;
+ msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
+ }
+ it->it_flags &= ~ITF_WANTED;
+ CLOCK_CALL(it->it_clockid, timer_delete, (it));
+ ITIMER_UNLOCK(it);
+
+ PROC_LOCK(p);
+ if (KSI_ONQ(&it->it_ksi))
+ sigqueue_take(&it->it_ksi);
+ p->p_itimers->its_timers[timerid] = NULL;
+ PROC_UNLOCK(p);
+ uma_zfree(itimer_zone, it);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_settime_args {
+ int timerid;
+ int flags;
+ const struct itimerspec * value;
+ struct itimerspec * ovalue;
+};
+#endif
+int
+sys_ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
+{
+ struct itimerspec val, oval, *ovalp;
+ int error;
+
+ error = copyin(uap->value, &val, sizeof(val));
+ if (error != 0)
+ return (error);
+ ovalp = uap->ovalue != NULL ? &oval : NULL;
+ error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp);
+ if (error == 0 && uap->ovalue != NULL)
+ error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
+ return (error);
+}
+
+int
+kern_ktimer_settime(struct thread *td, int timer_id, int flags,
+ struct itimerspec *val, struct itimerspec *oval)
+{
+ struct proc *p;
+ struct itimer *it;
+ int error;
+
+ p = td->td_proc;
+ PROC_LOCK(p);
+ if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
+ PROC_UNLOCK(p);
+ error = EINVAL;
+ } else {
+ PROC_UNLOCK(p);
+ itimer_enter(it);
+ error = CLOCK_CALL(it->it_clockid, timer_settime, (it,
+ flags, val, oval));
+ itimer_leave(it);
+ ITIMER_UNLOCK(it);
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_gettime_args {
+ int timerid;
+ struct itimerspec * value;
+};
+#endif
+int
+sys_ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
+{
+ struct itimerspec val;
+ int error;
+
+ error = kern_ktimer_gettime(td, uap->timerid, &val);
+ if (error == 0)
+ error = copyout(&val, uap->value, sizeof(val));
+ return (error);
+}
+
+int
+kern_ktimer_gettime(struct thread *td, int timer_id, struct itimerspec *val)
+{
+ struct proc *p;
+ struct itimer *it;
+ int error;
+
+ p = td->td_proc;
+ PROC_LOCK(p);
+ if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
+ PROC_UNLOCK(p);
+ error = EINVAL;
+ } else {
+ PROC_UNLOCK(p);
+ itimer_enter(it);
+ error = CLOCK_CALL(it->it_clockid, timer_gettime, (it, val));
+ itimer_leave(it);
+ ITIMER_UNLOCK(it);
+ }
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct timer_getoverrun_args {
+ int timerid;
+};
+#endif
+int
+sys_ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct itimer *it;
+ int error ;
+
+ PROC_LOCK(p);
+ if (uap->timerid < 3 ||
+ (it = itimer_find(p, uap->timerid)) == NULL) {
+ PROC_UNLOCK(p);
+ error = EINVAL;
+ } else {
+ td->td_retval[0] = it->it_overrun_last;
+ ITIMER_UNLOCK(it);
+ PROC_UNLOCK(p);
+ error = 0;
+ }
+ return (error);
+}
+
+static int
+realtimer_create(struct itimer *it)
+{
+ callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
+ return (0);
+}
+
+static int
+realtimer_delete(struct itimer *it)
+{
+ mtx_assert(&it->it_mtx, MA_OWNED);
+
+ /*
+ * clear timer's value and interval to tell realtimer_expire
+ * to not rearm the timer.
+ */
+ timespecclear(&it->it_time.it_value);
+ timespecclear(&it->it_time.it_interval);
+ ITIMER_UNLOCK(it);
+ callout_drain(&it->it_callout);
+ ITIMER_LOCK(it);
+ return (0);
+}
+
+static int
+realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
+{
+ struct timespec cts;
+
+ mtx_assert(&it->it_mtx, MA_OWNED);
+
+ realtimer_clocktime(it->it_clockid, &cts);
+ *ovalue = it->it_time;
+ if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
+ timespecsub(&ovalue->it_value, &cts);
+ if (ovalue->it_value.tv_sec < 0 ||
+ (ovalue->it_value.tv_sec == 0 &&
+ ovalue->it_value.tv_nsec == 0)) {
+ ovalue->it_value.tv_sec = 0;
+ ovalue->it_value.tv_nsec = 1;
+ }
+ }
+ return (0);
+}
+
+static int
+realtimer_settime(struct itimer *it, int flags,
+ struct itimerspec *value, struct itimerspec *ovalue)
+{
+ struct timespec cts, ts;
+ struct timeval tv;
+ struct itimerspec val;
+
+ mtx_assert(&it->it_mtx, MA_OWNED);
+
+ val = *value;
+ if (itimespecfix(&val.it_value))
+ return (EINVAL);
+
+ if (timespecisset(&val.it_value)) {
+ if (itimespecfix(&val.it_interval))
+ return (EINVAL);
+ } else {
+ timespecclear(&val.it_interval);
+ }
+
+ if (ovalue != NULL)
+ realtimer_gettime(it, ovalue);
+
+ it->it_time = val;
+ if (timespecisset(&val.it_value)) {
+ realtimer_clocktime(it->it_clockid, &cts);
+ ts = val.it_value;
+ if ((flags & TIMER_ABSTIME) == 0) {
+ /* Convert to absolute time. */
+ timespecadd(&it->it_time.it_value, &cts);
+ } else {
+ timespecsub(&ts, &cts);
+ /*
+ * We don't care if ts is negative, tztohz will
+ * fix it.
+ */
+ }
+ TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ callout_reset(&it->it_callout, tvtohz(&tv),
+ realtimer_expire, it);
+ } else {
+ callout_stop(&it->it_callout);
+ }
+
+ return (0);
+}
+
+static void
+realtimer_clocktime(clockid_t id, struct timespec *ts)
+{
+ if (id == CLOCK_REALTIME)
+ getnanotime(ts);
+ else /* CLOCK_MONOTONIC */
+ getnanouptime(ts);
+}
+
+int
+itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
+{
+ struct itimer *it;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ it = itimer_find(p, timerid);
+ if (it != NULL) {
+ ksi->ksi_overrun = it->it_overrun;
+ it->it_overrun_last = it->it_overrun;
+ it->it_overrun = 0;
+ ITIMER_UNLOCK(it);
+ return (0);
+ }
+ return (EINVAL);
+}
+
+int
+itimespecfix(struct timespec *ts)
+{
+
+ if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
+ return (EINVAL);
+ if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
+ ts->tv_nsec = tick * 1000;
+ return (0);
+}
+
+/* Timeout callback for realtime timer */
+static void
+realtimer_expire(void *arg)
+{
+ struct timespec cts, ts;
+ struct timeval tv;
+ struct itimer *it;
+
+ it = (struct itimer *)arg;
+
+ realtimer_clocktime(it->it_clockid, &cts);
+ /* Only fire if time is reached. */
+ if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+ if (timespecisset(&it->it_time.it_interval)) {
+ timespecadd(&it->it_time.it_value,
+ &it->it_time.it_interval);
+ while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+ if (it->it_overrun < INT_MAX)
+ it->it_overrun++;
+ else
+ it->it_ksi.ksi_errno = ERANGE;
+ timespecadd(&it->it_time.it_value,
+ &it->it_time.it_interval);
+ }
+ } else {
+ /* single shot timer ? */
+ timespecclear(&it->it_time.it_value);
+ }
+ if (timespecisset(&it->it_time.it_value)) {
+ ts = it->it_time.it_value;
+ timespecsub(&ts, &cts);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ callout_reset(&it->it_callout, tvtohz(&tv),
+ realtimer_expire, it);
+ }
+ itimer_enter(it);
+ ITIMER_UNLOCK(it);
+ itimer_fire(it);
+ ITIMER_LOCK(it);
+ itimer_leave(it);
+ } else if (timespecisset(&it->it_time.it_value)) {
+ ts = it->it_time.it_value;
+ timespecsub(&ts, &cts);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
+ it);
+ }
+}
+
+void
+itimer_fire(struct itimer *it)
+{
+ struct proc *p = it->it_proc;
+ struct thread *td;
+
+ if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
+ it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
+ if (sigev_findtd(p, &it->it_sigev, &td) != 0) {
+ ITIMER_LOCK(it);
+ timespecclear(&it->it_time.it_value);
+ timespecclear(&it->it_time.it_interval);
+ callout_stop(&it->it_callout);
+ ITIMER_UNLOCK(it);
+ return;
+ }
+ if (!KSI_ONQ(&it->it_ksi)) {
+ it->it_ksi.ksi_errno = 0;
+ ksiginfo_set_sigev(&it->it_ksi, &it->it_sigev);
+ tdsendsignal(p, td, it->it_ksi.ksi_signo, &it->it_ksi);
+ } else {
+ if (it->it_overrun < INT_MAX)
+ it->it_overrun++;
+ else
+ it->it_ksi.ksi_errno = ERANGE;
+ }
+ PROC_UNLOCK(p);
+ }
+}
+
+static void
+itimers_alloc(struct proc *p)
+{
+ struct itimers *its;
+ int i;
+
+ its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
+ LIST_INIT(&its->its_virtual);
+ LIST_INIT(&its->its_prof);
+ TAILQ_INIT(&its->its_worklist);
+ for (i = 0; i < TIMER_MAX; i++)
+ its->its_timers[i] = NULL;
+ PROC_LOCK(p);
+ if (p->p_itimers == NULL) {
+ p->p_itimers = its;
+ PROC_UNLOCK(p);
+ }
+ else {
+ PROC_UNLOCK(p);
+ free(its, M_SUBPROC);
+ }
+}
+
+static void
+itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+ itimers_event_hook_exit(arg, p);
+}
+
+/* Clean up timers when some process events are being triggered. */
+static void
+itimers_event_hook_exit(void *arg, struct proc *p)
+{
+ struct itimers *its;
+ struct itimer *it;
+ int event = (int)(intptr_t)arg;
+ int i;
+
+ if (p->p_itimers != NULL) {
+ its = p->p_itimers;
+ for (i = 0; i < MAX_CLOCKS; ++i) {
+ if (posix_clocks[i].event_hook != NULL)
+ CLOCK_CALL(i, event_hook, (p, i, event));
+ }
+ /*
+ * According to susv3, XSI interval timers should be inherited
+ * by new image.
+ */
+ if (event == ITIMER_EV_EXEC)
+ i = 3;
+ else if (event == ITIMER_EV_EXIT)
+ i = 0;
+ else
+ panic("unhandled event");
+ for (; i < TIMER_MAX; ++i) {
+ if ((it = its->its_timers[i]) != NULL)
+ kern_ktimer_delete(curthread, i);
+ }
+ if (its->its_timers[0] == NULL &&
+ its->its_timers[1] == NULL &&
+ its->its_timers[2] == NULL) {
+ free(its, M_SUBPROC);
+ p->p_itimers = NULL;
+ }
+ }
+}
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
new file mode 100644
index 0000000..e3580fc
--- /dev/null
+++ b/sys/kern/kern_timeout.c
@@ -0,0 +1,1433 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_callout_profiling.h"
+#include "opt_kdtrace.h"
+#if defined(__arm__)
+#include "opt_timer.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/file.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/sleepqueue.h>
+#include <sys/sysctl.h>
+#include <sys/smp.h>
+
+#ifdef SMP
+#include <machine/cpu.h>
+#endif
+
+#ifndef NO_EVENTTIMERS
+DPCPU_DECLARE(sbintime_t, hardclocktime);
+#endif
+
+SDT_PROVIDER_DEFINE(callout_execute);
+SDT_PROBE_DEFINE1(callout_execute, kernel, , callout_start, callout-start,
+ "struct callout *");
+SDT_PROBE_DEFINE1(callout_execute, kernel, , callout_end, callout-end,
+ "struct callout *");
+
+#ifdef CALLOUT_PROFILING
+static int avg_depth;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
+ "Average number of items examined per softclock call. Units = 1/1000");
+static int avg_gcalls;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
+ "Average number of Giant callouts made per softclock call. Units = 1/1000");
+static int avg_lockcalls;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
+ "Average number of lock callouts made per softclock call. Units = 1/1000");
+static int avg_mpcalls;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
+ "Average number of MP callouts made per softclock call. Units = 1/1000");
+static int avg_depth_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
+ "Average number of direct callouts examined per callout_process call. "
+ "Units = 1/1000");
+static int avg_lockcalls_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
+ &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
+ "callout_process call. Units = 1/1000");
+static int avg_mpcalls_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
+ 0, "Average number of MP direct callouts made per callout_process call. "
+ "Units = 1/1000");
+#endif
+
+static int ncallout;
+SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN, &ncallout, 0,
+ "Number of entries in callwheel and size of timeout() preallocation");
+
+/*
+ * TODO:
+ * allocate more timeout table slots when table overflows.
+ */
+u_int callwheelsize, callwheelmask;
+
+/*
+ * The callout cpu exec entities represent informations necessary for
+ * describing the state of callouts currently running on the CPU and the ones
+ * necessary for migrating callouts to the new callout cpu. In particular,
+ * the first entry of the array cc_exec_entity holds informations for callout
+ * running in SWI thread context, while the second one holds informations
+ * for callout running directly from hardware interrupt context.
+ * The cached informations are very important for deferring migration when
+ * the migrating callout is already running.
+ */
+struct cc_exec {
+ struct callout *cc_next;
+ struct callout *cc_curr;
+#ifdef SMP
+ void (*ce_migration_func)(void *);
+ void *ce_migration_arg;
+ int ce_migration_cpu;
+ sbintime_t ce_migration_time;
+ sbintime_t ce_migration_prec;
+#endif
+ bool cc_cancel;
+ bool cc_waiting;
+};
+
+/*
+ * There is one struct callout_cpu per cpu, holding all relevant
+ * state for the callout processing thread on the individual CPU.
+ */
+struct callout_cpu {
+ struct mtx_padalign cc_lock;
+ struct cc_exec cc_exec_entity[2];
+ struct callout *cc_callout;
+ struct callout_list *cc_callwheel;
+ struct callout_tailq cc_expireq;
+ struct callout_slist cc_callfree;
+ sbintime_t cc_firstevent;
+ sbintime_t cc_lastscan;
+ void *cc_cookie;
+ u_int cc_bucket;
+};
+
+#define cc_exec_curr cc_exec_entity[0].cc_curr
+#define cc_exec_next cc_exec_entity[0].cc_next
+#define cc_exec_cancel cc_exec_entity[0].cc_cancel
+#define cc_exec_waiting cc_exec_entity[0].cc_waiting
+#define cc_exec_curr_dir cc_exec_entity[1].cc_curr
+#define cc_exec_next_dir cc_exec_entity[1].cc_next
+#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel
+#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting
+
+#ifdef SMP
+#define cc_migration_func cc_exec_entity[0].ce_migration_func
+#define cc_migration_arg cc_exec_entity[0].ce_migration_arg
+#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu
+#define cc_migration_time cc_exec_entity[0].ce_migration_time
+#define cc_migration_prec cc_exec_entity[0].ce_migration_prec
+#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func
+#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg
+#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu
+#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time
+#define cc_migration_prec_dir cc_exec_entity[1].ce_migration_prec
+
+struct callout_cpu cc_cpu[MAXCPU];
+#define CPUBLOCK MAXCPU
+#define CC_CPU(cpu) (&cc_cpu[(cpu)])
+#define CC_SELF() CC_CPU(PCPU_GET(cpuid))
+#else
+struct callout_cpu cc_cpu;
+#define CC_CPU(cpu) &cc_cpu
+#define CC_SELF() &cc_cpu
+#endif
+#define CC_LOCK(cc) mtx_lock_spin(&(cc)->cc_lock)
+#define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock)
+#define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED)
+
+static int timeout_cpu;
+
+static void callout_cpu_init(struct callout_cpu *cc);
+static void softclock_call_cc(struct callout *c, struct callout_cpu *cc,
+#ifdef CALLOUT_PROFILING
+ int *mpcalls, int *lockcalls, int *gcalls,
+#endif
+ int direct);
+
+static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
+
+/**
+ * Locked by cc_lock:
+ * cc_curr - If a callout is in progress, it is cc_curr.
+ * If cc_curr is non-NULL, threads waiting in
+ * callout_drain() will be woken up as soon as the
+ * relevant callout completes.
+ * cc_cancel - Changing to 1 with both callout_lock and cc_lock held
+ * guarantees that the current callout will not run.
+ * The softclock() function sets this to 0 before it
+ * drops callout_lock to acquire c_lock, and it calls
+ * the handler only if curr_cancelled is still 0 after
+ * cc_lock is successfully acquired.
+ * cc_waiting - If a thread is waiting in callout_drain(), then
+ * callout_wait is nonzero. Set only when
+ * cc_curr is non-NULL.
+ */
+
+/*
+ * Resets the execution entity tied to a specific callout cpu.
+ */
+static void
+cc_cce_cleanup(struct callout_cpu *cc, int direct)
+{
+
+ cc->cc_exec_entity[direct].cc_curr = NULL;
+ cc->cc_exec_entity[direct].cc_next = NULL;
+ cc->cc_exec_entity[direct].cc_cancel = false;
+ cc->cc_exec_entity[direct].cc_waiting = false;
+#ifdef SMP
+ cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
+ cc->cc_exec_entity[direct].ce_migration_time = 0;
+ cc->cc_exec_entity[direct].ce_migration_prec = 0;
+ cc->cc_exec_entity[direct].ce_migration_func = NULL;
+ cc->cc_exec_entity[direct].ce_migration_arg = NULL;
+#endif
+}
+
+/*
+ * Checks if migration is requested by a specific callout cpu.
+ */
+static int
+cc_cce_migrating(struct callout_cpu *cc, int direct)
+{
+
+#ifdef SMP
+ return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
+#else
+ return (0);
+#endif
+}
+
+/*
+ * Kernel low level callwheel initialization
+ * called on cpu0 during kernel startup.
+ */
+static void
+callout_callwheel_init(void *dummy)
+{
+ struct callout_cpu *cc;
+
+ /*
+ * Calculate the size of the callout wheel and the preallocated
+ * timeout() structures.
+ * XXX: Clip callout to result of previous function of maxusers
+ * maximum 384. This is still huge, but acceptable.
+ */
+ ncallout = imin(16 + maxproc + maxfiles, 18508);
+ TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
+
+ /*
+ * Calculate callout wheel size, should be next power of two higher
+ * than 'ncallout'.
+ */
+ callwheelsize = 1 << fls(ncallout);
+ callwheelmask = callwheelsize - 1;
+
+ /*
+ * Only cpu0 handles timeout(9) and receives a preallocation.
+ *
+ * XXX: Once all timeout(9) consumers are converted this can
+ * be removed.
+ */
+ timeout_cpu = PCPU_GET(cpuid);
+ cc = CC_CPU(timeout_cpu);
+ cc->cc_callout = malloc(ncallout * sizeof(struct callout),
+ M_CALLOUT, M_WAITOK);
+ callout_cpu_init(cc);
+}
+SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
+
+/*
+ * Initialize the per-cpu callout structures.
+ */
+static void
+callout_cpu_init(struct callout_cpu *cc)
+{
+ struct callout *c;
+ int i;
+
+ mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
+ SLIST_INIT(&cc->cc_callfree);
+ cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
+ M_CALLOUT, M_WAITOK);
+ for (i = 0; i < callwheelsize; i++)
+ LIST_INIT(&cc->cc_callwheel[i]);
+ TAILQ_INIT(&cc->cc_expireq);
+ cc->cc_firstevent = INT64_MAX;
+ for (i = 0; i < 2; i++)
+ cc_cce_cleanup(cc, i);
+ if (cc->cc_callout == NULL) /* Only cpu0 handles timeout(9) */
+ return;
+ for (i = 0; i < ncallout; i++) {
+ c = &cc->cc_callout[i];
+ callout_init(c, 0);
+ c->c_flags = CALLOUT_LOCAL_ALLOC;
+ SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+ }
+}
+
+#ifdef SMP
+/*
+ * Switches the cpu tied to a specific callout.
+ * The function expects a locked incoming callout cpu and returns with
+ * locked outcoming callout cpu.
+ */
+static struct callout_cpu *
+callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
+{
+ struct callout_cpu *new_cc;
+
+ MPASS(c != NULL && cc != NULL);
+ CC_LOCK_ASSERT(cc);
+
+ /*
+ * Avoid interrupts and preemption firing after the callout cpu
+ * is blocked in order to avoid deadlocks as the new thread
+ * may be willing to acquire the callout cpu lock.
+ */
+ c->c_cpu = CPUBLOCK;
+ spinlock_enter();
+ CC_UNLOCK(cc);
+ new_cc = CC_CPU(new_cpu);
+ CC_LOCK(new_cc);
+ spinlock_exit();
+ c->c_cpu = new_cpu;
+ return (new_cc);
+}
+#endif
+
+/*
+ * Start standard softclock thread.
+ */
+static void
+start_softclock(void *dummy)
+{
+ struct callout_cpu *cc;
+#ifdef SMP
+ int cpu;
+#endif
+
+ cc = CC_CPU(timeout_cpu);
+ if (swi_add(&clk_intr_event, "clock", softclock, cc, SWI_CLOCK,
+ INTR_MPSAFE, &cc->cc_cookie))
+ panic("died while creating standard software ithreads");
+#ifdef SMP
+ CPU_FOREACH(cpu) {
+ if (cpu == timeout_cpu)
+ continue;
+ cc = CC_CPU(cpu);
+ cc->cc_callout = NULL; /* Only cpu0 handles timeout(9). */
+ callout_cpu_init(cc);
+ if (swi_add(NULL, "clock", softclock, cc, SWI_CLOCK,
+ INTR_MPSAFE, &cc->cc_cookie))
+ panic("died while creating standard software ithreads");
+ }
+#endif
+}
+SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
+
+#define CC_HASH_SHIFT 8
+
+static inline u_int
+callout_hash(sbintime_t sbt)
+{
+
+ return (sbt >> (32 - CC_HASH_SHIFT));
+}
+
+static inline u_int
+callout_get_bucket(sbintime_t sbt)
+{
+
+ return (callout_hash(sbt) & callwheelmask);
+}
+
+void
+callout_process(sbintime_t now)
+{
+ struct callout *tmp, *tmpn;
+ struct callout_cpu *cc;
+ struct callout_list *sc;
+ sbintime_t first, last, max, tmp_max;
+ uint32_t lookahead;
+ u_int firstb, lastb, nowb;
+#ifdef CALLOUT_PROFILING
+ int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
+#endif
+
+ cc = CC_SELF();
+ mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
+
+ /* Compute the buckets of the last scan and present times. */
+ firstb = callout_hash(cc->cc_lastscan);
+ cc->cc_lastscan = now;
+ nowb = callout_hash(now);
+
+ /* Compute the last bucket and minimum time of the bucket after it. */
+ if (nowb == firstb)
+ lookahead = (SBT_1S / 16);
+ else if (nowb - firstb == 1)
+ lookahead = (SBT_1S / 8);
+ else
+ lookahead = (SBT_1S / 2);
+ first = last = now;
+ first += (lookahead / 2);
+ last += lookahead;
+ last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
+ lastb = callout_hash(last) - 1;
+ max = last;
+
+ /*
+ * Check if we wrapped around the entire wheel from the last scan.
+ * In case, we need to scan entirely the wheel for pending callouts.
+ */
+ if (lastb - firstb >= callwheelsize) {
+ lastb = firstb + callwheelsize - 1;
+ if (nowb - firstb >= callwheelsize)
+ nowb = lastb;
+ }
+
+ /* Iterate callwheel from firstb to nowb and then up to lastb. */
+ do {
+ sc = &cc->cc_callwheel[firstb & callwheelmask];
+ tmp = LIST_FIRST(sc);
+ while (tmp != NULL) {
+ /* Run the callout if present time within allowed. */
+ if (tmp->c_time <= now) {
+ /*
+ * Consumer told us the callout may be run
+ * directly from hardware interrupt context.
+ */
+ if (tmp->c_flags & CALLOUT_DIRECT) {
+#ifdef CALLOUT_PROFILING
+ ++depth_dir;
+#endif
+ cc->cc_exec_next_dir =
+ LIST_NEXT(tmp, c_links.le);
+ cc->cc_bucket = firstb & callwheelmask;
+ LIST_REMOVE(tmp, c_links.le);
+ softclock_call_cc(tmp, cc,
+#ifdef CALLOUT_PROFILING
+ &mpcalls_dir, &lockcalls_dir, NULL,
+#endif
+ 1);
+ tmp = cc->cc_exec_next_dir;
+ } else {
+ tmpn = LIST_NEXT(tmp, c_links.le);
+ LIST_REMOVE(tmp, c_links.le);
+ TAILQ_INSERT_TAIL(&cc->cc_expireq,
+ tmp, c_links.tqe);
+ tmp->c_flags |= CALLOUT_PROCESSED;
+ tmp = tmpn;
+ }
+ continue;
+ }
+ /* Skip events from distant future. */
+ if (tmp->c_time >= max)
+ goto next;
+ /*
+ * Event minimal time is bigger than present maximal
+ * time, so it cannot be aggregated.
+ */
+ if (tmp->c_time > last) {
+ lastb = nowb;
+ goto next;
+ }
+ /* Update first and last time, respecting this event. */
+ if (tmp->c_time < first)
+ first = tmp->c_time;
+ tmp_max = tmp->c_time + tmp->c_precision;
+ if (tmp_max < last)
+ last = tmp_max;
+next:
+ tmp = LIST_NEXT(tmp, c_links.le);
+ }
+ /* Proceed with the next bucket. */
+ firstb++;
+ /*
+ * Stop if we looked after present time and found
+ * some event we can't execute at now.
+ * Stop if we looked far enough into the future.
+ */
+ } while (((int)(firstb - lastb)) <= 0);
+ cc->cc_firstevent = last;
+#ifndef NO_EVENTTIMERS
+ cpu_new_callout(curcpu, last, first);
+#endif
+#ifdef CALLOUT_PROFILING
+ avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
+ avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
+ avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
+#endif
+ mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
+ /*
+ * swi_sched acquires the thread lock, so we don't want to call it
+ * with cc_lock held; incorrect locking order.
+ */
+ if (!TAILQ_EMPTY(&cc->cc_expireq))
+ swi_sched(cc->cc_cookie, 0);
+}
+
+static struct callout_cpu *
+callout_lock(struct callout *c)
+{
+ struct callout_cpu *cc;
+ int cpu;
+
+ for (;;) {
+ cpu = c->c_cpu;
+#ifdef SMP
+ if (cpu == CPUBLOCK) {
+ while (c->c_cpu == CPUBLOCK)
+ cpu_spinwait();
+ continue;
+ }
+#endif
+ cc = CC_CPU(cpu);
+ CC_LOCK(cc);
+ if (cpu == c->c_cpu)
+ break;
+ CC_UNLOCK(cc);
+ }
+ return (cc);
+}
+
+static void
+callout_cc_add(struct callout *c, struct callout_cpu *cc,
+ sbintime_t sbt, sbintime_t precision, void (*func)(void *),
+ void *arg, int cpu, int flags)
+{
+ int bucket;
+
+ CC_LOCK_ASSERT(cc);
+ if (sbt < cc->cc_lastscan)
+ sbt = cc->cc_lastscan;
+ c->c_arg = arg;
+ c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+ if (flags & C_DIRECT_EXEC)
+ c->c_flags |= CALLOUT_DIRECT;
+ c->c_flags &= ~CALLOUT_PROCESSED;
+ c->c_func = func;
+ c->c_time = sbt;
+ c->c_precision = precision;
+ bucket = callout_get_bucket(c->c_time);
+ CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
+ c, (int)(c->c_precision >> 32),
+ (u_int)(c->c_precision & 0xffffffff));
+ LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
+ if (cc->cc_bucket == bucket)
+ cc->cc_exec_next_dir = c;
+#ifndef NO_EVENTTIMERS
+ /*
+ * Inform the eventtimers(4) subsystem there's a new callout
+ * that has been inserted, but only if really required.
+ */
+ sbt = c->c_time + c->c_precision;
+ if (sbt < cc->cc_firstevent) {
+ cc->cc_firstevent = sbt;
+ cpu_new_callout(cpu, sbt, c->c_time);
+ }
+#endif
+}
+
+static void
+callout_cc_del(struct callout *c, struct callout_cpu *cc)
+{
+
+ if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0)
+ return;
+ c->c_func = NULL;
+ SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
+}
+
+static void
+softclock_call_cc(struct callout *c, struct callout_cpu *cc,
+#ifdef CALLOUT_PROFILING
+ int *mpcalls, int *lockcalls, int *gcalls,
+#endif
+ int direct)
+{
+ void (*c_func)(void *);
+ void *c_arg;
+ struct lock_class *class;
+ struct lock_object *c_lock;
+ int c_flags, sharedlock;
+#ifdef SMP
+ struct callout_cpu *new_cc;
+ void (*new_func)(void *);
+ void *new_arg;
+ int flags, new_cpu;
+ sbintime_t new_prec, new_time;
+#endif
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
+ sbintime_t sbt1, sbt2;
+ struct timespec ts2;
+ static sbintime_t maxdt = 2 * SBT_1MS; /* 2 msec */
+ static timeout_t *lastfunc;
+#endif
+
+ KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
+ (CALLOUT_PENDING | CALLOUT_ACTIVE),
+ ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
+ class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
+ sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
+ c_lock = c->c_lock;
+ c_func = c->c_func;
+ c_arg = c->c_arg;
+ c_flags = c->c_flags;
+ if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+ c->c_flags = CALLOUT_LOCAL_ALLOC;
+ else
+ c->c_flags &= ~CALLOUT_PENDING;
+ cc->cc_exec_entity[direct].cc_curr = c;
+ cc->cc_exec_entity[direct].cc_cancel = false;
+ CC_UNLOCK(cc);
+ if (c_lock != NULL) {
+ class->lc_lock(c_lock, sharedlock);
+ /*
+ * The callout may have been cancelled
+ * while we switched locks.
+ */
+ if (cc->cc_exec_entity[direct].cc_cancel) {
+ class->lc_unlock(c_lock);
+ goto skip;
+ }
+ /* The callout cannot be stopped now. */
+ cc->cc_exec_entity[direct].cc_cancel = true;
+ if (c_lock == &Giant.lock_object) {
+#ifdef CALLOUT_PROFILING
+ (*gcalls)++;
+#endif
+ CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
+ c, c_func, c_arg);
+ } else {
+#ifdef CALLOUT_PROFILING
+ (*lockcalls)++;
+#endif
+ CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
+ c, c_func, c_arg);
+ }
+ } else {
+#ifdef CALLOUT_PROFILING
+ (*mpcalls)++;
+#endif
+ CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
+ c, c_func, c_arg);
+ }
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
+ sbt1 = sbinuptime();
+#endif
+ THREAD_NO_SLEEPING();
+ SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
+ c_func(c_arg);
+ SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
+ THREAD_SLEEPING_OK();
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
+ sbt2 = sbinuptime();
+ sbt2 -= sbt1;
+ if (sbt2 > maxdt) {
+ if (lastfunc != c_func || sbt2 > maxdt * 2) {
+ ts2 = sbttots(sbt2);
+ printf(
+ "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
+ c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
+ }
+ maxdt = sbt2;
+ lastfunc = c_func;
+ }
+#endif
+ CTR1(KTR_CALLOUT, "callout %p finished", c);
+ if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
+ class->lc_unlock(c_lock);
+skip:
+ CC_LOCK(cc);
+ KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
+ cc->cc_exec_entity[direct].cc_curr = NULL;
+ if (cc->cc_exec_entity[direct].cc_waiting) {
+ /*
+ * There is someone waiting for the
+ * callout to complete.
+ * If the callout was scheduled for
+ * migration just cancel it.
+ */
+ if (cc_cce_migrating(cc, direct)) {
+ cc_cce_cleanup(cc, direct);
+
+ /*
+ * It should be assert here that the callout is not
+ * destroyed but that is not easy.
+ */
+ c->c_flags &= ~CALLOUT_DFRMIGRATION;
+ }
+ cc->cc_exec_entity[direct].cc_waiting = false;
+ CC_UNLOCK(cc);
+ wakeup(&cc->cc_exec_entity[direct].cc_waiting);
+ CC_LOCK(cc);
+ } else if (cc_cce_migrating(cc, direct)) {
+ KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
+ ("Migrating legacy callout %p", c));
+#ifdef SMP
+ /*
+ * If the callout was scheduled for
+ * migration just perform it now.
+ */
+ new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu;
+ new_time = cc->cc_exec_entity[direct].ce_migration_time;
+ new_prec = cc->cc_exec_entity[direct].ce_migration_prec;
+ new_func = cc->cc_exec_entity[direct].ce_migration_func;
+ new_arg = cc->cc_exec_entity[direct].ce_migration_arg;
+ cc_cce_cleanup(cc, direct);
+
+ /*
+ * It should be assert here that the callout is not destroyed
+ * but that is not easy.
+ *
+ * As first thing, handle deferred callout stops.
+ */
+ if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
+ CTR3(KTR_CALLOUT,
+ "deferred cancelled %p func %p arg %p",
+ c, new_func, new_arg);
+ callout_cc_del(c, cc);
+ return;
+ }
+ c->c_flags &= ~CALLOUT_DFRMIGRATION;
+
+ new_cc = callout_cpu_switch(c, cc, new_cpu);
+ flags = (direct) ? C_DIRECT_EXEC : 0;
+ callout_cc_add(c, new_cc, new_time, new_prec, new_func,
+ new_arg, new_cpu, flags);
+ CC_UNLOCK(new_cc);
+ CC_LOCK(cc);
+#else
+ panic("migration should not happen");
+#endif
+ }
+ /*
+ * If the current callout is locally allocated (from
+ * timeout(9)) then put it on the freelist.
+ *
+ * Note: we need to check the cached copy of c_flags because
+ * if it was not local, then it's not safe to deref the
+ * callout pointer.
+ */
+ KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 ||
+ c->c_flags == CALLOUT_LOCAL_ALLOC,
+ ("corrupted callout"));
+ if (c_flags & CALLOUT_LOCAL_ALLOC)
+ callout_cc_del(c, cc);
+}
+
+/*
+ * The callout mechanism is based on the work of Adam M. Costello and
+ * George Varghese, published in a technical report entitled "Redesigning
+ * the BSD Callout and Timer Facilities" and modified slightly for inclusion
+ * in FreeBSD by Justin T. Gibbs. The original work on the data structures
+ * used in this implementation was published by G. Varghese and T. Lauck in
+ * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
+ * the Efficient Implementation of a Timer Facility" in the Proceedings of
+ * the 11th ACM Annual Symposium on Operating Systems Principles,
+ * Austin, Texas Nov 1987.
+ */
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+void
+softclock(void *arg)
+{
+ struct callout_cpu *cc;
+ struct callout *c;
+#ifdef CALLOUT_PROFILING
+ int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
+#endif
+
+ cc = (struct callout_cpu *)arg;
+ CC_LOCK(cc);
+ while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
+ TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+ softclock_call_cc(c, cc,
+#ifdef CALLOUT_PROFILING
+ &mpcalls, &lockcalls, &gcalls,
+#endif
+ 0);
+#ifdef CALLOUT_PROFILING
+ ++depth;
+#endif
+ }
+#ifdef CALLOUT_PROFILING
+ avg_depth += (depth * 1000 - avg_depth) >> 8;
+ avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
+ avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
+ avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
+#endif
+ CC_UNLOCK(cc);
+}
+
+/*
+ * timeout --
+ * Execute a function after a specified length of time.
+ *
+ * untimeout --
+ * Cancel previous timeout function call.
+ *
+ * callout_handle_init --
+ * Initialize a handle so that using it with untimeout is benign.
+ *
+ * See AT&T BCI Driver Reference Manual for specification. This
+ * implementation differs from that one in that although an
+ * identification value is returned from timeout, the original
+ * arguments to timeout as well as the identifier are used to
+ * identify entries for untimeout.
+ */
+struct callout_handle
+timeout(ftn, arg, to_ticks)
+ timeout_t *ftn;
+ void *arg;
+ int to_ticks;
+{
+ struct callout_cpu *cc;
+ struct callout *new;
+ struct callout_handle handle;
+
+ cc = CC_CPU(timeout_cpu);
+ CC_LOCK(cc);
+ /* Fill in the next free callout structure. */
+ new = SLIST_FIRST(&cc->cc_callfree);
+ if (new == NULL)
+ /* XXX Attempt to malloc first */
+ panic("timeout table full");
+ SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
+ callout_reset(new, to_ticks, ftn, arg);
+ handle.callout = new;
+ CC_UNLOCK(cc);
+
+ return (handle);
+}
+
+void
+untimeout(ftn, arg, handle)
+ timeout_t *ftn;
+ void *arg;
+ struct callout_handle handle;
+{
+ struct callout_cpu *cc;
+
+ /*
+ * Check for a handle that was initialized
+ * by callout_handle_init, but never used
+ * for a real timeout.
+ */
+ if (handle.callout == NULL)
+ return;
+
+ cc = callout_lock(handle.callout);
+ if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
+ callout_stop(handle.callout);
+ CC_UNLOCK(cc);
+}
+
+void
+callout_handle_init(struct callout_handle *handle)
+{
+ handle->callout = NULL;
+}
+
+/*
+ * New interface; clients allocate their own callout structures.
+ *
+ * callout_reset() - establish or change a timeout
+ * callout_stop() - disestablish a timeout
+ * callout_init() - initialize a callout structure so that it can
+ * safely be passed to callout_reset() and callout_stop()
+ *
+ * <sys/callout.h> defines three convenience macros:
+ *
+ * callout_active() - returns truth if callout has not been stopped,
+ * drained, or deactivated since the last time the callout was
+ * reset.
+ * callout_pending() - returns truth if callout is still waiting for timeout
+ * callout_deactivate() - marks the callout as having been serviced
+ */
+int
+callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
+ void (*ftn)(void *), void *arg, int cpu, int flags)
+{
+ sbintime_t to_sbt, pr;
+ struct callout_cpu *cc;
+ int cancelled, direct;
+
+ cancelled = 0;
+ if (flags & C_ABSOLUTE) {
+ to_sbt = sbt;
+ } else {
+ if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
+ sbt = tick_sbt;
+ if ((flags & C_HARDCLOCK) ||
+#ifdef NO_EVENTTIMERS
+ sbt >= sbt_timethreshold) {
+ to_sbt = getsbinuptime();
+
+ /* Add safety belt for the case of hz > 1000. */
+ to_sbt += tc_tick_sbt - tick_sbt;
+#else
+ sbt >= sbt_tickthreshold) {
+ /*
+ * Obtain the time of the last hardclock() call on
+ * this CPU directly from the kern_clocksource.c.
+ * This value is per-CPU, but it is equal for all
+ * active ones.
+ */
+#ifdef __LP64__
+ to_sbt = DPCPU_GET(hardclocktime);
+#else
+ spinlock_enter();
+ to_sbt = DPCPU_GET(hardclocktime);
+ spinlock_exit();
+#endif
+#endif
+ if ((flags & C_HARDCLOCK) == 0)
+ to_sbt += tick_sbt;
+ } else
+ to_sbt = sbinuptime();
+ to_sbt += sbt;
+ pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
+ sbt >> C_PRELGET(flags));
+ if (pr > precision)
+ precision = pr;
+ }
+ /*
+ * Don't allow migration of pre-allocated callouts lest they
+ * become unbalanced.
+ */
+ if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+ cpu = c->c_cpu;
+ direct = (c->c_flags & CALLOUT_DIRECT) != 0;
+ KASSERT(!direct || c->c_lock == NULL,
+ ("%s: direct callout %p has lock", __func__, c));
+ cc = callout_lock(c);
+ if (cc->cc_exec_entity[direct].cc_curr == c) {
+ /*
+ * We're being asked to reschedule a callout which is
+ * currently in progress. If there is a lock then we
+ * can cancel the callout if it has not really started.
+ */
+ if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel)
+ cancelled = cc->cc_exec_entity[direct].cc_cancel = true;
+ if (cc->cc_exec_entity[direct].cc_waiting) {
+ /*
+ * Someone has called callout_drain to kill this
+ * callout. Don't reschedule.
+ */
+ CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
+ cancelled ? "cancelled" : "failed to cancel",
+ c, c->c_func, c->c_arg);
+ CC_UNLOCK(cc);
+ return (cancelled);
+ }
+ }
+ if (c->c_flags & CALLOUT_PENDING) {
+ if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+ if (cc->cc_exec_next_dir == c)
+ cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
+ LIST_REMOVE(c, c_links.le);
+ } else
+ TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+ cancelled = 1;
+ c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+ }
+
+#ifdef SMP
+ /*
+ * If the callout must migrate try to perform it immediately.
+ * If the callout is currently running, just defer the migration
+ * to a more appropriate moment.
+ */
+ if (c->c_cpu != cpu) {
+ if (cc->cc_exec_entity[direct].cc_curr == c) {
+ cc->cc_exec_entity[direct].ce_migration_cpu = cpu;
+ cc->cc_exec_entity[direct].ce_migration_time
+ = to_sbt;
+ cc->cc_exec_entity[direct].ce_migration_prec
+ = precision;
+ cc->cc_exec_entity[direct].ce_migration_func = ftn;
+ cc->cc_exec_entity[direct].ce_migration_arg = arg;
+ c->c_flags |= CALLOUT_DFRMIGRATION;
+ CTR6(KTR_CALLOUT,
+ "migration of %p func %p arg %p in %d.%08x to %u deferred",
+ c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+ (u_int)(to_sbt & 0xffffffff), cpu);
+ CC_UNLOCK(cc);
+ return (cancelled);
+ }
+ cc = callout_cpu_switch(c, cc, cpu);
+ }
+#endif
+
+ callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
+ CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
+ cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+ (u_int)(to_sbt & 0xffffffff));
+ CC_UNLOCK(cc);
+
+ return (cancelled);
+}
+
+/*
+ * Common idioms that can be optimized in the future.
+ */
+int
+callout_schedule_on(struct callout *c, int to_ticks, int cpu)
+{
+ return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
+}
+
+int
+callout_schedule(struct callout *c, int to_ticks)
+{
+ return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
+}
+
+int
+_callout_stop_safe(c, safe)
+ struct callout *c;
+ int safe;
+{
+ struct callout_cpu *cc, *old_cc;
+ struct lock_class *class;
+ int direct, sq_locked, use_lock;
+
+ /*
+ * Some old subsystems don't hold Giant while running a callout_stop(),
+ * so just discard this check for the moment.
+ */
+ if (!safe && c->c_lock != NULL) {
+ if (c->c_lock == &Giant.lock_object)
+ use_lock = mtx_owned(&Giant);
+ else {
+ use_lock = 1;
+ class = LOCK_CLASS(c->c_lock);
+ class->lc_assert(c->c_lock, LA_XLOCKED);
+ }
+ } else
+ use_lock = 0;
+ direct = (c->c_flags & CALLOUT_DIRECT) != 0;
+ sq_locked = 0;
+ old_cc = NULL;
+again:
+ cc = callout_lock(c);
+
+ /*
+ * If the callout was migrating while the callout cpu lock was
+ * dropped, just drop the sleepqueue lock and check the states
+ * again.
+ */
+ if (sq_locked != 0 && cc != old_cc) {
+#ifdef SMP
+ CC_UNLOCK(cc);
+ sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting);
+ sq_locked = 0;
+ old_cc = NULL;
+ goto again;
+#else
+ panic("migration should not happen");
+#endif
+ }
+
+ /*
+ * If the callout isn't pending, it's not on the queue, so
+ * don't attempt to remove it from the queue. We can try to
+ * stop it by other means however.
+ */
+ if (!(c->c_flags & CALLOUT_PENDING)) {
+ c->c_flags &= ~CALLOUT_ACTIVE;
+
+ /*
+ * If it wasn't on the queue and it isn't the current
+ * callout, then we can't stop it, so just bail.
+ */
+ if (cc->cc_exec_entity[direct].cc_curr != c) {
+ CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+ c, c->c_func, c->c_arg);
+ CC_UNLOCK(cc);
+ if (sq_locked)
+ sleepq_release(
+ &cc->cc_exec_entity[direct].cc_waiting);
+ return (0);
+ }
+
+ if (safe) {
+ /*
+ * The current callout is running (or just
+ * about to run) and blocking is allowed, so
+ * just wait for the current invocation to
+ * finish.
+ */
+ while (cc->cc_exec_entity[direct].cc_curr == c) {
+ /*
+ * Use direct calls to sleepqueue interface
+ * instead of cv/msleep in order to avoid
+ * a LOR between cc_lock and sleepqueue
+ * chain spinlocks. This piece of code
+ * emulates a msleep_spin() call actually.
+ *
+ * If we already have the sleepqueue chain
+ * locked, then we can safely block. If we
+ * don't already have it locked, however,
+ * we have to drop the cc_lock to lock
+ * it. This opens several races, so we
+ * restart at the beginning once we have
+ * both locks. If nothing has changed, then
+ * we will end up back here with sq_locked
+ * set.
+ */
+ if (!sq_locked) {
+ CC_UNLOCK(cc);
+ sleepq_lock(
+ &cc->cc_exec_entity[direct].cc_waiting);
+ sq_locked = 1;
+ old_cc = cc;
+ goto again;
+ }
+
+ /*
+ * Migration could be cancelled here, but
+ * as long as it is still not sure when it
+ * will be packed up, just let softclock()
+ * take care of it.
+ */
+ cc->cc_exec_entity[direct].cc_waiting = true;
+ DROP_GIANT();
+ CC_UNLOCK(cc);
+ sleepq_add(
+ &cc->cc_exec_entity[direct].cc_waiting,
+ &cc->cc_lock.lock_object, "codrain",
+ SLEEPQ_SLEEP, 0);
+ sleepq_wait(
+ &cc->cc_exec_entity[direct].cc_waiting,
+ 0);
+ sq_locked = 0;
+ old_cc = NULL;
+
+ /* Reacquire locks previously released. */
+ PICKUP_GIANT();
+ CC_LOCK(cc);
+ }
+ } else if (use_lock &&
+ !cc->cc_exec_entity[direct].cc_cancel) {
+ /*
+ * The current callout is waiting for its
+ * lock which we hold. Cancel the callout
+ * and return. After our caller drops the
+ * lock, the callout will be skipped in
+ * softclock().
+ */
+ cc->cc_exec_entity[direct].cc_cancel = true;
+ CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+ c, c->c_func, c->c_arg);
+ KASSERT(!cc_cce_migrating(cc, direct),
+ ("callout wrongly scheduled for migration"));
+ CC_UNLOCK(cc);
+ KASSERT(!sq_locked, ("sleepqueue chain locked"));
+ return (1);
+ } else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
+ c->c_flags &= ~CALLOUT_DFRMIGRATION;
+ CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
+ c, c->c_func, c->c_arg);
+ CC_UNLOCK(cc);
+ return (1);
+ }
+ CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+ c, c->c_func, c->c_arg);
+ CC_UNLOCK(cc);
+ KASSERT(!sq_locked, ("sleepqueue chain still locked"));
+ return (0);
+ }
+ if (sq_locked)
+ sleepq_release(&cc->cc_exec_entity[direct].cc_waiting);
+
+ c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+ CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+ c, c->c_func, c->c_arg);
+ if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
+ if (cc->cc_exec_next_dir == c)
+ cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
+ LIST_REMOVE(c, c_links.le);
+ } else
+ TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+ callout_cc_del(c, cc);
+
+ CC_UNLOCK(cc);
+ return (1);
+}
+
+void
+callout_init(c, mpsafe)
+ struct callout *c;
+ int mpsafe;
+{
+ bzero(c, sizeof *c);
+ if (mpsafe) {
+ c->c_lock = NULL;
+ c->c_flags = CALLOUT_RETURNUNLOCKED;
+ } else {
+ c->c_lock = &Giant.lock_object;
+ c->c_flags = 0;
+ }
+ c->c_cpu = timeout_cpu;
+}
+
+void
+_callout_init_lock(c, lock, flags)
+ struct callout *c;
+ struct lock_object *lock;
+ int flags;
+{
+ bzero(c, sizeof *c);
+ c->c_lock = lock;
+ KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
+ ("callout_init_lock: bad flags %d", flags));
+ KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
+ ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
+ KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
+ (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
+ __func__));
+ c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
+ c->c_cpu = timeout_cpu;
+}
+
+#ifdef APM_FIXUP_CALLTODO
+/*
+ * Adjust the kernel calltodo timeout list. This routine is used after
+ * an APM resume to recalculate the calltodo timer list values with the
+ * number of hz's we have been sleeping. The next hardclock() will detect
+ * that there are fired timers and run softclock() to execute them.
+ *
+ * Please note, I have not done an exhaustive analysis of what code this
+ * might break. I am motivated to have my select()'s and alarm()'s that
+ * have expired during suspend firing upon resume so that the applications
+ * which set the timer can do the maintanence the timer was for as close
+ * as possible to the originally intended time. Testing this code for a
+ * week showed that resuming from a suspend resulted in 22 to 25 timers
+ * firing, which seemed independant on whether the suspend was 2 hours or
+ * 2 days. Your milage may vary. - Ken Key <key@cs.utk.edu>
+ */
+void
+adjust_timeout_calltodo(time_change)
+ struct timeval *time_change;
+{
+ register struct callout *p;
+ unsigned long delta_ticks;
+
+ /*
+ * How many ticks were we asleep?
+ * (stolen from tvtohz()).
+ */
+
+ /* Don't do anything */
+ if (time_change->tv_sec < 0)
+ return;
+ else if (time_change->tv_sec <= LONG_MAX / 1000000)
+ delta_ticks = (time_change->tv_sec * 1000000 +
+ time_change->tv_usec + (tick - 1)) / tick + 1;
+ else if (time_change->tv_sec <= LONG_MAX / hz)
+ delta_ticks = time_change->tv_sec * hz +
+ (time_change->tv_usec + (tick - 1)) / tick + 1;
+ else
+ delta_ticks = LONG_MAX;
+
+ if (delta_ticks > INT_MAX)
+ delta_ticks = INT_MAX;
+
+ /*
+ * Now rip through the timer calltodo list looking for timers
+ * to expire.
+ */
+
+ /* don't collide with softclock() */
+ CC_LOCK(cc);
+ for (p = calltodo.c_next; p != NULL; p = p->c_next) {
+ p->c_time -= delta_ticks;
+
+ /* Break if the timer had more time on it than delta_ticks */
+ if (p->c_time > 0)
+ break;
+
+ /* take back the ticks the timer didn't use (p->c_time <= 0) */
+ delta_ticks = -p->c_time;
+ }
+ CC_UNLOCK(cc);
+
+ return;
+}
+#endif /* APM_FIXUP_CALLTODO */
+
+static int
+flssbt(sbintime_t sbt)
+{
+
+ sbt += (uint64_t)sbt >> 1;
+ if (sizeof(long) >= sizeof(sbintime_t))
+ return (flsl(sbt));
+ if (sbt >= SBT_1S)
+ return (flsl(((uint64_t)sbt) >> 32) + 32);
+ return (flsl(sbt));
+}
+
+/*
+ * Dump immediate statistic snapshot of the scheduled callouts.
+ */
+static int
+sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
+{
+ struct callout *tmp;
+ struct callout_cpu *cc;
+ struct callout_list *sc;
+ sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
+ int ct[64], cpr[64], ccpbk[32];
+ int error, val, i, count, tcum, pcum, maxc, c, medc;
+#ifdef SMP
+ int cpu;
+#endif
+
+ val = 0;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ count = maxc = 0;
+ st = spr = maxt = maxpr = 0;
+ bzero(ccpbk, sizeof(ccpbk));
+ bzero(ct, sizeof(ct));
+ bzero(cpr, sizeof(cpr));
+ now = sbinuptime();
+#ifdef SMP
+ CPU_FOREACH(cpu) {
+ cc = CC_CPU(cpu);
+#else
+ cc = CC_CPU(timeout_cpu);
+#endif
+ CC_LOCK(cc);
+ for (i = 0; i < callwheelsize; i++) {
+ sc = &cc->cc_callwheel[i];
+ c = 0;
+ LIST_FOREACH(tmp, sc, c_links.le) {
+ c++;
+ t = tmp->c_time - now;
+ if (t < 0)
+ t = 0;
+ st += t / SBT_1US;
+ spr += tmp->c_precision / SBT_1US;
+ if (t > maxt)
+ maxt = t;
+ if (tmp->c_precision > maxpr)
+ maxpr = tmp->c_precision;
+ ct[flssbt(t)]++;
+ cpr[flssbt(tmp->c_precision)]++;
+ }
+ if (c > maxc)
+ maxc = c;
+ ccpbk[fls(c + c / 2)]++;
+ count += c;
+ }
+ CC_UNLOCK(cc);
+#ifdef SMP
+ }
+#endif
+
+ for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
+ tcum += ct[i];
+ medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
+ for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
+ pcum += cpr[i];
+ medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
+ for (i = 0, c = 0; i < 32 && c < count / 2; i++)
+ c += ccpbk[i];
+ medc = (i >= 2) ? (1 << (i - 2)) : 0;
+
+ printf("Scheduled callouts statistic snapshot:\n");
+ printf(" Callouts: %6d Buckets: %6d*%-3d Bucket size: 0.%06ds\n",
+ count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
+ printf(" C/Bk: med %5d avg %6d.%06jd max %6d\n",
+ medc,
+ count / callwheelsize / mp_ncpus,
+ (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
+ maxc);
+ printf(" Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
+ medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
+ (st / count) / 1000000, (st / count) % 1000000,
+ maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
+ printf(" Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
+ medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
+ (spr / count) / 1000000, (spr / count) % 1000000,
+ maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
+ printf(" Distribution: \tbuckets\t time\t tcum\t"
+ " prec\t pcum\n");
+ for (i = 0, tcum = pcum = 0; i < 64; i++) {
+ if (ct[i] == 0 && cpr[i] == 0)
+ continue;
+ t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
+ tcum += ct[i];
+ pcum += cpr[i];
+ printf(" %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
+ t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
+ i - 1 - (32 - CC_HASH_SHIFT),
+ ct[i], tcum, cpr[i], pcum);
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ 0, 0, sysctl_kern_callout_stat, "I",
+ "Dump immediate statistic snapshot of the scheduled callouts");
diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c
new file mode 100644
index 0000000..0e21383
--- /dev/null
+++ b/sys/kern/kern_umtx.c
@@ -0,0 +1,3918 @@
+/*-
+ * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
+ * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_umtx_profiling.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/syscallsubr.h>
+#include <sys/eventhandler.h>
+#include <sys/umtx.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+
+#include <machine/cpu.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32_proto.h>
+#endif
+
+#define _UMUTEX_TRY 1
+#define _UMUTEX_WAIT 2
+
+#ifdef UMTX_PROFILING
+#define UPROF_PERC_BIGGER(w, f, sw, sf) \
+ (((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
+#endif
+
+/* Priority inheritance mutex info. */
+struct umtx_pi {
+ /* Owner thread */
+ struct thread *pi_owner;
+
+ /* Reference count */
+ int pi_refcount;
+
+ /* List entry to link umtx holding by thread */
+ TAILQ_ENTRY(umtx_pi) pi_link;
+
+ /* List entry in hash */
+ TAILQ_ENTRY(umtx_pi) pi_hashlink;
+
+ /* List for waiters */
+ TAILQ_HEAD(,umtx_q) pi_blocked;
+
+ /* Identify a userland lock object */
+ struct umtx_key pi_key;
+};
+
+/* A userland synchronous object user. */
+struct umtx_q {
+ /* Linked list for the hash. */
+ TAILQ_ENTRY(umtx_q) uq_link;
+
+ /* Umtx key. */
+ struct umtx_key uq_key;
+
+ /* Umtx flags. */
+ int uq_flags;
+#define UQF_UMTXQ 0x0001
+
+ /* The thread waits on. */
+ struct thread *uq_thread;
+
+ /*
+ * Blocked on PI mutex. read can use chain lock
+ * or umtx_lock, write must have both chain lock and
+ * umtx_lock being hold.
+ */
+ struct umtx_pi *uq_pi_blocked;
+
+ /* On blocked list */
+ TAILQ_ENTRY(umtx_q) uq_lockq;
+
+ /* Thread contending with us */
+ TAILQ_HEAD(,umtx_pi) uq_pi_contested;
+
+ /* Inherited priority from PP mutex */
+ u_char uq_inherited_pri;
+
+ /* Spare queue ready to be reused */
+ struct umtxq_queue *uq_spare_queue;
+
+ /* The queue we on */
+ struct umtxq_queue *uq_cur_queue;
+};
+
+TAILQ_HEAD(umtxq_head, umtx_q);
+
+/* Per-key wait-queue */
+struct umtxq_queue {
+ struct umtxq_head head;
+ struct umtx_key key;
+ LIST_ENTRY(umtxq_queue) link;
+ int length;
+};
+
+LIST_HEAD(umtxq_list, umtxq_queue);
+
+/* Userland lock object's wait-queue chain */
+struct umtxq_chain {
+ /* Lock for this chain. */
+ struct mtx uc_lock;
+
+ /* List of sleep queues. */
+ struct umtxq_list uc_queue[2];
+#define UMTX_SHARED_QUEUE 0
+#define UMTX_EXCLUSIVE_QUEUE 1
+
+ LIST_HEAD(, umtxq_queue) uc_spare_queue;
+
+ /* Busy flag */
+ char uc_busy;
+
+ /* Chain lock waiters */
+ int uc_waiters;
+
+ /* All PI in the list */
+ TAILQ_HEAD(,umtx_pi) uc_pi_list;
+
+#ifdef UMTX_PROFILING
+ u_int length;
+ u_int max_length;
+#endif
+};
+
+#define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED)
+#define UMTXQ_BUSY_ASSERT(uc) KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
+
+/*
+ * Don't propagate time-sharing priority, there is a security reason,
+ * a user can simply introduce PI-mutex, let thread A lock the mutex,
+ * and let another thread B block on the mutex, because B is
+ * sleeping, its priority will be boosted, this causes A's priority to
+ * be boosted via priority propagating too and will never be lowered even
+ * if it is using 100%CPU, this is unfair to other processes.
+ */
+
+#define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
+ (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
+ PRI_MAX_TIMESHARE : (td)->td_user_pri)
+
+#define GOLDEN_RATIO_PRIME 2654404609U
+#define UMTX_CHAINS 512
+#define UMTX_SHIFTS (__WORD_BIT - 9)
+
+#define GET_SHARE(flags) \
+ (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
+
+#define BUSY_SPINS 200
+
+struct abs_timeout {
+ int clockid;
+ struct timespec cur;
+ struct timespec end;
+};
+
+static uma_zone_t umtx_pi_zone;
+static struct umtxq_chain umtxq_chains[2][UMTX_CHAINS];
+static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
+static int umtx_pi_allocated;
+
+static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
+SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
+ &umtx_pi_allocated, 0, "Allocated umtx_pi");
+
+#ifdef UMTX_PROFILING
+static long max_length;
+SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
+static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
+#endif
+
+static void umtxq_sysinit(void *);
+static void umtxq_hash(struct umtx_key *key);
+static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
+static void umtxq_lock(struct umtx_key *key);
+static void umtxq_unlock(struct umtx_key *key);
+static void umtxq_busy(struct umtx_key *key);
+static void umtxq_unbusy(struct umtx_key *key);
+static void umtxq_insert_queue(struct umtx_q *uq, int q);
+static void umtxq_remove_queue(struct umtx_q *uq, int q);
+static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
+static int umtxq_count(struct umtx_key *key);
+static struct umtx_pi *umtx_pi_alloc(int);
+static void umtx_pi_free(struct umtx_pi *pi);
+static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
+static void umtx_thread_cleanup(struct thread *td);
+static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
+ struct image_params *imgp __unused);
+SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
+
+#define umtxq_signal(key, nwake) umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
+#define umtxq_insert(uq) umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
+#define umtxq_remove(uq) umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
+
+static struct mtx umtx_lock;
+
+#ifdef UMTX_PROFILING
+static void
+umtx_init_profiling(void)
+{
+ struct sysctl_oid *chain_oid;
+ char chain_name[10];
+ int i;
+
+ for (i = 0; i < UMTX_CHAINS; ++i) {
+ snprintf(chain_name, sizeof(chain_name), "%d", i);
+ chain_oid = SYSCTL_ADD_NODE(NULL,
+ SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
+ chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
+ SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+ "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
+ SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+ "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
+ }
+}
+
+static int
+sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
+{
+ char buf[512];
+ struct sbuf sb;
+ struct umtxq_chain *uc;
+ u_int fract, i, j, tot, whole;
+ u_int sf0, sf1, sf2, sf3, sf4;
+ u_int si0, si1, si2, si3, si4;
+ u_int sw0, sw1, sw2, sw3, sw4;
+
+ sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+ for (i = 0; i < 2; i++) {
+ tot = 0;
+ for (j = 0; j < UMTX_CHAINS; ++j) {
+ uc = &umtxq_chains[i][j];
+ mtx_lock(&uc->uc_lock);
+ tot += uc->max_length;
+ mtx_unlock(&uc->uc_lock);
+ }
+ if (tot == 0)
+ sbuf_printf(&sb, "%u) Empty ", i);
+ else {
+ sf0 = sf1 = sf2 = sf3 = sf4 = 0;
+ si0 = si1 = si2 = si3 = si4 = 0;
+ sw0 = sw1 = sw2 = sw3 = sw4 = 0;
+ for (j = 0; j < UMTX_CHAINS; j++) {
+ uc = &umtxq_chains[i][j];
+ mtx_lock(&uc->uc_lock);
+ whole = uc->max_length * 100;
+ mtx_unlock(&uc->uc_lock);
+ fract = (whole % tot) * 100;
+ if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
+ sf0 = fract;
+ si0 = j;
+ sw0 = whole;
+ } else if (UPROF_PERC_BIGGER(whole, fract, sw1,
+ sf1)) {
+ sf1 = fract;
+ si1 = j;
+ sw1 = whole;
+ } else if (UPROF_PERC_BIGGER(whole, fract, sw2,
+ sf2)) {
+ sf2 = fract;
+ si2 = j;
+ sw2 = whole;
+ } else if (UPROF_PERC_BIGGER(whole, fract, sw3,
+ sf3)) {
+ sf3 = fract;
+ si3 = j;
+ sw3 = whole;
+ } else if (UPROF_PERC_BIGGER(whole, fract, sw4,
+ sf4)) {
+ sf4 = fract;
+ si4 = j;
+ sw4 = whole;
+ }
+ }
+ sbuf_printf(&sb, "queue %u:\n", i);
+ sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
+ sf0 / tot, si0);
+ sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
+ sf1 / tot, si1);
+ sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
+ sf2 / tot, si2);
+ sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
+ sf3 / tot, si3);
+ sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
+ sf4 / tot, si4);
+ }
+ }
+ sbuf_trim(&sb);
+ sbuf_finish(&sb);
+ sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+ sbuf_delete(&sb);
+ return (0);
+}
+
+static int
+sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
+{
+ struct umtxq_chain *uc;
+ u_int i, j;
+ int clear, error;
+
+ clear = 0;
+ error = sysctl_handle_int(oidp, &clear, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+
+ if (clear != 0) {
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < UMTX_CHAINS; ++j) {
+ uc = &umtxq_chains[i][j];
+ mtx_lock(&uc->uc_lock);
+ uc->length = 0;
+ uc->max_length = 0;
+ mtx_unlock(&uc->uc_lock);
+ }
+ }
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+ sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
+SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
+ sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
+#endif
+
+static void
+umtxq_sysinit(void *arg __unused)
+{
+ int i, j;
+
+ umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < UMTX_CHAINS; ++j) {
+ mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
+ MTX_DEF | MTX_DUPOK);
+ LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
+ LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
+ LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
+ TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
+ umtxq_chains[i][j].uc_busy = 0;
+ umtxq_chains[i][j].uc_waiters = 0;
+#ifdef UMTX_PROFILING
+ umtxq_chains[i][j].length = 0;
+ umtxq_chains[i][j].max_length = 0;
+#endif
+ }
+ }
+#ifdef UMTX_PROFILING
+ umtx_init_profiling();
+#endif
+ mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
+ EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
+ EVENTHANDLER_PRI_ANY);
+}
+
+struct umtx_q *
+umtxq_alloc(void)
+{
+ struct umtx_q *uq;
+
+ uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
+ uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&uq->uq_spare_queue->head);
+ TAILQ_INIT(&uq->uq_pi_contested);
+ uq->uq_inherited_pri = PRI_MAX;
+ return (uq);
+}
+
+void
+umtxq_free(struct umtx_q *uq)
+{
+ MPASS(uq->uq_spare_queue != NULL);
+ free(uq->uq_spare_queue, M_UMTX);
+ free(uq, M_UMTX);
+}
+
+static inline void
+umtxq_hash(struct umtx_key *key)
+{
+ unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
+ key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
+}
+
+static inline struct umtxq_chain *
+umtxq_getchain(struct umtx_key *key)
+{
+ if (key->type <= TYPE_SEM)
+ return (&umtxq_chains[1][key->hash]);
+ return (&umtxq_chains[0][key->hash]);
+}
+
+/*
+ * Lock a chain.
+ */
+static inline void
+umtxq_lock(struct umtx_key *key)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(key);
+ mtx_lock(&uc->uc_lock);
+}
+
+/*
+ * Unlock a chain.
+ */
+static inline void
+umtxq_unlock(struct umtx_key *key)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(key);
+ mtx_unlock(&uc->uc_lock);
+}
+
+/*
+ * Set chain to busy state when following operation
+ * may be blocked (kernel mutex can not be used).
+ */
+static inline void
+umtxq_busy(struct umtx_key *key)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(key);
+ mtx_assert(&uc->uc_lock, MA_OWNED);
+ if (uc->uc_busy) {
+#ifdef SMP
+ if (smp_cpus > 1) {
+ int count = BUSY_SPINS;
+ if (count > 0) {
+ umtxq_unlock(key);
+ while (uc->uc_busy && --count > 0)
+ cpu_spinwait();
+ umtxq_lock(key);
+ }
+ }
+#endif
+ while (uc->uc_busy) {
+ uc->uc_waiters++;
+ msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
+ uc->uc_waiters--;
+ }
+ }
+ uc->uc_busy = 1;
+}
+
+/*
+ * Unbusy a chain.
+ */
+static inline void
+umtxq_unbusy(struct umtx_key *key)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(key);
+ mtx_assert(&uc->uc_lock, MA_OWNED);
+ KASSERT(uc->uc_busy != 0, ("not busy"));
+ uc->uc_busy = 0;
+ if (uc->uc_waiters)
+ wakeup_one(uc);
+}
+
+static struct umtxq_queue *
+umtxq_queue_lookup(struct umtx_key *key, int q)
+{
+ struct umtxq_queue *uh;
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ LIST_FOREACH(uh, &uc->uc_queue[q], link) {
+ if (umtx_key_match(&uh->key, key))
+ return (uh);
+ }
+
+ return (NULL);
+}
+
+static inline void
+umtxq_insert_queue(struct umtx_q *uq, int q)
+{
+ struct umtxq_queue *uh;
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
+ uh = umtxq_queue_lookup(&uq->uq_key, q);
+ if (uh != NULL) {
+ LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
+ } else {
+ uh = uq->uq_spare_queue;
+ uh->key = uq->uq_key;
+ LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
+#ifdef UMTX_PROFILING
+ uc->length++;
+ if (uc->length > uc->max_length) {
+ uc->max_length = uc->length;
+ if (uc->max_length > max_length)
+ max_length = uc->max_length;
+ }
+#endif
+ }
+ uq->uq_spare_queue = NULL;
+
+ TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
+ uh->length++;
+ uq->uq_flags |= UQF_UMTXQ;
+ uq->uq_cur_queue = uh;
+ return;
+}
+
+static inline void
+umtxq_remove_queue(struct umtx_q *uq, int q)
+{
+ struct umtxq_chain *uc;
+ struct umtxq_queue *uh;
+
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ if (uq->uq_flags & UQF_UMTXQ) {
+ uh = uq->uq_cur_queue;
+ TAILQ_REMOVE(&uh->head, uq, uq_link);
+ uh->length--;
+ uq->uq_flags &= ~UQF_UMTXQ;
+ if (TAILQ_EMPTY(&uh->head)) {
+ KASSERT(uh->length == 0,
+ ("inconsistent umtxq_queue length"));
+#ifdef UMTX_PROFILING
+ uc->length--;
+#endif
+ LIST_REMOVE(uh, link);
+ } else {
+ uh = LIST_FIRST(&uc->uc_spare_queue);
+ KASSERT(uh != NULL, ("uc_spare_queue is empty"));
+ LIST_REMOVE(uh, link);
+ }
+ uq->uq_spare_queue = uh;
+ uq->uq_cur_queue = NULL;
+ }
+}
+
+/*
+ * Check if there are multiple waiters
+ */
+static int
+umtxq_count(struct umtx_key *key)
+{
+ struct umtxq_chain *uc;
+ struct umtxq_queue *uh;
+
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
+ if (uh != NULL)
+ return (uh->length);
+ return (0);
+}
+
+/*
+ * Check if there are multiple PI waiters and returns first
+ * waiter.
+ */
+static int
+umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
+{
+ struct umtxq_chain *uc;
+ struct umtxq_queue *uh;
+
+ *first = NULL;
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
+ if (uh != NULL) {
+ *first = TAILQ_FIRST(&uh->head);
+ return (uh->length);
+ }
+ return (0);
+}
+
+static int
+umtxq_check_susp(struct thread *td)
+{
+ struct proc *p;
+ int error;
+
+ /*
+ * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
+ * eventually break the lockstep loop.
+ */
+ if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
+ return (0);
+ error = 0;
+ p = td->td_proc;
+ PROC_LOCK(p);
+ if (P_SHOULDSTOP(p) ||
+ ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
+ if (p->p_flag & P_SINGLE_EXIT)
+ error = EINTR;
+ else
+ error = ERESTART;
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+/*
+ * Wake up threads waiting on an userland object.
+ */
+
+static int
+umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
+{
+ struct umtxq_chain *uc;
+ struct umtxq_queue *uh;
+ struct umtx_q *uq;
+ int ret;
+
+ ret = 0;
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ uh = umtxq_queue_lookup(key, q);
+ if (uh != NULL) {
+ while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
+ umtxq_remove_queue(uq, q);
+ wakeup(uq);
+ if (++ret >= n_wake)
+ return (ret);
+ }
+ }
+ return (ret);
+}
+
+
+/*
+ * Wake up specified thread.
+ */
+static inline void
+umtxq_signal_thread(struct umtx_q *uq)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ umtxq_remove(uq);
+ wakeup(uq);
+}
+
+static inline int
+tstohz(const struct timespec *tsp)
+{
+ struct timeval tv;
+
+ TIMESPEC_TO_TIMEVAL(&tv, tsp);
+ return tvtohz(&tv);
+}
+
+static void
+abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
+ const struct timespec *timeout)
+{
+
+ timo->clockid = clockid;
+ if (!absolute) {
+ kern_clock_gettime(curthread, clockid, &timo->end);
+ timo->cur = timo->end;
+ timespecadd(&timo->end, timeout);
+ } else {
+ timo->end = *timeout;
+ kern_clock_gettime(curthread, clockid, &timo->cur);
+ }
+}
+
+static void
+abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
+{
+
+ abs_timeout_init(timo, umtxtime->_clockid,
+ (umtxtime->_flags & UMTX_ABSTIME) != 0,
+ &umtxtime->_timeout);
+}
+
+static inline void
+abs_timeout_update(struct abs_timeout *timo)
+{
+ kern_clock_gettime(curthread, timo->clockid, &timo->cur);
+}
+
+static int
+abs_timeout_gethz(struct abs_timeout *timo)
+{
+ struct timespec tts;
+
+ if (timespeccmp(&timo->end, &timo->cur, <=))
+ return (-1);
+ tts = timo->end;
+ timespecsub(&tts, &timo->cur);
+ return (tstohz(&tts));
+}
+
+/*
+ * Put thread into sleep state, before sleeping, check if
+ * thread was removed from umtx queue.
+ */
+static inline int
+umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
+{
+ struct umtxq_chain *uc;
+ int error, timo;
+
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ for (;;) {
+ if (!(uq->uq_flags & UQF_UMTXQ))
+ return (0);
+ if (abstime != NULL) {
+ timo = abs_timeout_gethz(abstime);
+ if (timo < 0)
+ return (ETIMEDOUT);
+ } else
+ timo = 0;
+ error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
+ if (error != EWOULDBLOCK) {
+ umtxq_lock(&uq->uq_key);
+ break;
+ }
+ if (abstime != NULL)
+ abs_timeout_update(abstime);
+ umtxq_lock(&uq->uq_key);
+ }
+ return (error);
+}
+
+/*
+ * Convert userspace address into unique logical address.
+ */
+int
+umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
+{
+ struct thread *td = curthread;
+ vm_map_t map;
+ vm_map_entry_t entry;
+ vm_pindex_t pindex;
+ vm_prot_t prot;
+ boolean_t wired;
+
+ key->type = type;
+ if (share == THREAD_SHARE) {
+ key->shared = 0;
+ key->info.private.vs = td->td_proc->p_vmspace;
+ key->info.private.addr = (uintptr_t)addr;
+ } else {
+ MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
+ map = &td->td_proc->p_vmspace->vm_map;
+ if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
+ &entry, &key->info.shared.object, &pindex, &prot,
+ &wired) != KERN_SUCCESS) {
+ return EFAULT;
+ }
+
+ if ((share == PROCESS_SHARE) ||
+ (share == AUTO_SHARE &&
+ VM_INHERIT_SHARE == entry->inheritance)) {
+ key->shared = 1;
+ key->info.shared.offset = entry->offset + entry->start -
+ (vm_offset_t)addr;
+ vm_object_reference(key->info.shared.object);
+ } else {
+ key->shared = 0;
+ key->info.private.vs = td->td_proc->p_vmspace;
+ key->info.private.addr = (uintptr_t)addr;
+ }
+ vm_map_lookup_done(map, entry);
+ }
+
+ umtxq_hash(key);
+ return (0);
+}
+
+/*
+ * Release key.
+ */
+void
+umtx_key_release(struct umtx_key *key)
+{
+ if (key->shared)
+ vm_object_deallocate(key->info.shared.object);
+}
+
+/*
+ * Lock a umtx object.
+ */
+static int
+do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
+ const struct timespec *timeout)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ u_long owner;
+ u_long old;
+ int error = 0;
+
+ uq = td->td_umtxq;
+ if (timeout != NULL)
+ abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
+
+ /*
+ * Care must be exercised when dealing with umtx structure. It
+ * can fault on any access.
+ */
+ for (;;) {
+ /*
+ * Try the uncontested case. This should be done in userland.
+ */
+ owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
+
+ /* The acquire succeeded. */
+ if (owner == UMTX_UNOWNED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ /* If no one owns it but it is contested try to acquire it. */
+ if (owner == UMTX_CONTESTED) {
+ owner = casuword(&umtx->u_owner,
+ UMTX_CONTESTED, id | UMTX_CONTESTED);
+
+ if (owner == UMTX_CONTESTED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+
+ /* If this failed the lock has changed, restart. */
+ continue;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ break;
+
+ if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
+ AUTO_SHARE, &uq->uq_key)) != 0)
+ return (error);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Set the contested bit so that a release in user space
+ * knows to use the system call for unlock. If this fails
+ * either some one else has acquired the lock or it has been
+ * released.
+ */
+ old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
+
+ /* The address was invalid. */
+ if (old == -1) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (EFAULT);
+ }
+
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ umtxq_lock(&uq->uq_key);
+ if (old == owner)
+ error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
+ &timo);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+
+ if (error == 0)
+ error = umtxq_check_susp(td);
+ }
+
+ if (timeout == NULL) {
+ /* Mutex locking is restarted if it is interrupted. */
+ if (error == EINTR)
+ error = ERESTART;
+ } else {
+ /* Timed-locking is not restarted. */
+ if (error == ERESTART)
+ error = EINTR;
+ }
+ return (error);
+}
+
+/*
+ * Unlock a umtx object.
+ */
+static int
+do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
+{
+ struct umtx_key key;
+ u_long owner;
+ u_long old;
+ int error;
+ int count;
+
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMTX_CONTESTED) != id)
+ return (EPERM);
+
+ /* This should be done in userland */
+ if ((owner & UMTX_CONTESTED) == 0) {
+ old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
+ if (old == -1)
+ return (EFAULT);
+ if (old == owner)
+ return (0);
+ owner = old;
+ }
+
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
+ &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count(&key);
+ umtxq_unlock(&key);
+
+ /*
+ * When unlocking the umtx, it must be marked as unowned if
+ * there is zero or one thread only waiting for it.
+ * Otherwise, it must be marked as contested.
+ */
+ old = casuword(&umtx->u_owner, owner,
+ count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
+ umtxq_lock(&key);
+ umtxq_signal(&key,1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ if (old == -1)
+ return (EFAULT);
+ if (old != owner)
+ return (EINVAL);
+ return (0);
+}
+
+#ifdef COMPAT_FREEBSD32
+
+/*
+ * Lock a umtx object.
+ */
+static int
+do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
+ const struct timespec *timeout)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ uint32_t owner;
+ uint32_t old;
+ int error = 0;
+
+ uq = td->td_umtxq;
+
+ if (timeout != NULL)
+ abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
+
+ /*
+ * Care must be exercised when dealing with umtx structure. It
+ * can fault on any access.
+ */
+ for (;;) {
+ /*
+ * Try the uncontested case. This should be done in userland.
+ */
+ owner = casuword32(m, UMUTEX_UNOWNED, id);
+
+ /* The acquire succeeded. */
+ if (owner == UMUTEX_UNOWNED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ /* If no one owns it but it is contested try to acquire it. */
+ if (owner == UMUTEX_CONTESTED) {
+ owner = casuword32(m,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+ if (owner == UMUTEX_CONTESTED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+
+ /* If this failed the lock has changed, restart. */
+ continue;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ return (error);
+
+ if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
+ AUTO_SHARE, &uq->uq_key)) != 0)
+ return (error);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Set the contested bit so that a release in user space
+ * knows to use the system call for unlock. If this fails
+ * either some one else has acquired the lock or it has been
+ * released.
+ */
+ old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
+
+ /* The address was invalid. */
+ if (old == -1) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (EFAULT);
+ }
+
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ umtxq_lock(&uq->uq_key);
+ if (old == owner)
+ error = umtxq_sleep(uq, "umtx", timeout == NULL ?
+ NULL : &timo);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+
+ if (error == 0)
+ error = umtxq_check_susp(td);
+ }
+
+ if (timeout == NULL) {
+ /* Mutex locking is restarted if it is interrupted. */
+ if (error == EINTR)
+ error = ERESTART;
+ } else {
+ /* Timed-locking is not restarted. */
+ if (error == ERESTART)
+ error = EINTR;
+ }
+ return (error);
+}
+
+/*
+ * Unlock a umtx object.
+ */
+static int
+do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
+{
+ struct umtx_key key;
+ uint32_t owner;
+ uint32_t old;
+ int error;
+ int count;
+
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword32(m);
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != id)
+ return (EPERM);
+
+ /* This should be done in userland */
+ if ((owner & UMUTEX_CONTESTED) == 0) {
+ old = casuword32(m, owner, UMUTEX_UNOWNED);
+ if (old == -1)
+ return (EFAULT);
+ if (old == owner)
+ return (0);
+ owner = old;
+ }
+
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
+ &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count(&key);
+ umtxq_unlock(&key);
+
+ /*
+ * When unlocking the umtx, it must be marked as unowned if
+ * there is zero or one thread only waiting for it.
+ * Otherwise, it must be marked as contested.
+ */
+ old = casuword32(m, owner,
+ count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+ umtxq_lock(&key);
+ umtxq_signal(&key,1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ if (old == -1)
+ return (EFAULT);
+ if (old != owner)
+ return (EINVAL);
+ return (0);
+}
+#endif
+
+/*
+ * Fetch and compare value, sleep on the address if value is not changed.
+ */
+static int
+do_wait(struct thread *td, void *addr, u_long id,
+ struct _umtx_time *timeout, int compat32, int is_private)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ u_long tmp;
+ int error = 0;
+
+ uq = td->td_umtxq;
+ if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
+ is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
+ return (error);
+
+ if (timeout != NULL)
+ abs_timeout_init2(&timo, timeout);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unlock(&uq->uq_key);
+ if (compat32 == 0)
+ tmp = fuword(addr);
+ else
+ tmp = (unsigned int)fuword32(addr);
+ umtxq_lock(&uq->uq_key);
+ if (tmp == id)
+ error = umtxq_sleep(uq, "uwait", timeout == NULL ?
+ NULL : &timo);
+ if ((uq->uq_flags & UQF_UMTXQ) == 0)
+ error = 0;
+ else
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ if (error == ERESTART)
+ error = EINTR;
+ return (error);
+}
+
+/*
+ * Wake up threads sleeping on the specified address.
+ */
+int
+kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
+{
+ struct umtx_key key;
+ int ret;
+
+ if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
+ is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
+ return (ret);
+ umtxq_lock(&key);
+ ret = umtxq_signal(&key, n_wake);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ return (0);
+}
+
+/*
+ * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+static int
+do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
+ struct _umtx_time *timeout, int mode)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ uint32_t owner, old, id;
+ int error = 0;
+
+ id = td->td_tid;
+ uq = td->td_umtxq;
+
+ if (timeout != NULL)
+ abs_timeout_init2(&timo, timeout);
+
+ /*
+ * Care must be exercised when dealing with umtx structure. It
+ * can fault on any access.
+ */
+ for (;;) {
+ owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
+ if (mode == _UMUTEX_WAIT) {
+ if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
+ return (0);
+ } else {
+ /*
+ * Try the uncontested case. This should be done in userland.
+ */
+ owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+
+ /* The acquire succeeded. */
+ if (owner == UMUTEX_UNOWNED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ /* If no one owns it but it is contested try to acquire it. */
+ if (owner == UMUTEX_CONTESTED) {
+ owner = casuword32(&m->m_owner,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+ if (owner == UMUTEX_CONTESTED)
+ return (0);
+
+ /* The address was invalid. */
+ if (owner == -1)
+ return (EFAULT);
+
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ return (error);
+
+ /* If this failed the lock has changed, restart. */
+ continue;
+ }
+ }
+
+ if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+ (owner & ~UMUTEX_CONTESTED) == id)
+ return (EDEADLK);
+
+ if (mode == _UMUTEX_TRY)
+ return (EBUSY);
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ return (error);
+
+ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
+ GET_SHARE(flags), &uq->uq_key)) != 0)
+ return (error);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Set the contested bit so that a release in user space
+ * knows to use the system call for unlock. If this fails
+ * either some one else has acquired the lock or it has been
+ * released.
+ */
+ old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+
+ /* The address was invalid. */
+ if (old == -1) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_remove(uq);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (EFAULT);
+ }
+
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ if (old == owner)
+ error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
+ NULL : &timo);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+
+ if (error == 0)
+ error = umtxq_check_susp(td);
+ }
+
+ return (0);
+}
+
+/*
+ * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+static int
+do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
+{
+ struct umtx_key key;
+ uint32_t owner, old, id;
+ int error;
+ int count;
+
+ id = td->td_tid;
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != id)
+ return (EPERM);
+
+ if ((owner & UMUTEX_CONTESTED) == 0) {
+ old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
+ if (old == -1)
+ return (EFAULT);
+ if (old == owner)
+ return (0);
+ owner = old;
+ }
+
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
+ &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count(&key);
+ umtxq_unlock(&key);
+
+ /*
+ * When unlocking the umtx, it must be marked as unowned if
+ * there is zero or one thread only waiting for it.
+ * Otherwise, it must be marked as contested.
+ */
+ old = casuword32(&m->m_owner, owner,
+ count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+ umtxq_lock(&key);
+ umtxq_signal(&key,1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ if (old == -1)
+ return (EFAULT);
+ if (old != owner)
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * Check if the mutex is available and wake up a waiter,
+ * only for simple mutex.
+ */
+static int
+do_wake_umutex(struct thread *td, struct umutex *m)
+{
+ struct umtx_key key;
+ uint32_t owner;
+ uint32_t flags;
+ int error;
+ int count;
+
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != 0)
+ return (0);
+
+ flags = fuword32(&m->m_flags);
+
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
+ &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count(&key);
+ umtxq_unlock(&key);
+
+ if (count <= 1)
+ owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
+
+ umtxq_lock(&key);
+ if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
+ umtxq_signal(&key, 1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ return (0);
+}
+
+/*
+ * Check if the mutex has waiters and tries to fix contention bit.
+ */
+static int
+do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
+{
+ struct umtx_key key;
+ uint32_t owner, old;
+ int type;
+ int error;
+ int count;
+
+ switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+ case 0:
+ type = TYPE_NORMAL_UMUTEX;
+ break;
+ case UMUTEX_PRIO_INHERIT:
+ type = TYPE_PI_UMUTEX;
+ break;
+ case UMUTEX_PRIO_PROTECT:
+ type = TYPE_PP_UMUTEX;
+ break;
+ default:
+ return (EINVAL);
+ }
+ if ((error = umtx_key_get(m, type, GET_SHARE(flags),
+ &key)) != 0)
+ return (error);
+
+ owner = 0;
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count(&key);
+ umtxq_unlock(&key);
+ /*
+ * Only repair contention bit if there is a waiter, this means the mutex
+ * is still being referenced by userland code, otherwise don't update
+ * any memory.
+ */
+ if (count > 1) {
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ while ((owner & UMUTEX_CONTESTED) ==0) {
+ old = casuword32(&m->m_owner, owner,
+ owner|UMUTEX_CONTESTED);
+ if (old == owner)
+ break;
+ owner = old;
+ if (old == -1)
+ break;
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ }
+ } else if (count == 1) {
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ while ((owner & ~UMUTEX_CONTESTED) != 0 &&
+ (owner & UMUTEX_CONTESTED) == 0) {
+ old = casuword32(&m->m_owner, owner,
+ owner|UMUTEX_CONTESTED);
+ if (old == owner)
+ break;
+ owner = old;
+ if (old == -1)
+ break;
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ }
+ }
+ umtxq_lock(&key);
+ if (owner == -1) {
+ error = EFAULT;
+ umtxq_signal(&key, INT_MAX);
+ }
+ else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
+ umtxq_signal(&key, 1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ return (error);
+}
+
+static inline struct umtx_pi *
+umtx_pi_alloc(int flags)
+{
+ struct umtx_pi *pi;
+
+ pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
+ TAILQ_INIT(&pi->pi_blocked);
+ atomic_add_int(&umtx_pi_allocated, 1);
+ return (pi);
+}
+
+static inline void
+umtx_pi_free(struct umtx_pi *pi)
+{
+ uma_zfree(umtx_pi_zone, pi);
+ atomic_add_int(&umtx_pi_allocated, -1);
+}
+
+/*
+ * Adjust the thread's position on a pi_state after its priority has been
+ * changed.
+ */
+static int
+umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
+{
+ struct umtx_q *uq, *uq1, *uq2;
+ struct thread *td1;
+
+ mtx_assert(&umtx_lock, MA_OWNED);
+ if (pi == NULL)
+ return (0);
+
+ uq = td->td_umtxq;
+
+ /*
+ * Check if the thread needs to be moved on the blocked chain.
+ * It needs to be moved if either its priority is lower than
+ * the previous thread or higher than the next thread.
+ */
+ uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
+ uq2 = TAILQ_NEXT(uq, uq_lockq);
+ if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
+ (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
+ /*
+ * Remove thread from blocked chain and determine where
+ * it should be moved to.
+ */
+ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
+ TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
+ td1 = uq1->uq_thread;
+ MPASS(td1->td_proc->p_magic == P_MAGIC);
+ if (UPRI(td1) > UPRI(td))
+ break;
+ }
+
+ if (uq1 == NULL)
+ TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
+ else
+ TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
+ }
+ return (1);
+}
+
+/*
+ * Propagate priority when a thread is blocked on POSIX
+ * PI mutex.
+ */
+static void
+umtx_propagate_priority(struct thread *td)
+{
+ struct umtx_q *uq;
+ struct umtx_pi *pi;
+ int pri;
+
+ mtx_assert(&umtx_lock, MA_OWNED);
+ pri = UPRI(td);
+ uq = td->td_umtxq;
+ pi = uq->uq_pi_blocked;
+ if (pi == NULL)
+ return;
+
+ for (;;) {
+ td = pi->pi_owner;
+ if (td == NULL || td == curthread)
+ return;
+
+ MPASS(td->td_proc != NULL);
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+
+ thread_lock(td);
+ if (td->td_lend_user_pri > pri)
+ sched_lend_user_prio(td, pri);
+ else {
+ thread_unlock(td);
+ break;
+ }
+ thread_unlock(td);
+
+ /*
+ * Pick up the lock that td is blocked on.
+ */
+ uq = td->td_umtxq;
+ pi = uq->uq_pi_blocked;
+ if (pi == NULL)
+ break;
+ /* Resort td on the list if needed. */
+ umtx_pi_adjust_thread(pi, td);
+ }
+}
+
+/*
+ * Unpropagate priority for a PI mutex when a thread blocked on
+ * it is interrupted by signal or resumed by others.
+ */
+static void
+umtx_repropagate_priority(struct umtx_pi *pi)
+{
+ struct umtx_q *uq, *uq_owner;
+ struct umtx_pi *pi2;
+ int pri;
+
+ mtx_assert(&umtx_lock, MA_OWNED);
+
+ while (pi != NULL && pi->pi_owner != NULL) {
+ pri = PRI_MAX;
+ uq_owner = pi->pi_owner->td_umtxq;
+
+ TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
+ uq = TAILQ_FIRST(&pi2->pi_blocked);
+ if (uq != NULL) {
+ if (pri > UPRI(uq->uq_thread))
+ pri = UPRI(uq->uq_thread);
+ }
+ }
+
+ if (pri > uq_owner->uq_inherited_pri)
+ pri = uq_owner->uq_inherited_pri;
+ thread_lock(pi->pi_owner);
+ sched_lend_user_prio(pi->pi_owner, pri);
+ thread_unlock(pi->pi_owner);
+ if ((pi = uq_owner->uq_pi_blocked) != NULL)
+ umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
+ }
+}
+
+/*
+ * Insert a PI mutex into owned list.
+ */
+static void
+umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
+{
+ struct umtx_q *uq_owner;
+
+ uq_owner = owner->td_umtxq;
+ mtx_assert(&umtx_lock, MA_OWNED);
+ if (pi->pi_owner != NULL)
+ panic("pi_ower != NULL");
+ pi->pi_owner = owner;
+ TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
+}
+
+/*
+ * Claim ownership of a PI mutex.
+ */
+static int
+umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
+{
+ struct umtx_q *uq, *uq_owner;
+
+ uq_owner = owner->td_umtxq;
+ mtx_lock_spin(&umtx_lock);
+ if (pi->pi_owner == owner) {
+ mtx_unlock_spin(&umtx_lock);
+ return (0);
+ }
+
+ if (pi->pi_owner != NULL) {
+ /*
+ * userland may have already messed the mutex, sigh.
+ */
+ mtx_unlock_spin(&umtx_lock);
+ return (EPERM);
+ }
+ umtx_pi_setowner(pi, owner);
+ uq = TAILQ_FIRST(&pi->pi_blocked);
+ if (uq != NULL) {
+ int pri;
+
+ pri = UPRI(uq->uq_thread);
+ thread_lock(owner);
+ if (pri < UPRI(owner))
+ sched_lend_user_prio(owner, pri);
+ thread_unlock(owner);
+ }
+ mtx_unlock_spin(&umtx_lock);
+ return (0);
+}
+
+/*
+ * Adjust a thread's order position in its blocked PI mutex,
+ * this may result new priority propagating process.
+ */
+void
+umtx_pi_adjust(struct thread *td, u_char oldpri)
+{
+ struct umtx_q *uq;
+ struct umtx_pi *pi;
+
+ uq = td->td_umtxq;
+ mtx_lock_spin(&umtx_lock);
+ /*
+ * Pick up the lock that td is blocked on.
+ */
+ pi = uq->uq_pi_blocked;
+ if (pi != NULL) {
+ umtx_pi_adjust_thread(pi, td);
+ umtx_repropagate_priority(pi);
+ }
+ mtx_unlock_spin(&umtx_lock);
+}
+
+/*
+ * Sleep on a PI mutex.
+ */
+static int
+umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
+ uint32_t owner, const char *wmesg, struct abs_timeout *timo)
+{
+ struct umtxq_chain *uc;
+ struct thread *td, *td1;
+ struct umtx_q *uq1;
+ int pri;
+ int error = 0;
+
+ td = uq->uq_thread;
+ KASSERT(td == curthread, ("inconsistent uq_thread"));
+ uc = umtxq_getchain(&uq->uq_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ UMTXQ_BUSY_ASSERT(uc);
+ umtxq_insert(uq);
+ mtx_lock_spin(&umtx_lock);
+ if (pi->pi_owner == NULL) {
+ mtx_unlock_spin(&umtx_lock);
+ /* XXX Only look up thread in current process. */
+ td1 = tdfind(owner, curproc->p_pid);
+ mtx_lock_spin(&umtx_lock);
+ if (td1 != NULL) {
+ if (pi->pi_owner == NULL)
+ umtx_pi_setowner(pi, td1);
+ PROC_UNLOCK(td1->td_proc);
+ }
+ }
+
+ TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
+ pri = UPRI(uq1->uq_thread);
+ if (pri > UPRI(td))
+ break;
+ }
+
+ if (uq1 != NULL)
+ TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
+ else
+ TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
+
+ uq->uq_pi_blocked = pi;
+ thread_lock(td);
+ td->td_flags |= TDF_UPIBLOCKED;
+ thread_unlock(td);
+ umtx_propagate_priority(td);
+ mtx_unlock_spin(&umtx_lock);
+ umtxq_unbusy(&uq->uq_key);
+
+ error = umtxq_sleep(uq, wmesg, timo);
+ umtxq_remove(uq);
+
+ mtx_lock_spin(&umtx_lock);
+ uq->uq_pi_blocked = NULL;
+ thread_lock(td);
+ td->td_flags &= ~TDF_UPIBLOCKED;
+ thread_unlock(td);
+ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
+ umtx_repropagate_priority(pi);
+ mtx_unlock_spin(&umtx_lock);
+ umtxq_unlock(&uq->uq_key);
+
+ return (error);
+}
+
+/*
+ * Add reference count for a PI mutex.
+ */
+static void
+umtx_pi_ref(struct umtx_pi *pi)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&pi->pi_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ pi->pi_refcount++;
+}
+
+/*
+ * Decrease reference count for a PI mutex, if the counter
+ * is decreased to zero, its memory space is freed.
+ */
+static void
+umtx_pi_unref(struct umtx_pi *pi)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&pi->pi_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
+ if (--pi->pi_refcount == 0) {
+ mtx_lock_spin(&umtx_lock);
+ if (pi->pi_owner != NULL) {
+ TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
+ pi, pi_link);
+ pi->pi_owner = NULL;
+ }
+ KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
+ ("blocked queue not empty"));
+ mtx_unlock_spin(&umtx_lock);
+ TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
+ umtx_pi_free(pi);
+ }
+}
+
+/*
+ * Find a PI mutex in hash table.
+ */
+static struct umtx_pi *
+umtx_pi_lookup(struct umtx_key *key)
+{
+ struct umtxq_chain *uc;
+ struct umtx_pi *pi;
+
+ uc = umtxq_getchain(key);
+ UMTXQ_LOCKED_ASSERT(uc);
+
+ TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
+ if (umtx_key_match(&pi->pi_key, key)) {
+ return (pi);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Insert a PI mutex into hash table.
+ */
+static inline void
+umtx_pi_insert(struct umtx_pi *pi)
+{
+ struct umtxq_chain *uc;
+
+ uc = umtxq_getchain(&pi->pi_key);
+ UMTXQ_LOCKED_ASSERT(uc);
+ TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
+}
+
+/*
+ * Lock a PI mutex.
+ */
+static int
+do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
+ struct _umtx_time *timeout, int try)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ struct umtx_pi *pi, *new_pi;
+ uint32_t id, owner, old;
+ int error;
+
+ id = td->td_tid;
+ uq = td->td_umtxq;
+
+ if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+ &uq->uq_key)) != 0)
+ return (error);
+
+ if (timeout != NULL)
+ abs_timeout_init2(&timo, timeout);
+
+ umtxq_lock(&uq->uq_key);
+ pi = umtx_pi_lookup(&uq->uq_key);
+ if (pi == NULL) {
+ new_pi = umtx_pi_alloc(M_NOWAIT);
+ if (new_pi == NULL) {
+ umtxq_unlock(&uq->uq_key);
+ new_pi = umtx_pi_alloc(M_WAITOK);
+ umtxq_lock(&uq->uq_key);
+ pi = umtx_pi_lookup(&uq->uq_key);
+ if (pi != NULL) {
+ umtx_pi_free(new_pi);
+ new_pi = NULL;
+ }
+ }
+ if (new_pi != NULL) {
+ new_pi->pi_key = uq->uq_key;
+ umtx_pi_insert(new_pi);
+ pi = new_pi;
+ }
+ }
+ umtx_pi_ref(pi);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Care must be exercised when dealing with umtx structure. It
+ * can fault on any access.
+ */
+ for (;;) {
+ /*
+ * Try the uncontested case. This should be done in userland.
+ */
+ owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+
+ /* The acquire succeeded. */
+ if (owner == UMUTEX_UNOWNED) {
+ error = 0;
+ break;
+ }
+
+ /* The address was invalid. */
+ if (owner == -1) {
+ error = EFAULT;
+ break;
+ }
+
+ /* If no one owns it but it is contested try to acquire it. */
+ if (owner == UMUTEX_CONTESTED) {
+ owner = casuword32(&m->m_owner,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+ if (owner == UMUTEX_CONTESTED) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ error = umtx_pi_claim(pi, td);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ break;
+ }
+
+ /* The address was invalid. */
+ if (owner == -1) {
+ error = EFAULT;
+ break;
+ }
+
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+
+ /* If this failed the lock has changed, restart. */
+ continue;
+ }
+
+ if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+ (owner & ~UMUTEX_CONTESTED) == id) {
+ error = EDEADLK;
+ break;
+ }
+
+ if (try != 0) {
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ break;
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Set the contested bit so that a release in user space
+ * knows to use the system call for unlock. If this fails
+ * either some one else has acquired the lock or it has been
+ * released.
+ */
+ old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+
+ /* The address was invalid. */
+ if (old == -1) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ error = EFAULT;
+ break;
+ }
+
+ umtxq_lock(&uq->uq_key);
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ if (old == owner)
+ error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
+ "umtxpi", timeout == NULL ? NULL : &timo);
+ else {
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ }
+
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ }
+
+ umtxq_lock(&uq->uq_key);
+ umtx_pi_unref(pi);
+ umtxq_unlock(&uq->uq_key);
+
+ umtx_key_release(&uq->uq_key);
+ return (error);
+}
+
+/*
+ * Unlock a PI mutex.
+ */
+static int
+do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
+{
+ struct umtx_key key;
+ struct umtx_q *uq_first, *uq_first2, *uq_me;
+ struct umtx_pi *pi, *pi2;
+ uint32_t owner, old, id;
+ int error;
+ int count;
+ int pri;
+
+ id = td->td_tid;
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != id)
+ return (EPERM);
+
+ /* This should be done in userland */
+ if ((owner & UMUTEX_CONTESTED) == 0) {
+ old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
+ if (old == -1)
+ return (EFAULT);
+ if (old == owner)
+ return (0);
+ owner = old;
+ }
+
+ /* We should only ever be in here for contested locks */
+ if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+ &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ count = umtxq_count_pi(&key, &uq_first);
+ if (uq_first != NULL) {
+ mtx_lock_spin(&umtx_lock);
+ pi = uq_first->uq_pi_blocked;
+ KASSERT(pi != NULL, ("pi == NULL?"));
+ if (pi->pi_owner != curthread) {
+ mtx_unlock_spin(&umtx_lock);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ /* userland messed the mutex */
+ return (EPERM);
+ }
+ uq_me = curthread->td_umtxq;
+ pi->pi_owner = NULL;
+ TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
+ /* get highest priority thread which is still sleeping. */
+ uq_first = TAILQ_FIRST(&pi->pi_blocked);
+ while (uq_first != NULL &&
+ (uq_first->uq_flags & UQF_UMTXQ) == 0) {
+ uq_first = TAILQ_NEXT(uq_first, uq_lockq);
+ }
+ pri = PRI_MAX;
+ TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
+ uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
+ if (uq_first2 != NULL) {
+ if (pri > UPRI(uq_first2->uq_thread))
+ pri = UPRI(uq_first2->uq_thread);
+ }
+ }
+ thread_lock(curthread);
+ sched_lend_user_prio(curthread, pri);
+ thread_unlock(curthread);
+ mtx_unlock_spin(&umtx_lock);
+ if (uq_first)
+ umtxq_signal_thread(uq_first);
+ }
+ umtxq_unlock(&key);
+
+ /*
+ * When unlocking the umtx, it must be marked as unowned if
+ * there is zero or one thread only waiting for it.
+ * Otherwise, it must be marked as contested.
+ */
+ old = casuword32(&m->m_owner, owner,
+ count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+
+ umtxq_lock(&key);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ if (old == -1)
+ return (EFAULT);
+ if (old != owner)
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * Lock a PP mutex.
+ */
+static int
+do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
+ struct _umtx_time *timeout, int try)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq, *uq2;
+ struct umtx_pi *pi;
+ uint32_t ceiling;
+ uint32_t owner, id;
+ int error, pri, old_inherited_pri, su;
+
+ id = td->td_tid;
+ uq = td->td_umtxq;
+ if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+ &uq->uq_key)) != 0)
+ return (error);
+
+ if (timeout != NULL)
+ abs_timeout_init2(&timo, timeout);
+
+ su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
+ for (;;) {
+ old_inherited_pri = uq->uq_inherited_pri;
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
+ if (ceiling > RTP_PRIO_MAX) {
+ error = EINVAL;
+ goto out;
+ }
+
+ mtx_lock_spin(&umtx_lock);
+ if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
+ mtx_unlock_spin(&umtx_lock);
+ error = EINVAL;
+ goto out;
+ }
+ if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
+ uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
+ thread_lock(td);
+ if (uq->uq_inherited_pri < UPRI(td))
+ sched_lend_user_prio(td, uq->uq_inherited_pri);
+ thread_unlock(td);
+ }
+ mtx_unlock_spin(&umtx_lock);
+
+ owner = casuword32(&m->m_owner,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+ if (owner == UMUTEX_CONTESTED) {
+ error = 0;
+ break;
+ }
+
+ /* The address was invalid. */
+ if (owner == -1) {
+ error = EFAULT;
+ break;
+ }
+
+ if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+ (owner & ~UMUTEX_CONTESTED) == id) {
+ error = EDEADLK;
+ break;
+ }
+
+ if (try != 0) {
+ error = EBUSY;
+ break;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ break;
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+ error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
+ NULL : &timo);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+
+ mtx_lock_spin(&umtx_lock);
+ uq->uq_inherited_pri = old_inherited_pri;
+ pri = PRI_MAX;
+ TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+ uq2 = TAILQ_FIRST(&pi->pi_blocked);
+ if (uq2 != NULL) {
+ if (pri > UPRI(uq2->uq_thread))
+ pri = UPRI(uq2->uq_thread);
+ }
+ }
+ if (pri > uq->uq_inherited_pri)
+ pri = uq->uq_inherited_pri;
+ thread_lock(td);
+ sched_lend_user_prio(td, pri);
+ thread_unlock(td);
+ mtx_unlock_spin(&umtx_lock);
+ }
+
+ if (error != 0) {
+ mtx_lock_spin(&umtx_lock);
+ uq->uq_inherited_pri = old_inherited_pri;
+ pri = PRI_MAX;
+ TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+ uq2 = TAILQ_FIRST(&pi->pi_blocked);
+ if (uq2 != NULL) {
+ if (pri > UPRI(uq2->uq_thread))
+ pri = UPRI(uq2->uq_thread);
+ }
+ }
+ if (pri > uq->uq_inherited_pri)
+ pri = uq->uq_inherited_pri;
+ thread_lock(td);
+ sched_lend_user_prio(td, pri);
+ thread_unlock(td);
+ mtx_unlock_spin(&umtx_lock);
+ }
+
+out:
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (error);
+}
+
+/*
+ * Unlock a PP mutex.
+ */
+static int
+do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
+{
+ struct umtx_key key;
+ struct umtx_q *uq, *uq2;
+ struct umtx_pi *pi;
+ uint32_t owner, id;
+ uint32_t rceiling;
+ int error, pri, new_inherited_pri, su;
+
+ id = td->td_tid;
+ uq = td->td_umtxq;
+ su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
+
+ /*
+ * Make sure we own this mtx.
+ */
+ owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+ if (owner == -1)
+ return (EFAULT);
+
+ if ((owner & ~UMUTEX_CONTESTED) != id)
+ return (EPERM);
+
+ error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
+ if (error != 0)
+ return (error);
+
+ if (rceiling == -1)
+ new_inherited_pri = PRI_MAX;
+ else {
+ rceiling = RTP_PRIO_MAX - rceiling;
+ if (rceiling > RTP_PRIO_MAX)
+ return (EINVAL);
+ new_inherited_pri = PRI_MIN_REALTIME + rceiling;
+ }
+
+ if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+ &key)) != 0)
+ return (error);
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ umtxq_unlock(&key);
+ /*
+ * For priority protected mutex, always set unlocked state
+ * to UMUTEX_CONTESTED, so that userland always enters kernel
+ * to lock the mutex, it is necessary because thread priority
+ * has to be adjusted for such mutex.
+ */
+ error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
+ UMUTEX_CONTESTED);
+
+ umtxq_lock(&key);
+ if (error == 0)
+ umtxq_signal(&key, 1);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+
+ if (error == -1)
+ error = EFAULT;
+ else {
+ mtx_lock_spin(&umtx_lock);
+ if (su != 0)
+ uq->uq_inherited_pri = new_inherited_pri;
+ pri = PRI_MAX;
+ TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+ uq2 = TAILQ_FIRST(&pi->pi_blocked);
+ if (uq2 != NULL) {
+ if (pri > UPRI(uq2->uq_thread))
+ pri = UPRI(uq2->uq_thread);
+ }
+ }
+ if (pri > uq->uq_inherited_pri)
+ pri = uq->uq_inherited_pri;
+ thread_lock(td);
+ sched_lend_user_prio(td, pri);
+ thread_unlock(td);
+ mtx_unlock_spin(&umtx_lock);
+ }
+ umtx_key_release(&key);
+ return (error);
+}
+
+static int
+do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
+ uint32_t *old_ceiling)
+{
+ struct umtx_q *uq;
+ uint32_t save_ceiling;
+ uint32_t owner, id;
+ uint32_t flags;
+ int error;
+
+ flags = fuword32(&m->m_flags);
+ if ((flags & UMUTEX_PRIO_PROTECT) == 0)
+ return (EINVAL);
+ if (ceiling > RTP_PRIO_MAX)
+ return (EINVAL);
+ id = td->td_tid;
+ uq = td->td_umtxq;
+ if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+ &uq->uq_key)) != 0)
+ return (error);
+ for (;;) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ save_ceiling = fuword32(&m->m_ceilings[0]);
+
+ owner = casuword32(&m->m_owner,
+ UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+ if (owner == UMUTEX_CONTESTED) {
+ suword32(&m->m_ceilings[0], ceiling);
+ suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
+ UMUTEX_CONTESTED);
+ error = 0;
+ break;
+ }
+
+ /* The address was invalid. */
+ if (owner == -1) {
+ error = EFAULT;
+ break;
+ }
+
+ if ((owner & ~UMUTEX_CONTESTED) == id) {
+ suword32(&m->m_ceilings[0], ceiling);
+ error = 0;
+ break;
+ }
+
+ /*
+ * If we caught a signal, we have retried and now
+ * exit immediately.
+ */
+ if (error != 0)
+ break;
+
+ /*
+ * We set the contested bit, sleep. Otherwise the lock changed
+ * and we need to retry or we lost a race to the thread
+ * unlocking the umtx.
+ */
+ umtxq_lock(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+ error = umtxq_sleep(uq, "umtxpp", NULL);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ }
+ umtxq_lock(&uq->uq_key);
+ if (error == 0)
+ umtxq_signal(&uq->uq_key, INT_MAX);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ if (error == 0 && old_ceiling != NULL)
+ suword32(old_ceiling, save_ceiling);
+ return (error);
+}
+
+/*
+ * Lock a userland POSIX mutex.
+ */
+static int
+do_lock_umutex(struct thread *td, struct umutex *m,
+ struct _umtx_time *timeout, int mode)
+{
+ uint32_t flags;
+ int error;
+
+ flags = fuword32(&m->m_flags);
+ if (flags == -1)
+ return (EFAULT);
+
+ switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+ case 0:
+ error = do_lock_normal(td, m, flags, timeout, mode);
+ break;
+ case UMUTEX_PRIO_INHERIT:
+ error = do_lock_pi(td, m, flags, timeout, mode);
+ break;
+ case UMUTEX_PRIO_PROTECT:
+ error = do_lock_pp(td, m, flags, timeout, mode);
+ break;
+ default:
+ return (EINVAL);
+ }
+ if (timeout == NULL) {
+ if (error == EINTR && mode != _UMUTEX_WAIT)
+ error = ERESTART;
+ } else {
+ /* Timed-locking is not restarted. */
+ if (error == ERESTART)
+ error = EINTR;
+ }
+ return (error);
+}
+
+/*
+ * Unlock a userland POSIX mutex.
+ */
+static int
+do_unlock_umutex(struct thread *td, struct umutex *m)
+{
+ uint32_t flags;
+
+ flags = fuword32(&m->m_flags);
+ if (flags == -1)
+ return (EFAULT);
+
+ switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+ case 0:
+ return (do_unlock_normal(td, m, flags));
+ case UMUTEX_PRIO_INHERIT:
+ return (do_unlock_pi(td, m, flags));
+ case UMUTEX_PRIO_PROTECT:
+ return (do_unlock_pp(td, m, flags));
+ }
+
+ return (EINVAL);
+}
+
+static int
+do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
+ struct timespec *timeout, u_long wflags)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ uint32_t flags;
+ uint32_t clockid;
+ int error;
+
+ uq = td->td_umtxq;
+ flags = fuword32(&cv->c_flags);
+ error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
+ if (error != 0)
+ return (error);
+
+ if ((wflags & CVWAIT_CLOCKID) != 0) {
+ clockid = fuword32(&cv->c_clockid);
+ if (clockid < CLOCK_REALTIME ||
+ clockid >= CLOCK_THREAD_CPUTIME_ID) {
+ /* hmm, only HW clock id will work. */
+ return (EINVAL);
+ }
+ } else {
+ clockid = CLOCK_REALTIME;
+ }
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * Set c_has_waiters to 1 before releasing user mutex, also
+ * don't modify cache line when unnecessary.
+ */
+ if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
+ suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ error = do_unlock_umutex(td, m);
+
+ if (timeout != NULL)
+ abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
+ timeout);
+
+ umtxq_lock(&uq->uq_key);
+ if (error == 0) {
+ error = umtxq_sleep(uq, "ucond", timeout == NULL ?
+ NULL : &timo);
+ }
+
+ if ((uq->uq_flags & UQF_UMTXQ) == 0)
+ error = 0;
+ else {
+ /*
+ * This must be timeout,interrupted by signal or
+ * surprious wakeup, clear c_has_waiter flag when
+ * necessary.
+ */
+ umtxq_busy(&uq->uq_key);
+ if ((uq->uq_flags & UQF_UMTXQ) != 0) {
+ int oldlen = uq->uq_cur_queue->length;
+ umtxq_remove(uq);
+ if (oldlen == 1) {
+ umtxq_unlock(&uq->uq_key);
+ suword32(
+ __DEVOLATILE(uint32_t *,
+ &cv->c_has_waiters), 0);
+ umtxq_lock(&uq->uq_key);
+ }
+ }
+ umtxq_unbusy(&uq->uq_key);
+ if (error == ERESTART)
+ error = EINTR;
+ }
+
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (error);
+}
+
+/*
+ * Signal a userland condition variable.
+ */
+static int
+do_cv_signal(struct thread *td, struct ucond *cv)
+{
+ struct umtx_key key;
+ int error, cnt, nwake;
+ uint32_t flags;
+
+ flags = fuword32(&cv->c_flags);
+ if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
+ return (error);
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ cnt = umtxq_count(&key);
+ nwake = umtxq_signal(&key, 1);
+ if (cnt <= nwake) {
+ umtxq_unlock(&key);
+ error = suword32(
+ __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+ umtxq_lock(&key);
+ }
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ return (error);
+}
+
+static int
+do_cv_broadcast(struct thread *td, struct ucond *cv)
+{
+ struct umtx_key key;
+ int error;
+ uint32_t flags;
+
+ flags = fuword32(&cv->c_flags);
+ if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
+ return (error);
+
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ umtxq_signal(&key, INT_MAX);
+ umtxq_unlock(&key);
+
+ error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+
+ umtxq_lock(&key);
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+
+ umtx_key_release(&key);
+ return (error);
+}
+
+static int
+do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ uint32_t flags, wrflags;
+ int32_t state, oldstate;
+ int32_t blocked_readers;
+ int error;
+
+ uq = td->td_umtxq;
+ flags = fuword32(&rwlock->rw_flags);
+ error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
+ if (error != 0)
+ return (error);
+
+ if (timeout != NULL)
+ abs_timeout_init2(&timo, timeout);
+
+ wrflags = URWLOCK_WRITE_OWNER;
+ if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
+ wrflags |= URWLOCK_WRITE_WAITERS;
+
+ for (;;) {
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+ /* try to lock it */
+ while (!(state & wrflags)) {
+ if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
+ umtx_key_release(&uq->uq_key);
+ return (EAGAIN);
+ }
+ oldstate = casuword32(&rwlock->rw_state, state, state + 1);
+ if (oldstate == -1) {
+ umtx_key_release(&uq->uq_key);
+ return (EFAULT);
+ }
+ if (oldstate == state) {
+ umtx_key_release(&uq->uq_key);
+ return (0);
+ }
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ state = oldstate;
+ }
+
+ if (error)
+ break;
+
+ /* grab monitor lock */
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * re-read the state, in case it changed between the try-lock above
+ * and the check below
+ */
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+
+ /* set read contention bit */
+ while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
+ oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
+ if (oldstate == -1) {
+ error = EFAULT;
+ break;
+ }
+ if (oldstate == state)
+ goto sleep;
+ state = oldstate;
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ }
+ if (error != 0) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ break;
+ }
+
+ /* state is changed while setting flags, restart */
+ if (!(state & wrflags)) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ continue;
+ }
+
+sleep:
+ /* contention bit is set, before sleeping, increase read waiter count */
+ blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+ suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
+
+ while (state & wrflags) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unbusy(&uq->uq_key);
+
+ error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
+ NULL : &timo);
+
+ umtxq_busy(&uq->uq_key);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ if (error)
+ break;
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+ }
+
+ /* decrease read waiter count, and may clear read contention bit */
+ blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+ suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
+ if (blocked_readers == 1) {
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+ for (;;) {
+ oldstate = casuword32(&rwlock->rw_state, state,
+ state & ~URWLOCK_READ_WAITERS);
+ if (oldstate == -1) {
+ error = EFAULT;
+ break;
+ }
+ if (oldstate == state)
+ break;
+ state = oldstate;
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ }
+ }
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ if (error != 0)
+ break;
+ }
+ umtx_key_release(&uq->uq_key);
+ if (error == ERESTART)
+ error = EINTR;
+ return (error);
+}
+
+static int
+do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ uint32_t flags;
+ int32_t state, oldstate;
+ int32_t blocked_writers;
+ int32_t blocked_readers;
+ int error;
+
+ uq = td->td_umtxq;
+ flags = fuword32(&rwlock->rw_flags);
+ error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
+ if (error != 0)
+ return (error);
+
+ if (timeout != NULL)
+ abs_timeout_init2(&timo, timeout);
+
+ blocked_readers = 0;
+ for (;;) {
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+ while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
+ oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
+ if (oldstate == -1) {
+ umtx_key_release(&uq->uq_key);
+ return (EFAULT);
+ }
+ if (oldstate == state) {
+ umtx_key_release(&uq->uq_key);
+ return (0);
+ }
+ state = oldstate;
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ }
+
+ if (error) {
+ if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
+ blocked_readers != 0) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ }
+
+ break;
+ }
+
+ /* grab monitor lock */
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+
+ /*
+ * re-read the state, in case it changed between the try-lock above
+ * and the check below
+ */
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+
+ while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
+ (state & URWLOCK_WRITE_WAITERS) == 0) {
+ oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
+ if (oldstate == -1) {
+ error = EFAULT;
+ break;
+ }
+ if (oldstate == state)
+ goto sleep;
+ state = oldstate;
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ }
+ if (error != 0) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ break;
+ }
+
+ if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ break;
+ continue;
+ }
+sleep:
+ blocked_writers = fuword32(&rwlock->rw_blocked_writers);
+ suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
+
+ while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
+ umtxq_unbusy(&uq->uq_key);
+
+ error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
+ NULL : &timo);
+
+ umtxq_busy(&uq->uq_key);
+ umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
+ umtxq_unlock(&uq->uq_key);
+ if (error)
+ break;
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+ }
+
+ blocked_writers = fuword32(&rwlock->rw_blocked_writers);
+ suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
+ if (blocked_writers == 1) {
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+ for (;;) {
+ oldstate = casuword32(&rwlock->rw_state, state,
+ state & ~URWLOCK_WRITE_WAITERS);
+ if (oldstate == -1) {
+ error = EFAULT;
+ break;
+ }
+ if (oldstate == state)
+ break;
+ state = oldstate;
+ error = umtxq_check_susp(td);
+ /*
+ * We are leaving the URWLOCK_WRITE_WAITERS
+ * behind, but this should not harm the
+ * correctness.
+ */
+ if (error != 0)
+ break;
+ }
+ blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+ } else
+ blocked_readers = 0;
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ }
+
+ umtx_key_release(&uq->uq_key);
+ if (error == ERESTART)
+ error = EINTR;
+ return (error);
+}
+
+static int
+do_rw_unlock(struct thread *td, struct urwlock *rwlock)
+{
+ struct umtx_q *uq;
+ uint32_t flags;
+ int32_t state, oldstate;
+ int error, q, count;
+
+ uq = td->td_umtxq;
+ flags = fuword32(&rwlock->rw_flags);
+ error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
+ if (error != 0)
+ return (error);
+
+ state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+ if (state & URWLOCK_WRITE_OWNER) {
+ for (;;) {
+ oldstate = casuword32(&rwlock->rw_state, state,
+ state & ~URWLOCK_WRITE_OWNER);
+ if (oldstate == -1) {
+ error = EFAULT;
+ goto out;
+ }
+ if (oldstate != state) {
+ state = oldstate;
+ if (!(oldstate & URWLOCK_WRITE_OWNER)) {
+ error = EPERM;
+ goto out;
+ }
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ goto out;
+ } else
+ break;
+ }
+ } else if (URWLOCK_READER_COUNT(state) != 0) {
+ for (;;) {
+ oldstate = casuword32(&rwlock->rw_state, state,
+ state - 1);
+ if (oldstate == -1) {
+ error = EFAULT;
+ goto out;
+ }
+ if (oldstate != state) {
+ state = oldstate;
+ if (URWLOCK_READER_COUNT(oldstate) == 0) {
+ error = EPERM;
+ goto out;
+ }
+ error = umtxq_check_susp(td);
+ if (error != 0)
+ goto out;
+ } else
+ break;
+ }
+ } else {
+ error = EPERM;
+ goto out;
+ }
+
+ count = 0;
+
+ if (!(flags & URWLOCK_PREFER_READER)) {
+ if (state & URWLOCK_WRITE_WAITERS) {
+ count = 1;
+ q = UMTX_EXCLUSIVE_QUEUE;
+ } else if (state & URWLOCK_READ_WAITERS) {
+ count = INT_MAX;
+ q = UMTX_SHARED_QUEUE;
+ }
+ } else {
+ if (state & URWLOCK_READ_WAITERS) {
+ count = INT_MAX;
+ q = UMTX_SHARED_QUEUE;
+ } else if (state & URWLOCK_WRITE_WAITERS) {
+ count = 1;
+ q = UMTX_EXCLUSIVE_QUEUE;
+ }
+ }
+
+ if (count) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_signal_queue(&uq->uq_key, count, q);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_unlock(&uq->uq_key);
+ }
+out:
+ umtx_key_release(&uq->uq_key);
+ return (error);
+}
+
+static int
+do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
+{
+ struct abs_timeout timo;
+ struct umtx_q *uq;
+ uint32_t flags, count;
+ int error;
+
+ uq = td->td_umtxq;
+ flags = fuword32(&sem->_flags);
+ error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
+ if (error != 0)
+ return (error);
+
+ if (timeout != NULL)
+ abs_timeout_init2(&timo, timeout);
+
+ umtxq_lock(&uq->uq_key);
+ umtxq_busy(&uq->uq_key);
+ umtxq_insert(uq);
+ umtxq_unlock(&uq->uq_key);
+ casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
+ count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
+ if (count != 0) {
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+ umtxq_remove(uq);
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (0);
+ }
+ umtxq_lock(&uq->uq_key);
+ umtxq_unbusy(&uq->uq_key);
+
+ error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
+
+ if ((uq->uq_flags & UQF_UMTXQ) == 0)
+ error = 0;
+ else {
+ umtxq_remove(uq);
+ /* A relative timeout cannot be restarted. */
+ if (error == ERESTART && timeout != NULL &&
+ (timeout->_flags & UMTX_ABSTIME) == 0)
+ error = EINTR;
+ }
+ umtxq_unlock(&uq->uq_key);
+ umtx_key_release(&uq->uq_key);
+ return (error);
+}
+
+/*
+ * Signal a userland condition variable.
+ */
+static int
+do_sem_wake(struct thread *td, struct _usem *sem)
+{
+ struct umtx_key key;
+ int error, cnt;
+ uint32_t flags;
+
+ flags = fuword32(&sem->_flags);
+ if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
+ return (error);
+ umtxq_lock(&key);
+ umtxq_busy(&key);
+ cnt = umtxq_count(&key);
+ if (cnt > 0) {
+ umtxq_signal(&key, 1);
+ /*
+ * Check if count is greater than 0, this means the memory is
+ * still being referenced by user code, so we can safely
+ * update _has_waiters flag.
+ */
+ if (cnt == 1) {
+ umtxq_unlock(&key);
+ error = suword32(
+ __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
+ umtxq_lock(&key);
+ }
+ }
+ umtxq_unbusy(&key);
+ umtxq_unlock(&key);
+ umtx_key_release(&key);
+ return (error);
+}
+
+int
+sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
+ /* struct umtx *umtx */
+{
+ return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
+}
+
+int
+sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
+ /* struct umtx *umtx */
+{
+ return do_unlock_umtx(td, uap->umtx, td->td_tid);
+}
+
+inline int
+umtx_copyin_timeout(const void *addr, struct timespec *tsp)
+{
+ int error;
+
+ error = copyin(addr, tsp, sizeof(struct timespec));
+ if (error == 0) {
+ if (tsp->tv_sec < 0 ||
+ tsp->tv_nsec >= 1000000000 ||
+ tsp->tv_nsec < 0)
+ error = EINVAL;
+ }
+ return (error);
+}
+
+static inline int
+umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
+{
+ int error;
+
+ if (size <= sizeof(struct timespec)) {
+ tp->_clockid = CLOCK_REALTIME;
+ tp->_flags = 0;
+ error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
+ } else
+ error = copyin(addr, tp, sizeof(struct _umtx_time));
+ if (error != 0)
+ return (error);
+ if (tp->_timeout.tv_sec < 0 ||
+ tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
+ return (EINVAL);
+ return (0);
+}
+
+static int
+__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+ if (error != 0)
+ return (error);
+ ts = &timeout;
+ }
+ return (do_lock_umtx(td, uap->obj, uap->val, ts));
+}
+
+static int
+__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
+{
+ return (do_unlock_umtx(td, uap->obj, uap->val));
+}
+
+static int
+__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time timeout, *tm_p;
+ int error;
+
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time(
+ uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
+}
+
+static int
+__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time timeout, *tm_p;
+ int error;
+
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time(
+ uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
+}
+
+static int
+__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time(
+ uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
+}
+
+static int
+__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
+{
+ return (kern_umtx_wake(td, uap->obj, uap->val, 0));
+}
+
+#define BATCH_SIZE 128
+static int
+__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
+{
+ int count = uap->val;
+ void *uaddrs[BATCH_SIZE];
+ char **upp = (char **)uap->obj;
+ int tocopy;
+ int error = 0;
+ int i, pos = 0;
+
+ while (count > 0) {
+ tocopy = count;
+ if (tocopy > BATCH_SIZE)
+ tocopy = BATCH_SIZE;
+ error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
+ if (error != 0)
+ break;
+ for (i = 0; i < tocopy; ++i)
+ kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
+ count -= tocopy;
+ pos += tocopy;
+ }
+ return (error);
+}
+
+static int
+__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
+{
+ return (kern_umtx_wake(td, uap->obj, uap->val, 1));
+}
+
+static int
+__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time(
+ uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_lock_umutex(td, uap->obj, tm_p, 0);
+}
+
+static int
+__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
+}
+
+static int
+__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time(
+ uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
+}
+
+static int
+__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_wake_umutex(td, uap->obj);
+}
+
+static int
+__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_unlock_umutex(td, uap->obj);
+}
+
+static int
+__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
+}
+
+static int
+__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+ if (error != 0)
+ return (error);
+ ts = &timeout;
+ }
+ return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
+}
+
+static int
+__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_cv_signal(td, uap->obj);
+}
+
+static int
+__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_cv_broadcast(td, uap->obj);
+}
+
+static int
+__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL) {
+ error = do_rw_rdlock(td, uap->obj, uap->val, 0);
+ } else {
+ error = umtx_copyin_umtx_time(uap->uaddr2,
+ (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
+ }
+ return (error);
+}
+
+static int
+__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL) {
+ error = do_rw_wrlock(td, uap->obj, 0);
+ } else {
+ error = umtx_copyin_umtx_time(uap->uaddr2,
+ (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+
+ error = do_rw_wrlock(td, uap->obj, &timeout);
+ }
+ return (error);
+}
+
+static int
+__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_rw_unlock(td, uap->obj);
+}
+
+static int
+__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time(
+ uap->uaddr2, (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return (do_sem_wait(td, uap->obj, tm_p));
+}
+
+static int
+__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_sem_wake(td, uap->obj);
+}
+
+static int
+__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+ return do_wake2_umutex(td, uap->obj, uap->val);
+}
+
+typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
+
+static _umtx_op_func op_table[] = {
+ __umtx_op_lock_umtx, /* UMTX_OP_LOCK */
+ __umtx_op_unlock_umtx, /* UMTX_OP_UNLOCK */
+ __umtx_op_wait, /* UMTX_OP_WAIT */
+ __umtx_op_wake, /* UMTX_OP_WAKE */
+ __umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_TRYLOCK */
+ __umtx_op_lock_umutex, /* UMTX_OP_MUTEX_LOCK */
+ __umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */
+ __umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */
+ __umtx_op_cv_wait, /* UMTX_OP_CV_WAIT*/
+ __umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */
+ __umtx_op_cv_broadcast, /* UMTX_OP_CV_BROADCAST */
+ __umtx_op_wait_uint, /* UMTX_OP_WAIT_UINT */
+ __umtx_op_rw_rdlock, /* UMTX_OP_RW_RDLOCK */
+ __umtx_op_rw_wrlock, /* UMTX_OP_RW_WRLOCK */
+ __umtx_op_rw_unlock, /* UMTX_OP_RW_UNLOCK */
+ __umtx_op_wait_uint_private, /* UMTX_OP_WAIT_UINT_PRIVATE */
+ __umtx_op_wake_private, /* UMTX_OP_WAKE_PRIVATE */
+ __umtx_op_wait_umutex, /* UMTX_OP_UMUTEX_WAIT */
+ __umtx_op_wake_umutex, /* UMTX_OP_UMUTEX_WAKE */
+ __umtx_op_sem_wait, /* UMTX_OP_SEM_WAIT */
+ __umtx_op_sem_wake, /* UMTX_OP_SEM_WAKE */
+ __umtx_op_nwake_private, /* UMTX_OP_NWAKE_PRIVATE */
+ __umtx_op_wake2_umutex /* UMTX_OP_UMUTEX_WAKE2 */
+};
+
+int
+sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
+{
+ if ((unsigned)uap->op < UMTX_OP_MAX)
+ return (*op_table[uap->op])(td, uap);
+ return (EINVAL);
+}
+
+#ifdef COMPAT_FREEBSD32
+int
+freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
+ /* struct umtx *umtx */
+{
+ return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
+}
+
+int
+freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
+ /* struct umtx *umtx */
+{
+ return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
+}
+
+struct timespec32 {
+ int32_t tv_sec;
+ int32_t tv_nsec;
+};
+
+struct umtx_time32 {
+ struct timespec32 timeout;
+ uint32_t flags;
+ uint32_t clockid;
+};
+
+static inline int
+umtx_copyin_timeout32(void *addr, struct timespec *tsp)
+{
+ struct timespec32 ts32;
+ int error;
+
+ error = copyin(addr, &ts32, sizeof(struct timespec32));
+ if (error == 0) {
+ if (ts32.tv_sec < 0 ||
+ ts32.tv_nsec >= 1000000000 ||
+ ts32.tv_nsec < 0)
+ error = EINVAL;
+ else {
+ tsp->tv_sec = ts32.tv_sec;
+ tsp->tv_nsec = ts32.tv_nsec;
+ }
+ }
+ return (error);
+}
+
+static inline int
+umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
+{
+ struct umtx_time32 t32;
+ int error;
+
+ t32.clockid = CLOCK_REALTIME;
+ t32.flags = 0;
+ if (size <= sizeof(struct timespec32))
+ error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
+ else
+ error = copyin(addr, &t32, sizeof(struct umtx_time32));
+ if (error != 0)
+ return (error);
+ if (t32.timeout.tv_sec < 0 ||
+ t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
+ return (EINVAL);
+ tp->_timeout.tv_sec = t32.timeout.tv_sec;
+ tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
+ tp->_flags = t32.flags;
+ tp->_clockid = t32.clockid;
+ return (0);
+}
+
+static int
+__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+ if (error != 0)
+ return (error);
+ ts = &timeout;
+ }
+ return (do_lock_umtx32(td, uap->obj, uap->val, ts));
+}
+
+static int
+__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
+}
+
+static int
+__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time32(uap->uaddr2,
+ (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
+}
+
+static int
+__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time(uap->uaddr2,
+ (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_lock_umutex(td, uap->obj, tm_p, 0);
+}
+
+static int
+__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time32(uap->uaddr2,
+ (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
+}
+
+static int
+__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct timespec *ts, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ ts = NULL;
+ else {
+ error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+ if (error != 0)
+ return (error);
+ ts = &timeout;
+ }
+ return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
+}
+
+static int
+__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL) {
+ error = do_rw_rdlock(td, uap->obj, uap->val, 0);
+ } else {
+ error = umtx_copyin_umtx_time32(uap->uaddr2,
+ (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
+ }
+ return (error);
+}
+
+static int
+__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL) {
+ error = do_rw_wrlock(td, uap->obj, 0);
+ } else {
+ error = umtx_copyin_umtx_time32(uap->uaddr2,
+ (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ error = do_rw_wrlock(td, uap->obj, &timeout);
+ }
+ return (error);
+}
+
+static int
+__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time32(
+ uap->uaddr2, (size_t)uap->uaddr1,&timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
+}
+
+static int
+__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+ struct _umtx_time *tm_p, timeout;
+ int error;
+
+ /* Allow a null timespec (wait forever). */
+ if (uap->uaddr2 == NULL)
+ tm_p = NULL;
+ else {
+ error = umtx_copyin_umtx_time32(uap->uaddr2,
+ (size_t)uap->uaddr1, &timeout);
+ if (error != 0)
+ return (error);
+ tm_p = &timeout;
+ }
+ return (do_sem_wait(td, uap->obj, tm_p));
+}
+
+static int
+__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
+{
+ int count = uap->val;
+ uint32_t uaddrs[BATCH_SIZE];
+ uint32_t **upp = (uint32_t **)uap->obj;
+ int tocopy;
+ int error = 0;
+ int i, pos = 0;
+
+ while (count > 0) {
+ tocopy = count;
+ if (tocopy > BATCH_SIZE)
+ tocopy = BATCH_SIZE;
+ error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
+ if (error != 0)
+ break;
+ for (i = 0; i < tocopy; ++i)
+ kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
+ INT_MAX, 1);
+ count -= tocopy;
+ pos += tocopy;
+ }
+ return (error);
+}
+
+static _umtx_op_func op_table_compat32[] = {
+ __umtx_op_lock_umtx_compat32, /* UMTX_OP_LOCK */
+ __umtx_op_unlock_umtx_compat32, /* UMTX_OP_UNLOCK */
+ __umtx_op_wait_compat32, /* UMTX_OP_WAIT */
+ __umtx_op_wake, /* UMTX_OP_WAKE */
+ __umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_LOCK */
+ __umtx_op_lock_umutex_compat32, /* UMTX_OP_MUTEX_TRYLOCK */
+ __umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */
+ __umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */
+ __umtx_op_cv_wait_compat32, /* UMTX_OP_CV_WAIT*/
+ __umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */
+ __umtx_op_cv_broadcast, /* UMTX_OP_CV_BROADCAST */
+ __umtx_op_wait_compat32, /* UMTX_OP_WAIT_UINT */
+ __umtx_op_rw_rdlock_compat32, /* UMTX_OP_RW_RDLOCK */
+ __umtx_op_rw_wrlock_compat32, /* UMTX_OP_RW_WRLOCK */
+ __umtx_op_rw_unlock, /* UMTX_OP_RW_UNLOCK */
+ __umtx_op_wait_uint_private_compat32, /* UMTX_OP_WAIT_UINT_PRIVATE */
+ __umtx_op_wake_private, /* UMTX_OP_WAKE_PRIVATE */
+ __umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
+ __umtx_op_wake_umutex, /* UMTX_OP_UMUTEX_WAKE */
+ __umtx_op_sem_wait_compat32, /* UMTX_OP_SEM_WAIT */
+ __umtx_op_sem_wake, /* UMTX_OP_SEM_WAKE */
+ __umtx_op_nwake_private32, /* UMTX_OP_NWAKE_PRIVATE */
+ __umtx_op_wake2_umutex /* UMTX_OP_UMUTEX_WAKE2 */
+};
+
+int
+freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
+{
+ if ((unsigned)uap->op < UMTX_OP_MAX)
+ return (*op_table_compat32[uap->op])(td,
+ (struct _umtx_op_args *)uap);
+ return (EINVAL);
+}
+#endif
+
+void
+umtx_thread_init(struct thread *td)
+{
+ td->td_umtxq = umtxq_alloc();
+ td->td_umtxq->uq_thread = td;
+}
+
+void
+umtx_thread_fini(struct thread *td)
+{
+ umtxq_free(td->td_umtxq);
+}
+
+/*
+ * It will be called when new thread is created, e.g fork().
+ */
+void
+umtx_thread_alloc(struct thread *td)
+{
+ struct umtx_q *uq;
+
+ uq = td->td_umtxq;
+ uq->uq_inherited_pri = PRI_MAX;
+
+ KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
+ KASSERT(uq->uq_thread == td, ("uq_thread != td"));
+ KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
+ KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
+}
+
+/*
+ * exec() hook.
+ */
+static void
+umtx_exec_hook(void *arg __unused, struct proc *p __unused,
+ struct image_params *imgp __unused)
+{
+ umtx_thread_cleanup(curthread);
+}
+
+/*
+ * thread_exit() hook.
+ */
+void
+umtx_thread_exit(struct thread *td)
+{
+ umtx_thread_cleanup(td);
+}
+
+/*
+ * clean up umtx data.
+ */
+static void
+umtx_thread_cleanup(struct thread *td)
+{
+ struct umtx_q *uq;
+ struct umtx_pi *pi;
+
+ if ((uq = td->td_umtxq) == NULL)
+ return;
+
+ mtx_lock_spin(&umtx_lock);
+ uq->uq_inherited_pri = PRI_MAX;
+ while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
+ pi->pi_owner = NULL;
+ TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
+ }
+ mtx_unlock_spin(&umtx_lock);
+ thread_lock(td);
+ sched_lend_user_prio(td, PRI_MAX);
+ thread_unlock(td);
+}
diff --git a/sys/kern/kern_uuid.c b/sys/kern/kern_uuid.c
new file mode 100644
index 0000000..fd4027b
--- /dev/null
+++ b/sys/kern/kern_uuid.c
@@ -0,0 +1,426 @@
+/*-
+ * Copyright (c) 2002 Marcel Moolenaar
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/endian.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sbuf.h>
+#include <sys/socket.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/jail.h>
+#include <sys/uuid.h>
+
+#include <net/if.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/vnet.h>
+
+/*
+ * See also:
+ * http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
+ * http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
+ *
+ * Note that the generator state is itself an UUID, but the time and clock
+ * sequence fields are written in the native byte order.
+ */
+
+CTASSERT(sizeof(struct uuid) == 16);
+
+/* We use an alternative, more convenient representation in the generator. */
+struct uuid_private {
+ union {
+ uint64_t ll; /* internal. */
+ struct {
+ uint32_t low;
+ uint16_t mid;
+ uint16_t hi;
+ } x;
+ } time;
+ uint16_t seq; /* Big-endian. */
+ uint16_t node[UUID_NODE_LEN>>1];
+};
+
+CTASSERT(sizeof(struct uuid_private) == 16);
+
+struct uuid_macaddr {
+ uint16_t state;
+#define UUID_ETHER_EMPTY 0
+#define UUID_ETHER_RANDOM 1
+#define UUID_ETHER_UNIQUE 2
+ uint16_t node[UUID_NODE_LEN>>1];
+};
+
+static struct uuid_private uuid_last;
+
+#define UUID_NETHER 4
+static struct uuid_macaddr uuid_ether[UUID_NETHER];
+
+static struct mtx uuid_mutex;
+MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);
+
+/*
+ * Return the first MAC address added in the array. If it's empty, then
+ * construct a sufficiently random multicast MAC address first. Any
+ * addresses added later will bump the random MAC address up tp the next
+ * index.
+ */
+static void
+uuid_node(uint16_t *node)
+{
+ int i;
+
+ if (uuid_ether[0].state == UUID_ETHER_EMPTY) {
+ for (i = 0; i < (UUID_NODE_LEN>>1); i++)
+ uuid_ether[0].node[i] = (uint16_t)arc4random();
+ *((uint8_t*)uuid_ether[0].node) |= 0x01;
+ uuid_ether[0].state = UUID_ETHER_RANDOM;
+ }
+ for (i = 0; i < (UUID_NODE_LEN>>1); i++)
+ node[i] = uuid_ether[0].node[i];
+}
+
+/*
+ * Get the current time as a 60 bit count of 100-nanosecond intervals
+ * since 00:00:00.00, October 15,1582. We apply a magic offset to convert
+ * the Unix time since 00:00:00.00, January 1, 1970 to the date of the
+ * Gregorian reform to the Christian calendar.
+ */
+static uint64_t
+uuid_time(void)
+{
+ struct bintime bt;
+ uint64_t time = 0x01B21DD213814000LL;
+
+ bintime(&bt);
+ time += (uint64_t)bt.sec * 10000000LL;
+ time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
+ return (time & ((1LL << 60) - 1LL));
+}
+
+struct uuid *
+kern_uuidgen(struct uuid *store, size_t count)
+{
+ struct uuid_private uuid;
+ uint64_t time;
+ size_t n;
+
+ mtx_lock(&uuid_mutex);
+
+ uuid_node(uuid.node);
+ time = uuid_time();
+
+ if (uuid_last.time.ll == 0LL || uuid_last.node[0] != uuid.node[0] ||
+ uuid_last.node[1] != uuid.node[1] ||
+ uuid_last.node[2] != uuid.node[2])
+ uuid.seq = (uint16_t)arc4random() & 0x3fff;
+ else if (uuid_last.time.ll >= time)
+ uuid.seq = (uuid_last.seq + 1) & 0x3fff;
+ else
+ uuid.seq = uuid_last.seq;
+
+ uuid_last = uuid;
+ uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);
+
+ mtx_unlock(&uuid_mutex);
+
+ /* Set sequence and variant and deal with byte order. */
+ uuid.seq = htobe16(uuid.seq | 0x8000);
+
+ for (n = 0; n < count; n++) {
+ /* Set time and version (=1). */
+ uuid.time.x.low = (uint32_t)time;
+ uuid.time.x.mid = (uint16_t)(time >> 32);
+ uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
+ store[n] = *(struct uuid *)&uuid;
+ time++;
+ }
+
+ return (store);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct uuidgen_args {
+ struct uuid *store;
+ int count;
+};
+#endif
+int
+sys_uuidgen(struct thread *td, struct uuidgen_args *uap)
+{
+ struct uuid *store;
+ size_t count;
+ int error;
+
+ /*
+ * Limit the number of UUIDs that can be created at the same time
+ * to some arbitrary number. This isn't really necessary, but I
+ * like to have some sort of upper-bound that's less than 2G :-)
+ * XXX probably needs to be tunable.
+ */
+ if (uap->count < 1 || uap->count > 2048)
+ return (EINVAL);
+
+ count = uap->count;
+ store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
+ kern_uuidgen(store, count);
+ error = copyout(store, uap->store, count * sizeof(struct uuid));
+ free(store, M_TEMP);
+ return (error);
+}
+
+int
+uuid_ether_add(const uint8_t *addr)
+{
+ int i, sum;
+
+ /*
+ * Validate input. No multicast (flag 0x1), no locally administered
+ * (flag 0x2) and no 'all-zeroes' addresses.
+ */
+ if (addr[0] & 0x03)
+ return (EINVAL);
+ sum = 0;
+ for (i = 0; i < UUID_NODE_LEN; i++)
+ sum += addr[i];
+ if (sum == 0)
+ return (EINVAL);
+
+ mtx_lock(&uuid_mutex);
+
+ /* Make sure the MAC isn't known already and that there's space. */
+ i = 0;
+ while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE) {
+ if (!bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN)) {
+ mtx_unlock(&uuid_mutex);
+ return (EEXIST);
+ }
+ i++;
+ }
+ if (i == UUID_NETHER) {
+ mtx_unlock(&uuid_mutex);
+ return (ENOSPC);
+ }
+
+ /* Insert MAC at index, moving the non-empty entry if possible. */
+ if (uuid_ether[i].state == UUID_ETHER_RANDOM && i < UUID_NETHER - 1)
+ uuid_ether[i + 1] = uuid_ether[i];
+ uuid_ether[i].state = UUID_ETHER_UNIQUE;
+ bcopy(addr, uuid_ether[i].node, UUID_NODE_LEN);
+ mtx_unlock(&uuid_mutex);
+ return (0);
+}
+
+int
+uuid_ether_del(const uint8_t *addr)
+{
+ int i;
+
+ mtx_lock(&uuid_mutex);
+ i = 0;
+ while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE &&
+ bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN))
+ i++;
+ if (i == UUID_NETHER || uuid_ether[i].state != UUID_ETHER_UNIQUE) {
+ mtx_unlock(&uuid_mutex);
+ return (ENOENT);
+ }
+
+ /* Remove it by shifting higher index entries down. */
+ while (i < UUID_NETHER - 1 && uuid_ether[i].state != UUID_ETHER_EMPTY) {
+ uuid_ether[i] = uuid_ether[i + 1];
+ i++;
+ }
+ if (uuid_ether[i].state != UUID_ETHER_EMPTY) {
+ uuid_ether[i].state = UUID_ETHER_EMPTY;
+ bzero(uuid_ether[i].node, UUID_NODE_LEN);
+ }
+ mtx_unlock(&uuid_mutex);
+ return (0);
+}
+
+int
+snprintf_uuid(char *buf, size_t sz, struct uuid *uuid)
+{
+ struct uuid_private *id;
+ int cnt;
+
+ id = (struct uuid_private *)uuid;
+ cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
+ id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
+ be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
+ return (cnt);
+}
+
+int
+printf_uuid(struct uuid *uuid)
+{
+ char buf[38];
+
+ snprintf_uuid(buf, sizeof(buf), uuid);
+ return (printf("%s", buf));
+}
+
+int
+sbuf_printf_uuid(struct sbuf *sb, struct uuid *uuid)
+{
+ char buf[38];
+
+ snprintf_uuid(buf, sizeof(buf), uuid);
+ return (sbuf_printf(sb, "%s", buf));
+}
+
+/*
+ * Encode/Decode UUID into byte-stream.
+ * http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
+ *
+ * 0 1 2 3
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | time_low |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | time_mid | time_hi_and_version |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |clk_seq_hi_res | clk_seq_low | node (0-1) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | node (2-5) |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+
+void
+le_uuid_enc(void *buf, struct uuid const *uuid)
+{
+ u_char *p;
+ int i;
+
+ p = buf;
+ le32enc(p, uuid->time_low);
+ le16enc(p + 4, uuid->time_mid);
+ le16enc(p + 6, uuid->time_hi_and_version);
+ p[8] = uuid->clock_seq_hi_and_reserved;
+ p[9] = uuid->clock_seq_low;
+ for (i = 0; i < _UUID_NODE_LEN; i++)
+ p[10 + i] = uuid->node[i];
+}
+
+void
+le_uuid_dec(void const *buf, struct uuid *uuid)
+{
+ u_char const *p;
+ int i;
+
+ p = buf;
+ uuid->time_low = le32dec(p);
+ uuid->time_mid = le16dec(p + 4);
+ uuid->time_hi_and_version = le16dec(p + 6);
+ uuid->clock_seq_hi_and_reserved = p[8];
+ uuid->clock_seq_low = p[9];
+ for (i = 0; i < _UUID_NODE_LEN; i++)
+ uuid->node[i] = p[10 + i];
+}
+
+void
+be_uuid_enc(void *buf, struct uuid const *uuid)
+{
+ u_char *p;
+ int i;
+
+ p = buf;
+ be32enc(p, uuid->time_low);
+ be16enc(p + 4, uuid->time_mid);
+ be16enc(p + 6, uuid->time_hi_and_version);
+ p[8] = uuid->clock_seq_hi_and_reserved;
+ p[9] = uuid->clock_seq_low;
+ for (i = 0; i < _UUID_NODE_LEN; i++)
+ p[10 + i] = uuid->node[i];
+}
+
+void
+be_uuid_dec(void const *buf, struct uuid *uuid)
+{
+ u_char const *p;
+ int i;
+
+ p = buf;
+ uuid->time_low = be32dec(p);
+ uuid->time_mid = le16dec(p + 4);
+ uuid->time_hi_and_version = be16dec(p + 6);
+ uuid->clock_seq_hi_and_reserved = p[8];
+ uuid->clock_seq_low = p[9];
+ for (i = 0; i < _UUID_NODE_LEN; i++)
+ uuid->node[i] = p[10 + i];
+}
+
+int
+parse_uuid(const char *str, struct uuid *uuid)
+{
+ u_int c[11];
+ int n;
+
+ /* An empty string represents a nil UUID. */
+ if (*str == '\0') {
+ bzero(uuid, sizeof(*uuid));
+ return (0);
+ }
+
+ /* The UUID string representation has a fixed length. */
+ if (strlen(str) != 36)
+ return (EINVAL);
+
+ /*
+ * We only work with "new" UUIDs. New UUIDs have the form:
+ * 01234567-89ab-cdef-0123-456789abcdef
+ * The so called "old" UUIDs, which we don't support, have the form:
+ * 0123456789ab.cd.ef.01.23.45.67.89.ab
+ */
+ if (str[8] != '-')
+ return (EINVAL);
+
+ n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
+ c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
+ /* Make sure we have all conversions. */
+ if (n != 11)
+ return (EINVAL);
+
+ /* Successful scan. Build the UUID. */
+ uuid->time_low = c[0];
+ uuid->time_mid = c[1];
+ uuid->time_hi_and_version = c[2];
+ uuid->clock_seq_hi_and_reserved = c[3];
+ uuid->clock_seq_low = c[4];
+ for (n = 0; n < 6; n++)
+ uuid->node[n] = c[n + 5];
+
+ /* Check semantics... */
+ return (((c[3] & 0x80) != 0x00 && /* variant 0? */
+ (c[3] & 0xc0) != 0x80 && /* variant 1? */
+ (c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */
+}
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
new file mode 100644
index 0000000..095e3ff
--- /dev/null
+++ b/sys/kern/kern_xxx.c
@@ -0,0 +1,471 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/utsname.h>
+
+#include <vm/vm_param.h>
+
+#if defined(COMPAT_43)
+
+#ifndef _SYS_SYSPROTO_H_
+struct gethostname_args {
+ char *hostname;
+ u_int len;
+};
+#endif
+/* ARGSUSED */
+int
+ogethostname(td, uap)
+ struct thread *td;
+ struct gethostname_args *uap;
+{
+ int name[2];
+ size_t len = uap->len;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTNAME;
+ return (userland_sysctl(td, name, 2, uap->hostname, &len,
+ 1, 0, 0, 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct sethostname_args {
+ char *hostname;
+ u_int len;
+};
+#endif
+/* ARGSUSED */
+int
+osethostname(td, uap)
+ struct thread *td;
+ register struct sethostname_args *uap;
+{
+ int name[2];
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTNAME;
+ return (userland_sysctl(td, name, 2, 0, 0, 0, uap->hostname,
+ uap->len, 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ogethostid_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+ogethostid(td, uap)
+ struct thread *td;
+ struct ogethostid_args *uap;
+{
+ size_t len = sizeof(long);
+ int name[2];
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTID;
+ return (kernel_sysctl(td, name, 2, (long *)td->td_retval, &len,
+ NULL, 0, NULL, 0));
+}
+#endif /* COMPAT_43 */
+
+#ifdef COMPAT_43
+#ifndef _SYS_SYSPROTO_H_
+struct osethostid_args {
+ long hostid;
+};
+#endif
+/* ARGSUSED */
+int
+osethostid(td, uap)
+ struct thread *td;
+ struct osethostid_args *uap;
+{
+ int name[2];
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTID;
+ return (kernel_sysctl(td, name, 2, NULL, NULL, &uap->hostid,
+ sizeof(uap->hostid), NULL, 0));
+}
+
+int
+oquota(td, uap)
+ struct thread *td;
+ struct oquota_args *uap;
+{
+
+ return (ENOSYS);
+}
+
+#define KINFO_PROC (0<<8)
+#define KINFO_RT (1<<8)
+#define KINFO_VNODE (2<<8)
+#define KINFO_FILE (3<<8)
+#define KINFO_METER (4<<8)
+#define KINFO_LOADAVG (5<<8)
+#define KINFO_CLOCKRATE (6<<8)
+
+/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
+#define KINFO_BSDI_SYSINFO (101<<8)
+
+/*
+ * XXX this is bloat, but I hope it's better here than on the potentially
+ * limited kernel stack... -Peter
+ */
+
+static struct {
+ int bsdi_machine; /* "i386" on BSD/386 */
+/* ^^^ this is an offset to the string, relative to the struct start */
+ char *pad0;
+ long pad1;
+ long pad2;
+ long pad3;
+ u_long pad4;
+ u_long pad5;
+ u_long pad6;
+
+ int bsdi_ostype; /* "BSD/386" on BSD/386 */
+ int bsdi_osrelease; /* "1.1" on BSD/386 */
+ long pad7;
+ long pad8;
+ char *pad9;
+
+ long pad10;
+ long pad11;
+ int pad12;
+ long pad13;
+ quad_t pad14;
+ long pad15;
+
+ struct timeval pad16;
+ /* we dont set this, because BSDI's uname used gethostname() instead */
+ int bsdi_hostname; /* hostname on BSD/386 */
+
+ /* the actual string data is appended here */
+
+} bsdi_si;
+
+/*
+ * this data is appended to the end of the bsdi_si structure during copyout.
+ * The "char *" offsets are relative to the base of the bsdi_si struct.
+ * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
+ * should not exceed the length of the buffer here... (or else!! :-)
+ */
+static char bsdi_strings[80]; /* It had better be less than this! */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getkerninfo_args {
+ int op;
+ char *where;
+ size_t *size;
+ int arg;
+};
+#endif
+int
+ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
+{
+ int error, name[6];
+ size_t size;
+ u_int needed = 0;
+
+ switch (uap->op & 0xff00) {
+
+ case KINFO_RT:
+ name[0] = CTL_NET;
+ name[1] = PF_ROUTE;
+ name[2] = 0;
+ name[3] = (uap->op & 0xff0000) >> 16;
+ name[4] = uap->op & 0xff;
+ name[5] = uap->arg;
+ error = userland_sysctl(td, name, 6, uap->where, uap->size,
+ 0, 0, 0, &size, 0);
+ break;
+
+ case KINFO_VNODE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_VNODE;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size, 0);
+ break;
+
+ case KINFO_PROC:
+ name[0] = CTL_KERN;
+ name[1] = KERN_PROC;
+ name[2] = uap->op & 0xff;
+ name[3] = uap->arg;
+ error = userland_sysctl(td, name, 4, uap->where, uap->size,
+ 0, 0, 0, &size, 0);
+ break;
+
+ case KINFO_FILE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_FILE;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size, 0);
+ break;
+
+ case KINFO_METER:
+ name[0] = CTL_VM;
+ name[1] = VM_TOTAL;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size, 0);
+ break;
+
+ case KINFO_LOADAVG:
+ name[0] = CTL_VM;
+ name[1] = VM_LOADAVG;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size, 0);
+ break;
+
+ case KINFO_CLOCKRATE:
+ name[0] = CTL_KERN;
+ name[1] = KERN_CLOCKRATE;
+ error = userland_sysctl(td, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size, 0);
+ break;
+
+ case KINFO_BSDI_SYSINFO: {
+ /*
+ * this is pretty crude, but it's just enough for uname()
+ * from BSDI's 1.x libc to work.
+ *
+ * *size gives the size of the buffer before the call, and
+ * the amount of data copied after a successful call.
+ * If successful, the return value is the amount of data
+ * available, which can be larger than *size.
+ *
+ * BSDI's 2.x product apparently fails with ENOMEM if *size
+ * is too small.
+ */
+
+ u_int left;
+ char *s;
+
+ bzero((char *)&bsdi_si, sizeof(bsdi_si));
+ bzero(bsdi_strings, sizeof(bsdi_strings));
+
+ s = bsdi_strings;
+
+ bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, ostype);
+ s += strlen(s) + 1;
+
+ bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, osrelease);
+ s += strlen(s) + 1;
+
+ bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, machine);
+ s += strlen(s) + 1;
+
+ needed = sizeof(bsdi_si) + (s - bsdi_strings);
+
+ if ((uap->where == NULL) || (uap->size == NULL)) {
+ /* process is asking how much buffer to supply.. */
+ size = needed;
+ error = 0;
+ break;
+ }
+
+ if ((error = copyin(uap->size, &size, sizeof(size))) != 0)
+ break;
+
+ /* if too much buffer supplied, trim it down */
+ if (size > needed)
+ size = needed;
+
+ /* how much of the buffer is remaining */
+ left = size;
+
+ if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
+ break;
+
+ /* is there any point in continuing? */
+ if (left > sizeof(bsdi_si)) {
+ left -= sizeof(bsdi_si);
+ error = copyout(&bsdi_strings,
+ uap->where + sizeof(bsdi_si), left);
+ }
+ break;
+ }
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ if (error == 0) {
+ td->td_retval[0] = needed ? needed : size;
+ if (uap->size) {
+ error = copyout(&size, uap->size, sizeof(size));
+ }
+ }
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+#ifdef COMPAT_FREEBSD4
+/*
+ * This is the FreeBSD-1.1 compatible uname(2) interface. These days it is
+ * done in libc as a wrapper around a bunch of sysctl's. This must maintain
+ * the old 1.1 binary ABI.
+ */
+#if SYS_NMLN != 32
+#error "FreeBSD-1.1 uname syscall has been broken"
+#endif
+#ifndef _SYS_SYSPROTO_H_
+struct uname_args {
+ struct utsname *name;
+};
+#endif
+/* ARGSUSED */
+int
+freebsd4_uname(struct thread *td, struct freebsd4_uname_args *uap)
+{
+ int name[2], error;
+ size_t len;
+ char *s, *us;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_OSTYPE;
+ len = sizeof (uap->name->sysname);
+ error = userland_sysctl(td, name, 2, uap->name->sysname, &len,
+ 1, 0, 0, 0, 0);
+ if (error)
+ return (error);
+ subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
+
+ name[1] = KERN_HOSTNAME;
+ len = sizeof uap->name->nodename;
+ error = userland_sysctl(td, name, 2, uap->name->nodename, &len,
+ 1, 0, 0, 0, 0);
+ if (error)
+ return (error);
+ subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
+
+ name[1] = KERN_OSRELEASE;
+ len = sizeof uap->name->release;
+ error = userland_sysctl(td, name, 2, uap->name->release, &len,
+ 1, 0, 0, 0, 0);
+ if (error)
+ return (error);
+ subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
+
+/*
+ name = KERN_VERSION;
+ len = sizeof uap->name->version;
+ error = userland_sysctl(td, name, 2, uap->name->version, &len,
+ 1, 0, 0, 0, 0);
+ if (error)
+ return (error);
+ subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
+*/
+
+/*
+ * this stupid hackery to make the version field look like FreeBSD 1.1
+ */
+ for(s = version; *s && *s != '#'; s++);
+
+ for(us = uap->name->version; *s && *s != ':'; s++) {
+ error = subyte( us++, *s);
+ if (error)
+ return (error);
+ }
+ error = subyte( us++, 0);
+ if (error)
+ return (error);
+
+ name[0] = CTL_HW;
+ name[1] = HW_MACHINE;
+ len = sizeof uap->name->machine;
+ error = userland_sysctl(td, name, 2, uap->name->machine, &len,
+ 1, 0, 0, 0, 0);
+ if (error)
+ return (error);
+ subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdomainname_args {
+ char *domainname;
+ int len;
+};
+#endif
+/* ARGSUSED */
+int
+freebsd4_getdomainname(struct thread *td,
+ struct freebsd4_getdomainname_args *uap)
+{
+ int name[2];
+ size_t len = uap->len;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_NISDOMAINNAME;
+ return (userland_sysctl(td, name, 2, uap->domainname, &len,
+ 1, 0, 0, 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setdomainname_args {
+ char *domainname;
+ int len;
+};
+#endif
+/* ARGSUSED */
+int
+freebsd4_setdomainname(struct thread *td,
+ struct freebsd4_setdomainname_args *uap)
+{
+ int name[2];
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_NISDOMAINNAME;
+ return (userland_sysctl(td, name, 2, 0, 0, 0, uap->domainname,
+ uap->len, 0, 0));
+}
+#endif /* COMPAT_FREEBSD4 */
diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c
new file mode 100644
index 0000000..efb673e
--- /dev/null
+++ b/sys/kern/ksched.c
@@ -0,0 +1,292 @@
+/*-
+ * Copyright (c) 1996, 1997
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* ksched: Soft real time scheduling based on "rtprio".
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/resource.h>
+#include <sys/sched.h>
+
+FEATURE(kposix_priority_scheduling, "POSIX P1003.1B realtime extensions");
+
+/* ksched: Real-time extension to support POSIX priority scheduling.
+ */
+
+struct ksched {
+ struct timespec rr_interval;
+};
+
+int
+ksched_attach(struct ksched **p)
+{
+ struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+
+ ksched->rr_interval.tv_sec = 0;
+ ksched->rr_interval.tv_nsec = 1000000000L / hz * sched_rr_interval();
+
+ *p = ksched;
+ return 0;
+}
+
+int
+ksched_detach(struct ksched *ks)
+{
+ p31b_free(ks);
+
+ return 0;
+}
+
+/*
+ * XXX About priorities
+ *
+ * POSIX 1003.1b requires that numerically higher priorities be of
+ * higher priority. It also permits sched_setparam to be
+ * implementation defined for SCHED_OTHER. I don't like
+ * the notion of inverted priorites for normal processes when
+ * you can use "setpriority" for that.
+ *
+ */
+
+/* Macros to convert between the unix (lower numerically is higher priority)
+ * and POSIX 1003.1b (higher numerically is higher priority)
+ */
+
+#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
+#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
+
+#define p4prio_to_tsprio(P) ((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) - (P))
+#define tsprio_to_p4prio(P) ((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) - (P))
+
+/* These improve readability a bit for me:
+ */
+#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
+#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
+
+static __inline int
+getscheduler(struct ksched *ksched, struct thread *td, int *policy)
+{
+ struct rtprio rtp;
+ int e = 0;
+
+ pri_to_rtp(td, &rtp);
+ switch (rtp.type)
+ {
+ case RTP_PRIO_FIFO:
+ *policy = SCHED_FIFO;
+ break;
+
+ case RTP_PRIO_REALTIME:
+ *policy = SCHED_RR;
+ break;
+
+ default:
+ *policy = SCHED_OTHER;
+ break;
+ }
+
+ return e;
+}
+
+int
+ksched_setparam(struct ksched *ksched,
+ struct thread *td, const struct sched_param *param)
+{
+ int policy;
+ int e;
+
+ e = getscheduler(ksched, td, &policy);
+
+ if (e == 0)
+ {
+ e = ksched_setscheduler(ksched, td, policy, param);
+ }
+
+ return e;
+}
+
+int
+ksched_getparam(struct ksched *ksched,
+ struct thread *td, struct sched_param *param)
+{
+ struct rtprio rtp;
+
+ pri_to_rtp(td, &rtp);
+ if (RTP_PRIO_IS_REALTIME(rtp.type))
+ param->sched_priority = rtpprio_to_p4prio(rtp.prio);
+ else {
+ if (PRI_MIN_TIMESHARE < rtp.prio)
+ /*
+ * The interactive score has it to min realtime
+ * so we must show max (64 most likely
+ */
+ param->sched_priority = (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE);
+ else
+ param->sched_priority = tsprio_to_p4prio(rtp.prio);
+ }
+ return 0;
+}
+
+/*
+ * XXX The priority and scheduler modifications should
+ * be moved into published interfaces in kern/kern_sync.
+ *
+ * The permissions to modify process p were checked in "p31b_proc()".
+ *
+ */
+int
+ksched_setscheduler(struct ksched *ksched,
+ struct thread *td, int policy, const struct sched_param *param)
+{
+ int e = 0;
+ struct rtprio rtp;
+
+ switch(policy)
+ {
+ case SCHED_RR:
+ case SCHED_FIFO:
+
+ if (param->sched_priority >= P1B_PRIO_MIN &&
+ param->sched_priority <= P1B_PRIO_MAX)
+ {
+ rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+ rtp.type = (policy == SCHED_FIFO)
+ ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
+
+ rtp_to_pri(&rtp, td);
+ }
+ else
+ e = EPERM;
+
+
+ break;
+
+ case SCHED_OTHER:
+ if (param->sched_priority >= 0 &&
+ param->sched_priority <= (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) {
+ rtp.type = RTP_PRIO_NORMAL;
+ rtp.prio = p4prio_to_tsprio(param->sched_priority);
+ rtp_to_pri(&rtp, td);
+ } else
+ e = EINVAL;
+
+ break;
+
+ default:
+ e = EINVAL;
+ break;
+ }
+
+ return e;
+}
+
+int
+ksched_getscheduler(struct ksched *ksched, struct thread *td, int *policy)
+{
+ return getscheduler(ksched, td, policy);
+}
+
+/* ksched_yield: Yield the CPU.
+ */
+int
+ksched_yield(struct ksched *ksched)
+{
+ sched_relinquish(curthread);
+ return 0;
+}
+
+int
+ksched_get_priority_max(struct ksched *ksched, int policy, int *prio)
+{
+ int e = 0;
+
+ switch (policy)
+ {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ *prio = RTP_PRIO_MAX;
+ break;
+
+ case SCHED_OTHER:
+ *prio = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
+ break;
+
+ default:
+ e = EINVAL;
+ }
+
+ return e;
+}
+
+int
+ksched_get_priority_min(struct ksched *ksched, int policy, int *prio)
+{
+ int e = 0;
+
+ switch (policy)
+ {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ *prio = P1B_PRIO_MIN;
+ break;
+
+ case SCHED_OTHER:
+ *prio = 0;
+ break;
+
+ default:
+ e = EINVAL;
+ }
+
+ return e;
+}
+
+int
+ksched_rr_get_interval(struct ksched *ksched,
+ struct thread *td, struct timespec *timespec)
+{
+ *timespec = ksched->rr_interval;
+
+ return 0;
+}
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
new file mode 100644
index 0000000..6252a8d
--- /dev/null
+++ b/sys/kern/link_elf.c
@@ -0,0 +1,1605 @@
+/*-
+ * Copyright (c) 1998-2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_gdb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/mount.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+#include <machine/elf.h>
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#ifdef SPARSE_MAPPING
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#endif
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#include <sys/link_elf.h>
+
+#ifdef DDB_CTF
+#include <net/zlib.h>
+#endif
+
+#include "linker_if.h"
+
+#define MAXSEGS 4
+
+typedef struct elf_file {
+ struct linker_file lf; /* Common fields */
+ int preloaded; /* Was file pre-loaded */
+ caddr_t address; /* Relocation address */
+#ifdef SPARSE_MAPPING
+ vm_object_t object; /* VM object to hold file pages */
+#endif
+ Elf_Dyn *dynamic; /* Symbol table etc. */
+ Elf_Hashelt nbuckets; /* DT_HASH info */
+ Elf_Hashelt nchains;
+ const Elf_Hashelt *buckets;
+ const Elf_Hashelt *chains;
+ caddr_t hash;
+ caddr_t strtab; /* DT_STRTAB */
+ int strsz; /* DT_STRSZ */
+ const Elf_Sym *symtab; /* DT_SYMTAB */
+ Elf_Addr *got; /* DT_PLTGOT */
+ const Elf_Rel *pltrel; /* DT_JMPREL */
+ int pltrelsize; /* DT_PLTRELSZ */
+ const Elf_Rela *pltrela; /* DT_JMPREL */
+ int pltrelasize; /* DT_PLTRELSZ */
+ const Elf_Rel *rel; /* DT_REL */
+ int relsize; /* DT_RELSZ */
+ const Elf_Rela *rela; /* DT_RELA */
+ int relasize; /* DT_RELASZ */
+ caddr_t modptr;
+ const Elf_Sym *ddbsymtab; /* The symbol table we are using */
+ long ddbsymcnt; /* Number of symbols */
+ caddr_t ddbstrtab; /* String table */
+ long ddbstrcnt; /* number of bytes in string table */
+ caddr_t symbase; /* malloc'ed symbold base */
+ caddr_t strbase; /* malloc'ed string base */
+ caddr_t ctftab; /* CTF table */
+ long ctfcnt; /* number of bytes in CTF table */
+ caddr_t ctfoff; /* CTF offset table */
+ caddr_t typoff; /* Type offset table */
+ long typlen; /* Number of type entries. */
+ Elf_Addr pcpu_start; /* Pre-relocation pcpu set start. */
+ Elf_Addr pcpu_stop; /* Pre-relocation pcpu set stop. */
+ Elf_Addr pcpu_base; /* Relocated pcpu set address. */
+#ifdef VIMAGE
+ Elf_Addr vnet_start; /* Pre-relocation vnet set start. */
+ Elf_Addr vnet_stop; /* Pre-relocation vnet set stop. */
+ Elf_Addr vnet_base; /* Relocated vnet set address. */
+#endif
+#ifdef GDB
+ struct link_map gdb; /* hooks for gdb */
+#endif
+} *elf_file_t;
+
+struct elf_set {
+ Elf_Addr es_start;
+ Elf_Addr es_stop;
+ Elf_Addr es_base;
+ TAILQ_ENTRY(elf_set) es_link;
+};
+
+TAILQ_HEAD(elf_set_head, elf_set);
+
+#include <kern/kern_ctf.c>
+
+static int link_elf_link_common_finish(linker_file_t);
+static int link_elf_link_preload(linker_class_t cls,
+ const char *, linker_file_t *);
+static int link_elf_link_preload_finish(linker_file_t);
+static int link_elf_load_file(linker_class_t, const char *,
+ linker_file_t *);
+static int link_elf_lookup_symbol(linker_file_t, const char *,
+ c_linker_sym_t *);
+static int link_elf_symbol_values(linker_file_t, c_linker_sym_t,
+ linker_symval_t *);
+static int link_elf_search_symbol(linker_file_t, caddr_t,
+ c_linker_sym_t *, long *);
+
+static void link_elf_unload_file(linker_file_t);
+static void link_elf_unload_preload(linker_file_t);
+static int link_elf_lookup_set(linker_file_t, const char *,
+ void ***, void ***, int *);
+static int link_elf_each_function_name(linker_file_t,
+ int (*)(const char *, void *), void *);
+static int link_elf_each_function_nameval(linker_file_t,
+ linker_function_nameval_callback_t, void *);
+static void link_elf_reloc_local(linker_file_t);
+static long link_elf_symtab_get(linker_file_t, const Elf_Sym **);
+static long link_elf_strtab_get(linker_file_t, caddr_t *);
+static Elf_Addr elf_lookup(linker_file_t, Elf_Size, int);
+
+static kobj_method_t link_elf_methods[] = {
+ KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol),
+ KOBJMETHOD(linker_symbol_values, link_elf_symbol_values),
+ KOBJMETHOD(linker_search_symbol, link_elf_search_symbol),
+ KOBJMETHOD(linker_unload, link_elf_unload_file),
+ KOBJMETHOD(linker_load_file, link_elf_load_file),
+ KOBJMETHOD(linker_link_preload, link_elf_link_preload),
+ KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish),
+ KOBJMETHOD(linker_lookup_set, link_elf_lookup_set),
+ KOBJMETHOD(linker_each_function_name, link_elf_each_function_name),
+ KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
+ KOBJMETHOD(linker_ctf_get, link_elf_ctf_get),
+ KOBJMETHOD(linker_symtab_get, link_elf_symtab_get),
+ KOBJMETHOD(linker_strtab_get, link_elf_strtab_get),
+ { 0, 0 }
+};
+
+static struct linker_class link_elf_class = {
+#if ELF_TARG_CLASS == ELFCLASS32
+ "elf32",
+#else
+ "elf64",
+#endif
+ link_elf_methods, sizeof(struct elf_file)
+};
+
+static int parse_dynamic(elf_file_t);
+static int relocate_file(elf_file_t);
+static int link_elf_preload_parse_symbols(elf_file_t);
+
+static struct elf_set_head set_pcpu_list;
+#ifdef VIMAGE
+static struct elf_set_head set_vnet_list;
+#endif
+
+static void
+elf_set_add(struct elf_set_head *list, Elf_Addr start, Elf_Addr stop, Elf_Addr base)
+{
+ struct elf_set *set, *iter;
+
+ set = malloc(sizeof(*set), M_LINKER, M_WAITOK);
+ set->es_start = start;
+ set->es_stop = stop;
+ set->es_base = base;
+
+ TAILQ_FOREACH(iter, list, es_link) {
+
+ KASSERT((set->es_start < iter->es_start && set->es_stop < iter->es_stop) ||
+ (set->es_start > iter->es_start && set->es_stop > iter->es_stop),
+ ("linker sets intersection: to insert: 0x%jx-0x%jx; inserted: 0x%jx-0x%jx",
+ (uintmax_t)set->es_start, (uintmax_t)set->es_stop,
+ (uintmax_t)iter->es_start, (uintmax_t)iter->es_stop));
+
+ if (iter->es_start > set->es_start) {
+ TAILQ_INSERT_BEFORE(iter, set, es_link);
+ break;
+ }
+ }
+
+ if (iter == NULL)
+ TAILQ_INSERT_TAIL(list, set, es_link);
+}
+
+static int
+elf_set_find(struct elf_set_head *list, Elf_Addr addr, Elf_Addr *start, Elf_Addr *base)
+{
+ struct elf_set *set;
+
+ TAILQ_FOREACH(set, list, es_link) {
+ if (addr < set->es_start)
+ return (0);
+ if (addr < set->es_stop) {
+ *start = set->es_start;
+ *base = set->es_base;
+ return (1);
+ }
+ }
+
+ return (0);
+}
+
+static void
+elf_set_delete(struct elf_set_head *list, Elf_Addr start)
+{
+ struct elf_set *set;
+
+ TAILQ_FOREACH(set, list, es_link) {
+ if (start < set->es_start)
+ break;
+ if (start == set->es_start) {
+ TAILQ_REMOVE(list, set, es_link);
+ free(set, M_LINKER);
+ return;
+ }
+ }
+ KASSERT(0, ("deleting unknown linker set (start = 0x%jx)",
+ (uintmax_t)start));
+}
+
+#ifdef GDB
+static void r_debug_state(struct r_debug *, struct link_map *);
+
+/*
+ * A list of loaded modules for GDB to use for loading symbols.
+ */
+struct r_debug r_debug;
+
+#define GDB_STATE(s) do { \
+ r_debug.r_state = s; r_debug_state(NULL, NULL); \
+} while (0)
+
+/*
+ * Function for the debugger to set a breakpoint on to gain control.
+ */
+static void
+r_debug_state(struct r_debug *dummy_one __unused,
+ struct link_map *dummy_two __unused)
+{
+}
+
+static void
+link_elf_add_gdb(struct link_map *l)
+{
+ struct link_map *prev;
+
+ l->l_next = NULL;
+
+ if (r_debug.r_map == NULL) {
+ /* Add first. */
+ l->l_prev = NULL;
+ r_debug.r_map = l;
+ } else {
+ /* Append to list. */
+ for (prev = r_debug.r_map;
+ prev->l_next != NULL;
+ prev = prev->l_next)
+ ;
+ l->l_prev = prev;
+ prev->l_next = l;
+ }
+}
+
+static void
+link_elf_delete_gdb(struct link_map *l)
+{
+ if (l->l_prev == NULL) {
+ /* Remove first. */
+ if ((r_debug.r_map = l->l_next) != NULL)
+ l->l_next->l_prev = NULL;
+ } else {
+ /* Remove any but first. */
+ if ((l->l_prev->l_next = l->l_next) != NULL)
+ l->l_next->l_prev = l->l_prev;
+ }
+}
+#endif /* GDB */
+
+#ifdef __ia64__
+Elf_Addr link_elf_get_gp(linker_file_t);
+#endif
+
+/*
+ * The kernel symbol table starts here.
+ */
+extern struct _dynamic _DYNAMIC;
+
+static void
+link_elf_error(const char *filename, const char *s)
+{
+ if (filename == NULL)
+ printf("kldload: %s\n", s);
+ else
+ printf("kldload: %s: %s\n", filename, s);
+}
+
+/*
+ * Actions performed after linking/loading both the preloaded kernel and any
+ * modules; whether preloaded or dynamicly loaded.
+ */
+static int
+link_elf_link_common_finish(linker_file_t lf)
+{
+#ifdef GDB
+ elf_file_t ef = (elf_file_t)lf;
+ char *newfilename;
+#endif
+ int error;
+
+ /* Notify MD code that a module is being loaded. */
+ error = elf_cpu_load_file(lf);
+ if (error != 0)
+ return (error);
+
+#ifdef GDB
+ GDB_STATE(RT_ADD);
+ ef->gdb.l_addr = lf->address;
+ newfilename = malloc(strlen(lf->filename) + 1, M_LINKER, M_WAITOK);
+ strcpy(newfilename, lf->filename);
+ ef->gdb.l_name = newfilename;
+ ef->gdb.l_ld = ef->dynamic;
+ link_elf_add_gdb(&ef->gdb);
+ GDB_STATE(RT_CONSISTENT);
+#endif
+
+ return (0);
+}
+
+static void
+link_elf_init(void* arg)
+{
+ Elf_Dyn *dp;
+ caddr_t modptr, baseptr, sizeptr;
+ elf_file_t ef;
+ char *modname;
+
+ linker_add_class(&link_elf_class);
+
+ dp = (Elf_Dyn *)&_DYNAMIC;
+ modname = NULL;
+ modptr = preload_search_by_type("elf" __XSTRING(__ELF_WORD_SIZE) " kernel");
+ if (modptr == NULL)
+ modptr = preload_search_by_type("elf kernel");
+ if (modptr != NULL)
+ modname = (char *)preload_search_info(modptr, MODINFO_NAME);
+ if (modname == NULL)
+ modname = "kernel";
+ linker_kernel_file = linker_make_file(modname, &link_elf_class);
+ if (linker_kernel_file == NULL)
+ panic("%s: Can't create linker structures for kernel",
+ __func__);
+
+ ef = (elf_file_t) linker_kernel_file;
+ ef->preloaded = 1;
+ ef->address = 0;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ ef->dynamic = dp;
+
+ if (dp != NULL)
+ parse_dynamic(ef);
+ linker_kernel_file->address = (caddr_t) KERNBASE;
+ linker_kernel_file->size = -(intptr_t)linker_kernel_file->address;
+
+ if (modptr != NULL) {
+ ef->modptr = modptr;
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ if (baseptr != NULL)
+ linker_kernel_file->address = *(caddr_t *)baseptr;
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ if (sizeptr != NULL)
+ linker_kernel_file->size = *(size_t *)sizeptr;
+ }
+ (void)link_elf_preload_parse_symbols(ef);
+
+#ifdef GDB
+ r_debug.r_map = NULL;
+ r_debug.r_brk = r_debug_state;
+ r_debug.r_state = RT_CONSISTENT;
+#endif
+
+ (void)link_elf_link_common_finish(linker_kernel_file);
+ linker_kernel_file->flags |= LINKER_FILE_LINKED;
+ TAILQ_INIT(&set_pcpu_list);
+#ifdef VIMAGE
+ TAILQ_INIT(&set_vnet_list);
+#endif
+}
+
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0);
+
+static int
+link_elf_preload_parse_symbols(elf_file_t ef)
+{
+ caddr_t pointer;
+ caddr_t ssym, esym, base;
+ caddr_t strtab;
+ int strcnt;
+ Elf_Sym *symtab;
+ int symcnt;
+
+ if (ef->modptr == NULL)
+ return (0);
+ pointer = preload_search_info(ef->modptr,
+ MODINFO_METADATA | MODINFOMD_SSYM);
+ if (pointer == NULL)
+ return (0);
+ ssym = *(caddr_t *)pointer;
+ pointer = preload_search_info(ef->modptr,
+ MODINFO_METADATA | MODINFOMD_ESYM);
+ if (pointer == NULL)
+ return (0);
+ esym = *(caddr_t *)pointer;
+
+ base = ssym;
+
+ symcnt = *(long *)base;
+ base += sizeof(long);
+ symtab = (Elf_Sym *)base;
+ base += roundup(symcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return (EINVAL);
+ }
+
+ strcnt = *(long *)base;
+ base += sizeof(long);
+ strtab = base;
+ base += roundup(strcnt, sizeof(long));
+
+ if (base > esym || base < ssym) {
+ printf("Symbols are corrupt!\n");
+ return (EINVAL);
+ }
+
+ ef->ddbsymtab = symtab;
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbstrtab = strtab;
+ ef->ddbstrcnt = strcnt;
+
+ return (0);
+}
+
+static int
+parse_dynamic(elf_file_t ef)
+{
+ Elf_Dyn *dp;
+ int plttype = DT_REL;
+
+ for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
+ switch (dp->d_tag) {
+ case DT_HASH:
+ {
+ /* From src/libexec/rtld-elf/rtld.c */
+ const Elf_Hashelt *hashtab = (const Elf_Hashelt *)
+ (ef->address + dp->d_un.d_ptr);
+ ef->nbuckets = hashtab[0];
+ ef->nchains = hashtab[1];
+ ef->buckets = hashtab + 2;
+ ef->chains = ef->buckets + ef->nbuckets;
+ break;
+ }
+ case DT_STRTAB:
+ ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_STRSZ:
+ ef->strsz = dp->d_un.d_val;
+ break;
+ case DT_SYMTAB:
+ ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_SYMENT:
+ if (dp->d_un.d_val != sizeof(Elf_Sym))
+ return (ENOEXEC);
+ break;
+ case DT_PLTGOT:
+ ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_REL:
+ ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELSZ:
+ ef->relsize = dp->d_un.d_val;
+ break;
+ case DT_RELENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rel))
+ return (ENOEXEC);
+ break;
+ case DT_JMPREL:
+ ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_PLTRELSZ:
+ ef->pltrelsize = dp->d_un.d_val;
+ break;
+ case DT_RELA:
+ ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr);
+ break;
+ case DT_RELASZ:
+ ef->relasize = dp->d_un.d_val;
+ break;
+ case DT_RELAENT:
+ if (dp->d_un.d_val != sizeof(Elf_Rela))
+ return (ENOEXEC);
+ break;
+ case DT_PLTREL:
+ plttype = dp->d_un.d_val;
+ if (plttype != DT_REL && plttype != DT_RELA)
+ return (ENOEXEC);
+ break;
+#ifdef GDB
+ case DT_DEBUG:
+ dp->d_un.d_ptr = (Elf_Addr)&r_debug;
+ break;
+#endif
+ }
+ }
+
+ if (plttype == DT_RELA) {
+ ef->pltrela = (const Elf_Rela *)ef->pltrel;
+ ef->pltrel = NULL;
+ ef->pltrelasize = ef->pltrelsize;
+ ef->pltrelsize = 0;
+ }
+
+ ef->ddbsymtab = ef->symtab;
+ ef->ddbsymcnt = ef->nchains;
+ ef->ddbstrtab = ef->strtab;
+ ef->ddbstrcnt = ef->strsz;
+
+ return (0);
+}
+
+static int
+parse_dpcpu(elf_file_t ef)
+{
+ int count;
+ int error;
+
+ ef->pcpu_start = 0;
+ ef->pcpu_stop = 0;
+ error = link_elf_lookup_set(&ef->lf, "pcpu", (void ***)&ef->pcpu_start,
+ (void ***)&ef->pcpu_stop, &count);
+ /* Error just means there is no pcpu set to relocate. */
+ if (error != 0)
+ return (0);
+ count *= sizeof(void *);
+ /*
+ * Allocate space in the primary pcpu area. Copy in our
+ * initialization from the data section and then initialize
+ * all per-cpu storage from that.
+ */
+ ef->pcpu_base = (Elf_Addr)(uintptr_t)dpcpu_alloc(count);
+ if (ef->pcpu_base == 0)
+ return (ENOSPC);
+ memcpy((void *)ef->pcpu_base, (void *)ef->pcpu_start, count);
+ dpcpu_copy((void *)ef->pcpu_base, count);
+ elf_set_add(&set_pcpu_list, ef->pcpu_start, ef->pcpu_stop,
+ ef->pcpu_base);
+
+ return (0);
+}
+
+#ifdef VIMAGE
+static int
+parse_vnet(elf_file_t ef)
+{
+ int count;
+ int error;
+
+ ef->vnet_start = 0;
+ ef->vnet_stop = 0;
+ error = link_elf_lookup_set(&ef->lf, "vnet", (void ***)&ef->vnet_start,
+ (void ***)&ef->vnet_stop, &count);
+ /* Error just means there is no vnet data set to relocate. */
+ if (error != 0)
+ return (0);
+ count *= sizeof(void *);
+ /*
+ * Allocate space in the primary vnet area. Copy in our
+ * initialization from the data section and then initialize
+ * all per-vnet storage from that.
+ */
+ ef->vnet_base = (Elf_Addr)(uintptr_t)vnet_data_alloc(count);
+ if (ef->vnet_base == 0)
+ return (ENOSPC);
+ memcpy((void *)ef->vnet_base, (void *)ef->vnet_start, count);
+ vnet_data_copy((void *)ef->vnet_base, count);
+ elf_set_add(&set_vnet_list, ef->vnet_start, ef->vnet_stop,
+ ef->vnet_base);
+
+ return (0);
+}
+#endif
+
+static int
+link_elf_link_preload(linker_class_t cls,
+ const char* filename, linker_file_t *result)
+{
+ caddr_t modptr, baseptr, sizeptr, dynptr;
+ char *type;
+ elf_file_t ef;
+ linker_file_t lf;
+ int error;
+ vm_offset_t dp;
+
+ /* Look to see if we have the file preloaded */
+ modptr = preload_search_by_name(filename);
+ if (modptr == NULL)
+ return (ENOENT);
+
+ type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ dynptr = preload_search_info(modptr,
+ MODINFO_METADATA | MODINFOMD_DYNAMIC);
+ if (type == NULL ||
+ (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE) " module") != 0 &&
+ strcmp(type, "elf module") != 0))
+ return (EFTYPE);
+ if (baseptr == NULL || sizeptr == NULL || dynptr == NULL)
+ return (EINVAL);
+
+ lf = linker_make_file(filename, &link_elf_class);
+ if (lf == NULL)
+ return (ENOMEM);
+
+ ef = (elf_file_t) lf;
+ ef->preloaded = 1;
+ ef->modptr = modptr;
+ ef->address = *(caddr_t *)baseptr;
+#ifdef SPARSE_MAPPING
+ ef->object = 0;
+#endif
+ dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr;
+ ef->dynamic = (Elf_Dyn *)dp;
+ lf->address = ef->address;
+ lf->size = *(size_t *)sizeptr;
+
+ error = parse_dynamic(ef);
+ if (error == 0)
+ error = parse_dpcpu(ef);
+#ifdef VIMAGE
+ if (error == 0)
+ error = parse_vnet(ef);
+#endif
+ if (error != 0) {
+ linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ return (error);
+ }
+ link_elf_reloc_local(lf);
+ *result = lf;
+ return (0);
+}
+
+static int
+link_elf_link_preload_finish(linker_file_t lf)
+{
+ elf_file_t ef;
+ int error;
+
+ ef = (elf_file_t) lf;
+ error = relocate_file(ef);
+ if (error != 0)
+ return (error);
+ (void)link_elf_preload_parse_symbols(ef);
+
+ return (link_elf_link_common_finish(lf));
+}
+
+static int
+link_elf_load_file(linker_class_t cls, const char* filename,
+ linker_file_t* result)
+{
+ struct nameidata nd;
+ struct thread* td = curthread; /* XXX */
+ Elf_Ehdr *hdr;
+ caddr_t firstpage;
+ int nbytes, i;
+ Elf_Phdr *phdr;
+ Elf_Phdr *phlimit;
+ Elf_Phdr *segs[MAXSEGS];
+ int nsegs;
+ Elf_Phdr *phdyn;
+ Elf_Phdr *phphdr;
+ caddr_t mapbase;
+ size_t mapsize;
+ Elf_Off base_offset;
+ Elf_Addr base_vaddr;
+ Elf_Addr base_vlimit;
+ int error = 0;
+ ssize_t resid;
+ int flags;
+ elf_file_t ef;
+ linker_file_t lf;
+ Elf_Shdr *shdr;
+ int symtabindex;
+ int symstrindex;
+ int symcnt;
+ int strcnt;
+
+ shdr = NULL;
+ lf = NULL;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp->v_type != VREG) {
+ error = ENOEXEC;
+ firstpage = NULL;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_kld_check_load(curthread->td_ucred, nd.ni_vp);
+ if (error != 0) {
+ firstpage = NULL;
+ goto out;
+ }
+#endif
+
+ /*
+ * Read the elf header from the file.
+ */
+ firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK);
+ hdr = (Elf_Ehdr *)firstpage;
+ error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ nbytes = PAGE_SIZE - resid;
+ if (error != 0)
+ goto out;
+
+ if (!IS_ELF(*hdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+ hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+ link_elf_error(filename, "Unsupported file layout");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_ident[EI_VERSION] != EV_CURRENT ||
+ hdr->e_version != EV_CURRENT) {
+ link_elf_error(filename, "Unsupported file version");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) {
+ error = ENOSYS;
+ goto out;
+ }
+ if (hdr->e_machine != ELF_TARG_MACH) {
+ link_elf_error(filename, "Unsupported machine");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * We rely on the program header being in the first page.
+ * This is not strictly required by the ABI specification, but
+ * it seems to always true in practice. And, it simplifies
+ * things considerably.
+ */
+ if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) &&
+ (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes)))
+ link_elf_error(filename, "Unreadable program headers");
+
+ /*
+ * Scan the program header entries, and save key information.
+ *
+ * We rely on there being exactly two load segments, text and data,
+ * in that order.
+ */
+ phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff);
+ phlimit = phdr + hdr->e_phnum;
+ nsegs = 0;
+ phdyn = NULL;
+ phphdr = NULL;
+ while (phdr < phlimit) {
+ switch (phdr->p_type) {
+ case PT_LOAD:
+ if (nsegs == MAXSEGS) {
+ link_elf_error(filename, "Too many sections");
+ error = ENOEXEC;
+ goto out;
+ }
+ /*
+ * XXX: We just trust they come in right order ??
+ */
+ segs[nsegs] = phdr;
+ ++nsegs;
+ break;
+
+ case PT_PHDR:
+ phphdr = phdr;
+ break;
+
+ case PT_DYNAMIC:
+ phdyn = phdr;
+ break;
+
+ case PT_INTERP:
+ error = ENOSYS;
+ goto out;
+ }
+
+ ++phdr;
+ }
+ if (phdyn == NULL) {
+ link_elf_error(filename, "Object is not dynamically-linked");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (nsegs == 0) {
+ link_elf_error(filename, "No sections");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * Allocate the entire address space of the object, to stake
+ * out our contiguous region, and to establish the base
+ * address for relocation.
+ */
+ base_offset = trunc_page(segs[0]->p_offset);
+ base_vaddr = trunc_page(segs[0]->p_vaddr);
+ base_vlimit = round_page(segs[nsegs - 1]->p_vaddr +
+ segs[nsegs - 1]->p_memsz);
+ mapsize = base_vlimit - base_vaddr;
+
+ lf = linker_make_file(filename, &link_elf_class);
+ if (lf == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ ef = (elf_file_t) lf;
+#ifdef SPARSE_MAPPING
+ ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT);
+ if (ef->object == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ ef->address = (caddr_t) vm_map_min(kernel_map);
+ error = vm_map_find(kernel_map, ef->object, 0,
+ (vm_offset_t *) &ef->address, mapsize, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error != 0) {
+ vm_object_deallocate(ef->object);
+ ef->object = 0;
+ goto out;
+ }
+#else
+ ef->address = malloc(mapsize, M_LINKER, M_WAITOK);
+#endif
+ mapbase = ef->address;
+
+ /*
+ * Read the text and data sections and zero the bss.
+ */
+ for (i = 0; i < nsegs; i++) {
+ caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ segbase, segs[i]->p_filesz, segs[i]->p_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error != 0)
+ goto out;
+ bzero(segbase + segs[i]->p_filesz,
+ segs[i]->p_memsz - segs[i]->p_filesz);
+
+#ifdef SPARSE_MAPPING
+ /*
+ * Wire down the pages
+ */
+ error = vm_map_wire(kernel_map,
+ (vm_offset_t) segbase,
+ (vm_offset_t) segbase + segs[i]->p_memsz,
+ VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
+ if (error != KERN_SUCCESS) {
+ error = ENOMEM;
+ goto out;
+ }
+#endif
+ }
+
+#ifdef GPROF
+ /* Update profiling information with the new text segment. */
+ mtx_lock(&Giant);
+ kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
+ segs[0]->p_memsz));
+ mtx_unlock(&Giant);
+#endif
+
+ ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
+
+ lf->address = ef->address;
+ lf->size = mapsize;
+
+ error = parse_dynamic(ef);
+ if (error != 0)
+ goto out;
+ error = parse_dpcpu(ef);
+ if (error != 0)
+ goto out;
+#ifdef VIMAGE
+ error = parse_vnet(ef);
+ if (error != 0)
+ goto out;
+#endif
+ link_elf_reloc_local(lf);
+
+ VOP_UNLOCK(nd.ni_vp, 0);
+ error = linker_load_dependencies(lf);
+ vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error != 0)
+ goto out;
+ error = relocate_file(ef);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Try and load the symbol table if it's present. (you can
+ * strip it!)
+ */
+ nbytes = hdr->e_shnum * hdr->e_shentsize;
+ if (nbytes == 0 || hdr->e_shoff == 0)
+ goto nosyms;
+ shdr = malloc(nbytes, M_LINKER, M_WAITOK | M_ZERO);
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ (caddr_t)shdr, nbytes, hdr->e_shoff,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error != 0)
+ goto out;
+ symtabindex = -1;
+ symstrindex = -1;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_type == SHT_SYMTAB) {
+ symtabindex = i;
+ symstrindex = shdr[i].sh_link;
+ }
+ }
+ if (symtabindex < 0 || symstrindex < 0)
+ goto nosyms;
+
+ symcnt = shdr[symtabindex].sh_size;
+ ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK);
+ strcnt = shdr[symstrindex].sh_size;
+ ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK);
+
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->symbase, symcnt, shdr[symtabindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error != 0)
+ goto out;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->strbase, strcnt, shdr[symstrindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error != 0)
+ goto out;
+
+ ef->ddbsymcnt = symcnt / sizeof(Elf_Sym);
+ ef->ddbsymtab = (const Elf_Sym *)ef->symbase;
+ ef->ddbstrcnt = strcnt;
+ ef->ddbstrtab = ef->strbase;
+
+nosyms:
+ error = link_elf_link_common_finish(lf);
+ if (error != 0)
+ goto out;
+
+ *result = lf;
+
+out:
+ VOP_UNLOCK(nd.ni_vp, 0);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+ if (error != 0 && lf != NULL)
+ linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ if (shdr != NULL)
+ free(shdr, M_LINKER);
+ if (firstpage != NULL)
+ free(firstpage, M_LINKER);
+
+ return (error);
+}
+
+Elf_Addr
+elf_relocaddr(linker_file_t lf, Elf_Addr x)
+{
+ elf_file_t ef;
+
+ ef = (elf_file_t)lf;
+ if (x >= ef->pcpu_start && x < ef->pcpu_stop)
+ return ((x - ef->pcpu_start) + ef->pcpu_base);
+#ifdef VIMAGE
+ if (x >= ef->vnet_start && x < ef->vnet_stop)
+ return ((x - ef->vnet_start) + ef->vnet_base);
+#endif
+ return (x);
+}
+
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+ elf_file_t ef = (elf_file_t) file;
+
+ if (ef->pcpu_base != 0) {
+ dpcpu_free((void *)ef->pcpu_base,
+ ef->pcpu_stop - ef->pcpu_start);
+ elf_set_delete(&set_pcpu_list, ef->pcpu_start);
+ }
+#ifdef VIMAGE
+ if (ef->vnet_base != 0) {
+ vnet_data_free((void *)ef->vnet_base,
+ ef->vnet_stop - ef->vnet_start);
+ elf_set_delete(&set_vnet_list, ef->vnet_start);
+ }
+#endif
+#ifdef GDB
+ if (ef->gdb.l_ld != NULL) {
+ GDB_STATE(RT_DELETE);
+ free((void *)(uintptr_t)ef->gdb.l_name, M_LINKER);
+ link_elf_delete_gdb(&ef->gdb);
+ GDB_STATE(RT_CONSISTENT);
+ }
+#endif
+
+ /* Notify MD code that a module is being unloaded. */
+ elf_cpu_unload_file(file);
+
+ if (ef->preloaded) {
+ link_elf_unload_preload(file);
+ return;
+ }
+
+#ifdef SPARSE_MAPPING
+ if (ef->object != NULL) {
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address
+ + (ef->object->size << PAGE_SHIFT));
+ }
+#else
+ if (ef->address != NULL)
+ free(ef->address, M_LINKER);
+#endif
+ if (ef->symbase != NULL)
+ free(ef->symbase, M_LINKER);
+ if (ef->strbase != NULL)
+ free(ef->strbase, M_LINKER);
+ if (ef->ctftab != NULL)
+ free(ef->ctftab, M_LINKER);
+ if (ef->ctfoff != NULL)
+ free(ef->ctfoff, M_LINKER);
+ if (ef->typoff != NULL)
+ free(ef->typoff, M_LINKER);
+}
+
+static void
+link_elf_unload_preload(linker_file_t file)
+{
+ if (file->filename != NULL)
+ preload_delete_name(file->filename);
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Size r_info)
+{
+ const Elf_Sym *ref;
+
+ if (ELF_R_SYM(r_info)) {
+ ref = ef->symtab + ELF_R_SYM(r_info);
+ return (ef->strtab + ref->st_name);
+ }
+ return (NULL);
+}
+
+static int
+relocate_file(elf_file_t ef)
+{
+ const Elf_Rel *rellim;
+ const Elf_Rel *rel;
+ const Elf_Rela *relalim;
+ const Elf_Rela *rela;
+ const char *symname;
+
+ /* Perform relocations without addend if there are any: */
+ rel = ef->rel;
+ if (rel != NULL) {
+ rellim = (const Elf_Rel *)
+ ((const char *)ef->rel + ef->relsize);
+ while (rel < rellim) {
+ if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
+ ELF_RELOC_REL, elf_lookup)) {
+ symname = symbol_name(ef, rel->r_info);
+ printf("link_elf: symbol %s undefined\n", symname);
+ return (ENOENT);
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->rela;
+ if (rela != NULL) {
+ relalim = (const Elf_Rela *)
+ ((const char *)ef->rela + ef->relasize);
+ while (rela < relalim) {
+ if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
+ ELF_RELOC_RELA, elf_lookup)) {
+ symname = symbol_name(ef, rela->r_info);
+ printf("link_elf: symbol %s undefined\n",
+ symname);
+ return (ENOENT);
+ }
+ rela++;
+ }
+ }
+
+ /* Perform PLT relocations without addend if there are any: */
+ rel = ef->pltrel;
+ if (rel != NULL) {
+ rellim = (const Elf_Rel *)
+ ((const char *)ef->pltrel + ef->pltrelsize);
+ while (rel < rellim) {
+ if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rel,
+ ELF_RELOC_REL, elf_lookup)) {
+ symname = symbol_name(ef, rel->r_info);
+ printf("link_elf: symbol %s undefined\n",
+ symname);
+ return (ENOENT);
+ }
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ rela = ef->pltrela;
+ if (rela != NULL) {
+ relalim = (const Elf_Rela *)
+ ((const char *)ef->pltrela + ef->pltrelasize);
+ while (rela < relalim) {
+ if (elf_reloc(&ef->lf, (Elf_Addr)ef->address, rela,
+ ELF_RELOC_RELA, elf_lookup)) {
+ symname = symbol_name(ef, rela->r_info);
+ printf("link_elf: symbol %s undefined\n",
+ symname);
+ return (ENOENT);
+ }
+ rela++;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Hash function for symbol table lookup. Don't even think about changing
+ * this. It is specified by the System V ABI.
+ */
+static unsigned long
+elf_hash(const char *name)
+{
+ const unsigned char *p = (const unsigned char *) name;
+ unsigned long h = 0;
+ unsigned long g;
+
+ while (*p != '\0') {
+ h = (h << 4) + *p++;
+ if ((g = h & 0xf0000000) != 0)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return (h);
+}
+
+static int
+link_elf_lookup_symbol(linker_file_t lf, const char* name, c_linker_sym_t* sym)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ unsigned long symnum;
+ const Elf_Sym* symp;
+ const char *strp;
+ unsigned long hash;
+ int i;
+
+ /* If we don't have a hash, bail. */
+ if (ef->buckets == NULL || ef->nbuckets == 0) {
+ printf("link_elf_lookup_symbol: missing symbol hash table\n");
+ return (ENOENT);
+ }
+
+ /* First, search hashed global symbols */
+ hash = elf_hash(name);
+ symnum = ef->buckets[hash % ef->nbuckets];
+
+ while (symnum != STN_UNDEF) {
+ if (symnum >= ef->nchains) {
+ printf("%s: corrupt symbol table\n", __func__);
+ return (ENOENT);
+ }
+
+ symp = ef->symtab + symnum;
+ if (symp->st_name == 0) {
+ printf("%s: corrupt symbol table\n", __func__);
+ return (ENOENT);
+ }
+
+ strp = ef->strtab + symp->st_name;
+
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (c_linker_sym_t) symp;
+ return (0);
+ }
+ return (ENOENT);
+ }
+
+ symnum = ef->chains[symnum];
+ }
+
+ /* If we have not found it, look at the full table (if loaded) */
+ if (ef->symtab == ef->ddbsymtab)
+ return (ENOENT);
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ strp = ef->ddbstrtab + symp->st_name;
+ if (strcmp(name, strp) == 0) {
+ if (symp->st_shndx != SHN_UNDEF ||
+ (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC)) {
+ *sym = (c_linker_sym_t) symp;
+ return (0);
+ }
+ return (ENOENT);
+ }
+ }
+
+ return (ENOENT);
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
+ linker_symval_t *symval)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ const Elf_Sym* es = (const Elf_Sym*) sym;
+
+ if (es >= ef->symtab && es < (ef->symtab + ef->nchains)) {
+ symval->name = ef->strtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return (0);
+ }
+ if (ef->symtab == ef->ddbsymtab)
+ return (ENOENT);
+ if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
+ symval->name = ef->ddbstrtab + es->st_name;
+ symval->value = (caddr_t) ef->address + es->st_value;
+ symval->size = es->st_size;
+ return (0);
+ }
+ return (ENOENT);
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+ c_linker_sym_t *sym, long *diffp)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ u_long off = (uintptr_t) (void *) value;
+ u_long diff = off;
+ u_long st_value;
+ const Elf_Sym* es;
+ const Elf_Sym* best = 0;
+ int i;
+
+ for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+ if (es->st_name == 0)
+ continue;
+ st_value = es->st_value + (uintptr_t) (void *) ef->address;
+ if (off >= st_value) {
+ if (off - st_value < diff) {
+ diff = off - st_value;
+ best = es;
+ if (diff == 0)
+ break;
+ } else if (off - st_value == diff) {
+ best = es;
+ }
+ }
+ }
+ if (best == 0)
+ *diffp = off;
+ else
+ *diffp = diff;
+ *sym = (c_linker_sym_t) best;
+
+ return (0);
+}
+
+/*
+ * Look up a linker set on an ELF system.
+ */
+static int
+link_elf_lookup_set(linker_file_t lf, const char *name,
+ void ***startp, void ***stopp, int *countp)
+{
+ c_linker_sym_t sym;
+ linker_symval_t symval;
+ char *setsym;
+ void **start, **stop;
+ int len, error = 0, count;
+
+ len = strlen(name) + sizeof("__start_set_"); /* sizeof includes \0 */
+ setsym = malloc(len, M_LINKER, M_WAITOK);
+
+ /* get address of first entry */
+ snprintf(setsym, len, "%s%s", "__start_set_", name);
+ error = link_elf_lookup_symbol(lf, setsym, &sym);
+ if (error != 0)
+ goto out;
+ link_elf_symbol_values(lf, sym, &symval);
+ if (symval.value == 0) {
+ error = ESRCH;
+ goto out;
+ }
+ start = (void **)symval.value;
+
+ /* get address of last entry */
+ snprintf(setsym, len, "%s%s", "__stop_set_", name);
+ error = link_elf_lookup_symbol(lf, setsym, &sym);
+ if (error != 0)
+ goto out;
+ link_elf_symbol_values(lf, sym, &symval);
+ if (symval.value == 0) {
+ error = ESRCH;
+ goto out;
+ }
+ stop = (void **)symval.value;
+
+ /* and the number of entries */
+ count = stop - start;
+
+ /* and copy out */
+ if (startp != NULL)
+ *startp = start;
+ if (stopp != NULL)
+ *stopp = stop;
+ if (countp != NULL)
+ *countp = count;
+
+out:
+ free(setsym, M_LINKER);
+ return (error);
+}
+
+static int
+link_elf_each_function_name(linker_file_t file,
+ int (*callback)(const char *, void *), void *opaque)
+{
+ elf_file_t ef = (elf_file_t)file;
+ const Elf_Sym *symp;
+ int i, error;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ if (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+ error = callback(ef->ddbstrtab + symp->st_name, opaque);
+ if (error != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+static int
+link_elf_each_function_nameval(linker_file_t file,
+ linker_function_nameval_callback_t callback, void *opaque)
+{
+ linker_symval_t symval;
+ elf_file_t ef = (elf_file_t)file;
+ const Elf_Sym* symp;
+ int i, error;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ if (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+ error = link_elf_symbol_values(file,
+ (c_linker_sym_t) symp, &symval);
+ if (error != 0)
+ return (error);
+ error = callback(file, i, &symval, opaque);
+ if (error != 0)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+#ifdef __ia64__
+/*
+ * Each KLD has its own GP. The GP value for each load module is given by
+ * DT_PLTGOT on ia64. We need GP to construct function descriptors, but
+ * don't have direct access to the ELF file structure. The link_elf_get_gp()
+ * function returns the GP given a pointer to a generic linker file struct.
+ */
+Elf_Addr
+link_elf_get_gp(linker_file_t lf)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ return ((Elf_Addr)ef->got);
+}
+#endif
+
+const Elf_Sym *
+elf_get_sym(linker_file_t lf, Elf_Size symidx)
+{
+ elf_file_t ef = (elf_file_t)lf;
+
+ if (symidx >= ef->nchains)
+ return (NULL);
+ return (ef->symtab + symidx);
+}
+
+const char *
+elf_get_symname(linker_file_t lf, Elf_Size symidx)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ const Elf_Sym *sym;
+
+ if (symidx >= ef->nchains)
+ return (NULL);
+ sym = ef->symtab + symidx;
+ return (ef->strtab + sym->st_name);
+}
+
+/*
+ * Symbol lookup function that can be used when the symbol index is known (ie
+ * in relocations). It uses the symbol index instead of doing a fully fledged
+ * hash table based lookup when such is valid. For example for local symbols.
+ * This is not only more efficient, it's also more correct. It's not always
+ * the case that the symbol can be found through the hash table.
+ */
+static Elf_Addr
+elf_lookup(linker_file_t lf, Elf_Size symidx, int deps)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ const Elf_Sym *sym;
+ const char *symbol;
+ Elf_Addr addr, start, base;
+
+ /* Don't even try to lookup the symbol if the index is bogus. */
+ if (symidx >= ef->nchains)
+ return (0);
+
+ sym = ef->symtab + symidx;
+
+ /*
+ * Don't do a full lookup when the symbol is local. It may even
+ * fail because it may not be found through the hash table.
+ */
+ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
+ /* Force lookup failure when we have an insanity. */
+ if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0)
+ return (0);
+ return ((Elf_Addr)ef->address + sym->st_value);
+ }
+
+ /*
+ * XXX we can avoid doing a hash table based lookup for global
+ * symbols as well. This however is not always valid, so we'll
+ * just do it the hard way for now. Performance tweaks can
+ * always be added.
+ */
+
+ symbol = ef->strtab + sym->st_name;
+
+ /* Force a lookup failure if the symbol name is bogus. */
+ if (*symbol == 0)
+ return (0);
+
+ addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+
+ if (elf_set_find(&set_pcpu_list, addr, &start, &base))
+ addr = addr - start + base;
+#ifdef VIMAGE
+ else if (elf_set_find(&set_vnet_list, addr, &start, &base))
+ addr = addr - start + base;
+#endif
+ return addr;
+}
+
+static void
+link_elf_reloc_local(linker_file_t lf)
+{
+ const Elf_Rel *rellim;
+ const Elf_Rel *rel;
+ const Elf_Rela *relalim;
+ const Elf_Rela *rela;
+ elf_file_t ef = (elf_file_t)lf;
+
+ /* Perform relocations without addend if there are any: */
+ if ((rel = ef->rel) != NULL) {
+ rellim = (const Elf_Rel *)((const char *)ef->rel + ef->relsize);
+ while (rel < rellim) {
+ elf_reloc_local(lf, (Elf_Addr)ef->address, rel,
+ ELF_RELOC_REL, elf_lookup);
+ rel++;
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ if ((rela = ef->rela) != NULL) {
+ relalim = (const Elf_Rela *)
+ ((const char *)ef->rela + ef->relasize);
+ while (rela < relalim) {
+ elf_reloc_local(lf, (Elf_Addr)ef->address, rela,
+ ELF_RELOC_RELA, elf_lookup);
+ rela++;
+ }
+ }
+}
+
+static long
+link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
+{
+ elf_file_t ef = (elf_file_t)lf;
+
+ *symtab = ef->ddbsymtab;
+
+ if (*symtab == NULL)
+ return (0);
+
+ return (ef->ddbsymcnt);
+}
+
+static long
+link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
+{
+ elf_file_t ef = (elf_file_t)lf;
+
+ *strtab = ef->ddbstrtab;
+
+ if (*strtab == NULL)
+ return (0);
+
+ return (ef->ddbstrcnt);
+}
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
new file mode 100644
index 0000000..a9208df
--- /dev/null
+++ b/sys/kern/link_elf_obj.c
@@ -0,0 +1,1375 @@
+/*-
+ * Copyright (c) 1998-2000 Doug Rabson
+ * Copyright (c) 2004 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/linker.h>
+
+#include <machine/elf.h>
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#include <sys/link_elf.h>
+
+#ifdef DDB_CTF
+#include <net/zlib.h>
+#endif
+
+#include "linker_if.h"
+
+typedef struct {
+ void *addr;
+ Elf_Off size;
+ int flags;
+ int sec; /* Original section */
+ char *name;
+} Elf_progent;
+
+typedef struct {
+ Elf_Rel *rel;
+ int nrel;
+ int sec;
+} Elf_relent;
+
+typedef struct {
+ Elf_Rela *rela;
+ int nrela;
+ int sec;
+} Elf_relaent;
+
+
+typedef struct elf_file {
+ struct linker_file lf; /* Common fields */
+
+ int preloaded;
+ caddr_t address; /* Relocation address */
+ vm_object_t object; /* VM object to hold file pages */
+ Elf_Shdr *e_shdr;
+
+ Elf_progent *progtab;
+ int nprogtab;
+
+ Elf_relaent *relatab;
+ int nrelatab;
+
+ Elf_relent *reltab;
+ int nreltab;
+
+ Elf_Sym *ddbsymtab; /* The symbol table we are using */
+ long ddbsymcnt; /* Number of symbols */
+ caddr_t ddbstrtab; /* String table */
+ long ddbstrcnt; /* number of bytes in string table */
+
+ caddr_t shstrtab; /* Section name string table */
+ long shstrcnt; /* number of bytes in string table */
+
+ caddr_t ctftab; /* CTF table */
+ long ctfcnt; /* number of bytes in CTF table */
+ caddr_t ctfoff; /* CTF offset table */
+ caddr_t typoff; /* Type offset table */
+ long typlen; /* Number of type entries. */
+
+} *elf_file_t;
+
+#include <kern/kern_ctf.c>
+
+static int link_elf_link_preload(linker_class_t cls,
+ const char *, linker_file_t *);
+static int link_elf_link_preload_finish(linker_file_t);
+static int link_elf_load_file(linker_class_t, const char *, linker_file_t *);
+static int link_elf_lookup_symbol(linker_file_t, const char *,
+ c_linker_sym_t *);
+static int link_elf_symbol_values(linker_file_t, c_linker_sym_t,
+ linker_symval_t *);
+static int link_elf_search_symbol(linker_file_t, caddr_t value,
+ c_linker_sym_t *sym, long *diffp);
+
+static void link_elf_unload_file(linker_file_t);
+static int link_elf_lookup_set(linker_file_t, const char *,
+ void ***, void ***, int *);
+static int link_elf_each_function_name(linker_file_t,
+ int (*)(const char *, void *), void *);
+static int link_elf_each_function_nameval(linker_file_t,
+ linker_function_nameval_callback_t,
+ void *);
+static void link_elf_reloc_local(linker_file_t);
+static long link_elf_symtab_get(linker_file_t, const Elf_Sym **);
+static long link_elf_strtab_get(linker_file_t, caddr_t *);
+
+static Elf_Addr elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps);
+
+static kobj_method_t link_elf_methods[] = {
+ KOBJMETHOD(linker_lookup_symbol, link_elf_lookup_symbol),
+ KOBJMETHOD(linker_symbol_values, link_elf_symbol_values),
+ KOBJMETHOD(linker_search_symbol, link_elf_search_symbol),
+ KOBJMETHOD(linker_unload, link_elf_unload_file),
+ KOBJMETHOD(linker_load_file, link_elf_load_file),
+ KOBJMETHOD(linker_link_preload, link_elf_link_preload),
+ KOBJMETHOD(linker_link_preload_finish, link_elf_link_preload_finish),
+ KOBJMETHOD(linker_lookup_set, link_elf_lookup_set),
+ KOBJMETHOD(linker_each_function_name, link_elf_each_function_name),
+ KOBJMETHOD(linker_each_function_nameval, link_elf_each_function_nameval),
+ KOBJMETHOD(linker_ctf_get, link_elf_ctf_get),
+ KOBJMETHOD(linker_symtab_get, link_elf_symtab_get),
+ KOBJMETHOD(linker_strtab_get, link_elf_strtab_get),
+ { 0, 0 }
+};
+
+static struct linker_class link_elf_class = {
+#if ELF_TARG_CLASS == ELFCLASS32
+ "elf32_obj",
+#else
+ "elf64_obj",
+#endif
+ link_elf_methods, sizeof(struct elf_file)
+};
+
+static int relocate_file(elf_file_t ef);
+
+static void
+link_elf_error(const char *filename, const char *s)
+{
+ if (filename == NULL)
+ printf("kldload: %s\n", s);
+ else
+ printf("kldload: %s: %s\n", filename, s);
+}
+
+static void
+link_elf_init(void *arg)
+{
+
+ linker_add_class(&link_elf_class);
+}
+
+SYSINIT(link_elf_obj, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+
+static int
+link_elf_link_preload(linker_class_t cls, const char *filename,
+ linker_file_t *result)
+{
+ Elf_Ehdr *hdr;
+ Elf_Shdr *shdr;
+ Elf_Sym *es;
+ void *modptr, *baseptr, *sizeptr;
+ char *type;
+ elf_file_t ef;
+ linker_file_t lf;
+ Elf_Addr off;
+ int error, i, j, pb, ra, rl, shstrindex, symstrindex, symtabindex;
+
+ /* Look to see if we have the file preloaded */
+ modptr = preload_search_by_name(filename);
+ if (modptr == NULL)
+ return ENOENT;
+
+ type = (char *)preload_search_info(modptr, MODINFO_TYPE);
+ baseptr = preload_search_info(modptr, MODINFO_ADDR);
+ sizeptr = preload_search_info(modptr, MODINFO_SIZE);
+ hdr = (Elf_Ehdr *)preload_search_info(modptr, MODINFO_METADATA |
+ MODINFOMD_ELFHDR);
+ shdr = (Elf_Shdr *)preload_search_info(modptr, MODINFO_METADATA |
+ MODINFOMD_SHDR);
+ if (type == NULL || (strcmp(type, "elf" __XSTRING(__ELF_WORD_SIZE)
+ " obj module") != 0 &&
+ strcmp(type, "elf obj module") != 0)) {
+ return (EFTYPE);
+ }
+ if (baseptr == NULL || sizeptr == NULL || hdr == NULL ||
+ shdr == NULL)
+ return (EINVAL);
+
+ lf = linker_make_file(filename, &link_elf_class);
+ if (lf == NULL)
+ return (ENOMEM);
+
+ ef = (elf_file_t)lf;
+ ef->preloaded = 1;
+ ef->address = *(caddr_t *)baseptr;
+ lf->address = *(caddr_t *)baseptr;
+ lf->size = *(size_t *)sizeptr;
+
+ if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
+ hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
+ hdr->e_ident[EI_VERSION] != EV_CURRENT ||
+ hdr->e_version != EV_CURRENT ||
+ hdr->e_type != ET_REL ||
+ hdr->e_machine != ELF_TARG_MACH) {
+ error = EFTYPE;
+ goto out;
+ }
+ ef->e_shdr = shdr;
+
+ /* Scan the section header for information and table sizing. */
+ symtabindex = -1;
+ symstrindex = -1;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ switch (shdr[i].sh_type) {
+ case SHT_PROGBITS:
+ case SHT_NOBITS:
+ ef->nprogtab++;
+ break;
+ case SHT_SYMTAB:
+ symtabindex = i;
+ symstrindex = shdr[i].sh_link;
+ break;
+ case SHT_REL:
+ ef->nreltab++;
+ break;
+ case SHT_RELA:
+ ef->nrelatab++;
+ break;
+ }
+ }
+
+ shstrindex = hdr->e_shstrndx;
+ if (ef->nprogtab == 0 || symstrindex < 0 ||
+ symstrindex >= hdr->e_shnum ||
+ shdr[symstrindex].sh_type != SHT_STRTAB || shstrindex == 0 ||
+ shstrindex >= hdr->e_shnum ||
+ shdr[shstrindex].sh_type != SHT_STRTAB) {
+ printf("%s: bad/missing section headers\n", filename);
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /* Allocate space for tracking the load chunks */
+ if (ef->nprogtab != 0)
+ ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
+ M_LINKER, M_WAITOK | M_ZERO);
+ if (ef->nreltab != 0)
+ ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
+ M_LINKER, M_WAITOK | M_ZERO);
+ if (ef->nrelatab != 0)
+ ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
+ M_LINKER, M_WAITOK | M_ZERO);
+ if ((ef->nprogtab != 0 && ef->progtab == NULL) ||
+ (ef->nreltab != 0 && ef->reltab == NULL) ||
+ (ef->nrelatab != 0 && ef->relatab == NULL)) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* XXX, relocate the sh_addr fields saved by the loader. */
+ off = 0;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_addr != 0 && (off == 0 || shdr[i].sh_addr < off))
+ off = shdr[i].sh_addr;
+ }
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_addr != 0)
+ shdr[i].sh_addr = shdr[i].sh_addr - off +
+ (Elf_Addr)ef->address;
+ }
+
+ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
+ ef->ddbsymtab = (Elf_Sym *)shdr[symtabindex].sh_addr;
+ ef->ddbstrcnt = shdr[symstrindex].sh_size;
+ ef->ddbstrtab = (char *)shdr[symstrindex].sh_addr;
+ ef->shstrcnt = shdr[shstrindex].sh_size;
+ ef->shstrtab = (char *)shdr[shstrindex].sh_addr;
+
+ /* Now fill out progtab and the relocation tables. */
+ pb = 0;
+ rl = 0;
+ ra = 0;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ switch (shdr[i].sh_type) {
+ case SHT_PROGBITS:
+ case SHT_NOBITS:
+ ef->progtab[pb].addr = (void *)shdr[i].sh_addr;
+ if (shdr[i].sh_type == SHT_PROGBITS)
+ ef->progtab[pb].name = "<<PROGBITS>>";
+ else
+ ef->progtab[pb].name = "<<NOBITS>>";
+ ef->progtab[pb].size = shdr[i].sh_size;
+ ef->progtab[pb].sec = i;
+ if (ef->shstrtab && shdr[i].sh_name != 0)
+ ef->progtab[pb].name =
+ ef->shstrtab + shdr[i].sh_name;
+ if (ef->progtab[pb].name != NULL &&
+ !strcmp(ef->progtab[pb].name, DPCPU_SETNAME)) {
+ void *dpcpu;
+
+ dpcpu = dpcpu_alloc(shdr[i].sh_size);
+ if (dpcpu == NULL) {
+ error = ENOSPC;
+ goto out;
+ }
+ memcpy(dpcpu, ef->progtab[pb].addr,
+ ef->progtab[pb].size);
+ dpcpu_copy(dpcpu, shdr[i].sh_size);
+ ef->progtab[pb].addr = dpcpu;
+#ifdef VIMAGE
+ } else if (ef->progtab[pb].name != NULL &&
+ !strcmp(ef->progtab[pb].name, VNET_SETNAME)) {
+ void *vnet_data;
+
+ vnet_data = vnet_data_alloc(shdr[i].sh_size);
+ if (vnet_data == NULL) {
+ error = ENOSPC;
+ goto out;
+ }
+ memcpy(vnet_data, ef->progtab[pb].addr,
+ ef->progtab[pb].size);
+ vnet_data_copy(vnet_data, shdr[i].sh_size);
+ ef->progtab[pb].addr = vnet_data;
+#endif
+ }
+
+ /* Update all symbol values with the offset. */
+ for (j = 0; j < ef->ddbsymcnt; j++) {
+ es = &ef->ddbsymtab[j];
+ if (es->st_shndx != i)
+ continue;
+ es->st_value += (Elf_Addr)ef->progtab[pb].addr;
+ }
+ pb++;
+ break;
+ case SHT_REL:
+ ef->reltab[rl].rel = (Elf_Rel *)shdr[i].sh_addr;
+ ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
+ ef->reltab[rl].sec = shdr[i].sh_info;
+ rl++;
+ break;
+ case SHT_RELA:
+ ef->relatab[ra].rela = (Elf_Rela *)shdr[i].sh_addr;
+ ef->relatab[ra].nrela =
+ shdr[i].sh_size / sizeof(Elf_Rela);
+ ef->relatab[ra].sec = shdr[i].sh_info;
+ ra++;
+ break;
+ }
+ }
+ if (pb != ef->nprogtab)
+ panic("lost progbits");
+ if (rl != ef->nreltab)
+ panic("lost reltab");
+ if (ra != ef->nrelatab)
+ panic("lost relatab");
+
+ /* Local intra-module relocations */
+ link_elf_reloc_local(lf);
+
+ *result = lf;
+ return (0);
+
+out:
+ /* preload not done this way */
+ linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ return (error);
+}
+
+static int
+link_elf_link_preload_finish(linker_file_t lf)
+{
+ elf_file_t ef;
+ int error;
+
+ ef = (elf_file_t)lf;
+ error = relocate_file(ef);
+ if (error)
+ return error;
+
+ /* Notify MD code that a module is being loaded. */
+ error = elf_cpu_load_file(lf);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+static int
+link_elf_load_file(linker_class_t cls, const char *filename,
+ linker_file_t *result)
+{
+ struct nameidata nd;
+ struct thread *td = curthread; /* XXX */
+ Elf_Ehdr *hdr;
+ Elf_Shdr *shdr;
+ Elf_Sym *es;
+ int nbytes, i, j;
+ vm_offset_t mapbase;
+ size_t mapsize;
+ int error = 0;
+ ssize_t resid;
+ int flags;
+ elf_file_t ef;
+ linker_file_t lf;
+ int symtabindex;
+ int symstrindex;
+ int shstrindex;
+ int nsym;
+ int pb, rl, ra;
+ int alignmask;
+
+ shdr = NULL;
+ lf = NULL;
+ mapsize = 0;
+ hdr = NULL;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error)
+ return error;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp->v_type != VREG) {
+ error = ENOEXEC;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_kld_check_load(td->td_ucred, nd.ni_vp);
+ if (error) {
+ goto out;
+ }
+#endif
+
+ /* Read the elf header from the file. */
+ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK);
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)hdr, sizeof(*hdr), 0,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error)
+ goto out;
+ if (resid != 0){
+ error = ENOEXEC;
+ goto out;
+ }
+
+ if (!IS_ELF(*hdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS
+ || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) {
+ link_elf_error(filename, "Unsupported file layout");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_ident[EI_VERSION] != EV_CURRENT
+ || hdr->e_version != EV_CURRENT) {
+ link_elf_error(filename, "Unsupported file version");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (hdr->e_type != ET_REL) {
+ error = ENOSYS;
+ goto out;
+ }
+ if (hdr->e_machine != ELF_TARG_MACH) {
+ link_elf_error(filename, "Unsupported machine");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ lf = linker_make_file(filename, &link_elf_class);
+ if (!lf) {
+ error = ENOMEM;
+ goto out;
+ }
+ ef = (elf_file_t) lf;
+ ef->nprogtab = 0;
+ ef->e_shdr = 0;
+ ef->nreltab = 0;
+ ef->nrelatab = 0;
+
+ /* Allocate and read in the section header */
+ nbytes = hdr->e_shnum * hdr->e_shentsize;
+ if (nbytes == 0 || hdr->e_shoff == 0 ||
+ hdr->e_shentsize != sizeof(Elf_Shdr)) {
+ error = ENOEXEC;
+ goto out;
+ }
+ shdr = malloc(nbytes, M_LINKER, M_WAITOK);
+ ef->e_shdr = shdr;
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, &resid, td);
+ if (error)
+ goto out;
+ if (resid) {
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /* Scan the section header for information and table sizing. */
+ nsym = 0;
+ symtabindex = -1;
+ symstrindex = -1;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_size == 0)
+ continue;
+ switch (shdr[i].sh_type) {
+ case SHT_PROGBITS:
+ case SHT_NOBITS:
+ ef->nprogtab++;
+ break;
+ case SHT_SYMTAB:
+ nsym++;
+ symtabindex = i;
+ symstrindex = shdr[i].sh_link;
+ break;
+ case SHT_REL:
+ ef->nreltab++;
+ break;
+ case SHT_RELA:
+ ef->nrelatab++;
+ break;
+ case SHT_STRTAB:
+ break;
+ }
+ }
+ if (ef->nprogtab == 0) {
+ link_elf_error(filename, "file has no contents");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (nsym != 1) {
+ /* Only allow one symbol table for now */
+ link_elf_error(filename, "file has no valid symbol table");
+ error = ENOEXEC;
+ goto out;
+ }
+ if (symstrindex < 0 || symstrindex > hdr->e_shnum ||
+ shdr[symstrindex].sh_type != SHT_STRTAB) {
+ link_elf_error(filename, "file has invalid symbol strings");
+ error = ENOEXEC;
+ goto out;
+ }
+
+ /* Allocate space for tracking the load chunks */
+ if (ef->nprogtab != 0)
+ ef->progtab = malloc(ef->nprogtab * sizeof(*ef->progtab),
+ M_LINKER, M_WAITOK | M_ZERO);
+ if (ef->nreltab != 0)
+ ef->reltab = malloc(ef->nreltab * sizeof(*ef->reltab),
+ M_LINKER, M_WAITOK | M_ZERO);
+ if (ef->nrelatab != 0)
+ ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
+ M_LINKER, M_WAITOK | M_ZERO);
+
+ if (symtabindex == -1)
+ panic("lost symbol table index");
+ /* Allocate space for and load the symbol table */
+ ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
+ ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
+ error = vn_rdwr(UIO_READ, nd.ni_vp, (void *)ef->ddbsymtab,
+ shdr[symtabindex].sh_size, shdr[symtabindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error)
+ goto out;
+ if (resid != 0){
+ error = EINVAL;
+ goto out;
+ }
+
+ if (symstrindex == -1)
+ panic("lost symbol string index");
+ /* Allocate space for and load the symbol strings */
+ ef->ddbstrcnt = shdr[symstrindex].sh_size;
+ ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
+ error = vn_rdwr(UIO_READ, nd.ni_vp, ef->ddbstrtab,
+ shdr[symstrindex].sh_size, shdr[symstrindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error)
+ goto out;
+ if (resid != 0){
+ error = EINVAL;
+ goto out;
+ }
+
+ /* Do we have a string table for the section names? */
+ shstrindex = -1;
+ if (hdr->e_shstrndx != 0 &&
+ shdr[hdr->e_shstrndx].sh_type == SHT_STRTAB) {
+ shstrindex = hdr->e_shstrndx;
+ ef->shstrcnt = shdr[shstrindex].sh_size;
+ ef->shstrtab = malloc(shdr[shstrindex].sh_size, M_LINKER,
+ M_WAITOK);
+ error = vn_rdwr(UIO_READ, nd.ni_vp, ef->shstrtab,
+ shdr[shstrindex].sh_size, shdr[shstrindex].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error)
+ goto out;
+ if (resid != 0){
+ error = EINVAL;
+ goto out;
+ }
+ }
+
+ /* Size up code/data(progbits) and bss(nobits). */
+ alignmask = 0;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_size == 0)
+ continue;
+ switch (shdr[i].sh_type) {
+ case SHT_PROGBITS:
+ case SHT_NOBITS:
+ alignmask = shdr[i].sh_addralign - 1;
+ mapsize += alignmask;
+ mapsize &= ~alignmask;
+ mapsize += shdr[i].sh_size;
+ break;
+ }
+ }
+
+ /*
+ * We know how much space we need for the text/data/bss/etc.
+ * This stuff needs to be in a single chunk so that profiling etc
+ * can get the bounds and gdb can associate offsets with modules
+ */
+ ef->object = vm_object_allocate(OBJT_DEFAULT,
+ round_page(mapsize) >> PAGE_SHIFT);
+ if (ef->object == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ ef->address = (caddr_t) vm_map_min(kernel_map);
+
+ /*
+ * In order to satisfy amd64's architectural requirements on the
+ * location of code and data in the kernel's address space, request a
+ * mapping that is above the kernel.
+ */
+#ifdef __amd64__
+ mapbase = KERNBASE;
+#else
+ mapbase = VM_MIN_KERNEL_ADDRESS;
+#endif
+ error = vm_map_find(kernel_map, ef->object, 0, &mapbase,
+ round_page(mapsize), TRUE, VM_PROT_ALL, VM_PROT_ALL, FALSE);
+ if (error) {
+ vm_object_deallocate(ef->object);
+ ef->object = 0;
+ goto out;
+ }
+
+ /* Wire the pages */
+ error = vm_map_wire(kernel_map, mapbase,
+ mapbase + round_page(mapsize),
+ VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
+ if (error != KERN_SUCCESS) {
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* Inform the kld system about the situation */
+ lf->address = ef->address = (caddr_t)mapbase;
+ lf->size = mapsize;
+
+ /*
+ * Now load code/data(progbits), zero bss(nobits), allocate space for
+ * and load relocs
+ */
+ pb = 0;
+ rl = 0;
+ ra = 0;
+ alignmask = 0;
+ for (i = 0; i < hdr->e_shnum; i++) {
+ if (shdr[i].sh_size == 0)
+ continue;
+ switch (shdr[i].sh_type) {
+ case SHT_PROGBITS:
+ case SHT_NOBITS:
+ alignmask = shdr[i].sh_addralign - 1;
+ mapbase += alignmask;
+ mapbase &= ~alignmask;
+ if (ef->shstrtab && shdr[i].sh_name != 0)
+ ef->progtab[pb].name =
+ ef->shstrtab + shdr[i].sh_name;
+ else if (shdr[i].sh_type == SHT_PROGBITS)
+ ef->progtab[pb].name = "<<PROGBITS>>";
+ else
+ ef->progtab[pb].name = "<<NOBITS>>";
+ if (ef->progtab[pb].name != NULL &&
+ !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
+ ef->progtab[pb].addr =
+ dpcpu_alloc(shdr[i].sh_size);
+#ifdef VIMAGE
+ else if (ef->progtab[pb].name != NULL &&
+ !strcmp(ef->progtab[pb].name, VNET_SETNAME))
+ ef->progtab[pb].addr =
+ vnet_data_alloc(shdr[i].sh_size);
+#endif
+ else
+ ef->progtab[pb].addr =
+ (void *)(uintptr_t)mapbase;
+ if (ef->progtab[pb].addr == NULL) {
+ error = ENOSPC;
+ goto out;
+ }
+ ef->progtab[pb].size = shdr[i].sh_size;
+ ef->progtab[pb].sec = i;
+ if (shdr[i].sh_type == SHT_PROGBITS) {
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ ef->progtab[pb].addr,
+ shdr[i].sh_size, shdr[i].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
+ NOCRED, &resid, td);
+ if (error)
+ goto out;
+ if (resid != 0){
+ error = EINVAL;
+ goto out;
+ }
+ /* Initialize the per-cpu or vnet area. */
+ if (ef->progtab[pb].addr != (void *)mapbase &&
+ !strcmp(ef->progtab[pb].name, DPCPU_SETNAME))
+ dpcpu_copy(ef->progtab[pb].addr,
+ shdr[i].sh_size);
+#ifdef VIMAGE
+ else if (ef->progtab[pb].addr !=
+ (void *)mapbase &&
+ !strcmp(ef->progtab[pb].name, VNET_SETNAME))
+ vnet_data_copy(ef->progtab[pb].addr,
+ shdr[i].sh_size);
+#endif
+ } else
+ bzero(ef->progtab[pb].addr, shdr[i].sh_size);
+
+ /* Update all symbol values with the offset. */
+ for (j = 0; j < ef->ddbsymcnt; j++) {
+ es = &ef->ddbsymtab[j];
+ if (es->st_shndx != i)
+ continue;
+ es->st_value += (Elf_Addr)ef->progtab[pb].addr;
+ }
+ mapbase += shdr[i].sh_size;
+ pb++;
+ break;
+ case SHT_REL:
+ ef->reltab[rl].rel = malloc(shdr[i].sh_size, M_LINKER,
+ M_WAITOK);
+ ef->reltab[rl].nrel = shdr[i].sh_size / sizeof(Elf_Rel);
+ ef->reltab[rl].sec = shdr[i].sh_info;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ (void *)ef->reltab[rl].rel,
+ shdr[i].sh_size, shdr[i].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error)
+ goto out;
+ if (resid != 0){
+ error = EINVAL;
+ goto out;
+ }
+ rl++;
+ break;
+ case SHT_RELA:
+ ef->relatab[ra].rela = malloc(shdr[i].sh_size, M_LINKER,
+ M_WAITOK);
+ ef->relatab[ra].nrela =
+ shdr[i].sh_size / sizeof(Elf_Rela);
+ ef->relatab[ra].sec = shdr[i].sh_info;
+ error = vn_rdwr(UIO_READ, nd.ni_vp,
+ (void *)ef->relatab[ra].rela,
+ shdr[i].sh_size, shdr[i].sh_offset,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
+ &resid, td);
+ if (error)
+ goto out;
+ if (resid != 0){
+ error = EINVAL;
+ goto out;
+ }
+ ra++;
+ break;
+ }
+ }
+ if (pb != ef->nprogtab)
+ panic("lost progbits");
+ if (rl != ef->nreltab)
+ panic("lost reltab");
+ if (ra != ef->nrelatab)
+ panic("lost relatab");
+ if (mapbase != (vm_offset_t)ef->address + mapsize)
+ panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n",
+ (u_long)mapbase, ef->address, (u_long)mapsize,
+ (u_long)(vm_offset_t)ef->address + mapsize);
+
+ /* Local intra-module relocations */
+ link_elf_reloc_local(lf);
+
+ /* Pull in dependencies */
+ VOP_UNLOCK(nd.ni_vp, 0);
+ error = linker_load_dependencies(lf);
+ vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
+ if (error)
+ goto out;
+
+ /* External relocations */
+ error = relocate_file(ef);
+ if (error)
+ goto out;
+
+ /* Notify MD code that a module is being loaded. */
+ error = elf_cpu_load_file(lf);
+ if (error)
+ goto out;
+
+ *result = lf;
+
+out:
+ VOP_UNLOCK(nd.ni_vp, 0);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+ if (error && lf)
+ linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+ if (hdr)
+ free(hdr, M_LINKER);
+
+ return error;
+}
+
+static void
+link_elf_unload_file(linker_file_t file)
+{
+ elf_file_t ef = (elf_file_t) file;
+ int i;
+
+ /* Notify MD code that a module is being unloaded. */
+ elf_cpu_unload_file(file);
+
+ if (ef->progtab) {
+ for (i = 0; i < ef->nprogtab; i++) {
+ if (ef->progtab[i].size == 0)
+ continue;
+ if (ef->progtab[i].name == NULL)
+ continue;
+ if (!strcmp(ef->progtab[i].name, DPCPU_SETNAME))
+ dpcpu_free(ef->progtab[i].addr,
+ ef->progtab[i].size);
+#ifdef VIMAGE
+ else if (!strcmp(ef->progtab[i].name, VNET_SETNAME))
+ vnet_data_free(ef->progtab[i].addr,
+ ef->progtab[i].size);
+#endif
+ }
+ }
+ if (ef->preloaded) {
+ if (ef->reltab)
+ free(ef->reltab, M_LINKER);
+ if (ef->relatab)
+ free(ef->relatab, M_LINKER);
+ if (ef->progtab)
+ free(ef->progtab, M_LINKER);
+ if (ef->ctftab)
+ free(ef->ctftab, M_LINKER);
+ if (ef->ctfoff)
+ free(ef->ctfoff, M_LINKER);
+ if (ef->typoff)
+ free(ef->typoff, M_LINKER);
+ if (file->filename != NULL)
+ preload_delete_name(file->filename);
+ /* XXX reclaim module memory? */
+ return;
+ }
+
+ for (i = 0; i < ef->nreltab; i++)
+ if (ef->reltab[i].rel)
+ free(ef->reltab[i].rel, M_LINKER);
+ for (i = 0; i < ef->nrelatab; i++)
+ if (ef->relatab[i].rela)
+ free(ef->relatab[i].rela, M_LINKER);
+ if (ef->reltab)
+ free(ef->reltab, M_LINKER);
+ if (ef->relatab)
+ free(ef->relatab, M_LINKER);
+ if (ef->progtab)
+ free(ef->progtab, M_LINKER);
+
+ if (ef->object) {
+ vm_map_remove(kernel_map, (vm_offset_t) ef->address,
+ (vm_offset_t) ef->address +
+ (ef->object->size << PAGE_SHIFT));
+ }
+ if (ef->e_shdr)
+ free(ef->e_shdr, M_LINKER);
+ if (ef->ddbsymtab)
+ free(ef->ddbsymtab, M_LINKER);
+ if (ef->ddbstrtab)
+ free(ef->ddbstrtab, M_LINKER);
+ if (ef->shstrtab)
+ free(ef->shstrtab, M_LINKER);
+ if (ef->ctftab)
+ free(ef->ctftab, M_LINKER);
+ if (ef->ctfoff)
+ free(ef->ctfoff, M_LINKER);
+ if (ef->typoff)
+ free(ef->typoff, M_LINKER);
+}
+
+static const char *
+symbol_name(elf_file_t ef, Elf_Size r_info)
+{
+ const Elf_Sym *ref;
+
+ if (ELF_R_SYM(r_info)) {
+ ref = ef->ddbsymtab + ELF_R_SYM(r_info);
+ return ef->ddbstrtab + ref->st_name;
+ } else
+ return NULL;
+}
+
+static Elf_Addr
+findbase(elf_file_t ef, int sec)
+{
+ int i;
+ Elf_Addr base = 0;
+
+ for (i = 0; i < ef->nprogtab; i++) {
+ if (sec == ef->progtab[i].sec) {
+ base = (Elf_Addr)ef->progtab[i].addr;
+ break;
+ }
+ }
+ return base;
+}
+
+static int
+relocate_file(elf_file_t ef)
+{
+ const Elf_Rel *rellim;
+ const Elf_Rel *rel;
+ const Elf_Rela *relalim;
+ const Elf_Rela *rela;
+ const char *symname;
+ const Elf_Sym *sym;
+ int i;
+ Elf_Size symidx;
+ Elf_Addr base;
+
+
+ /* Perform relocations without addend if there are any: */
+ for (i = 0; i < ef->nreltab; i++) {
+ rel = ef->reltab[i].rel;
+ if (rel == NULL)
+ panic("lost a reltab!");
+ rellim = rel + ef->reltab[i].nrel;
+ base = findbase(ef, ef->reltab[i].sec);
+ if (base == 0)
+ panic("lost base for reltab");
+ for ( ; rel < rellim; rel++) {
+ symidx = ELF_R_SYM(rel->r_info);
+ if (symidx >= ef->ddbsymcnt)
+ continue;
+ sym = ef->ddbsymtab + symidx;
+ /* Local relocs are already done */
+ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL)
+ continue;
+ if (elf_reloc(&ef->lf, base, rel, ELF_RELOC_REL,
+ elf_obj_lookup)) {
+ symname = symbol_name(ef, rel->r_info);
+ printf("link_elf_obj: symbol %s undefined\n",
+ symname);
+ return ENOENT;
+ }
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ for (i = 0; i < ef->nrelatab; i++) {
+ rela = ef->relatab[i].rela;
+ if (rela == NULL)
+ panic("lost a relatab!");
+ relalim = rela + ef->relatab[i].nrela;
+ base = findbase(ef, ef->relatab[i].sec);
+ if (base == 0)
+ panic("lost base for relatab");
+ for ( ; rela < relalim; rela++) {
+ symidx = ELF_R_SYM(rela->r_info);
+ if (symidx >= ef->ddbsymcnt)
+ continue;
+ sym = ef->ddbsymtab + symidx;
+ /* Local relocs are already done */
+ if (ELF_ST_BIND(sym->st_info) == STB_LOCAL)
+ continue;
+ if (elf_reloc(&ef->lf, base, rela, ELF_RELOC_RELA,
+ elf_obj_lookup)) {
+ symname = symbol_name(ef, rela->r_info);
+ printf("link_elf_obj: symbol %s undefined\n",
+ symname);
+ return ENOENT;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int
+link_elf_lookup_symbol(linker_file_t lf, const char *name, c_linker_sym_t *sym)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ const Elf_Sym *symp;
+ const char *strp;
+ int i;
+
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ strp = ef->ddbstrtab + symp->st_name;
+ if (symp->st_shndx != SHN_UNDEF && strcmp(name, strp) == 0) {
+ *sym = (c_linker_sym_t) symp;
+ return 0;
+ }
+ }
+ return ENOENT;
+}
+
+static int
+link_elf_symbol_values(linker_file_t lf, c_linker_sym_t sym,
+ linker_symval_t *symval)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ const Elf_Sym *es = (const Elf_Sym*) sym;
+
+ if (es >= ef->ddbsymtab && es < (ef->ddbsymtab + ef->ddbsymcnt)) {
+ symval->name = ef->ddbstrtab + es->st_name;
+ symval->value = (caddr_t)es->st_value;
+ symval->size = es->st_size;
+ return 0;
+ }
+ return ENOENT;
+}
+
+static int
+link_elf_search_symbol(linker_file_t lf, caddr_t value,
+ c_linker_sym_t *sym, long *diffp)
+{
+ elf_file_t ef = (elf_file_t) lf;
+ u_long off = (uintptr_t) (void *) value;
+ u_long diff = off;
+ u_long st_value;
+ const Elf_Sym *es;
+ const Elf_Sym *best = 0;
+ int i;
+
+ for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) {
+ if (es->st_name == 0)
+ continue;
+ st_value = es->st_value;
+ if (off >= st_value) {
+ if (off - st_value < diff) {
+ diff = off - st_value;
+ best = es;
+ if (diff == 0)
+ break;
+ } else if (off - st_value == diff) {
+ best = es;
+ }
+ }
+ }
+ if (best == 0)
+ *diffp = off;
+ else
+ *diffp = diff;
+ *sym = (c_linker_sym_t) best;
+
+ return 0;
+}
+
+/*
+ * Look up a linker set on an ELF system.
+ */
+static int
+link_elf_lookup_set(linker_file_t lf, const char *name,
+ void ***startp, void ***stopp, int *countp)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ void **start, **stop;
+ int i, count;
+
+ /* Relative to section number */
+ for (i = 0; i < ef->nprogtab; i++) {
+ if ((strncmp(ef->progtab[i].name, "set_", 4) == 0) &&
+ strcmp(ef->progtab[i].name + 4, name) == 0) {
+ start = (void **)ef->progtab[i].addr;
+ stop = (void **)((char *)ef->progtab[i].addr +
+ ef->progtab[i].size);
+ count = stop - start;
+ if (startp)
+ *startp = start;
+ if (stopp)
+ *stopp = stop;
+ if (countp)
+ *countp = count;
+ return (0);
+ }
+ }
+ return (ESRCH);
+}
+
+static int
+link_elf_each_function_name(linker_file_t file,
+ int (*callback)(const char *, void *), void *opaque)
+{
+ elf_file_t ef = (elf_file_t)file;
+ const Elf_Sym *symp;
+ int i, error;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ if (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+ error = callback(ef->ddbstrtab + symp->st_name, opaque);
+ if (error)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+static int
+link_elf_each_function_nameval(linker_file_t file,
+ linker_function_nameval_callback_t callback, void *opaque)
+{
+ linker_symval_t symval;
+ elf_file_t ef = (elf_file_t)file;
+ const Elf_Sym* symp;
+ int i, error;
+
+ /* Exhaustive search */
+ for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
+ if (symp->st_value != 0 &&
+ ELF_ST_TYPE(symp->st_info) == STT_FUNC) {
+ error = link_elf_symbol_values(file, (c_linker_sym_t) symp, &symval);
+ if (error)
+ return (error);
+ error = callback(file, i, &symval, opaque);
+ if (error)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Symbol lookup function that can be used when the symbol index is known (ie
+ * in relocations). It uses the symbol index instead of doing a fully fledged
+ * hash table based lookup when such is valid. For example for local symbols.
+ * This is not only more efficient, it's also more correct. It's not always
+ * the case that the symbol can be found through the hash table.
+ */
+static Elf_Addr
+elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ const Elf_Sym *sym;
+ const char *symbol;
+ Elf_Addr ret;
+
+ /* Don't even try to lookup the symbol if the index is bogus. */
+ if (symidx >= ef->ddbsymcnt)
+ return (0);
+
+ sym = ef->ddbsymtab + symidx;
+
+ /* Quick answer if there is a definition included. */
+ if (sym->st_shndx != SHN_UNDEF)
+ return (sym->st_value);
+
+ /* If we get here, then it is undefined and needs a lookup. */
+ switch (ELF_ST_BIND(sym->st_info)) {
+ case STB_LOCAL:
+ /* Local, but undefined? huh? */
+ return (0);
+
+ case STB_GLOBAL:
+ /* Relative to Data or Function name */
+ symbol = ef->ddbstrtab + sym->st_name;
+
+ /* Force a lookup failure if the symbol name is bogus. */
+ if (*symbol == 0)
+ return (0);
+ ret = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+ return ret;
+
+ case STB_WEAK:
+ printf("link_elf_obj: Weak symbols not supported\n");
+ return (0);
+
+ default:
+ return (0);
+ }
+}
+
+static void
+link_elf_fix_link_set(elf_file_t ef)
+{
+ static const char startn[] = "__start_";
+ static const char stopn[] = "__stop_";
+ Elf_Sym *sym;
+ const char *sym_name, *linkset_name;
+ Elf_Addr startp, stopp;
+ Elf_Size symidx;
+ int start, i;
+
+ startp = stopp = 0;
+ for (symidx = 1 /* zero entry is special */;
+ symidx < ef->ddbsymcnt; symidx++) {
+ sym = ef->ddbsymtab + symidx;
+ if (sym->st_shndx != SHN_UNDEF)
+ continue;
+
+ sym_name = ef->ddbstrtab + sym->st_name;
+ if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) {
+ start = 1;
+ linkset_name = sym_name + sizeof(startn) - 1;
+ }
+ else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) {
+ start = 0;
+ linkset_name = sym_name + sizeof(stopn) - 1;
+ }
+ else
+ continue;
+
+ for (i = 0; i < ef->nprogtab; i++) {
+ if (strcmp(ef->progtab[i].name, linkset_name) == 0) {
+ startp = (Elf_Addr)ef->progtab[i].addr;
+ stopp = (Elf_Addr)(startp + ef->progtab[i].size);
+ break;
+ }
+ }
+ if (i == ef->nprogtab)
+ continue;
+
+ sym->st_value = start ? startp : stopp;
+ sym->st_shndx = i;
+ }
+}
+
+static void
+link_elf_reloc_local(linker_file_t lf)
+{
+ elf_file_t ef = (elf_file_t)lf;
+ const Elf_Rel *rellim;
+ const Elf_Rel *rel;
+ const Elf_Rela *relalim;
+ const Elf_Rela *rela;
+ const Elf_Sym *sym;
+ Elf_Addr base;
+ int i;
+ Elf_Size symidx;
+
+ link_elf_fix_link_set(ef);
+
+ /* Perform relocations without addend if there are any: */
+ for (i = 0; i < ef->nreltab; i++) {
+ rel = ef->reltab[i].rel;
+ if (rel == NULL)
+ panic("lost a reltab!");
+ rellim = rel + ef->reltab[i].nrel;
+ base = findbase(ef, ef->reltab[i].sec);
+ if (base == 0)
+ panic("lost base for reltab");
+ for ( ; rel < rellim; rel++) {
+ symidx = ELF_R_SYM(rel->r_info);
+ if (symidx >= ef->ddbsymcnt)
+ continue;
+ sym = ef->ddbsymtab + symidx;
+ /* Only do local relocs */
+ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL)
+ continue;
+ elf_reloc_local(lf, base, rel, ELF_RELOC_REL,
+ elf_obj_lookup);
+ }
+ }
+
+ /* Perform relocations with addend if there are any: */
+ for (i = 0; i < ef->nrelatab; i++) {
+ rela = ef->relatab[i].rela;
+ if (rela == NULL)
+ panic("lost a relatab!");
+ relalim = rela + ef->relatab[i].nrela;
+ base = findbase(ef, ef->relatab[i].sec);
+ if (base == 0)
+ panic("lost base for relatab");
+ for ( ; rela < relalim; rela++) {
+ symidx = ELF_R_SYM(rela->r_info);
+ if (symidx >= ef->ddbsymcnt)
+ continue;
+ sym = ef->ddbsymtab + symidx;
+ /* Only do local relocs */
+ if (ELF_ST_BIND(sym->st_info) != STB_LOCAL)
+ continue;
+ elf_reloc_local(lf, base, rela, ELF_RELOC_RELA,
+ elf_obj_lookup);
+ }
+ }
+}
+
+static long
+link_elf_symtab_get(linker_file_t lf, const Elf_Sym **symtab)
+{
+ elf_file_t ef = (elf_file_t)lf;
+
+ *symtab = ef->ddbsymtab;
+
+ if (*symtab == NULL)
+ return (0);
+
+ return (ef->ddbsymcnt);
+}
+
+static long
+link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
+{
+ elf_file_t ef = (elf_file_t)lf;
+
+ *strtab = ef->ddbstrtab;
+
+ if (*strtab == NULL)
+ return (0);
+
+ return (ef->ddbstrcnt);
+}
diff --git a/sys/kern/linker_if.m b/sys/kern/linker_if.m
new file mode 100644
index 0000000..3df592c
--- /dev/null
+++ b/sys/kern/linker_if.m
@@ -0,0 +1,145 @@
+#-
+# Copyright (c) 2000 Doug Rabson
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/linker.h>
+
+INTERFACE linker;
+
+#
+# Lookup a symbol in the file's symbol table. If the symbol is not
+# found then return ENOENT, otherwise zero.
+#
+METHOD int lookup_symbol {
+ linker_file_t file;
+ const char* name;
+ c_linker_sym_t* symp;
+};
+
+METHOD int symbol_values {
+ linker_file_t file;
+ c_linker_sym_t sym;
+ linker_symval_t* valp;
+};
+
+METHOD int search_symbol {
+ linker_file_t file;
+ caddr_t value;
+ c_linker_sym_t* symp;
+ long* diffp;
+};
+
+#
+# Call the callback with each specified function defined in the file.
+# Stop and return the error if the callback returns an error.
+#
+METHOD int each_function_name {
+ linker_file_t file;
+ linker_function_name_callback_t callback;
+ void* opaque;
+};
+
+#
+# Call the callback with each specified function and it's value
+# defined in the file.
+# Stop and return the error if the callback returns an error.
+#
+METHOD int each_function_nameval {
+ linker_file_t file;
+ linker_function_nameval_callback_t callback;
+ void* opaque;
+};
+
+#
+# Search for a linker set in a file. Return a pointer to the first
+# entry (which is itself a pointer), and the number of entries.
+# "stop" points to the entry beyond the last valid entry.
+# If count, start or stop are NULL, they are not returned.
+#
+METHOD int lookup_set {
+ linker_file_t file;
+ const char* name;
+ void*** start;
+ void*** stop;
+ int* count;
+};
+
+#
+# Unload a file, releasing dependancies and freeing storage.
+#
+METHOD void unload {
+ linker_file_t file;
+};
+
+#
+# Load CTF data if necessary and if there is a .SUNW_ctf section
+# in the ELF file, returning info in the linker CTF structure.
+#
+METHOD int ctf_get {
+ linker_file_t file;
+ linker_ctf_t *lc;
+};
+
+#
+# Get the symbol table, returning it in **symtab. Return the
+# number of symbols, otherwise zero.
+#
+METHOD long symtab_get {
+ linker_file_t file;
+ const Elf_Sym **symtab;
+};
+
+#
+# Get the string table, returning it in *strtab. Return the
+# size (in bytes) of the string table, otherwise zero.
+#
+METHOD long strtab_get {
+ linker_file_t file;
+ caddr_t *strtab;
+};
+
+#
+# Load a file, returning the new linker_file_t in *result. If
+# the class does not recognise the file type, zero should be
+# returned, without modifying *result. If the file is
+# recognised, the file should be loaded, *result set to the new
+# file and zero returned. If some other error is detected an
+# appropriate errno should be returned.
+#
+STATICMETHOD int load_file {
+ linker_class_t cls;
+ const char* filename;
+ linker_file_t* result;
+};
+STATICMETHOD int link_preload {
+ linker_class_t cls;
+ const char* filename;
+ linker_file_t* result;
+};
+METHOD int link_preload_finish {
+ linker_file_t file;
+};
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
new file mode 100644
index 0000000..21e6046
--- /dev/null
+++ b/sys/kern/makesyscalls.sh
@@ -0,0 +1,653 @@
+#! /bin/sh -
+# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93
+# $FreeBSD$
+
+set -e
+
+# name of compat options:
+compat=COMPAT_43
+compat4=COMPAT_FREEBSD4
+compat6=COMPAT_FREEBSD6
+compat7=COMPAT_FREEBSD7
+
+# output files:
+sysnames="syscalls.c"
+sysproto="../sys/sysproto.h"
+sysproto_h=_SYS_SYSPROTO_H_
+syshdr="../sys/syscall.h"
+sysmk="../sys/syscall.mk"
+syssw="init_sysent.c"
+syscallprefix="SYS_"
+switchname="sysent"
+namesname="syscallnames"
+systrace="systrace_args.c"
+
+# tmp files:
+sysaue="sysent.aue.$$"
+sysdcl="sysent.dcl.$$"
+syscompat="sysent.compat.$$"
+syscompatdcl="sysent.compatdcl.$$"
+syscompat4="sysent.compat4.$$"
+syscompat4dcl="sysent.compat4dcl.$$"
+syscompat6="sysent.compat6.$$"
+syscompat6dcl="sysent.compat6dcl.$$"
+syscompat7="sysent.compat7.$$"
+syscompat7dcl="sysent.compat7dcl.$$"
+sysent="sysent.switch.$$"
+sysinc="sysinc.switch.$$"
+sysarg="sysarg.switch.$$"
+sysprotoend="sysprotoend.$$"
+systracetmp="systrace.$$"
+systraceret="systraceret.$$"
+
+if [ -r capabilities.conf ]; then
+ capenabled=`cat capabilities.conf | grep -v "^#" | grep -v "^$"`
+ capenabled=`echo $capenabled | sed 's/ /,/g'`
+else
+ capenabled=""
+fi
+
+trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp $systraceret" 0
+
+touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp $systraceret
+
+case $# in
+ 0) echo "usage: $0 input-file <config-file>" 1>&2
+ exit 1
+ ;;
+esac
+
+if [ -n "$2" -a -f "$2" ]; then
+ . $2
+fi
+
+sed -e '
+s/\$//g
+:join
+ /\\$/{a\
+
+ N
+ s/\\\n//
+ b join
+ }
+2,${
+ /^#/!s/\([{}()*,]\)/ \1 /g
+}
+' < $1 | awk "
+ BEGIN {
+ sysaue = \"$sysaue\"
+ sysdcl = \"$sysdcl\"
+ sysproto = \"$sysproto\"
+ sysprotoend = \"$sysprotoend\"
+ sysproto_h = \"$sysproto_h\"
+ syscompat = \"$syscompat\"
+ syscompatdcl = \"$syscompatdcl\"
+ syscompat4 = \"$syscompat4\"
+ syscompat4dcl = \"$syscompat4dcl\"
+ syscompat6 = \"$syscompat6\"
+ syscompat6dcl = \"$syscompat6dcl\"
+ syscompat7 = \"$syscompat7\"
+ syscompat7dcl = \"$syscompat7dcl\"
+ sysent = \"$sysent\"
+ syssw = \"$syssw\"
+ sysinc = \"$sysinc\"
+ sysarg = \"$sysarg\"
+ sysnames = \"$sysnames\"
+ syshdr = \"$syshdr\"
+ sysmk = \"$sysmk\"
+ systrace = \"$systrace\"
+ systracetmp = \"$systracetmp\"
+ systraceret = \"$systraceret\"
+ compat = \"$compat\"
+ compat4 = \"$compat4\"
+ compat6 = \"$compat6\"
+ compat7 = \"$compat7\"
+ syscallprefix = \"$syscallprefix\"
+ switchname = \"$switchname\"
+ namesname = \"$namesname\"
+ infile = \"$1\"
+ capenabled_string = \"$capenabled\"
+ "'
+
+ split(capenabled_string, capenabled, ",");
+
+ printf "/*\n * System call switch table.\n *\n" > syssw
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw
+ printf " * $%s$\n", "FreeBSD" > syssw
+
+ printf "/*\n * System call prototypes.\n *\n" > sysarg
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
+ printf " * $%s$\n", "FreeBSD" > sysarg
+
+ printf "\n#ifdef %s\n\n", compat > syscompat
+ printf "\n#ifdef %s\n\n", compat4 > syscompat4
+ printf "\n#ifdef %s\n\n", compat6 > syscompat6
+ printf "\n#ifdef %s\n\n", compat7 > syscompat7
+
+ printf "/*\n * System call names.\n *\n" > sysnames
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+ printf " * $%s$\n", "FreeBSD" > sysnames
+
+ printf "/*\n * System call numbers.\n *\n" > syshdr
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
+ printf " * $%s$\n", "FreeBSD" > syshdr
+ printf "# FreeBSD system call names.\n" > sysmk
+ printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
+ printf "# $%s$\n", "FreeBSD" > sysmk
+
+ printf "/*\n * System call argument to DTrace register array converstion.\n *\n" > systrace
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > systrace
+ printf " * $%s$\n", "FreeBSD" > systrace
+ }
+ NR == 1 {
+ gsub("[$]FreeBSD: ", "", $0)
+ gsub(" [$]", "", $0)
+
+ printf " * created from%s\n */\n\n", $0 > syssw
+
+ printf "\n/* The casts are bogus but will do for now. */\n" > sysent
+ printf "struct sysent %s[] = {\n",switchname > sysent
+
+ printf " * created from%s\n */\n\n", $0 > sysarg
+ printf "#ifndef %s\n", sysproto_h > sysarg
+ printf "#define\t%s\n\n", sysproto_h > sysarg
+ printf "#include <sys/signal.h>\n" > sysarg
+ printf "#include <sys/acl.h>\n" > sysarg
+ printf "#include <sys/cpuset.h>\n" > sysarg
+ printf "#include <sys/_ffcounter.h>\n" > sysarg
+ printf "#include <sys/_semaphore.h>\n" > sysarg
+ printf "#include <sys/ucontext.h>\n\n" > sysarg
+ printf "#include <bsm/audit_kevents.h>\n\n" > sysarg
+ printf "struct proc;\n\n" > sysarg
+ printf "struct thread;\n\n" > sysarg
+ printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
+ printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg
+ printf "#if BYTE_ORDER == LITTLE_ENDIAN\n"> sysarg
+ printf "#define\tPADL_(t)\t0\n" > sysarg
+ printf "#define\tPADR_(t)\tPAD_(t)\n" > sysarg
+ printf "#else\n" > sysarg
+ printf "#define\tPADL_(t)\tPAD_(t)\n" > sysarg
+ printf "#define\tPADR_(t)\t0\n" > sysarg
+ printf "#endif\n\n" > sysarg
+
+ printf " * created from%s\n */\n\n", $0 > sysnames
+ printf "const char *%s[] = {\n", namesname > sysnames
+
+ printf " * created from%s\n */\n\n", $0 > syshdr
+
+ printf "# created from%s\nMIASM = ", $0 > sysmk
+
+ printf " * This file is part of the DTrace syscall provider.\n */\n\n" > systrace
+ printf "static void\nsystrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)\n{\n" > systrace
+ printf "\tint64_t *iarg = (int64_t *) uarg;\n" > systrace
+ printf "\tswitch (sysnum) {\n" > systrace
+
+ printf "static void\nsystrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)\n{\n\tconst char *p = NULL;\n" > systracetmp
+ printf "\tswitch (sysnum) {\n" > systracetmp
+
+ printf "static void\nsystrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)\n{\n\tconst char *p = NULL;\n" > systraceret
+ printf "\tswitch (sysnum) {\n" > systraceret
+
+ next
+ }
+ NF == 0 || $1 ~ /^;/ {
+ next
+ }
+ $1 ~ /^#[ ]*include/ {
+ print > sysinc
+ next
+ }
+ $1 ~ /^#[ ]*if/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > syscompat4
+ print > syscompat6
+ print > syscompat7
+ print > sysnames
+ print > systrace
+ print > systracetmp
+ print > systraceret
+ savesyscall = syscall
+ next
+ }
+ $1 ~ /^#[ ]*else/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > syscompat4
+ print > syscompat6
+ print > syscompat7
+ print > sysnames
+ print > systrace
+ print > systracetmp
+ print > systraceret
+ syscall = savesyscall
+ next
+ }
+ $1 ~ /^#/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > syscompat4
+ print > syscompat6
+ print > syscompat7
+ print > sysnames
+ print > systrace
+ print > systracetmp
+ print > systraceret
+ next
+ }
+ syscall != $1 {
+ printf "%s: line %d: syscall number out of sync at %d\n",
+ infile, NR, syscall
+ printf "line is:\n"
+ print
+ exit 1
+ }
+ # Returns true if the type "name" is the first flag in the type field
+ function type(name, flags, n) {
+ n = split($3, flags, /\|/)
+ return (n > 0 && flags[1] == name)
+ }
+ # Returns true if the flag "name" is set in the type field
+ function flag(name, flags, i, n) {
+ n = split($3, flags, /\|/)
+ for (i = 1; i <= n; i++)
+ if (flags[i] == name)
+ return 1
+ return 0
+ }
+ function align_sysent_comment(column) {
+ printf("\t") > sysent
+ column = column + 8 - column % 8
+ while (column < 56) {
+ printf("\t") > sysent
+ column = column + 8
+ }
+ }
+ function parserr(was, wanted) {
+ printf "%s: line %d: unexpected %s (expected %s)\n",
+ infile, NR, was, wanted
+ exit 1
+ }
+ function parseline() {
+ f=4 # toss number, type, audit event
+ argc= 0;
+ argssize = "0"
+ thr_flag = "SY_THR_STATIC"
+ if (flag("NOTSTATIC")) {
+ thr_flag = "SY_THR_ABSENT"
+ }
+ if ($NF != "}") {
+ funcalias=$(NF-2)
+ argalias=$(NF-1)
+ rettype=$NF
+ end=NF-3
+ } else {
+ funcalias=""
+ argalias=""
+ rettype="int"
+ end=NF
+ }
+ if (flag("NODEF")) {
+ auditev="AUE_NULL"
+ funcname=$4
+ argssize = "AS(" $6 ")"
+ return
+ }
+ if ($f != "{")
+ parserr($f, "{")
+ f++
+ if ($end != "}")
+ parserr($end, "}")
+ end--
+ if ($end != ";")
+ parserr($end, ";")
+ end--
+ if ($end != ")")
+ parserr($end, ")")
+ end--
+
+ syscallret=$f
+ f++
+
+ funcname=$f
+
+ #
+ # We now know the func name, so define a flags field for it.
+ # Do this before any other processing as we may return early
+ # from it.
+ #
+ for (cap in capenabled) {
+ if (funcname == capenabled[cap]) {
+ flags = "SYF_CAPENABLED";
+ }
+ }
+
+ if (funcalias == "")
+ funcalias = funcname
+ if (argalias == "") {
+ argalias = funcname "_args"
+ if (flag("COMPAT"))
+ argalias = "o" argalias
+ if (flag("COMPAT4"))
+ argalias = "freebsd4_" argalias
+ if (flag("COMPAT6"))
+ argalias = "freebsd6_" argalias
+ if (flag("COMPAT7"))
+ argalias = "freebsd7_" argalias
+ }
+ f++
+
+ if ($f != "(")
+ parserr($f, ")")
+ f++
+
+ if (f == end) {
+ if ($f != "void")
+ parserr($f, "argument definition")
+ return
+ }
+
+ while (f <= end) {
+ argc++
+ argtype[argc]=""
+ oldf=""
+ while (f < end && $(f+1) != ",") {
+ if (argtype[argc] != "" && oldf != "*")
+ argtype[argc] = argtype[argc]" ";
+ argtype[argc] = argtype[argc]$f;
+ oldf = $f;
+ f++
+ }
+ if (argtype[argc] == "")
+ parserr($f, "argument definition")
+ argname[argc]=$f;
+ f += 2; # skip name, and any comma
+ }
+ if (argc != 0)
+ argssize = "AS(" argalias ")"
+ }
+ { comment = $4
+ if (NF < 7)
+ for (i = 5; i <= NF; i++)
+ comment = comment " " $i
+ }
+
+ #
+ # The AUE_ audit event identifier.
+ #
+ {
+ auditev = $2;
+ }
+
+ #
+ # The flags, if any.
+ #
+ {
+ flags = "0";
+ }
+
+ type("STD") || type("NODEF") || type("NOARGS") || type("NOPROTO") \
+ || type("NOSTD") {
+ parseline()
+ printf("\t/* %s */\n\tcase %d: {\n", funcname, syscall) > systrace
+ printf("\t/* %s */\n\tcase %d:\n", funcname, syscall) > systracetmp
+ printf("\t/* %s */\n\tcase %d:\n", funcname, syscall) > systraceret
+ if (argc > 0) {
+ printf("\t\tswitch(ndx) {\n") > systracetmp
+ printf("\t\tstruct %s *p = params;\n", argalias) > systrace
+ for (i = 1; i <= argc; i++) {
+ printf("\t\tcase %d:\n\t\t\tp = \"%s\";\n\t\t\tbreak;\n", i - 1, argtype[i]) > systracetmp
+ if (index(argtype[i], "*") > 0 || argtype[i] == "caddr_t")
+ printf("\t\tuarg[%d] = (intptr_t) p->%s; /* %s */\n", \
+ i - 1, \
+ argname[i], argtype[i]) > systrace
+ else if (substr(argtype[i], 1, 1) == "u" || argtype[i] == "size_t")
+ printf("\t\tuarg[%d] = p->%s; /* %s */\n", \
+ i - 1, \
+ argname[i], argtype[i]) > systrace
+ else
+ printf("\t\tiarg[%d] = p->%s; /* %s */\n", \
+ i - 1, \
+ argname[i], argtype[i]) > systrace
+ }
+ printf("\t\tdefault:\n\t\t\tbreak;\n\t\t};\n") > systracetmp
+
+ printf("\t\tif (ndx == 0 || ndx == 1)\n") > systraceret
+ printf("\t\t\tp = \"%s\";\n", syscallret) > systraceret
+ printf("\t\tbreak;\n") > systraceret
+ }
+ printf("\t\t*n_args = %d;\n\t\tbreak;\n\t}\n", argc) > systrace
+ printf("\t\tbreak;\n") > systracetmp
+ if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \
+ !flag("NODEF")) {
+ printf("struct %s {\n", argalias) > sysarg
+ for (i = 1; i <= argc; i++)
+ printf("\tchar %s_l_[PADL_(%s)]; " \
+ "%s %s; char %s_r_[PADR_(%s)];\n",
+ argname[i], argtype[i],
+ argtype[i], argname[i],
+ argname[i], argtype[i]) > sysarg
+ printf("};\n") > sysarg
+ }
+ else if (!flag("NOARGS") && !flag("NOPROTO") && !flag("NODEF"))
+ printf("struct %s {\n\tregister_t dummy;\n};\n",
+ argalias) > sysarg
+ if (!flag("NOPROTO") && !flag("NODEF")) {
+ if (funcname == "nosys" || funcname == "lkmnosys" ||
+ funcname == "sysarch" || funcname ~ /^freebsd/ ||
+ funcname ~ /^linux/ || funcname ~ /^svr4/ ||
+ funcname ~ /^ibcs2/ || funcname ~ /^xenix/) {
+ printf("%s\t%s(struct thread *, struct %s *)",
+ rettype, funcname, argalias) > sysdcl
+ } else {
+ printf("%s\tsys_%s(struct thread *, struct %s *)",
+ rettype, funcname, argalias) > sysdcl
+ }
+ printf(";\n") > sysdcl
+ printf("#define\t%sAUE_%s\t%s\n", syscallprefix,
+ funcalias, auditev) > sysaue
+ }
+ printf("\t{ %s, (sy_call_t *)", argssize) > sysent
+ column = 8 + 2 + length(argssize) + 15
+ if (flag("NOSTD")) {
+ printf("%s },", "lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT") > sysent
+ column = column + length("lkmressys") + length("AUE_NULL") + 3
+ } else {
+ if (funcname == "nosys" || funcname == "sysarch" ||
+ funcname == "lkmnosys" || funcname ~ /^freebsd/ ||
+ funcname ~ /^linux/ || funcname ~ /^svr4/ ||
+ funcname ~ /^ibcs2/ || funcname ~ /^xenix/) {
+ printf("%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
+ column = column + length(funcname) + length(auditev) + length(flags) + 3
+ } else {
+ printf("sys_%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
+ column = column + length(funcname) + length(auditev) + length(flags) + 3 + 4
+ }
+ }
+ align_sysent_comment(column)
+ printf("/* %d = %s */\n", syscall, funcalias) > sysent
+ printf("\t\"%s\",\t\t\t/* %d = %s */\n",
+ funcalias, syscall, funcalias) > sysnames
+ if (!flag("NODEF")) {
+ printf("#define\t%s%s\t%d\n", syscallprefix,
+ funcalias, syscall) > syshdr
+ printf(" \\\n\t%s.o", funcalias) > sysmk
+ }
+ syscall++
+ next
+ }
+ type("COMPAT") || type("COMPAT4") || type("COMPAT6") || \
+ type("COMPAT7") {
+ if (flag("COMPAT")) {
+ ncompat++
+ out = syscompat
+ outdcl = syscompatdcl
+ wrap = "compat"
+ prefix = "o"
+ descr = "old"
+ } else if (flag("COMPAT4")) {
+ ncompat4++
+ out = syscompat4
+ outdcl = syscompat4dcl
+ wrap = "compat4"
+ prefix = "freebsd4_"
+ descr = "freebsd4"
+ } else if (flag("COMPAT6")) {
+ ncompat6++
+ out = syscompat6
+ outdcl = syscompat6dcl
+ wrap = "compat6"
+ prefix = "freebsd6_"
+ descr = "freebsd6"
+ } else if (flag("COMPAT7")) {
+ ncompat7++
+ out = syscompat7
+ outdcl = syscompat7dcl
+ wrap = "compat7"
+ prefix = "freebsd7_"
+ descr = "freebsd7"
+ }
+ parseline()
+ if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \
+ !flag("NODEF")) {
+ printf("struct %s {\n", argalias) > out
+ for (i = 1; i <= argc; i++)
+ printf("\tchar %s_l_[PADL_(%s)]; %s %s; " \
+ "char %s_r_[PADR_(%s)];\n",
+ argname[i], argtype[i],
+ argtype[i], argname[i],
+ argname[i], argtype[i]) > out
+ printf("};\n") > out
+ }
+ else if (!flag("NOARGS") && !flag("NOPROTO") && !flag("NODEF"))
+ printf("struct %s {\n\tregister_t dummy;\n};\n",
+ argalias) > sysarg
+ if (!flag("NOPROTO") && !flag("NODEF")) {
+ printf("%s\t%s%s(struct thread *, struct %s *);\n",
+ rettype, prefix, funcname, argalias) > outdcl
+ printf("#define\t%sAUE_%s%s\t%s\n", syscallprefix,
+ prefix, funcname, auditev) > sysaue
+ }
+ if (flag("NOSTD")) {
+ printf("\t{ %s, (sy_call_t *)%s, %s, NULL, 0, 0, 0, SY_THR_ABSENT },",
+ "0", "lkmressys", "AUE_NULL") > sysent
+ align_sysent_comment(8 + 2 + length("0") + 15 + \
+ length("lkmressys") + length("AUE_NULL") + 3)
+ } else {
+ printf("\t{ %s(%s,%s), %s, NULL, 0, 0, %s, %s },",
+ wrap, argssize, funcname, auditev, flags, thr_flag) > sysent
+ align_sysent_comment(8 + 9 + length(argssize) + 1 + \
+ length(funcname) + length(auditev) + \
+ length(flags) + 4)
+ }
+ printf("/* %d = %s %s */\n", syscall, descr, funcalias) > sysent
+ printf("\t\"%s.%s\",\t\t/* %d = %s %s */\n",
+ wrap, funcalias, syscall, descr, funcalias) > sysnames
+ if (flag("COMPAT")) {
+ printf("\t\t\t\t/* %d is old %s */\n",
+ syscall, funcalias) > syshdr
+ } else if (!flag("NODEF")) {
+ printf("#define\t%s%s%s\t%d\n", syscallprefix,
+ prefix, funcalias, syscall) > syshdr
+ printf(" \\\n\t%s%s.o", prefix, funcalias) > sysmk
+ }
+ syscall++
+ next
+ }
+ type("OBSOL") {
+ printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },") > sysent
+ align_sysent_comment(34)
+ printf("/* %d = obsolete %s */\n", syscall, comment) > sysent
+ printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
+ $4, syscall, comment) > sysnames
+ printf("\t\t\t\t/* %d is obsolete %s */\n",
+ syscall, comment) > syshdr
+ syscall++
+ next
+ }
+ type("UNIMPL") {
+ printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },\t\t\t/* %d = %s */\n",
+ syscall, comment) > sysent
+ printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
+ syscall, syscall, comment) > sysnames
+ syscall++
+ next
+ }
+ {
+ printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $3
+ exit 1
+ }
+ END {
+ printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc
+
+ if (ncompat != 0 || ncompat4 != 0 || ncompat6 != 0 || ncompat7 != 0)
+ printf "#include \"opt_compat.h\"\n\n" > syssw
+
+ if (ncompat != 0) {
+ printf "\n#ifdef %s\n", compat > sysinc
+ printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc
+ printf "#else\n" > sysinc
+ printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+ printf "#endif\n" > sysinc
+ }
+
+ if (ncompat4 != 0) {
+ printf "\n#ifdef %s\n", compat4 > sysinc
+ printf "#define compat4(n, name) n, (sy_call_t *)__CONCAT(freebsd4_,name)\n" > sysinc
+ printf "#else\n" > sysinc
+ printf "#define compat4(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+ printf "#endif\n" > sysinc
+ }
+
+ if (ncompat6 != 0) {
+ printf "\n#ifdef %s\n", compat6 > sysinc
+ printf "#define compat6(n, name) n, (sy_call_t *)__CONCAT(freebsd6_,name)\n" > sysinc
+ printf "#else\n" > sysinc
+ printf "#define compat6(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+ printf "#endif\n" > sysinc
+ }
+
+ if (ncompat7 != 0) {
+ printf "\n#ifdef %s\n", compat7 > sysinc
+ printf "#define compat7(n, name) n, (sy_call_t *)__CONCAT(freebsd7_,name)\n" > sysinc
+ printf "#else\n" > sysinc
+ printf "#define compat7(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+ printf "#endif\n" > sysinc
+ }
+
+ printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
+ printf("\n#endif /* %s */\n\n", compat4) > syscompat4dcl
+ printf("\n#endif /* %s */\n\n", compat6) > syscompat6dcl
+ printf("\n#endif /* %s */\n\n", compat7) > syscompat7dcl
+
+ printf("\n#undef PAD_\n") > sysprotoend
+ printf("#undef PADL_\n") > sysprotoend
+ printf("#undef PADR_\n") > sysprotoend
+ printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend
+
+ printf("\n") > sysmk
+ printf("};\n") > sysent
+ printf("};\n") > sysnames
+ printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
+ > syshdr
+ printf "\tdefault:\n\t\t*n_args = 0;\n\t\tbreak;\n\t};\n}\n" > systrace
+ printf "\tdefault:\n\t\tbreak;\n\t};\n\tif (p != NULL)\n\t\tstrlcpy(desc, p, descsz);\n}\n" > systracetmp
+ printf "\tdefault:\n\t\tbreak;\n\t};\n\tif (p != NULL)\n\t\tstrlcpy(desc, p, descsz);\n}\n" > systraceret
+ } '
+
+cat $sysinc $sysent >> $syssw
+cat $sysarg $sysdcl \
+ $syscompat $syscompatdcl \
+ $syscompat4 $syscompat4dcl \
+ $syscompat6 $syscompat6dcl \
+ $syscompat7 $syscompat7dcl \
+ $sysaue $sysprotoend > $sysproto
+cat $systracetmp >> $systrace
+cat $systraceret >> $systrace
+
diff --git a/sys/kern/md4c.c b/sys/kern/md4c.c
new file mode 100644
index 0000000..84a294a
--- /dev/null
+++ b/sys/kern/md4c.c
@@ -0,0 +1,288 @@
+/* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm
+ */
+
+/*-
+ Copyright (C) 1990-2, RSA Data Security, Inc. All rights reserved.
+
+ License to copy and use this software is granted provided that it
+ is identified as the "RSA Data Security, Inc. MD4 Message-Digest
+ Algorithm" in all material mentioning or referencing this software
+ or this function.
+
+ License is also granted to make and use derivative works provided
+ that such works are identified as "derived from the RSA Data
+ Security, Inc. MD4 Message-Digest Algorithm" in all material
+ mentioning or referencing the derived work.
+
+ RSA Data Security, Inc. makes no representations concerning either
+ the merchantability of this software or the suitability of this
+ software for any particular purpose. It is provided "as is"
+ without express or implied warranty of any kind.
+
+ These notices must be retained in any copies of any part of this
+ documentation and/or software.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/md4.h>
+
+typedef unsigned char *POINTER;
+typedef u_int16_t UINT2;
+typedef u_int32_t UINT4;
+
+#define PROTO_LIST(list) list
+
+/* Constants for MD4Transform routine.
+ */
+#define S11 3
+#define S12 7
+#define S13 11
+#define S14 19
+#define S21 3
+#define S22 5
+#define S23 9
+#define S24 13
+#define S31 3
+#define S32 9
+#define S33 11
+#define S34 15
+
+static void MD4Transform PROTO_LIST ((UINT4 [4], const unsigned char [64]));
+static void Encode PROTO_LIST
+ ((unsigned char *, UINT4 *, unsigned int));
+static void Decode PROTO_LIST
+ ((UINT4 *, const unsigned char *, unsigned int));
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G and H are basic MD4 functions.
+ */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+/* ROTATE_LEFT rotates x left n bits.
+ */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG and HH are transformations for rounds 1, 2 and 3 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s) { \
+ (a) += F ((b), (c), (d)) + (x); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ }
+#define GG(a, b, c, d, x, s) { \
+ (a) += G ((b), (c), (d)) + (x) + (UINT4)0x5a827999; \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ }
+#define HH(a, b, c, d, x, s) { \
+ (a) += H ((b), (c), (d)) + (x) + (UINT4)0x6ed9eba1; \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ }
+
+/* MD4 initialization. Begins an MD4 operation, writing a new context.
+ */
+void MD4Init (context)
+MD4_CTX *context; /* context */
+{
+ context->count[0] = context->count[1] = 0;
+
+ /* Load magic initialization constants.
+ */
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xefcdab89;
+ context->state[2] = 0x98badcfe;
+ context->state[3] = 0x10325476;
+}
+
+/* MD4 block update operation. Continues an MD4 message-digest
+ operation, processing another message block, and updating the
+ context.
+ */
+void MD4Update (context, input, inputLen)
+MD4_CTX *context; /* context */
+const unsigned char *input; /* input block */
+unsigned int inputLen; /* length of input block */
+{
+ unsigned int i, index, partLen;
+
+ /* Compute number of bytes mod 64 */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+ /* Update number of bits */
+ if ((context->count[0] += ((UINT4)inputLen << 3))
+ < ((UINT4)inputLen << 3))
+ context->count[1]++;
+ context->count[1] += ((UINT4)inputLen >> 29);
+
+ partLen = 64 - index;
+ /* Transform as many times as possible.
+ */
+ if (inputLen >= partLen) {
+ bcopy(input, &context->buffer[index], partLen);
+ MD4Transform (context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD4Transform (context->state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ /* Buffer remaining input */
+ bcopy(&input[i], &context->buffer[index], inputLen-i);
+}
+
+/* MD4 padding. */
+void MD4Pad (context)
+MD4_CTX *context; /* context */
+{
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ /* Save number of bits */
+ Encode (bits, context->count, 8);
+
+ /* Pad out to 56 mod 64.
+ */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ MD4Update (context, PADDING, padLen);
+
+ /* Append length (before padding) */
+ MD4Update (context, bits, 8);
+}
+
+/* MD4 finalization. Ends an MD4 message-digest operation, writing the
+ the message digest and zeroizing the context.
+ */
+void MD4Final (digest, context)
+unsigned char digest[16]; /* message digest */
+MD4_CTX *context; /* context */
+{
+ /* Do padding */
+ MD4Pad (context);
+
+ /* Store state in digest */
+ Encode (digest, context->state, 16);
+
+ /* Zeroize sensitive information.
+ */
+ bzero((POINTER)context, sizeof (*context));
+}
+
+/* MD4 basic transformation. Transforms state based on block.
+ */
+static void MD4Transform (state, block)
+UINT4 state[4];
+const unsigned char block[64];
+{
+ UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+ FF (a, b, c, d, x[ 0], S11); /* 1 */
+ FF (d, a, b, c, x[ 1], S12); /* 2 */
+ FF (c, d, a, b, x[ 2], S13); /* 3 */
+ FF (b, c, d, a, x[ 3], S14); /* 4 */
+ FF (a, b, c, d, x[ 4], S11); /* 5 */
+ FF (d, a, b, c, x[ 5], S12); /* 6 */
+ FF (c, d, a, b, x[ 6], S13); /* 7 */
+ FF (b, c, d, a, x[ 7], S14); /* 8 */
+ FF (a, b, c, d, x[ 8], S11); /* 9 */
+ FF (d, a, b, c, x[ 9], S12); /* 10 */
+ FF (c, d, a, b, x[10], S13); /* 11 */
+ FF (b, c, d, a, x[11], S14); /* 12 */
+ FF (a, b, c, d, x[12], S11); /* 13 */
+ FF (d, a, b, c, x[13], S12); /* 14 */
+ FF (c, d, a, b, x[14], S13); /* 15 */
+ FF (b, c, d, a, x[15], S14); /* 16 */
+
+ /* Round 2 */
+ GG (a, b, c, d, x[ 0], S21); /* 17 */
+ GG (d, a, b, c, x[ 4], S22); /* 18 */
+ GG (c, d, a, b, x[ 8], S23); /* 19 */
+ GG (b, c, d, a, x[12], S24); /* 20 */
+ GG (a, b, c, d, x[ 1], S21); /* 21 */
+ GG (d, a, b, c, x[ 5], S22); /* 22 */
+ GG (c, d, a, b, x[ 9], S23); /* 23 */
+ GG (b, c, d, a, x[13], S24); /* 24 */
+ GG (a, b, c, d, x[ 2], S21); /* 25 */
+ GG (d, a, b, c, x[ 6], S22); /* 26 */
+ GG (c, d, a, b, x[10], S23); /* 27 */
+ GG (b, c, d, a, x[14], S24); /* 28 */
+ GG (a, b, c, d, x[ 3], S21); /* 29 */
+ GG (d, a, b, c, x[ 7], S22); /* 30 */
+ GG (c, d, a, b, x[11], S23); /* 31 */
+ GG (b, c, d, a, x[15], S24); /* 32 */
+
+ /* Round 3 */
+ HH (a, b, c, d, x[ 0], S31); /* 33 */
+ HH (d, a, b, c, x[ 8], S32); /* 34 */
+ HH (c, d, a, b, x[ 4], S33); /* 35 */
+ HH (b, c, d, a, x[12], S34); /* 36 */
+ HH (a, b, c, d, x[ 2], S31); /* 37 */
+ HH (d, a, b, c, x[10], S32); /* 38 */
+ HH (c, d, a, b, x[ 6], S33); /* 39 */
+ HH (b, c, d, a, x[14], S34); /* 40 */
+ HH (a, b, c, d, x[ 1], S31); /* 41 */
+ HH (d, a, b, c, x[ 9], S32); /* 42 */
+ HH (c, d, a, b, x[ 5], S33); /* 43 */
+ HH (b, c, d, a, x[13], S34); /* 44 */
+ HH (a, b, c, d, x[ 3], S31); /* 45 */
+ HH (d, a, b, c, x[11], S32); /* 46 */
+ HH (c, d, a, b, x[ 7], S33); /* 47 */
+ HH (b, c, d, a, x[15], S34); /* 48 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ /* Zeroize sensitive information.
+ */
+ bzero((POINTER)x, sizeof (x));
+}
+
+/* Encodes input (UINT4) into output (unsigned char). Assumes len is
+ a multiple of 4.
+ */
+static void Encode (output, input, len)
+unsigned char *output;
+UINT4 *input;
+unsigned int len;
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+ }
+}
+
+/* Decodes input (unsigned char) into output (UINT4). Assumes len is
+ a multiple of 4.
+ */
+static void Decode (output, input, len)
+
+UINT4 *output;
+const unsigned char *input;
+unsigned int len;
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
+ (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
+}
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
new file mode 100644
index 0000000..50e2022
--- /dev/null
+++ b/sys/kern/md5c.c
@@ -0,0 +1,340 @@
+/*-
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * This code is the same as the code published by RSA Inc. It has been
+ * edited for clarity and style only.
+ */
+
+/*
+ * This file should be kept in sync with src/lib/libmd/md5c.c
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <machine/endian.h>
+#include <sys/endian.h>
+#include <sys/md5.h>
+
+static void MD5Transform(u_int32_t [4], const unsigned char [64]);
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+#define Encode memcpy
+#define Decode memcpy
+#else
+
+/*
+ * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Encode (unsigned char *output, u_int32_t *input, unsigned int len)
+{
+ unsigned int i;
+ uint32_t ip;
+
+ for (i = 0; i < len / 4; i++) {
+ ip = input[i];
+ *output++ = ip;
+ *output++ = ip >> 8;
+ *output++ = ip >> 16;
+ *output++ = ip >> 24;
+ }
+}
+
+/*
+ * Decodes input (unsigned char) into output (u_int32_t). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Decode (u_int32_t *output, const unsigned char *input, unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; i += 4) {
+ *output++ = input[i] | (input[i+1] << 8) | (input[i+2] << 16) |
+ (input[i+3] << 24);
+ }
+}
+#endif
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits. */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context. */
+
+void
+MD5Init (context)
+ MD5_CTX *context;
+{
+
+ context->count[0] = context->count[1] = 0;
+
+ /* Load magic initialization constants. */
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xefcdab89;
+ context->state[2] = 0x98badcfe;
+ context->state[3] = 0x10325476;
+}
+
+/*
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating the
+ * context.
+ */
+
+void
+MD5Update (context, in, inputLen)
+ MD5_CTX *context;
+ const void *in;
+ unsigned int inputLen;
+{
+ unsigned int i, index, partLen;
+ const unsigned char *input = in;
+
+ /* Compute number of bytes mod 64 */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+ /* Update number of bits */
+ if ((context->count[0] += ((u_int32_t)inputLen << 3))
+ < ((u_int32_t)inputLen << 3))
+ context->count[1]++;
+ context->count[1] += ((u_int32_t)inputLen >> 29);
+
+ partLen = 64 - index;
+
+ /* Transform as many times as possible. */
+ if (inputLen >= partLen) {
+ memcpy((void *)&context->buffer[index], (const void *)input,
+ partLen);
+ MD5Transform (context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD5Transform (context->state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ /* Buffer remaining input */
+ memcpy ((void *)&context->buffer[index], (const void *)&input[i],
+ inputLen-i);
+}
+
+/*
+ * MD5 padding. Adds padding followed by original length.
+ */
+
+static void
+MD5Pad (MD5_CTX *context)
+{
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ /* Save number of bits */
+ Encode (bits, context->count, 8);
+
+ /* Pad out to 56 mod 64. */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ MD5Update (context, PADDING, padLen);
+
+ /* Append length (before padding) */
+ MD5Update (context, bits, 8);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation, writing the
+ * the message digest and zeroizing the context.
+ */
+
+void
+MD5Final (digest, context)
+ unsigned char digest[16];
+ MD5_CTX *context;
+{
+ /* Do padding. */
+ MD5Pad (context);
+
+ /* Store state in digest */
+ Encode (digest, context->state, 16);
+
+ /* Zeroize sensitive information. */
+ memset ((void *)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block. */
+
+static void
+MD5Transform (state, block)
+ u_int32_t state[4];
+ const unsigned char block[64];
+{
+ u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+ /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+ /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+ /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ /* Zeroize sensitive information. */
+ memset ((void *)x, 0, sizeof (x));
+}
diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c
new file mode 100644
index 0000000..fb89efc
--- /dev/null
+++ b/sys/kern/p1003_1b.c
@@ -0,0 +1,315 @@
+/*-
+ * Copyright (c) 1996, 1997, 1998
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* p1003_1b: Real Time common code.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
+
+/* The system calls return ENOSYS if an entry is called that is not run-time
+ * supported. I am also logging since some programs start to use this when
+ * they shouldn't. That will be removed if annoying.
+ */
+int
+syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
+{
+ log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+ td->td_name, td->td_proc->p_pid, s);
+
+ /* a " return nosys(p, uap); " here causes a core dump.
+ */
+
+ return ENOSYS;
+}
+
+#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
+
+/* Not configured but loadable via a module:
+ */
+
+static int
+sched_attach(void)
+{
+ return 0;
+}
+
+SYSCALL_NOT_PRESENT_GEN(sched_setparam)
+SYSCALL_NOT_PRESENT_GEN(sched_getparam)
+SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_yield)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
+SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
+#else
+
+/* Configured in kernel version:
+ */
+static struct ksched *ksched;
+
+static int
+sched_attach(void)
+{
+ int ret = ksched_attach(&ksched);
+
+ if (ret == 0)
+ p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 200112L);
+
+ return ret;
+}
+
+int
+sys_sched_setparam(struct thread *td, struct sched_setparam_args *uap)
+{
+ struct thread *targettd;
+ struct proc *targetp;
+ int e;
+ struct sched_param sched_param;
+
+ e = copyin(uap->param, &sched_param, sizeof(sched_param));
+ if (e)
+ return (e);
+
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL)
+ return (ESRCH);
+ targettd = FIRST_THREAD_IN_PROC(targetp);
+ }
+
+ e = p_cansched(td, targetp);
+ if (e == 0) {
+ e = ksched_setparam(ksched, targettd,
+ (const struct sched_param *)&sched_param);
+ }
+ PROC_UNLOCK(targetp);
+ return (e);
+}
+
+int
+sys_sched_getparam(struct thread *td, struct sched_getparam_args *uap)
+{
+ int e;
+ struct sched_param sched_param;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL) {
+ return (ESRCH);
+ }
+ targettd = FIRST_THREAD_IN_PROC(targetp);
+ }
+
+ e = p_cansee(td, targetp);
+ if (e == 0) {
+ e = ksched_getparam(ksched, targettd, &sched_param);
+ }
+ PROC_UNLOCK(targetp);
+ if (e == 0)
+ e = copyout(&sched_param, uap->param, sizeof(sched_param));
+ return (e);
+}
+
+int
+sys_sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
+{
+ int e;
+ struct sched_param sched_param;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ /* Don't allow non root user to set a scheduler policy. */
+ e = priv_check(td, PRIV_SCHED_SET);
+ if (e)
+ return (e);
+
+ e = copyin(uap->param, &sched_param, sizeof(sched_param));
+ if (e)
+ return (e);
+
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL)
+ return (ESRCH);
+ targettd = FIRST_THREAD_IN_PROC(targetp);
+ }
+
+ e = p_cansched(td, targetp);
+ if (e == 0) {
+ e = ksched_setscheduler(ksched, targettd,
+ uap->policy, (const struct sched_param *)&sched_param);
+ }
+ PROC_UNLOCK(targetp);
+ return (e);
+}
+
+int
+sys_sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
+{
+ int e, policy;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ if (uap->pid == 0) {
+ targetp = td->td_proc;
+ targettd = td;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(uap->pid);
+ if (targetp == NULL)
+ return (ESRCH);
+ targettd = FIRST_THREAD_IN_PROC(targetp);
+ }
+
+ e = p_cansee(td, targetp);
+ if (e == 0) {
+ e = ksched_getscheduler(ksched, targettd, &policy);
+ td->td_retval[0] = policy;
+ }
+ PROC_UNLOCK(targetp);
+
+ return (e);
+}
+
+int
+sys_sched_yield(struct thread *td, struct sched_yield_args *uap)
+{
+
+ sched_relinquish(curthread);
+ return 0;
+}
+
+int
+sys_sched_get_priority_max(struct thread *td,
+ struct sched_get_priority_max_args *uap)
+{
+ int error, prio;
+
+ error = ksched_get_priority_max(ksched, uap->policy, &prio);
+ td->td_retval[0] = prio;
+ return (error);
+}
+
+int
+sys_sched_get_priority_min(struct thread *td,
+ struct sched_get_priority_min_args *uap)
+{
+ int error, prio;
+
+ error = ksched_get_priority_min(ksched, uap->policy, &prio);
+ td->td_retval[0] = prio;
+ return (error);
+}
+
+int
+sys_sched_rr_get_interval(struct thread *td,
+ struct sched_rr_get_interval_args *uap)
+{
+ struct timespec timespec;
+ int error;
+
+ error = kern_sched_rr_get_interval(td, uap->pid, &timespec);
+ if (error == 0)
+ error = copyout(&timespec, uap->interval, sizeof(timespec));
+ return (error);
+}
+
+int
+kern_sched_rr_get_interval(struct thread *td, pid_t pid,
+ struct timespec *ts)
+{
+ int e;
+ struct thread *targettd;
+ struct proc *targetp;
+
+ if (pid == 0) {
+ targettd = td;
+ targetp = td->td_proc;
+ PROC_LOCK(targetp);
+ } else {
+ targetp = pfind(pid);
+ if (targetp == NULL)
+ return (ESRCH);
+ targettd = FIRST_THREAD_IN_PROC(targetp);
+ }
+
+ e = p_cansee(td, targetp);
+ if (e == 0)
+ e = ksched_rr_get_interval(ksched, targettd, ts);
+ PROC_UNLOCK(targetp);
+ return (e);
+}
+
+#endif
+
+static void
+p31binit(void *notused)
+{
+ (void) sched_attach();
+ p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
diff --git a/sys/kern/posix4_mib.c b/sys/kern/posix4_mib.c
new file mode 100644
index 0000000..e299787
--- /dev/null
+++ b/sys/kern/posix4_mib.c
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 1998
+ * HD Associates, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+
+static int facility[CTL_P1003_1B_MAXID - 1];
+static int facility_initialized[CTL_P1003_1B_MAXID - 1];
+
+static int p31b_sysctl_proc(SYSCTL_HANDLER_ARGS);
+
+/* OID_AUTO isn't working with sysconf(3). I guess I'd have to
+ * modify it to do a lookup by name from the index.
+ * For now I've left it a top-level sysctl.
+ */
+
+#if 1
+
+SYSCTL_DECL(_p1003_1b);
+
+#define P1B_SYSCTL(num, name) \
+ SYSCTL_INT(_p1003_1b, num, name, CTLFLAG_RD | CTLFLAG_CAPRD, \
+ facility + num - 1, 0, "");
+#define P1B_SYSCTL_RW(num, name) \
+ SYSCTL_PROC(_p1003_1b, num, name, CTLTYPE_INT | CTLFLAG_RW, NULL, num, \
+ p31b_sysctl_proc, "I", "");
+
+#else
+
+SYSCTL_DECL(_kern_p1003_1b);
+
+#define P1B_SYSCTL(num, name) \
+ SYSCTL_INT(_kern_p1003_1b, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_CAPRD, \
+ facility + num - 1, 0, "");
+#define P1B_SYSCTL_RW(num, name) \
+ SYSCTL_PROC(_p1003_1b, OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW, NULL, \
+ num, p31b_sysctl_proc, "I", "");
+SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B");
+
+#endif
+
+SYSCTL_INT(_p1003_1b, CTL_P1003_1B_ASYNCHRONOUS_IO, \
+ asynchronous_io, CTLFLAG_RD, &async_io_version, 0, "");
+P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range);
+P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection);
+P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling);
+P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals);
+P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores);
+P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync);
+P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects);
+P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io);
+P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers);
+P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max);
+P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max);
+P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max);
+P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize);
+P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max);
+P1B_SYSCTL_RW(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max);
+P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max);
+P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max);
+
+#define P31B_VALID(num) ((num) >= 1 && (num) < CTL_P1003_1B_MAXID)
+
+static int
+p31b_sysctl_proc(SYSCTL_HANDLER_ARGS)
+{
+ int error, num, val;
+
+ num = arg2;
+ if (!P31B_VALID(num))
+ return (EINVAL);
+ val = facility_initialized[num] ? facility[num - 1] : 0;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error == 0 && req->newptr != NULL && facility_initialized[num])
+ facility[num - 1] = val;
+ return (error);
+}
+
+/* p31b_setcfg: Set the configuration
+ */
+void
+p31b_setcfg(int num, int value)
+{
+
+ if (P31B_VALID(num)) {
+ facility[num - 1] = value;
+ facility_initialized[num - 1] = 1;
+ }
+}
+
+void
+p31b_unsetcfg(int num)
+{
+
+ facility[num - 1] = 0;
+ facility_initialized[num -1] = 0;
+}
+
+int
+p31b_getcfg(int num)
+{
+
+ if (P31B_VALID(num))
+ return (facility[num - 1]);
+ return (0);
+}
+
+int
+p31b_iscfg(int num)
+{
+
+ if (P31B_VALID(num))
+ return (facility_initialized[num - 1]);
+ return (0);
+}
+
+/*
+ * Turn on indications for standard (non-configurable) kernel features.
+ */
+static void
+p31b_set_standard(void *dummy)
+{
+
+ p31b_setcfg(CTL_P1003_1B_FSYNC, 200112L);
+ p31b_setcfg(CTL_P1003_1B_MAPPED_FILES, 200112L);
+ p31b_setcfg(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, 200112L);
+ p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+ if (!p31b_iscfg(CTL_P1003_1B_AIO_LISTIO_MAX))
+ p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
+ if (!p31b_iscfg(CTL_P1003_1B_AIO_MAX))
+ p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
+ if (!p31b_iscfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX))
+ p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
+}
+
+SYSINIT(p31b_set_standard, SI_SUB_P1003_1B, SI_ORDER_ANY, p31b_set_standard,
+ 0);
+
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
new file mode 100644
index 0000000..7c7d481
--- /dev/null
+++ b/sys/kern/sched_4bsd.c
@@ -0,0 +1,1784 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+#include "opt_sched.h"
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/kthread.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <sys/turnstile.h>
+#include <sys/umtx.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+int dtrace_vtime_active;
+dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
+#endif
+
+/*
+ * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
+ * the range 100-256 Hz (approximately).
+ */
+#define ESTCPULIM(e) \
+ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
+ RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
+#ifdef SMP
+#define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus)
+#else
+#define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */
+#endif
+#define NICE_WEIGHT 1 /* Priorities per nice level. */
+
+#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
+
+/*
+ * The schedulable entity that runs a context.
+ * This is an extension to the thread structure and is tailored to
+ * the requirements of this scheduler
+ */
+struct td_sched {
+ fixpt_t ts_pctcpu; /* (j) %cpu during p_swtime. */
+ int ts_cpticks; /* (j) Ticks of cpu time. */
+ int ts_slptime; /* (j) Seconds !RUNNING. */
+ int ts_slice; /* Remaining part of time slice. */
+ int ts_flags;
+ struct runq *ts_runq; /* runq the thread is currently on */
+#ifdef KTR
+ char ts_name[TS_NAME_LEN];
+#endif
+};
+
+/* flags kept in td_flags */
+#define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */
+#define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */
+#define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */
+
+/* flags kept in ts_flags */
+#define TSF_AFFINITY 0x0001 /* Has a non-"full" CPU set. */
+
+#define SKE_RUNQ_PCPU(ts) \
+ ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
+
+#define THREAD_CAN_SCHED(td, cpu) \
+ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
+
+static struct td_sched td_sched0;
+struct mtx sched_lock;
+
+static int realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
+static int sched_tdcnt; /* Total runnable threads in the system. */
+static int sched_slice = 12; /* Thread run time before rescheduling. */
+
+static void setup_runqs(void);
+static void schedcpu(void);
+static void schedcpu_thread(void);
+static void sched_priority(struct thread *td, u_char prio);
+static void sched_setup(void *dummy);
+static void maybe_resched(struct thread *td);
+static void updatepri(struct thread *td);
+static void resetpriority(struct thread *td);
+static void resetpriority_thread(struct thread *td);
+#ifdef SMP
+static int sched_pickcpu(struct thread *td);
+static int forward_wakeup(int cpunum);
+static void kick_other_cpu(int pri, int cpuid);
+#endif
+
+static struct kproc_desc sched_kp = {
+ "schedcpu",
+ schedcpu_thread,
+ NULL
+};
+SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
+ &sched_kp);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
+ NULL);
+
+/*
+ * Global run queue.
+ */
+static struct runq runq;
+
+#ifdef SMP
+/*
+ * Per-CPU run queues
+ */
+static struct runq runq_pcpu[MAXCPU];
+long runq_length[MAXCPU];
+
+static cpuset_t idle_cpus_mask;
+#endif
+
+struct pcpuidlestat {
+ u_int idlecalls;
+ u_int oldidlecalls;
+};
+static DPCPU_DEFINE(struct pcpuidlestat, idlestat);
+
+static void
+setup_runqs(void)
+{
+#ifdef SMP
+ int i;
+
+ for (i = 0; i < MAXCPU; ++i)
+ runq_init(&runq_pcpu[i]);
+#endif
+
+ runq_init(&runq);
+}
+
+static int
+sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
+{
+ int error, new_val, period;
+
+ period = 1000000 / realstathz;
+ new_val = period * sched_slice;
+ error = sysctl_handle_int(oidp, &new_val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (new_val <= 0)
+ return (EINVAL);
+ sched_slice = imax(1, (new_val + period / 2) / period);
+ hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
+ realstathz);
+ return (0);
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
+
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
+ "Scheduler name");
+SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
+ NULL, 0, sysctl_kern_quantum, "I",
+ "Quantum for timeshare threads in microseconds");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
+ "Quantum for timeshare threads in stathz ticks");
+#ifdef SMP
+/* Enable forwarding of wakeups to all other cpus */
+static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL,
+ "Kernel SMP");
+
+static int runq_fuzz = 1;
+SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
+
+static int forward_wakeup_enabled = 1;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
+ &forward_wakeup_enabled, 0,
+ "Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeups_requested = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
+ &forward_wakeups_requested, 0,
+ "Requests for Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeups_delivered = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
+ &forward_wakeups_delivered, 0,
+ "Completed Forwarding of wakeup to idle CPUs");
+
+static int forward_wakeup_use_mask = 1;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
+ &forward_wakeup_use_mask, 0,
+ "Use the mask of idle cpus");
+
+static int forward_wakeup_use_loop = 0;
+SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
+ &forward_wakeup_use_loop, 0,
+ "Use a loop to find idle cpus");
+
+#endif
+#if 0
+static int sched_followon = 0;
+SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
+ &sched_followon, 0,
+ "allow threads to share a quantum");
+#endif
+
+SDT_PROVIDER_DEFINE(sched);
+
+SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *",
+ "struct proc *", "uint8_t");
+SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *",
+ "struct proc *", "void *");
+SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *",
+ "struct proc *", "void *", "int");
+SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *",
+ "struct proc *", "uint8_t", "struct thread *");
+SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *",
+ "struct proc *");
+SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
+SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *",
+ "struct proc *");
+
+static __inline void
+sched_load_add(void)
+{
+
+ sched_tdcnt++;
+ KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
+ SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
+}
+
+static __inline void
+sched_load_rem(void)
+{
+
+ sched_tdcnt--;
+ KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
+ SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
+}
+/*
+ * Arrange to reschedule if necessary, taking the priorities and
+ * schedulers into account.
+ */
+static void
+maybe_resched(struct thread *td)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ if (td->td_priority < curthread->td_priority)
+ curthread->td_flags |= TDF_NEEDRESCHED;
+}
+
+/*
+ * This function is called when a thread is about to be put on run queue
+ * because it has been made runnable or its priority has been adjusted. It
+ * determines if the new thread should be immediately preempted to. If so,
+ * it switches to it and eventually returns true. If not, it returns false
+ * so that the caller may place the thread on an appropriate run queue.
+ */
+int
+maybe_preempt(struct thread *td)
+{
+#ifdef PREEMPTION
+ struct thread *ctd;
+ int cpri, pri;
+
+ /*
+ * The new thread should not preempt the current thread if any of the
+ * following conditions are true:
+ *
+ * - The kernel is in the throes of crashing (panicstr).
+ * - The current thread has a higher (numerically lower) or
+ * equivalent priority. Note that this prevents curthread from
+ * trying to preempt to itself.
+ * - It is too early in the boot for context switches (cold is set).
+ * - The current thread has an inhibitor set or is in the process of
+ * exiting. In this case, the current thread is about to switch
+ * out anyways, so there's no point in preempting. If we did,
+ * the current thread would not be properly resumed as well, so
+ * just avoid that whole landmine.
+ * - If the new thread's priority is not a realtime priority and
+ * the current thread's priority is not an idle priority and
+ * FULL_PREEMPTION is disabled.
+ *
+ * If all of these conditions are false, but the current thread is in
+ * a nested critical section, then we have to defer the preemption
+ * until we exit the critical section. Otherwise, switch immediately
+ * to the new thread.
+ */
+ ctd = curthread;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT((td->td_inhibitors == 0),
+ ("maybe_preempt: trying to run inhibited thread"));
+ pri = td->td_priority;
+ cpri = ctd->td_priority;
+ if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
+ TD_IS_INHIBITED(ctd))
+ return (0);
+#ifndef FULL_PREEMPTION
+ if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
+ return (0);
+#endif
+
+ if (ctd->td_critnest > 1) {
+ CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
+ ctd->td_critnest);
+ ctd->td_owepreempt = 1;
+ return (0);
+ }
+ /*
+ * Thread is runnable but not yet put on system run queue.
+ */
+ MPASS(ctd->td_lock == td->td_lock);
+ MPASS(TD_ON_RUNQ(td));
+ TD_SET_RUNNING(td);
+ CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
+ td->td_proc->p_pid, td->td_name);
+ mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td);
+ /*
+ * td's lock pointer may have changed. We have to return with it
+ * locked.
+ */
+ spinlock_enter();
+ thread_unlock(ctd);
+ thread_lock(td);
+ spinlock_exit();
+ return (1);
+#else
+ return (0);
+#endif
+}
+
+/*
+ * Constants for digital decay and forget:
+ * 90% of (td_estcpu) usage in 5 * loadav time
+ * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
+ * Note that, as ps(1) mentions, this can let percentages
+ * total over 100% (I've seen 137.9% for 3 processes).
+ *
+ * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
+ *
+ * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
+ * That is, the system wants to compute a value of decay such
+ * that the following for loop:
+ * for (i = 0; i < (5 * loadavg); i++)
+ * td_estcpu *= decay;
+ * will compute
+ * td_estcpu *= 0.1;
+ * for all values of loadavg:
+ *
+ * Mathematically this loop can be expressed by saying:
+ * decay ** (5 * loadavg) ~= .1
+ *
+ * The system computes decay as:
+ * decay = (2 * loadavg) / (2 * loadavg + 1)
+ *
+ * We wish to prove that the system's computation of decay
+ * will always fulfill the equation:
+ * decay ** (5 * loadavg) ~= .1
+ *
+ * If we compute b as:
+ * b = 2 * loadavg
+ * then
+ * decay = b / (b + 1)
+ *
+ * We now need to prove two things:
+ * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
+ * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
+ *
+ * Facts:
+ * For x close to zero, exp(x) =~ 1 + x, since
+ * exp(x) = 0! + x**1/1! + x**2/2! + ... .
+ * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
+ * For x close to zero, ln(1+x) =~ x, since
+ * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
+ * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
+ * ln(.1) =~ -2.30
+ *
+ * Proof of (1):
+ * Solve (factor)**(power) =~ .1 given power (5*loadav):
+ * solving for factor,
+ * ln(factor) =~ (-2.30/5*loadav), or
+ * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
+ * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
+ *
+ * Proof of (2):
+ * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
+ * solving for power,
+ * power*ln(b/(b+1)) =~ -2.30, or
+ * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
+ *
+ * Actual power values for the implemented algorithm are as follows:
+ * loadav: 1 2 3 4
+ * power: 5.68 10.32 14.94 19.55
+ */
+
+/* calculations for digital decay to forget 90% of usage in 5*loadav sec */
+#define loadfactor(loadav) (2 * (loadav))
+#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
+
+/* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
+SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+/*
+ * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
+ * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
+ * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
+ *
+ * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
+ * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
+ *
+ * If you don't want to bother with the faster/more-accurate formula, you
+ * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
+ * (more general) method of calculating the %age of CPU used by a process.
+ */
+#define CCPU_SHIFT 11
+
+/*
+ * Recompute process priorities, every hz ticks.
+ * MP-safe, called without the Giant mutex.
+ */
+/* ARGSUSED */
+static void
+schedcpu(void)
+{
+ register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
+ struct thread *td;
+ struct proc *p;
+ struct td_sched *ts;
+ int awake;
+
+ sx_slock(&allproc_lock);
+ FOREACH_PROC_IN_SYSTEM(p) {
+ PROC_LOCK(p);
+ if (p->p_state == PRS_NEW) {
+ PROC_UNLOCK(p);
+ continue;
+ }
+ FOREACH_THREAD_IN_PROC(p, td) {
+ awake = 0;
+ thread_lock(td);
+ ts = td->td_sched;
+ /*
+ * Increment sleep time (if sleeping). We
+ * ignore overflow, as above.
+ */
+ /*
+ * The td_sched slptimes are not touched in wakeup
+ * because the thread may not HAVE everything in
+ * memory? XXX I think this is out of date.
+ */
+ if (TD_ON_RUNQ(td)) {
+ awake = 1;
+ td->td_flags &= ~TDF_DIDRUN;
+ } else if (TD_IS_RUNNING(td)) {
+ awake = 1;
+ /* Do not clear TDF_DIDRUN */
+ } else if (td->td_flags & TDF_DIDRUN) {
+ awake = 1;
+ td->td_flags &= ~TDF_DIDRUN;
+ }
+
+ /*
+ * ts_pctcpu is only for ps and ttyinfo().
+ */
+ ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
+ /*
+ * If the td_sched has been idle the entire second,
+ * stop recalculating its priority until
+ * it wakes up.
+ */
+ if (ts->ts_cpticks != 0) {
+#if (FSHIFT >= CCPU_SHIFT)
+ ts->ts_pctcpu += (realstathz == 100)
+ ? ((fixpt_t) ts->ts_cpticks) <<
+ (FSHIFT - CCPU_SHIFT) :
+ 100 * (((fixpt_t) ts->ts_cpticks)
+ << (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+ ts->ts_pctcpu += ((FSCALE - ccpu) *
+ (ts->ts_cpticks *
+ FSCALE / realstathz)) >> FSHIFT;
+#endif
+ ts->ts_cpticks = 0;
+ }
+ /*
+ * If there are ANY running threads in this process,
+ * then don't count it as sleeping.
+ * XXX: this is broken.
+ */
+ if (awake) {
+ if (ts->ts_slptime > 1) {
+ /*
+ * In an ideal world, this should not
+ * happen, because whoever woke us
+ * up from the long sleep should have
+ * unwound the slptime and reset our
+ * priority before we run at the stale
+ * priority. Should KASSERT at some
+ * point when all the cases are fixed.
+ */
+ updatepri(td);
+ }
+ ts->ts_slptime = 0;
+ } else
+ ts->ts_slptime++;
+ if (ts->ts_slptime > 1) {
+ thread_unlock(td);
+ continue;
+ }
+ td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
+ resetpriority(td);
+ resetpriority_thread(td);
+ thread_unlock(td);
+ }
+ PROC_UNLOCK(p);
+ }
+ sx_sunlock(&allproc_lock);
+}
+
+/*
+ * Main loop for a kthread that executes schedcpu once a second.
+ */
+static void
+schedcpu_thread(void)
+{
+
+ for (;;) {
+ schedcpu();
+ pause("-", hz);
+ }
+}
+
+/*
+ * Recalculate the priority of a process after it has slept for a while.
+ * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
+ * least six times the loadfactor will decay td_estcpu to zero.
+ */
+static void
+updatepri(struct thread *td)
+{
+ struct td_sched *ts;
+ fixpt_t loadfac;
+ unsigned int newcpu;
+
+ ts = td->td_sched;
+ loadfac = loadfactor(averunnable.ldavg[0]);
+ if (ts->ts_slptime > 5 * loadfac)
+ td->td_estcpu = 0;
+ else {
+ newcpu = td->td_estcpu;
+ ts->ts_slptime--; /* was incremented in schedcpu() */
+ while (newcpu && --ts->ts_slptime)
+ newcpu = decay_cpu(loadfac, newcpu);
+ td->td_estcpu = newcpu;
+ }
+}
+
+/*
+ * Compute the priority of a process when running in user mode.
+ * Arrange to reschedule if the resulting priority is better
+ * than that of the current process.
+ */
+static void
+resetpriority(struct thread *td)
+{
+ register unsigned int newpriority;
+
+ if (td->td_pri_class == PRI_TIMESHARE) {
+ newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT +
+ NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
+ newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
+ PRI_MAX_TIMESHARE);
+ sched_user_prio(td, newpriority);
+ }
+}
+
+/*
+ * Update the thread's priority when the associated process's user
+ * priority changes.
+ */
+static void
+resetpriority_thread(struct thread *td)
+{
+
+ /* Only change threads with a time sharing user priority. */
+ if (td->td_priority < PRI_MIN_TIMESHARE ||
+ td->td_priority > PRI_MAX_TIMESHARE)
+ return;
+
+ /* XXX the whole needresched thing is broken, but not silly. */
+ maybe_resched(td);
+
+ sched_prio(td, td->td_user_pri);
+}
+
+/* ARGSUSED */
+static void
+sched_setup(void *dummy)
+{
+
+ setup_runqs();
+
+ /* Account for thread0. */
+ sched_load_add();
+}
+
+/*
+ * This routine determines time constants after stathz and hz are setup.
+ */
+static void
+sched_initticks(void *dummy)
+{
+
+ realstathz = stathz ? stathz : hz;
+ sched_slice = realstathz / 10; /* ~100ms */
+ hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
+ realstathz);
+}
+
+/* External interfaces start here */
+
+/*
+ * Very early in the boot some setup of scheduler-specific
+ * parts of proc0 and of some scheduler resources needs to be done.
+ * Called from:
+ * proc0_init()
+ */
+void
+schedinit(void)
+{
+ /*
+ * Set up the scheduler specific parts of proc0.
+ */
+ proc0.p_sched = NULL; /* XXX */
+ thread0.td_sched = &td_sched0;
+ thread0.td_lock = &sched_lock;
+ td_sched0.ts_slice = sched_slice;
+ mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+}
+
+int
+sched_runnable(void)
+{
+#ifdef SMP
+ return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
+#else
+ return runq_check(&runq);
+#endif
+}
+
+int
+sched_rr_interval(void)
+{
+
+ /* Convert sched_slice from stathz to hz. */
+ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
+}
+
+/*
+ * We adjust the priority of the current process. The priority of
+ * a process gets worse as it accumulates CPU time. The cpu usage
+ * estimator (td_estcpu) is increased here. resetpriority() will
+ * compute a different priority each time td_estcpu increases by
+ * INVERSE_ESTCPU_WEIGHT
+ * (until MAXPRI is reached). The cpu usage estimator ramps up
+ * quite quickly when the process is running (linearly), and decays
+ * away exponentially, at a rate which is proportionally slower when
+ * the system is busy. The basic principle is that the system will
+ * 90% forget that the process used a lot of CPU time in 5 * loadav
+ * seconds. This causes the system to favor processes which haven't
+ * run much recently, and to round-robin among other processes.
+ */
+void
+sched_clock(struct thread *td)
+{
+ struct pcpuidlestat *stat;
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+
+ ts->ts_cpticks++;
+ td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
+ if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+ resetpriority(td);
+ resetpriority_thread(td);
+ }
+
+ /*
+ * Force a context switch if the current thread has used up a full
+ * time slice (default is 100ms).
+ */
+ if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
+ ts->ts_slice = sched_slice;
+ td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
+ }
+
+ stat = DPCPU_PTR(idlestat);
+ stat->oldidlecalls = stat->idlecalls;
+ stat->idlecalls = 0;
+}
+
+/*
+ * Charge child's scheduling CPU usage to parent.
+ */
+void
+sched_exit(struct proc *p, struct thread *td)
+{
+
+ KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
+ "prio:%d", td->td_priority);
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
+}
+
+void
+sched_exit_thread(struct thread *td, struct thread *child)
+{
+
+ KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
+ "prio:%d", child->td_priority);
+ thread_lock(td);
+ td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
+ thread_unlock(td);
+ thread_lock(child);
+ if ((child->td_flags & TDF_NOLOAD) == 0)
+ sched_load_rem();
+ thread_unlock(child);
+}
+
+void
+sched_fork(struct thread *td, struct thread *childtd)
+{
+ sched_fork_thread(td, childtd);
+}
+
+void
+sched_fork_thread(struct thread *td, struct thread *childtd)
+{
+ struct td_sched *ts;
+
+ childtd->td_estcpu = td->td_estcpu;
+ childtd->td_lock = &sched_lock;
+ childtd->td_cpuset = cpuset_ref(td->td_cpuset);
+ childtd->td_priority = childtd->td_base_pri;
+ ts = childtd->td_sched;
+ bzero(ts, sizeof(*ts));
+ ts->ts_flags |= (td->td_sched->ts_flags & TSF_AFFINITY);
+ ts->ts_slice = 1;
+}
+
+void
+sched_nice(struct proc *p, int nice)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ p->p_nice = nice;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ resetpriority(td);
+ resetpriority_thread(td);
+ thread_unlock(td);
+ }
+}
+
+void
+sched_class(struct thread *td, int class)
+{
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_pri_class = class;
+}
+
+/*
+ * Adjust the priority of a thread.
+ */
+static void
+sched_priority(struct thread *td, u_char prio)
+{
+
+
+ KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
+ "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
+ sched_tdname(curthread));
+ SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
+ if (td != curthread && prio > td->td_priority) {
+ KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
+ "lend prio", "prio:%d", td->td_priority, "new prio:%d",
+ prio, KTR_ATTR_LINKED, sched_tdname(td));
+ SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio,
+ curthread);
+ }
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ if (td->td_priority == prio)
+ return;
+ td->td_priority = prio;
+ if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) {
+ sched_rem(td);
+ sched_add(td, SRQ_BORING);
+ }
+}
+
+/*
+ * Update a thread's priority when it is lent another thread's
+ * priority.
+ */
+void
+sched_lend_prio(struct thread *td, u_char prio)
+{
+
+ td->td_flags |= TDF_BORROWING;
+ sched_priority(td, prio);
+}
+
+/*
+ * Restore a thread's priority when priority propagation is
+ * over. The prio argument is the minimum priority the thread
+ * needs to have to satisfy other possible priority lending
+ * requests. If the thread's regulary priority is less
+ * important than prio the thread will keep a priority boost
+ * of prio.
+ */
+void
+sched_unlend_prio(struct thread *td, u_char prio)
+{
+ u_char base_pri;
+
+ if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
+ td->td_base_pri <= PRI_MAX_TIMESHARE)
+ base_pri = td->td_user_pri;
+ else
+ base_pri = td->td_base_pri;
+ if (prio >= base_pri) {
+ td->td_flags &= ~TDF_BORROWING;
+ sched_prio(td, base_pri);
+ } else
+ sched_lend_prio(td, prio);
+}
+
+void
+sched_prio(struct thread *td, u_char prio)
+{
+ u_char oldprio;
+
+ /* First, update the base priority. */
+ td->td_base_pri = prio;
+
+ /*
+ * If the thread is borrowing another thread's priority, don't ever
+ * lower the priority.
+ */
+ if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
+ return;
+
+ /* Change the real priority. */
+ oldprio = td->td_priority;
+ sched_priority(td, prio);
+
+ /*
+ * If the thread is on a turnstile, then let the turnstile update
+ * its state.
+ */
+ if (TD_ON_LOCK(td) && oldprio != prio)
+ turnstile_adjust(td, oldprio);
+}
+
+void
+sched_user_prio(struct thread *td, u_char prio)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_base_user_pri = prio;
+ if (td->td_lend_user_pri <= prio)
+ return;
+ td->td_user_pri = prio;
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_lend_user_pri = prio;
+ td->td_user_pri = min(prio, td->td_base_user_pri);
+ if (td->td_priority > td->td_user_pri)
+ sched_prio(td, td->td_user_pri);
+ else if (td->td_priority != td->td_user_pri)
+ td->td_flags |= TDF_NEEDRESCHED;
+}
+
+void
+sched_sleep(struct thread *td, int pri)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_slptick = ticks;
+ td->td_sched->ts_slptime = 0;
+ if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+ sched_prio(td, pri);
+ if (TD_IS_SUSPENDED(td) || pri >= PSOCK)
+ td->td_flags |= TDF_CANSWAP;
+}
+
+void
+sched_switch(struct thread *td, struct thread *newtd, int flags)
+{
+ struct mtx *tmtx;
+ struct td_sched *ts;
+ struct proc *p;
+ int preempted;
+
+ tmtx = NULL;
+ ts = td->td_sched;
+ p = td->td_proc;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ /*
+ * Switch to the sched lock to fix things up and pick
+ * a new thread.
+ * Block the td_lock in order to avoid breaking the critical path.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ tmtx = thread_lock_block(td);
+ }
+
+ if ((td->td_flags & TDF_NOLOAD) == 0)
+ sched_load_rem();
+
+ td->td_lastcpu = td->td_oncpu;
+ preempted = !(td->td_flags & TDF_SLICEEND);
+ td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
+ td->td_owepreempt = 0;
+ td->td_oncpu = NOCPU;
+
+ /*
+ * At the last moment, if this thread is still marked RUNNING,
+ * then put it back on the run queue as it has not been suspended
+ * or stopped or any thing else similar. We never put the idle
+ * threads on the run queue, however.
+ */
+ if (td->td_flags & TDF_IDLETD) {
+ TD_SET_CAN_RUN(td);
+#ifdef SMP
+ CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
+#endif
+ } else {
+ if (TD_IS_RUNNING(td)) {
+ /* Put us back on the run queue. */
+ sched_add(td, preempted ?
+ SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+ SRQ_OURSELF|SRQ_YIELDING);
+ }
+ }
+ if (newtd) {
+ /*
+ * The thread we are about to run needs to be counted
+ * as if it had been added to the run queue and selected.
+ * It came from:
+ * * A preemption
+ * * An upcall
+ * * A followon
+ */
+ KASSERT((newtd->td_inhibitors == 0),
+ ("trying to run inhibited thread"));
+ newtd->td_flags |= TDF_DIDRUN;
+ TD_SET_RUNNING(newtd);
+ if ((newtd->td_flags & TDF_NOLOAD) == 0)
+ sched_load_add();
+ } else {
+ newtd = choosethread();
+ MPASS(newtd->td_lock == &sched_lock);
+ }
+
+ if (td != newtd) {
+#ifdef HWPMC_HOOKS
+ if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+ PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
+#endif
+
+ SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
+
+ /* I feel sleepy */
+ lock_profile_release_lock(&sched_lock.lock_object);
+#ifdef KDTRACE_HOOKS
+ /*
+ * If DTrace has set the active vtime enum to anything
+ * other than INACTIVE (0), then it should have set the
+ * function to call.
+ */
+ if (dtrace_vtime_active)
+ (*dtrace_vtime_switch_func)(newtd);
+#endif
+
+ cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
+ lock_profile_obtain_lock_success(&sched_lock.lock_object,
+ 0, 0, __FILE__, __LINE__);
+ /*
+ * Where am I? What year is it?
+ * We are in the same thread that went to sleep above,
+ * but any amount of time may have passed. All our context
+ * will still be available as will local variables.
+ * PCPU values however may have changed as we may have
+ * changed CPU so don't trust cached values of them.
+ * New threads will go to fork_exit() instead of here
+ * so if you change things here you may need to change
+ * things there too.
+ *
+ * If the thread above was exiting it will never wake
+ * up again here, so either it has saved everything it
+ * needed to, or the thread_wait() or wait() will
+ * need to reap it.
+ */
+
+ SDT_PROBE0(sched, , , on_cpu);
+#ifdef HWPMC_HOOKS
+ if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+ PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
+#endif
+ } else
+ SDT_PROBE0(sched, , , remain_cpu);
+
+#ifdef SMP
+ if (td->td_flags & TDF_IDLETD)
+ CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
+#endif
+ sched_lock.mtx_lock = (uintptr_t)td;
+ td->td_oncpu = PCPU_GET(cpuid);
+ MPASS(td->td_lock == &sched_lock);
+}
+
+void
+sched_wakeup(struct thread *td)
+{
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+ td->td_flags &= ~TDF_CANSWAP;
+ if (ts->ts_slptime > 1) {
+ updatepri(td);
+ resetpriority(td);
+ }
+ td->td_slptick = 0;
+ ts->ts_slptime = 0;
+ ts->ts_slice = sched_slice;
+ sched_add(td, SRQ_BORING);
+}
+
+#ifdef SMP
+static int
+forward_wakeup(int cpunum)
+{
+ struct pcpu *pc;
+ cpuset_t dontuse, map, map2;
+ u_int id, me;
+ int iscpuset;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+
+ CTR0(KTR_RUNQ, "forward_wakeup()");
+
+ if ((!forward_wakeup_enabled) ||
+ (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
+ return (0);
+ if (!smp_started || cold || panicstr)
+ return (0);
+
+ forward_wakeups_requested++;
+
+ /*
+ * Check the idle mask we received against what we calculated
+ * before in the old version.
+ */
+ me = PCPU_GET(cpuid);
+
+ /* Don't bother if we should be doing it ourself. */
+ if (CPU_ISSET(me, &idle_cpus_mask) &&
+ (cpunum == NOCPU || me == cpunum))
+ return (0);
+
+ CPU_SETOF(me, &dontuse);
+ CPU_OR(&dontuse, &stopped_cpus);
+ CPU_OR(&dontuse, &hlt_cpus_mask);
+ CPU_ZERO(&map2);
+ if (forward_wakeup_use_loop) {
+ STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+ id = pc->pc_cpuid;
+ if (!CPU_ISSET(id, &dontuse) &&
+ pc->pc_curthread == pc->pc_idlethread) {
+ CPU_SET(id, &map2);
+ }
+ }
+ }
+
+ if (forward_wakeup_use_mask) {
+ map = idle_cpus_mask;
+ CPU_NAND(&map, &dontuse);
+
+ /* If they are both on, compare and use loop if different. */
+ if (forward_wakeup_use_loop) {
+ if (CPU_CMP(&map, &map2)) {
+ printf("map != map2, loop method preferred\n");
+ map = map2;
+ }
+ }
+ } else {
+ map = map2;
+ }
+
+ /* If we only allow a specific CPU, then mask off all the others. */
+ if (cpunum != NOCPU) {
+ KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
+ iscpuset = CPU_ISSET(cpunum, &map);
+ if (iscpuset == 0)
+ CPU_ZERO(&map);
+ else
+ CPU_SETOF(cpunum, &map);
+ }
+ if (!CPU_EMPTY(&map)) {
+ forward_wakeups_delivered++;
+ STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+ id = pc->pc_cpuid;
+ if (!CPU_ISSET(id, &map))
+ continue;
+ if (cpu_idle_wakeup(pc->pc_cpuid))
+ CPU_CLR(id, &map);
+ }
+ if (!CPU_EMPTY(&map))
+ ipi_selected(map, IPI_AST);
+ return (1);
+ }
+ if (cpunum == NOCPU)
+ printf("forward_wakeup: Idle processor not found\n");
+ return (0);
+}
+
+static void
+kick_other_cpu(int pri, int cpuid)
+{
+ struct pcpu *pcpu;
+ int cpri;
+
+ pcpu = pcpu_find(cpuid);
+ if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
+ forward_wakeups_delivered++;
+ if (!cpu_idle_wakeup(cpuid))
+ ipi_cpu(cpuid, IPI_AST);
+ return;
+ }
+
+ cpri = pcpu->pc_curthread->td_priority;
+ if (pri >= cpri)
+ return;
+
+#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
+#if !defined(FULL_PREEMPTION)
+ if (pri <= PRI_MAX_ITHD)
+#endif /* ! FULL_PREEMPTION */
+ {
+ ipi_cpu(cpuid, IPI_PREEMPT);
+ return;
+ }
+#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
+
+ pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
+ ipi_cpu(cpuid, IPI_AST);
+ return;
+}
+#endif /* SMP */
+
+#ifdef SMP
+static int
+sched_pickcpu(struct thread *td)
+{
+ int best, cpu;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+
+ if (THREAD_CAN_SCHED(td, td->td_lastcpu))
+ best = td->td_lastcpu;
+ else
+ best = NOCPU;
+ CPU_FOREACH(cpu) {
+ if (!THREAD_CAN_SCHED(td, cpu))
+ continue;
+
+ if (best == NOCPU)
+ best = cpu;
+ else if (runq_length[cpu] < runq_length[best])
+ best = cpu;
+ }
+ KASSERT(best != NOCPU, ("no valid CPUs"));
+
+ return (best);
+}
+#endif
+
+void
+sched_add(struct thread *td, int flags)
+#ifdef SMP
+{
+ cpuset_t tidlemsk;
+ struct td_sched *ts;
+ u_int cpu, cpuid;
+ int forwarded = 0;
+ int single_cpu = 0;
+
+ ts = td->td_sched;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT((td->td_inhibitors == 0),
+ ("sched_add: trying to run inhibited thread"));
+ KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+ ("sched_add: bad thread state"));
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_add: thread swapped out"));
+
+ KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
+ "prio:%d", td->td_priority, KTR_ATTR_LINKED,
+ sched_tdname(curthread));
+ KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
+ KTR_ATTR_LINKED, sched_tdname(td));
+ SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
+ flags & SRQ_PREEMPTED);
+
+
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
+ TD_SET_RUNQ(td);
+
+ /*
+ * If SMP is started and the thread is pinned or otherwise limited to
+ * a specific set of CPUs, queue the thread to a per-CPU run queue.
+ * Otherwise, queue the thread to the global run queue.
+ *
+ * If SMP has not yet been started we must use the global run queue
+ * as per-CPU state may not be initialized yet and we may crash if we
+ * try to access the per-CPU run queues.
+ */
+ if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
+ ts->ts_flags & TSF_AFFINITY)) {
+ if (td->td_pinned != 0)
+ cpu = td->td_lastcpu;
+ else if (td->td_flags & TDF_BOUND) {
+ /* Find CPU from bound runq. */
+ KASSERT(SKE_RUNQ_PCPU(ts),
+ ("sched_add: bound td_sched not on cpu runq"));
+ cpu = ts->ts_runq - &runq_pcpu[0];
+ } else
+ /* Find a valid CPU for our cpuset */
+ cpu = sched_pickcpu(td);
+ ts->ts_runq = &runq_pcpu[cpu];
+ single_cpu = 1;
+ CTR3(KTR_RUNQ,
+ "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
+ cpu);
+ } else {
+ CTR2(KTR_RUNQ,
+ "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
+ td);
+ cpu = NOCPU;
+ ts->ts_runq = &runq;
+ }
+
+ cpuid = PCPU_GET(cpuid);
+ if (single_cpu && cpu != cpuid) {
+ kick_other_cpu(td->td_priority, cpu);
+ } else {
+ if (!single_cpu) {
+ tidlemsk = idle_cpus_mask;
+ CPU_NAND(&tidlemsk, &hlt_cpus_mask);
+ CPU_CLR(cpuid, &tidlemsk);
+
+ if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
+ ((flags & SRQ_INTR) == 0) &&
+ !CPU_EMPTY(&tidlemsk))
+ forwarded = forward_wakeup(cpu);
+ }
+
+ if (!forwarded) {
+ if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
+ return;
+ else
+ maybe_resched(td);
+ }
+ }
+
+ if ((td->td_flags & TDF_NOLOAD) == 0)
+ sched_load_add();
+ runq_add(ts->ts_runq, td, flags);
+ if (cpu != NOCPU)
+ runq_length[cpu]++;
+}
+#else /* SMP */
+{
+ struct td_sched *ts;
+
+ ts = td->td_sched;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT((td->td_inhibitors == 0),
+ ("sched_add: trying to run inhibited thread"));
+ KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+ ("sched_add: bad thread state"));
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_add: thread swapped out"));
+ KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
+ "prio:%d", td->td_priority, KTR_ATTR_LINKED,
+ sched_tdname(curthread));
+ KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
+ KTR_ATTR_LINKED, sched_tdname(td));
+ SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
+ flags & SRQ_PREEMPTED);
+
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ if (td->td_lock != &sched_lock) {
+ mtx_lock_spin(&sched_lock);
+ thread_lock_set(td, &sched_lock);
+ }
+ TD_SET_RUNQ(td);
+ CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
+ ts->ts_runq = &runq;
+
+ /*
+ * If we are yielding (on the way out anyhow) or the thread
+ * being saved is US, then don't try be smart about preemption
+ * or kicking off another CPU as it won't help and may hinder.
+ * In the YIEDLING case, we are about to run whoever is being
+ * put in the queue anyhow, and in the OURSELF case, we are
+ * puting ourself on the run queue which also only happens
+ * when we are about to yield.
+ */
+ if ((flags & SRQ_YIELDING) == 0) {
+ if (maybe_preempt(td))
+ return;
+ }
+ if ((td->td_flags & TDF_NOLOAD) == 0)
+ sched_load_add();
+ runq_add(ts->ts_runq, td, flags);
+ maybe_resched(td);
+}
+#endif /* SMP */
+
+void
+sched_rem(struct thread *td)
+{
+ struct td_sched *ts;
+
+ ts = td->td_sched;
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_rem: thread swapped out"));
+ KASSERT(TD_ON_RUNQ(td),
+ ("sched_rem: thread not on run queue"));
+ mtx_assert(&sched_lock, MA_OWNED);
+ KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
+ "prio:%d", td->td_priority, KTR_ATTR_LINKED,
+ sched_tdname(curthread));
+ SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
+
+ if ((td->td_flags & TDF_NOLOAD) == 0)
+ sched_load_rem();
+#ifdef SMP
+ if (ts->ts_runq != &runq)
+ runq_length[ts->ts_runq - runq_pcpu]--;
+#endif
+ runq_remove(ts->ts_runq, td);
+ TD_SET_CAN_RUN(td);
+}
+
+/*
+ * Select threads to run. Note that running threads still consume a
+ * slot.
+ */
+struct thread *
+sched_choose(void)
+{
+ struct thread *td;
+ struct runq *rq;
+
+ mtx_assert(&sched_lock, MA_OWNED);
+#ifdef SMP
+ struct thread *tdcpu;
+
+ rq = &runq;
+ td = runq_choose_fuzz(&runq, runq_fuzz);
+ tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
+
+ if (td == NULL ||
+ (tdcpu != NULL &&
+ tdcpu->td_priority < td->td_priority)) {
+ CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
+ PCPU_GET(cpuid));
+ td = tdcpu;
+ rq = &runq_pcpu[PCPU_GET(cpuid)];
+ } else {
+ CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
+ }
+
+#else
+ rq = &runq;
+ td = runq_choose(&runq);
+#endif
+
+ if (td) {
+#ifdef SMP
+ if (td == tdcpu)
+ runq_length[PCPU_GET(cpuid)]--;
+#endif
+ runq_remove(rq, td);
+ td->td_flags |= TDF_DIDRUN;
+
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_choose: thread swapped out"));
+ return (td);
+ }
+ return (PCPU_GET(idlethread));
+}
+
+void
+sched_preempt(struct thread *td)
+{
+
+ SDT_PROBE2(sched, , , surrender, td, td->td_proc);
+ thread_lock(td);
+ if (td->td_critnest > 1)
+ td->td_owepreempt = 1;
+ else
+ mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, NULL);
+ thread_unlock(td);
+}
+
+void
+sched_userret(struct thread *td)
+{
+ /*
+ * XXX we cheat slightly on the locking here to avoid locking in
+ * the usual case. Setting td_priority here is essentially an
+ * incomplete workaround for not setting it properly elsewhere.
+ * Now that some interrupt handlers are threads, not setting it
+ * properly elsewhere can clobber it in the window between setting
+ * it here and returning to user mode, so don't waste time setting
+ * it perfectly here.
+ */
+ KASSERT((td->td_flags & TDF_BORROWING) == 0,
+ ("thread with borrowed priority returning to userland"));
+ if (td->td_priority != td->td_user_pri) {
+ thread_lock(td);
+ td->td_priority = td->td_user_pri;
+ td->td_base_pri = td->td_user_pri;
+ thread_unlock(td);
+ }
+}
+
+void
+sched_bind(struct thread *td, int cpu)
+{
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
+ KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
+
+ ts = td->td_sched;
+
+ td->td_flags |= TDF_BOUND;
+#ifdef SMP
+ ts->ts_runq = &runq_pcpu[cpu];
+ if (PCPU_GET(cpuid) == cpu)
+ return;
+
+ mi_switch(SW_VOL, NULL);
+#endif
+}
+
+void
+sched_unbind(struct thread* td)
+{
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
+ td->td_flags &= ~TDF_BOUND;
+}
+
+int
+sched_is_bound(struct thread *td)
+{
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ return (td->td_flags & TDF_BOUND);
+}
+
+void
+sched_relinquish(struct thread *td)
+{
+ thread_lock(td);
+ mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+ thread_unlock(td);
+}
+
+int
+sched_load(void)
+{
+ return (sched_tdcnt);
+}
+
+int
+sched_sizeof_proc(void)
+{
+ return (sizeof(struct proc));
+}
+
+int
+sched_sizeof_thread(void)
+{
+ return (sizeof(struct thread) + sizeof(struct td_sched));
+}
+
+fixpt_t
+sched_pctcpu(struct thread *td)
+{
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+ return (ts->ts_pctcpu);
+}
+
+#ifdef RACCT
+/*
+ * Calculates the contribution to the thread cpu usage for the latest
+ * (unfinished) second.
+ */
+fixpt_t
+sched_pctcpu_delta(struct thread *td)
+{
+ struct td_sched *ts;
+ fixpt_t delta;
+ int realstathz;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+ delta = 0;
+ realstathz = stathz ? stathz : hz;
+ if (ts->ts_cpticks != 0) {
+#if (FSHIFT >= CCPU_SHIFT)
+ delta = (realstathz == 100)
+ ? ((fixpt_t) ts->ts_cpticks) <<
+ (FSHIFT - CCPU_SHIFT) :
+ 100 * (((fixpt_t) ts->ts_cpticks)
+ << (FSHIFT - CCPU_SHIFT)) / realstathz;
+#else
+ delta = ((FSCALE - ccpu) *
+ (ts->ts_cpticks *
+ FSCALE / realstathz)) >> FSHIFT;
+#endif
+ }
+
+ return (delta);
+}
+#endif
+
+void
+sched_tick(int cnt)
+{
+}
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+ struct pcpuidlestat *stat;
+
+ THREAD_NO_SLEEPING();
+ stat = DPCPU_PTR(idlestat);
+ for (;;) {
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ while (sched_runnable() == 0) {
+ cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
+ stat->idlecalls++;
+ }
+
+ mtx_lock_spin(&sched_lock);
+ mi_switch(SW_VOL | SWT_IDLE, NULL);
+ mtx_unlock_spin(&sched_lock);
+ }
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ /*
+ * Correct spinlock nesting. The idle thread context that we are
+ * borrowing was created so that it would start out with a single
+ * spin lock (sched_lock) held in fork_trampoline(). Since we've
+ * explicitly acquired locks in this function, the nesting count
+ * is now 2 rather than 1. Since we are nested, calling
+ * spinlock_exit() will simply adjust the counts without allowing
+ * spin lock using code to interrupt us.
+ */
+ if (td == NULL) {
+ mtx_lock_spin(&sched_lock);
+ spinlock_exit();
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ } else {
+ lock_profile_release_lock(&sched_lock.lock_object);
+ MPASS(td->td_lock == &sched_lock);
+ }
+ mtx_assert(&sched_lock, MA_OWNED);
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ cpu_throw(td, choosethread()); /* doesn't return */
+}
+
+void
+sched_fork_exit(struct thread *td)
+{
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with sched_lock held but not recursed.
+ */
+ td->td_oncpu = PCPU_GET(cpuid);
+ sched_lock.mtx_lock = (uintptr_t)td;
+ lock_profile_obtain_lock_success(&sched_lock.lock_object,
+ 0, 0, __FILE__, __LINE__);
+ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
+}
+
+char *
+sched_tdname(struct thread *td)
+{
+#ifdef KTR
+ struct td_sched *ts;
+
+ ts = td->td_sched;
+ if (ts->ts_name[0] == '\0')
+ snprintf(ts->ts_name, sizeof(ts->ts_name),
+ "%s tid %d", td->td_name, td->td_tid);
+ return (ts->ts_name);
+#else
+ return (td->td_name);
+#endif
+}
+
+#ifdef KTR
+void
+sched_clear_tdname(struct thread *td)
+{
+ struct td_sched *ts;
+
+ ts = td->td_sched;
+ ts->ts_name[0] = '\0';
+}
+#endif
+
+void
+sched_affinity(struct thread *td)
+{
+#ifdef SMP
+ struct td_sched *ts;
+ int cpu;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ /*
+ * Set the TSF_AFFINITY flag if there is at least one CPU this
+ * thread can't run on.
+ */
+ ts = td->td_sched;
+ ts->ts_flags &= ~TSF_AFFINITY;
+ CPU_FOREACH(cpu) {
+ if (!THREAD_CAN_SCHED(td, cpu)) {
+ ts->ts_flags |= TSF_AFFINITY;
+ break;
+ }
+ }
+
+ /*
+ * If this thread can run on all CPUs, nothing else to do.
+ */
+ if (!(ts->ts_flags & TSF_AFFINITY))
+ return;
+
+ /* Pinned threads and bound threads should be left alone. */
+ if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
+ return;
+
+ switch (td->td_state) {
+ case TDS_RUNQ:
+ /*
+ * If we are on a per-CPU runqueue that is in the set,
+ * then nothing needs to be done.
+ */
+ if (ts->ts_runq != &runq &&
+ THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
+ return;
+
+ /* Put this thread on a valid per-CPU runqueue. */
+ sched_rem(td);
+ sched_add(td, SRQ_BORING);
+ break;
+ case TDS_RUNNING:
+ /*
+ * See if our current CPU is in the set. If not, force a
+ * context switch.
+ */
+ if (THREAD_CAN_SCHED(td, td->td_oncpu))
+ return;
+
+ td->td_flags |= TDF_NEEDRESCHED;
+ if (td != curthread)
+ ipi_cpu(cpu, IPI_AST);
+ break;
+ default:
+ break;
+ }
+#endif
+}
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
new file mode 100644
index 0000000..cba9d80
--- /dev/null
+++ b/sys/kern/sched_ule.c
@@ -0,0 +1,2911 @@
+/*-
+ * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This file implements the ULE scheduler. ULE supports independent CPU
+ * run queues and fine grain locking. It has superior interactive
+ * performance under load even on uni-processor systems.
+ *
+ * etymology:
+ * ULE is the last three letters in schedule. It owes its name to a
+ * generic user created for a scheduling system by Paul Mikesell at
+ * Isilon Systems and a general lack of creativity on the part of the author.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/turnstile.h>
+#include <sys/umtx.h>
+#include <sys/vmmeter.h>
+#include <sys/cpuset.h>
+#include <sys/sbuf.h>
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+int dtrace_vtime_active;
+dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
+#endif
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#if defined(__powerpc__) && defined(BOOKE_E500)
+#error "This architecture is not currently compatible with ULE"
+#endif
+
+#define KTR_ULE 0
+
+#define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
+#define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU)))
+#define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load"))
+
+/*
+ * Thread scheduler specific section. All fields are protected
+ * by the thread lock.
+ */
+struct td_sched {
+ struct runq *ts_runq; /* Run-queue we're queued on. */
+ short ts_flags; /* TSF_* flags. */
+ u_char ts_cpu; /* CPU that we have affinity for. */
+ int ts_rltick; /* Real last tick, for affinity. */
+ int ts_slice; /* Ticks of slice remaining. */
+ u_int ts_slptime; /* Number of ticks we vol. slept */
+ u_int ts_runtime; /* Number of ticks we were running */
+ int ts_ltick; /* Last tick that we were running on */
+ int ts_ftick; /* First tick that we were running on */
+ int ts_ticks; /* Tick count */
+#ifdef KTR
+ char ts_name[TS_NAME_LEN];
+#endif
+};
+/* flags kept in ts_flags */
+#define TSF_BOUND 0x0001 /* Thread can not migrate. */
+#define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */
+
+static struct td_sched td_sched0;
+
+#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0)
+#define THREAD_CAN_SCHED(td, cpu) \
+ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
+
+/*
+ * Priority ranges used for interactive and non-interactive timeshare
+ * threads. The timeshare priorities are split up into four ranges.
+ * The first range handles interactive threads. The last three ranges
+ * (NHALF, x, and NHALF) handle non-interactive threads with the outer
+ * ranges supporting nice values.
+ */
+#define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
+#define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2)
+#define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE)
+
+#define PRI_MIN_INTERACT PRI_MIN_TIMESHARE
+#define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1)
+#define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE)
+#define PRI_MAX_BATCH PRI_MAX_TIMESHARE
+
+/*
+ * Cpu percentage computation macros and defines.
+ *
+ * SCHED_TICK_SECS: Number of seconds to average the cpu usage across.
+ * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across.
+ * SCHED_TICK_MAX: Maximum number of ticks before scaling back.
+ * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results.
+ * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count.
+ * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks.
+ */
+#define SCHED_TICK_SECS 10
+#define SCHED_TICK_TARG (hz * SCHED_TICK_SECS)
+#define SCHED_TICK_MAX (SCHED_TICK_TARG + hz)
+#define SCHED_TICK_SHIFT 10
+#define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT)
+#define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz))
+
+/*
+ * These macros determine priorities for non-interactive threads. They are
+ * assigned a priority based on their recent cpu utilization as expressed
+ * by the ratio of ticks to the tick total. NHALF priorities at the start
+ * and end of the MIN to MAX timeshare range are only reachable with negative
+ * or positive nice respectively.
+ *
+ * PRI_RANGE: Priority range for utilization dependent priorities.
+ * PRI_NRESV: Number of nice values.
+ * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total.
+ * PRI_NICE: Determines the part of the priority inherited from nice.
+ */
+#define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN)
+#define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2)
+#define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF)
+#define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF)
+#define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1)
+#define SCHED_PRI_TICKS(ts) \
+ (SCHED_TICK_HZ((ts)) / \
+ (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
+#define SCHED_PRI_NICE(nice) (nice)
+
+/*
+ * These determine the interactivity of a process. Interactivity differs from
+ * cpu utilization in that it expresses the voluntary time slept vs time ran
+ * while cpu utilization includes all time not running. This more accurately
+ * models the intent of the thread.
+ *
+ * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
+ * before throttling back.
+ * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time.
+ * INTERACT_MAX: Maximum interactivity value. Smaller is better.
+ * INTERACT_THRESH: Threshold for placement on the current runq.
+ */
+#define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT)
+#define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT)
+#define SCHED_INTERACT_MAX (100)
+#define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2)
+#define SCHED_INTERACT_THRESH (30)
+
+/*
+ * These parameters determine the slice behavior for batch work.
+ */
+#define SCHED_SLICE_DEFAULT_DIVISOR 10 /* ~94 ms, 12 stathz ticks. */
+#define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */
+
+/* Flags kept in td_flags. */
+#define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */
+
+/*
+ * tickincr: Converts a stathz tick into a hz domain scaled by
+ * the shift factor. Without the shift the error rate
+ * due to rounding would be unacceptably high.
+ * realstathz: stathz is sometimes 0 and run off of hz.
+ * sched_slice: Runtime of each thread before rescheduling.
+ * preempt_thresh: Priority threshold for preemption and remote IPIs.
+ */
+static int sched_interact = SCHED_INTERACT_THRESH;
+static int tickincr = 8 << SCHED_TICK_SHIFT;
+static int realstathz = 127; /* reset during boot. */
+static int sched_slice = 10; /* reset during boot. */
+static int sched_slice_min = 1; /* reset during boot. */
+#ifdef PREEMPTION
+#ifdef FULL_PREEMPTION
+static int preempt_thresh = PRI_MAX_IDLE;
+#else
+static int preempt_thresh = PRI_MIN_KERN;
+#endif
+#else
+static int preempt_thresh = 0;
+#endif
+static int static_boost = PRI_MIN_BATCH;
+static int sched_idlespins = 10000;
+static int sched_idlespinthresh = -1;
+
+/*
+ * tdq - per processor runqs and statistics. All fields are protected by the
+ * tdq_lock. The load and lowpri may be accessed without to avoid excess
+ * locking in sched_pickcpu();
+ */
+struct tdq {
+ /*
+ * Ordered to improve efficiency of cpu_search() and switch().
+ * tdq_lock is padded to avoid false sharing with tdq_load and
+ * tdq_cpu_idle.
+ */
+ struct mtx_padalign tdq_lock; /* run queue lock. */
+ struct cpu_group *tdq_cg; /* Pointer to cpu topology. */
+ volatile int tdq_load; /* Aggregate load. */
+ volatile int tdq_cpu_idle; /* cpu_idle() is active. */
+ int tdq_sysload; /* For loadavg, !ITHD load. */
+ int tdq_transferable; /* Transferable thread count. */
+ short tdq_switchcnt; /* Switches this tick. */
+ short tdq_oldswitchcnt; /* Switches last tick. */
+ u_char tdq_lowpri; /* Lowest priority thread. */
+ u_char tdq_ipipending; /* IPI pending. */
+ u_char tdq_idx; /* Current insert index. */
+ u_char tdq_ridx; /* Current removal index. */
+ struct runq tdq_realtime; /* real-time run queue. */
+ struct runq tdq_timeshare; /* timeshare run queue. */
+ struct runq tdq_idle; /* Queue of IDLE threads. */
+ char tdq_name[TDQ_NAME_LEN];
+#ifdef KTR
+ char tdq_loadname[TDQ_LOADNAME_LEN];
+#endif
+} __aligned(64);
+
+/* Idle thread states and config. */
+#define TDQ_RUNNING 1
+#define TDQ_IDLE 2
+
+#ifdef SMP
+struct cpu_group *cpu_top; /* CPU topology */
+
+#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000))
+#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity))
+
+/*
+ * Run-time tunables.
+ */
+static int rebalance = 1;
+static int balance_interval = 128; /* Default set in sched_initticks(). */
+static int affinity;
+static int steal_idle = 1;
+static int steal_thresh = 2;
+
+/*
+ * One thread queue per processor.
+ */
+static struct tdq tdq_cpu[MAXCPU];
+static struct tdq *balance_tdq;
+static int balance_ticks;
+static DPCPU_DEFINE(uint32_t, randomval);
+
+#define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)])
+#define TDQ_CPU(x) (&tdq_cpu[(x)])
+#define TDQ_ID(x) ((int)((x) - tdq_cpu))
+#else /* !SMP */
+static struct tdq tdq_cpu;
+
+#define TDQ_ID(x) (0)
+#define TDQ_SELF() (&tdq_cpu)
+#define TDQ_CPU(x) (&tdq_cpu)
+#endif
+
+#define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type))
+#define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t)))
+#define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
+#define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t)))
+#define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock))
+
+static void sched_priority(struct thread *);
+static void sched_thread_priority(struct thread *, u_char);
+static int sched_interact_score(struct thread *);
+static void sched_interact_update(struct thread *);
+static void sched_interact_fork(struct thread *);
+static void sched_pctcpu_update(struct td_sched *, int);
+
+/* Operations on per processor queues */
+static struct thread *tdq_choose(struct tdq *);
+static void tdq_setup(struct tdq *);
+static void tdq_load_add(struct tdq *, struct thread *);
+static void tdq_load_rem(struct tdq *, struct thread *);
+static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
+static __inline void tdq_runq_rem(struct tdq *, struct thread *);
+static inline int sched_shouldpreempt(int, int, int);
+void tdq_print(int cpu);
+static void runq_print(struct runq *rq);
+static void tdq_add(struct tdq *, struct thread *, int);
+#ifdef SMP
+static int tdq_move(struct tdq *, struct tdq *);
+static int tdq_idled(struct tdq *);
+static void tdq_notify(struct tdq *, struct thread *);
+static struct thread *tdq_steal(struct tdq *, int);
+static struct thread *runq_steal(struct runq *, int);
+static int sched_pickcpu(struct thread *, int);
+static void sched_balance(void);
+static int sched_balance_pair(struct tdq *, struct tdq *);
+static inline struct tdq *sched_setcpu(struct thread *, int, int);
+static inline void thread_unblock_switch(struct thread *, struct mtx *);
+static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
+static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
+static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb,
+ struct cpu_group *cg, int indent);
+#endif
+
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
+ NULL);
+
+SDT_PROVIDER_DEFINE(sched);
+
+SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *",
+ "struct proc *", "uint8_t");
+SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *",
+ "struct proc *", "void *");
+SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *",
+ "struct proc *", "void *", "int");
+SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *",
+ "struct proc *", "uint8_t", "struct thread *");
+SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *",
+ "struct proc *");
+SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
+SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *",
+ "struct proc *");
+
+/*
+ * Print the threads waiting on a run-queue.
+ */
+static void
+runq_print(struct runq *rq)
+{
+ struct rqhead *rqh;
+ struct thread *td;
+ int pri;
+ int j;
+ int i;
+
+ for (i = 0; i < RQB_LEN; i++) {
+ printf("\t\trunq bits %d 0x%zx\n",
+ i, rq->rq_status.rqb_bits[i]);
+ for (j = 0; j < RQB_BPW; j++)
+ if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
+ pri = j + (i << RQB_L2BPW);
+ rqh = &rq->rq_queues[pri];
+ TAILQ_FOREACH(td, rqh, td_runq) {
+ printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
+ td, td->td_name, td->td_priority,
+ td->td_rqindex, pri);
+ }
+ }
+ }
+}
+
+/*
+ * Print the status of a per-cpu thread queue. Should be a ddb show cmd.
+ */
+void
+tdq_print(int cpu)
+{
+ struct tdq *tdq;
+
+ tdq = TDQ_CPU(cpu);
+
+ printf("tdq %d:\n", TDQ_ID(tdq));
+ printf("\tlock %p\n", TDQ_LOCKPTR(tdq));
+ printf("\tLock name: %s\n", tdq->tdq_name);
+ printf("\tload: %d\n", tdq->tdq_load);
+ printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt);
+ printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt);
+ printf("\ttimeshare idx: %d\n", tdq->tdq_idx);
+ printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
+ printf("\tload transferable: %d\n", tdq->tdq_transferable);
+ printf("\tlowest priority: %d\n", tdq->tdq_lowpri);
+ printf("\trealtime runq:\n");
+ runq_print(&tdq->tdq_realtime);
+ printf("\ttimeshare runq:\n");
+ runq_print(&tdq->tdq_timeshare);
+ printf("\tidle runq:\n");
+ runq_print(&tdq->tdq_idle);
+}
+
+static inline int
+sched_shouldpreempt(int pri, int cpri, int remote)
+{
+ /*
+ * If the new priority is not better than the current priority there is
+ * nothing to do.
+ */
+ if (pri >= cpri)
+ return (0);
+ /*
+ * Always preempt idle.
+ */
+ if (cpri >= PRI_MIN_IDLE)
+ return (1);
+ /*
+ * If preemption is disabled don't preempt others.
+ */
+ if (preempt_thresh == 0)
+ return (0);
+ /*
+ * Preempt if we exceed the threshold.
+ */
+ if (pri <= preempt_thresh)
+ return (1);
+ /*
+ * If we're interactive or better and there is non-interactive
+ * or worse running preempt only remote processors.
+ */
+ if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT)
+ return (1);
+ return (0);
+}
+
+/*
+ * Add a thread to the actual run-queue. Keeps transferable counts up to
+ * date with what is actually on the run-queue. Selects the correct
+ * queue position for timeshare threads.
+ */
+static __inline void
+tdq_runq_add(struct tdq *tdq, struct thread *td, int flags)
+{
+ struct td_sched *ts;
+ u_char pri;
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ pri = td->td_priority;
+ ts = td->td_sched;
+ TD_SET_RUNQ(td);
+ if (THREAD_CAN_MIGRATE(td)) {
+ tdq->tdq_transferable++;
+ ts->ts_flags |= TSF_XFERABLE;
+ }
+ if (pri < PRI_MIN_BATCH) {
+ ts->ts_runq = &tdq->tdq_realtime;
+ } else if (pri <= PRI_MAX_BATCH) {
+ ts->ts_runq = &tdq->tdq_timeshare;
+ KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH,
+ ("Invalid priority %d on timeshare runq", pri));
+ /*
+ * This queue contains only priorities between MIN and MAX
+ * realtime. Use the whole queue to represent these values.
+ */
+ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
+ pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE;
+ pri = (pri + tdq->tdq_idx) % RQ_NQS;
+ /*
+ * This effectively shortens the queue by one so we
+ * can have a one slot difference between idx and
+ * ridx while we wait for threads to drain.
+ */
+ if (tdq->tdq_ridx != tdq->tdq_idx &&
+ pri == tdq->tdq_ridx)
+ pri = (unsigned char)(pri - 1) % RQ_NQS;
+ } else
+ pri = tdq->tdq_ridx;
+ runq_add_pri(ts->ts_runq, td, pri, flags);
+ return;
+ } else
+ ts->ts_runq = &tdq->tdq_idle;
+ runq_add(ts->ts_runq, td, flags);
+}
+
+/*
+ * Remove a thread from a run-queue. This typically happens when a thread
+ * is selected to run. Running threads are not on the queue and the
+ * transferable count does not reflect them.
+ */
+static __inline void
+tdq_runq_rem(struct tdq *tdq, struct thread *td)
+{
+ struct td_sched *ts;
+
+ ts = td->td_sched;
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ KASSERT(ts->ts_runq != NULL,
+ ("tdq_runq_remove: thread %p null ts_runq", td));
+ if (ts->ts_flags & TSF_XFERABLE) {
+ tdq->tdq_transferable--;
+ ts->ts_flags &= ~TSF_XFERABLE;
+ }
+ if (ts->ts_runq == &tdq->tdq_timeshare) {
+ if (tdq->tdq_idx != tdq->tdq_ridx)
+ runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx);
+ else
+ runq_remove_idx(ts->ts_runq, td, NULL);
+ } else
+ runq_remove(ts->ts_runq, td);
+}
+
+/*
+ * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load
+ * for this thread to the referenced thread queue.
+ */
+static void
+tdq_load_add(struct tdq *tdq, struct thread *td)
+{
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ tdq->tdq_load++;
+ if ((td->td_flags & TDF_NOLOAD) == 0)
+ tdq->tdq_sysload++;
+ KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
+ SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
+}
+
+/*
+ * Remove the load from a thread that is transitioning to a sleep state or
+ * exiting.
+ */
+static void
+tdq_load_rem(struct tdq *tdq, struct thread *td)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ KASSERT(tdq->tdq_load != 0,
+ ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
+
+ tdq->tdq_load--;
+ if ((td->td_flags & TDF_NOLOAD) == 0)
+ tdq->tdq_sysload--;
+ KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
+ SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
+}
+
+/*
+ * Bound timeshare latency by decreasing slice size as load increases. We
+ * consider the maximum latency as the sum of the threads waiting to run
+ * aside from curthread and target no more than sched_slice latency but
+ * no less than sched_slice_min runtime.
+ */
+static inline int
+tdq_slice(struct tdq *tdq)
+{
+ int load;
+
+ /*
+ * It is safe to use sys_load here because this is called from
+ * contexts where timeshare threads are running and so there
+ * cannot be higher priority load in the system.
+ */
+ load = tdq->tdq_sysload - 1;
+ if (load >= SCHED_SLICE_MIN_DIVISOR)
+ return (sched_slice_min);
+ if (load <= 1)
+ return (sched_slice);
+ return (sched_slice / load);
+}
+
+/*
+ * Set lowpri to its exact value by searching the run-queue and
+ * evaluating curthread. curthread may be passed as an optimization.
+ */
+static void
+tdq_setlowpri(struct tdq *tdq, struct thread *ctd)
+{
+ struct thread *td;
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ if (ctd == NULL)
+ ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread;
+ td = tdq_choose(tdq);
+ if (td == NULL || td->td_priority > ctd->td_priority)
+ tdq->tdq_lowpri = ctd->td_priority;
+ else
+ tdq->tdq_lowpri = td->td_priority;
+}
+
+#ifdef SMP
+struct cpu_search {
+ cpuset_t cs_mask;
+ u_int cs_prefer;
+ int cs_pri; /* Min priority for low. */
+ int cs_limit; /* Max load for low, min load for high. */
+ int cs_cpu;
+ int cs_load;
+};
+
+#define CPU_SEARCH_LOWEST 0x1
+#define CPU_SEARCH_HIGHEST 0x2
+#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST)
+
+#define CPUSET_FOREACH(cpu, mask) \
+ for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++) \
+ if (CPU_ISSET(cpu, &mask))
+
+static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
+ struct cpu_search *high, const int match);
+int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
+ struct cpu_search *high);
+
+/*
+ * Search the tree of cpu_groups for the lowest or highest loaded cpu
+ * according to the match argument. This routine actually compares the
+ * load on all paths through the tree and finds the least loaded cpu on
+ * the least loaded path, which may differ from the least loaded cpu in
+ * the system. This balances work among caches and busses.
+ *
+ * This inline is instantiated in three forms below using constants for the
+ * match argument. It is reduced to the minimum set for each case. It is
+ * also recursive to the depth of the tree.
+ */
+static __inline int
+cpu_search(const struct cpu_group *cg, struct cpu_search *low,
+ struct cpu_search *high, const int match)
+{
+ struct cpu_search lgroup;
+ struct cpu_search hgroup;
+ cpuset_t cpumask;
+ struct cpu_group *child;
+ struct tdq *tdq;
+ int cpu, i, hload, lload, load, total, rnd, *rndptr;
+
+ total = 0;
+ cpumask = cg->cg_mask;
+ if (match & CPU_SEARCH_LOWEST) {
+ lload = INT_MAX;
+ lgroup = *low;
+ }
+ if (match & CPU_SEARCH_HIGHEST) {
+ hload = INT_MIN;
+ hgroup = *high;
+ }
+
+ /* Iterate through the child CPU groups and then remaining CPUs. */
+ for (i = cg->cg_children, cpu = mp_maxid; ; ) {
+ if (i == 0) {
+#ifdef HAVE_INLINE_FFSL
+ cpu = CPU_FFS(&cpumask) - 1;
+#else
+ while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
+ cpu--;
+#endif
+ if (cpu < 0)
+ break;
+ child = NULL;
+ } else
+ child = &cg->cg_child[i - 1];
+
+ if (match & CPU_SEARCH_LOWEST)
+ lgroup.cs_cpu = -1;
+ if (match & CPU_SEARCH_HIGHEST)
+ hgroup.cs_cpu = -1;
+ if (child) { /* Handle child CPU group. */
+ CPU_NAND(&cpumask, &child->cg_mask);
+ switch (match) {
+ case CPU_SEARCH_LOWEST:
+ load = cpu_search_lowest(child, &lgroup);
+ break;
+ case CPU_SEARCH_HIGHEST:
+ load = cpu_search_highest(child, &hgroup);
+ break;
+ case CPU_SEARCH_BOTH:
+ load = cpu_search_both(child, &lgroup, &hgroup);
+ break;
+ }
+ } else { /* Handle child CPU. */
+ CPU_CLR(cpu, &cpumask);
+ tdq = TDQ_CPU(cpu);
+ load = tdq->tdq_load * 256;
+ rndptr = DPCPU_PTR(randomval);
+ rnd = (*rndptr = *rndptr * 69069 + 5) >> 26;
+ if (match & CPU_SEARCH_LOWEST) {
+ if (cpu == low->cs_prefer)
+ load -= 64;
+ /* If that CPU is allowed and get data. */
+ if (tdq->tdq_lowpri > lgroup.cs_pri &&
+ tdq->tdq_load <= lgroup.cs_limit &&
+ CPU_ISSET(cpu, &lgroup.cs_mask)) {
+ lgroup.cs_cpu = cpu;
+ lgroup.cs_load = load - rnd;
+ }
+ }
+ if (match & CPU_SEARCH_HIGHEST)
+ if (tdq->tdq_load >= hgroup.cs_limit &&
+ tdq->tdq_transferable &&
+ CPU_ISSET(cpu, &hgroup.cs_mask)) {
+ hgroup.cs_cpu = cpu;
+ hgroup.cs_load = load - rnd;
+ }
+ }
+ total += load;
+
+ /* We have info about child item. Compare it. */
+ if (match & CPU_SEARCH_LOWEST) {
+ if (lgroup.cs_cpu >= 0 &&
+ (load < lload ||
+ (load == lload && lgroup.cs_load < low->cs_load))) {
+ lload = load;
+ low->cs_cpu = lgroup.cs_cpu;
+ low->cs_load = lgroup.cs_load;
+ }
+ }
+ if (match & CPU_SEARCH_HIGHEST)
+ if (hgroup.cs_cpu >= 0 &&
+ (load > hload ||
+ (load == hload && hgroup.cs_load > high->cs_load))) {
+ hload = load;
+ high->cs_cpu = hgroup.cs_cpu;
+ high->cs_load = hgroup.cs_load;
+ }
+ if (child) {
+ i--;
+ if (i == 0 && CPU_EMPTY(&cpumask))
+ break;
+ }
+#ifndef HAVE_INLINE_FFSL
+ else
+ cpu--;
+#endif
+ }
+ return (total);
+}
+
+/*
+ * cpu_search instantiations must pass constants to maintain the inline
+ * optimization.
+ */
+int
+cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
+{
+ return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
+}
+
+int
+cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
+{
+ return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
+}
+
+int
+cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
+ struct cpu_search *high)
+{
+ return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
+}
+
+/*
+ * Find the cpu with the least load via the least loaded path that has a
+ * lowpri greater than pri pri. A pri of -1 indicates any priority is
+ * acceptable.
+ */
+static inline int
+sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
+ int prefer)
+{
+ struct cpu_search low;
+
+ low.cs_cpu = -1;
+ low.cs_prefer = prefer;
+ low.cs_mask = mask;
+ low.cs_pri = pri;
+ low.cs_limit = maxload;
+ cpu_search_lowest(cg, &low);
+ return low.cs_cpu;
+}
+
+/*
+ * Find the cpu with the highest load via the highest loaded path.
+ */
+static inline int
+sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
+{
+ struct cpu_search high;
+
+ high.cs_cpu = -1;
+ high.cs_mask = mask;
+ high.cs_limit = minload;
+ cpu_search_highest(cg, &high);
+ return high.cs_cpu;
+}
+
+/*
+ * Simultaneously find the highest and lowest loaded cpu reachable via
+ * cg.
+ */
+static inline void
+sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
+{
+ struct cpu_search high;
+ struct cpu_search low;
+
+ low.cs_cpu = -1;
+ low.cs_prefer = -1;
+ low.cs_pri = -1;
+ low.cs_limit = INT_MAX;
+ low.cs_mask = mask;
+ high.cs_cpu = -1;
+ high.cs_limit = -1;
+ high.cs_mask = mask;
+ cpu_search_both(cg, &low, &high);
+ *lowcpu = low.cs_cpu;
+ *highcpu = high.cs_cpu;
+ return;
+}
+
+static void
+sched_balance_group(struct cpu_group *cg)
+{
+ cpuset_t hmask, lmask;
+ int high, low, anylow;
+
+ CPU_FILL(&hmask);
+ for (;;) {
+ high = sched_highest(cg, hmask, 1);
+ /* Stop if there is no more CPU with transferrable threads. */
+ if (high == -1)
+ break;
+ CPU_CLR(high, &hmask);
+ CPU_COPY(&hmask, &lmask);
+ /* Stop if there is no more CPU left for low. */
+ if (CPU_EMPTY(&lmask))
+ break;
+ anylow = 1;
+nextlow:
+ low = sched_lowest(cg, lmask, -1,
+ TDQ_CPU(high)->tdq_load - 1, high);
+ /* Stop if we looked well and found no less loaded CPU. */
+ if (anylow && low == -1)
+ break;
+ /* Go to next high if we found no less loaded CPU. */
+ if (low == -1)
+ continue;
+ /* Transfer thread from high to low. */
+ if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
+ /* CPU that got thread can no longer be a donor. */
+ CPU_CLR(low, &hmask);
+ } else {
+ /*
+ * If failed, then there is no threads on high
+ * that can run on this low. Drop low from low
+ * mask and look for different one.
+ */
+ CPU_CLR(low, &lmask);
+ anylow = 0;
+ goto nextlow;
+ }
+ }
+}
+
+static void
+sched_balance(void)
+{
+ struct tdq *tdq;
+
+ /*
+ * Select a random time between .5 * balance_interval and
+ * 1.5 * balance_interval.
+ */
+ balance_ticks = max(balance_interval / 2, 1);
+ balance_ticks += random() % balance_interval;
+ if (smp_started == 0 || rebalance == 0)
+ return;
+ tdq = TDQ_SELF();
+ TDQ_UNLOCK(tdq);
+ sched_balance_group(cpu_top);
+ TDQ_LOCK(tdq);
+}
+
+/*
+ * Lock two thread queues using their address to maintain lock order.
+ */
+static void
+tdq_lock_pair(struct tdq *one, struct tdq *two)
+{
+ if (one < two) {
+ TDQ_LOCK(one);
+ TDQ_LOCK_FLAGS(two, MTX_DUPOK);
+ } else {
+ TDQ_LOCK(two);
+ TDQ_LOCK_FLAGS(one, MTX_DUPOK);
+ }
+}
+
+/*
+ * Unlock two thread queues. Order is not important here.
+ */
+static void
+tdq_unlock_pair(struct tdq *one, struct tdq *two)
+{
+ TDQ_UNLOCK(one);
+ TDQ_UNLOCK(two);
+}
+
+/*
+ * Transfer load between two imbalanced thread queues.
+ */
+static int
+sched_balance_pair(struct tdq *high, struct tdq *low)
+{
+ int moved;
+ int cpu;
+
+ tdq_lock_pair(high, low);
+ moved = 0;
+ /*
+ * Determine what the imbalance is and then adjust that to how many
+ * threads we actually have to give up (transferable).
+ */
+ if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
+ (moved = tdq_move(high, low)) > 0) {
+ /*
+ * In case the target isn't the current cpu IPI it to force a
+ * reschedule with the new workload.
+ */
+ cpu = TDQ_ID(low);
+ if (cpu != PCPU_GET(cpuid))
+ ipi_cpu(cpu, IPI_PREEMPT);
+ }
+ tdq_unlock_pair(high, low);
+ return (moved);
+}
+
+/*
+ * Move a thread from one thread queue to another.
+ */
+static int
+tdq_move(struct tdq *from, struct tdq *to)
+{
+ struct td_sched *ts;
+ struct thread *td;
+ struct tdq *tdq;
+ int cpu;
+
+ TDQ_LOCK_ASSERT(from, MA_OWNED);
+ TDQ_LOCK_ASSERT(to, MA_OWNED);
+
+ tdq = from;
+ cpu = TDQ_ID(to);
+ td = tdq_steal(tdq, cpu);
+ if (td == NULL)
+ return (0);
+ ts = td->td_sched;
+ /*
+ * Although the run queue is locked the thread may be blocked. Lock
+ * it to clear this and acquire the run-queue lock.
+ */
+ thread_lock(td);
+ /* Drop recursive lock on from acquired via thread_lock(). */
+ TDQ_UNLOCK(from);
+ sched_rem(td);
+ ts->ts_cpu = cpu;
+ td->td_lock = TDQ_LOCKPTR(to);
+ tdq_add(to, td, SRQ_YIELDING);
+ return (1);
+}
+
+/*
+ * This tdq has idled. Try to steal a thread from another cpu and switch
+ * to it.
+ */
+static int
+tdq_idled(struct tdq *tdq)
+{
+ struct cpu_group *cg;
+ struct tdq *steal;
+ cpuset_t mask;
+ int thresh;
+ int cpu;
+
+ if (smp_started == 0 || steal_idle == 0)
+ return (1);
+ CPU_FILL(&mask);
+ CPU_CLR(PCPU_GET(cpuid), &mask);
+ /* We don't want to be preempted while we're iterating. */
+ spinlock_enter();
+ for (cg = tdq->tdq_cg; cg != NULL; ) {
+ if ((cg->cg_flags & CG_FLAG_THREAD) == 0)
+ thresh = steal_thresh;
+ else
+ thresh = 1;
+ cpu = sched_highest(cg, mask, thresh);
+ if (cpu == -1) {
+ cg = cg->cg_parent;
+ continue;
+ }
+ steal = TDQ_CPU(cpu);
+ CPU_CLR(cpu, &mask);
+ tdq_lock_pair(tdq, steal);
+ if (steal->tdq_load < thresh || steal->tdq_transferable == 0) {
+ tdq_unlock_pair(tdq, steal);
+ continue;
+ }
+ /*
+ * If a thread was added while interrupts were disabled don't
+ * steal one here. If we fail to acquire one due to affinity
+ * restrictions loop again with this cpu removed from the
+ * set.
+ */
+ if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) {
+ tdq_unlock_pair(tdq, steal);
+ continue;
+ }
+ spinlock_exit();
+ TDQ_UNLOCK(steal);
+ mi_switch(SW_VOL | SWT_IDLE, NULL);
+ thread_unlock(curthread);
+
+ return (0);
+ }
+ spinlock_exit();
+ return (1);
+}
+
+/*
+ * Notify a remote cpu of new work. Sends an IPI if criteria are met.
+ */
+static void
+tdq_notify(struct tdq *tdq, struct thread *td)
+{
+ struct thread *ctd;
+ int pri;
+ int cpu;
+
+ if (tdq->tdq_ipipending)
+ return;
+ cpu = td->td_sched->ts_cpu;
+ pri = td->td_priority;
+ ctd = pcpu_find(cpu)->pc_curthread;
+ if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
+ return;
+ if (TD_IS_IDLETHREAD(ctd)) {
+ /*
+ * If the MD code has an idle wakeup routine try that before
+ * falling back to IPI.
+ */
+ if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu))
+ return;
+ }
+ tdq->tdq_ipipending = 1;
+ ipi_cpu(cpu, IPI_PREEMPT);
+}
+
+/*
+ * Steals load from a timeshare queue. Honors the rotating queue head
+ * index.
+ */
+static struct thread *
+runq_steal_from(struct runq *rq, int cpu, u_char start)
+{
+ struct rqbits *rqb;
+ struct rqhead *rqh;
+ struct thread *td, *first;
+ int bit;
+ int pri;
+ int i;
+
+ rqb = &rq->rq_status;
+ bit = start & (RQB_BPW -1);
+ pri = 0;
+ first = NULL;
+again:
+ for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
+ if (rqb->rqb_bits[i] == 0)
+ continue;
+ if (bit != 0) {
+ for (pri = bit; pri < RQB_BPW; pri++)
+ if (rqb->rqb_bits[i] & (1ul << pri))
+ break;
+ if (pri >= RQB_BPW)
+ continue;
+ } else
+ pri = RQB_FFS(rqb->rqb_bits[i]);
+ pri += (i << RQB_L2BPW);
+ rqh = &rq->rq_queues[pri];
+ TAILQ_FOREACH(td, rqh, td_runq) {
+ if (first && THREAD_CAN_MIGRATE(td) &&
+ THREAD_CAN_SCHED(td, cpu))
+ return (td);
+ first = td;
+ }
+ }
+ if (start != 0) {
+ start = 0;
+ goto again;
+ }
+
+ if (first && THREAD_CAN_MIGRATE(first) &&
+ THREAD_CAN_SCHED(first, cpu))
+ return (first);
+ return (NULL);
+}
+
+/*
+ * Steals load from a standard linear queue.
+ */
+static struct thread *
+runq_steal(struct runq *rq, int cpu)
+{
+ struct rqhead *rqh;
+ struct rqbits *rqb;
+ struct thread *td;
+ int word;
+ int bit;
+
+ rqb = &rq->rq_status;
+ for (word = 0; word < RQB_LEN; word++) {
+ if (rqb->rqb_bits[word] == 0)
+ continue;
+ for (bit = 0; bit < RQB_BPW; bit++) {
+ if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
+ continue;
+ rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
+ TAILQ_FOREACH(td, rqh, td_runq)
+ if (THREAD_CAN_MIGRATE(td) &&
+ THREAD_CAN_SCHED(td, cpu))
+ return (td);
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Attempt to steal a thread in priority order from a thread queue.
+ */
+static struct thread *
+tdq_steal(struct tdq *tdq, int cpu)
+{
+ struct thread *td;
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL)
+ return (td);
+ if ((td = runq_steal_from(&tdq->tdq_timeshare,
+ cpu, tdq->tdq_ridx)) != NULL)
+ return (td);
+ return (runq_steal(&tdq->tdq_idle, cpu));
+}
+
+/*
+ * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the
+ * current lock and returns with the assigned queue locked.
+ */
+static inline struct tdq *
+sched_setcpu(struct thread *td, int cpu, int flags)
+{
+
+ struct tdq *tdq;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ tdq = TDQ_CPU(cpu);
+ td->td_sched->ts_cpu = cpu;
+ /*
+ * If the lock matches just return the queue.
+ */
+ if (td->td_lock == TDQ_LOCKPTR(tdq))
+ return (tdq);
+#ifdef notyet
+ /*
+ * If the thread isn't running its lockptr is a
+ * turnstile or a sleepqueue. We can just lock_set without
+ * blocking.
+ */
+ if (TD_CAN_RUN(td)) {
+ TDQ_LOCK(tdq);
+ thread_lock_set(td, TDQ_LOCKPTR(tdq));
+ return (tdq);
+ }
+#endif
+ /*
+ * The hard case, migration, we need to block the thread first to
+ * prevent order reversals with other cpus locks.
+ */
+ spinlock_enter();
+ thread_lock_block(td);
+ TDQ_LOCK(tdq);
+ thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
+ spinlock_exit();
+ return (tdq);
+}
+
+SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding");
+SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity");
+SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity");
+SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load");
+SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu");
+SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration");
+
+static int
+sched_pickcpu(struct thread *td, int flags)
+{
+ struct cpu_group *cg, *ccg;
+ struct td_sched *ts;
+ struct tdq *tdq;
+ cpuset_t mask;
+ int cpu, pri, self;
+
+ self = PCPU_GET(cpuid);
+ ts = td->td_sched;
+ if (smp_started == 0)
+ return (self);
+ /*
+ * Don't migrate a running thread from sched_switch().
+ */
+ if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td))
+ return (ts->ts_cpu);
+ /*
+ * Prefer to run interrupt threads on the processors that generate
+ * the interrupt.
+ */
+ pri = td->td_priority;
+ if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
+ curthread->td_intr_nesting_level && ts->ts_cpu != self) {
+ SCHED_STAT_INC(pickcpu_intrbind);
+ ts->ts_cpu = self;
+ if (TDQ_CPU(self)->tdq_lowpri > pri) {
+ SCHED_STAT_INC(pickcpu_affinity);
+ return (ts->ts_cpu);
+ }
+ }
+ /*
+ * If the thread can run on the last cpu and the affinity has not
+ * expired or it is idle run it there.
+ */
+ tdq = TDQ_CPU(ts->ts_cpu);
+ cg = tdq->tdq_cg;
+ if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
+ tdq->tdq_lowpri >= PRI_MIN_IDLE &&
+ SCHED_AFFINITY(ts, CG_SHARE_L2)) {
+ if (cg->cg_flags & CG_FLAG_THREAD) {
+ CPUSET_FOREACH(cpu, cg->cg_mask) {
+ if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
+ break;
+ }
+ } else
+ cpu = INT_MAX;
+ if (cpu > mp_maxid) {
+ SCHED_STAT_INC(pickcpu_idle_affinity);
+ return (ts->ts_cpu);
+ }
+ }
+ /*
+ * Search for the last level cache CPU group in the tree.
+ * Skip caches with expired affinity time and SMT groups.
+ * Affinity to higher level caches will be handled less aggressively.
+ */
+ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
+ if (cg->cg_flags & CG_FLAG_THREAD)
+ continue;
+ if (!SCHED_AFFINITY(ts, cg->cg_level))
+ continue;
+ ccg = cg;
+ }
+ if (ccg != NULL)
+ cg = ccg;
+ cpu = -1;
+ /* Search the group for the less loaded idle CPU we can run now. */
+ mask = td->td_cpuset->cs_mask;
+ if (cg != NULL && cg != cpu_top &&
+ CPU_CMP(&cg->cg_mask, &cpu_top->cg_mask) != 0)
+ cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
+ INT_MAX, ts->ts_cpu);
+ /* Search globally for the less loaded CPU we can run now. */
+ if (cpu == -1)
+ cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
+ /* Search globally for the less loaded CPU. */
+ if (cpu == -1)
+ cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
+ KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
+ /*
+ * Compare the lowest loaded cpu to current cpu.
+ */
+ if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
+ TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
+ TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
+ SCHED_STAT_INC(pickcpu_local);
+ cpu = self;
+ } else
+ SCHED_STAT_INC(pickcpu_lowest);
+ if (cpu != ts->ts_cpu)
+ SCHED_STAT_INC(pickcpu_migration);
+ return (cpu);
+}
+#endif
+
+/*
+ * Pick the highest priority task we have and return it.
+ */
+static struct thread *
+tdq_choose(struct tdq *tdq)
+{
+ struct thread *td;
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ td = runq_choose(&tdq->tdq_realtime);
+ if (td != NULL)
+ return (td);
+ td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
+ if (td != NULL) {
+ KASSERT(td->td_priority >= PRI_MIN_BATCH,
+ ("tdq_choose: Invalid priority on timeshare queue %d",
+ td->td_priority));
+ return (td);
+ }
+ td = runq_choose(&tdq->tdq_idle);
+ if (td != NULL) {
+ KASSERT(td->td_priority >= PRI_MIN_IDLE,
+ ("tdq_choose: Invalid priority on idle queue %d",
+ td->td_priority));
+ return (td);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Initialize a thread queue.
+ */
+static void
+tdq_setup(struct tdq *tdq)
+{
+
+ if (bootverbose)
+ printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
+ runq_init(&tdq->tdq_realtime);
+ runq_init(&tdq->tdq_timeshare);
+ runq_init(&tdq->tdq_idle);
+ snprintf(tdq->tdq_name, sizeof(tdq->tdq_name),
+ "sched lock %d", (int)TDQ_ID(tdq));
+ mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock",
+ MTX_SPIN | MTX_RECURSE);
+#ifdef KTR
+ snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname),
+ "CPU %d load", (int)TDQ_ID(tdq));
+#endif
+}
+
+#ifdef SMP
+static void
+sched_setup_smp(void)
+{
+ struct tdq *tdq;
+ int i;
+
+ cpu_top = smp_topo();
+ CPU_FOREACH(i) {
+ tdq = TDQ_CPU(i);
+ tdq_setup(tdq);
+ tdq->tdq_cg = smp_topo_find(cpu_top, i);
+ if (tdq->tdq_cg == NULL)
+ panic("Can't find cpu group for %d\n", i);
+ }
+ balance_tdq = TDQ_SELF();
+ sched_balance();
+}
+#endif
+
+/*
+ * Setup the thread queues and initialize the topology based on MD
+ * information.
+ */
+static void
+sched_setup(void *dummy)
+{
+ struct tdq *tdq;
+
+ tdq = TDQ_SELF();
+#ifdef SMP
+ sched_setup_smp();
+#else
+ tdq_setup(tdq);
+#endif
+
+ /* Add thread0's load since it's running. */
+ TDQ_LOCK(tdq);
+ thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
+ tdq_load_add(tdq, &thread0);
+ tdq->tdq_lowpri = thread0.td_priority;
+ TDQ_UNLOCK(tdq);
+}
+
+/*
+ * This routine determines time constants after stathz and hz are setup.
+ */
+/* ARGSUSED */
+static void
+sched_initticks(void *dummy)
+{
+ int incr;
+
+ realstathz = stathz ? stathz : hz;
+ sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
+ sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
+ hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
+ realstathz);
+
+ /*
+ * tickincr is shifted out by 10 to avoid rounding errors due to
+ * hz not being evenly divisible by stathz on all platforms.
+ */
+ incr = (hz << SCHED_TICK_SHIFT) / realstathz;
+ /*
+ * This does not work for values of stathz that are more than
+ * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen.
+ */
+ if (incr == 0)
+ incr = 1;
+ tickincr = incr;
+#ifdef SMP
+ /*
+ * Set the default balance interval now that we know
+ * what realstathz is.
+ */
+ balance_interval = realstathz;
+ affinity = SCHED_AFFINITY_DEFAULT;
+#endif
+ if (sched_idlespinthresh < 0)
+ sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
+}
+
+
+/*
+ * This is the core of the interactivity algorithm. Determines a score based
+ * on past behavior. It is the ratio of sleep time to run time scaled to
+ * a [0, 100] integer. This is the voluntary sleep time of a process, which
+ * differs from the cpu usage because it does not account for time spent
+ * waiting on a run-queue. Would be prettier if we had floating point.
+ */
+static int
+sched_interact_score(struct thread *td)
+{
+ struct td_sched *ts;
+ int div;
+
+ ts = td->td_sched;
+ /*
+ * The score is only needed if this is likely to be an interactive
+ * task. Don't go through the expense of computing it if there's
+ * no chance.
+ */
+ if (sched_interact <= SCHED_INTERACT_HALF &&
+ ts->ts_runtime >= ts->ts_slptime)
+ return (SCHED_INTERACT_HALF);
+
+ if (ts->ts_runtime > ts->ts_slptime) {
+ div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
+ return (SCHED_INTERACT_HALF +
+ (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
+ }
+ if (ts->ts_slptime > ts->ts_runtime) {
+ div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
+ return (ts->ts_runtime / div);
+ }
+ /* runtime == slptime */
+ if (ts->ts_runtime)
+ return (SCHED_INTERACT_HALF);
+
+ /*
+ * This can happen if slptime and runtime are 0.
+ */
+ return (0);
+
+}
+
+/*
+ * Scale the scheduling priority according to the "interactivity" of this
+ * process.
+ */
+static void
+sched_priority(struct thread *td)
+{
+ int score;
+ int pri;
+
+ if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
+ return;
+ /*
+ * If the score is interactive we place the thread in the realtime
+ * queue with a priority that is less than kernel and interrupt
+ * priorities. These threads are not subject to nice restrictions.
+ *
+ * Scores greater than this are placed on the normal timeshare queue
+ * where the priority is partially decided by the most recent cpu
+ * utilization and the rest is decided by nice value.
+ *
+ * The nice value of the process has a linear effect on the calculated
+ * score. Negative nice values make it easier for a thread to be
+ * considered interactive.
+ */
+ score = imax(0, sched_interact_score(td) + td->td_proc->p_nice);
+ if (score < sched_interact) {
+ pri = PRI_MIN_INTERACT;
+ pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) /
+ sched_interact) * score;
+ KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT,
+ ("sched_priority: invalid interactive priority %d score %d",
+ pri, score));
+ } else {
+ pri = SCHED_PRI_MIN;
+ if (td->td_sched->ts_ticks)
+ pri += min(SCHED_PRI_TICKS(td->td_sched),
+ SCHED_PRI_RANGE);
+ pri += SCHED_PRI_NICE(td->td_proc->p_nice);
+ KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
+ ("sched_priority: invalid priority %d: nice %d, "
+ "ticks %d ftick %d ltick %d tick pri %d",
+ pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
+ td->td_sched->ts_ftick, td->td_sched->ts_ltick,
+ SCHED_PRI_TICKS(td->td_sched)));
+ }
+ sched_user_prio(td, pri);
+
+ return;
+}
+
+/*
+ * This routine enforces a maximum limit on the amount of scheduling history
+ * kept. It is called after either the slptime or runtime is adjusted. This
+ * function is ugly due to integer math.
+ */
+static void
+sched_interact_update(struct thread *td)
+{
+ struct td_sched *ts;
+ u_int sum;
+
+ ts = td->td_sched;
+ sum = ts->ts_runtime + ts->ts_slptime;
+ if (sum < SCHED_SLP_RUN_MAX)
+ return;
+ /*
+ * This only happens from two places:
+ * 1) We have added an unusual amount of run time from fork_exit.
+ * 2) We have added an unusual amount of sleep time from sched_sleep().
+ */
+ if (sum > SCHED_SLP_RUN_MAX * 2) {
+ if (ts->ts_runtime > ts->ts_slptime) {
+ ts->ts_runtime = SCHED_SLP_RUN_MAX;
+ ts->ts_slptime = 1;
+ } else {
+ ts->ts_slptime = SCHED_SLP_RUN_MAX;
+ ts->ts_runtime = 1;
+ }
+ return;
+ }
+ /*
+ * If we have exceeded by more than 1/5th then the algorithm below
+ * will not bring us back into range. Dividing by two here forces
+ * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
+ */
+ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
+ ts->ts_runtime /= 2;
+ ts->ts_slptime /= 2;
+ return;
+ }
+ ts->ts_runtime = (ts->ts_runtime / 5) * 4;
+ ts->ts_slptime = (ts->ts_slptime / 5) * 4;
+}
+
+/*
+ * Scale back the interactivity history when a child thread is created. The
+ * history is inherited from the parent but the thread may behave totally
+ * differently. For example, a shell spawning a compiler process. We want
+ * to learn that the compiler is behaving badly very quickly.
+ */
+static void
+sched_interact_fork(struct thread *td)
+{
+ int ratio;
+ int sum;
+
+ sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
+ if (sum > SCHED_SLP_RUN_FORK) {
+ ratio = sum / SCHED_SLP_RUN_FORK;
+ td->td_sched->ts_runtime /= ratio;
+ td->td_sched->ts_slptime /= ratio;
+ }
+}
+
+/*
+ * Called from proc0_init() to setup the scheduler fields.
+ */
+void
+schedinit(void)
+{
+
+ /*
+ * Set up the scheduler specific parts of proc0.
+ */
+ proc0.p_sched = NULL; /* XXX */
+ thread0.td_sched = &td_sched0;
+ td_sched0.ts_ltick = ticks;
+ td_sched0.ts_ftick = ticks;
+ td_sched0.ts_slice = 0;
+}
+
+/*
+ * This is only somewhat accurate since given many processes of the same
+ * priority they will switch when their slices run out, which will be
+ * at most sched_slice stathz ticks.
+ */
+int
+sched_rr_interval(void)
+{
+
+ /* Convert sched_slice from stathz to hz. */
+ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
+}
+
+/*
+ * Update the percent cpu tracking information when it is requested or
+ * the total history exceeds the maximum. We keep a sliding history of
+ * tick counts that slowly decays. This is less precise than the 4BSD
+ * mechanism since it happens with less regular and frequent events.
+ */
+static void
+sched_pctcpu_update(struct td_sched *ts, int run)
+{
+ int t = ticks;
+
+ if (t - ts->ts_ltick >= SCHED_TICK_TARG) {
+ ts->ts_ticks = 0;
+ ts->ts_ftick = t - SCHED_TICK_TARG;
+ } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) {
+ ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) *
+ (ts->ts_ltick - (t - SCHED_TICK_TARG));
+ ts->ts_ftick = t - SCHED_TICK_TARG;
+ }
+ if (run)
+ ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT;
+ ts->ts_ltick = t;
+}
+
+/*
+ * Adjust the priority of a thread. Move it to the appropriate run-queue
+ * if necessary. This is the back-end for several priority related
+ * functions.
+ */
+static void
+sched_thread_priority(struct thread *td, u_char prio)
+{
+ struct td_sched *ts;
+ struct tdq *tdq;
+ int oldpri;
+
+ KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
+ "prio:%d", td->td_priority, "new prio:%d", prio,
+ KTR_ATTR_LINKED, sched_tdname(curthread));
+ SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
+ if (td != curthread && prio < td->td_priority) {
+ KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
+ "lend prio", "prio:%d", td->td_priority, "new prio:%d",
+ prio, KTR_ATTR_LINKED, sched_tdname(td));
+ SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio,
+ curthread);
+ }
+ ts = td->td_sched;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ if (td->td_priority == prio)
+ return;
+ /*
+ * If the priority has been elevated due to priority
+ * propagation, we may have to move ourselves to a new
+ * queue. This could be optimized to not re-add in some
+ * cases.
+ */
+ if (TD_ON_RUNQ(td) && prio < td->td_priority) {
+ sched_rem(td);
+ td->td_priority = prio;
+ sched_add(td, SRQ_BORROWING);
+ return;
+ }
+ /*
+ * If the thread is currently running we may have to adjust the lowpri
+ * information so other cpus are aware of our current priority.
+ */
+ if (TD_IS_RUNNING(td)) {
+ tdq = TDQ_CPU(ts->ts_cpu);
+ oldpri = td->td_priority;
+ td->td_priority = prio;
+ if (prio < tdq->tdq_lowpri)
+ tdq->tdq_lowpri = prio;
+ else if (tdq->tdq_lowpri == oldpri)
+ tdq_setlowpri(tdq, td);
+ return;
+ }
+ td->td_priority = prio;
+}
+
+/*
+ * Update a thread's priority when it is lent another thread's
+ * priority.
+ */
+void
+sched_lend_prio(struct thread *td, u_char prio)
+{
+
+ td->td_flags |= TDF_BORROWING;
+ sched_thread_priority(td, prio);
+}
+
+/*
+ * Restore a thread's priority when priority propagation is
+ * over. The prio argument is the minimum priority the thread
+ * needs to have to satisfy other possible priority lending
+ * requests. If the thread's regular priority is less
+ * important than prio, the thread will keep a priority boost
+ * of prio.
+ */
+void
+sched_unlend_prio(struct thread *td, u_char prio)
+{
+ u_char base_pri;
+
+ if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
+ td->td_base_pri <= PRI_MAX_TIMESHARE)
+ base_pri = td->td_user_pri;
+ else
+ base_pri = td->td_base_pri;
+ if (prio >= base_pri) {
+ td->td_flags &= ~TDF_BORROWING;
+ sched_thread_priority(td, base_pri);
+ } else
+ sched_lend_prio(td, prio);
+}
+
+/*
+ * Standard entry for setting the priority to an absolute value.
+ */
+void
+sched_prio(struct thread *td, u_char prio)
+{
+ u_char oldprio;
+
+ /* First, update the base priority. */
+ td->td_base_pri = prio;
+
+ /*
+ * If the thread is borrowing another thread's priority, don't
+ * ever lower the priority.
+ */
+ if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
+ return;
+
+ /* Change the real priority. */
+ oldprio = td->td_priority;
+ sched_thread_priority(td, prio);
+
+ /*
+ * If the thread is on a turnstile, then let the turnstile update
+ * its state.
+ */
+ if (TD_ON_LOCK(td) && oldprio != prio)
+ turnstile_adjust(td, oldprio);
+}
+
+/*
+ * Set the base user priority, does not effect current running priority.
+ */
+void
+sched_user_prio(struct thread *td, u_char prio)
+{
+
+ td->td_base_user_pri = prio;
+ if (td->td_lend_user_pri <= prio)
+ return;
+ td->td_user_pri = prio;
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ td->td_lend_user_pri = prio;
+ td->td_user_pri = min(prio, td->td_base_user_pri);
+ if (td->td_priority > td->td_user_pri)
+ sched_prio(td, td->td_user_pri);
+ else if (td->td_priority != td->td_user_pri)
+ td->td_flags |= TDF_NEEDRESCHED;
+}
+
+/*
+ * Handle migration from sched_switch(). This happens only for
+ * cpu binding.
+ */
+static struct mtx *
+sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
+{
+ struct tdq *tdn;
+
+ tdn = TDQ_CPU(td->td_sched->ts_cpu);
+#ifdef SMP
+ tdq_load_rem(tdq, td);
+ /*
+ * Do the lock dance required to avoid LOR. We grab an extra
+ * spinlock nesting to prevent preemption while we're
+ * not holding either run-queue lock.
+ */
+ spinlock_enter();
+ thread_lock_block(td); /* This releases the lock on tdq. */
+
+ /*
+ * Acquire both run-queue locks before placing the thread on the new
+ * run-queue to avoid deadlocks created by placing a thread with a
+ * blocked lock on the run-queue of a remote processor. The deadlock
+ * occurs when a third processor attempts to lock the two queues in
+ * question while the target processor is spinning with its own
+ * run-queue lock held while waiting for the blocked lock to clear.
+ */
+ tdq_lock_pair(tdn, tdq);
+ tdq_add(tdn, td, flags);
+ tdq_notify(tdn, td);
+ TDQ_UNLOCK(tdn);
+ spinlock_exit();
+#endif
+ return (TDQ_LOCKPTR(tdn));
+}
+
+/*
+ * Variadic version of thread_lock_unblock() that does not assume td_lock
+ * is blocked.
+ */
+static inline void
+thread_unblock_switch(struct thread *td, struct mtx *mtx)
+{
+ atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
+ (uintptr_t)mtx);
+}
+
+/*
+ * Switch threads. This function has to handle threads coming in while
+ * blocked for some reason, running, or idle. It also must deal with
+ * migrating a thread from one queue to another as running threads may
+ * be assigned elsewhere via binding.
+ */
+void
+sched_switch(struct thread *td, struct thread *newtd, int flags)
+{
+ struct tdq *tdq;
+ struct td_sched *ts;
+ struct mtx *mtx;
+ int srqflag;
+ int cpuid, preempted;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(newtd == NULL, ("sched_switch: Unsupported newtd argument"));
+
+ cpuid = PCPU_GET(cpuid);
+ tdq = TDQ_CPU(cpuid);
+ ts = td->td_sched;
+ mtx = td->td_lock;
+ sched_pctcpu_update(ts, 1);
+ ts->ts_rltick = ticks;
+ td->td_lastcpu = td->td_oncpu;
+ td->td_oncpu = NOCPU;
+ preempted = !(td->td_flags & TDF_SLICEEND);
+ td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
+ td->td_owepreempt = 0;
+ if (!TD_IS_IDLETHREAD(td))
+ tdq->tdq_switchcnt++;
+ /*
+ * The lock pointer in an idle thread should never change. Reset it
+ * to CAN_RUN as well.
+ */
+ if (TD_IS_IDLETHREAD(td)) {
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ TD_SET_CAN_RUN(td);
+ } else if (TD_IS_RUNNING(td)) {
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ srqflag = preempted ?
+ SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+ SRQ_OURSELF|SRQ_YIELDING;
+#ifdef SMP
+ if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
+ ts->ts_cpu = sched_pickcpu(td, 0);
+#endif
+ if (ts->ts_cpu == cpuid)
+ tdq_runq_add(tdq, td, srqflag);
+ else {
+ KASSERT(THREAD_CAN_MIGRATE(td) ||
+ (ts->ts_flags & TSF_BOUND) != 0,
+ ("Thread %p shouldn't migrate", td));
+ mtx = sched_switch_migrate(tdq, td, srqflag);
+ }
+ } else {
+ /* This thread must be going to sleep. */
+ TDQ_LOCK(tdq);
+ mtx = thread_lock_block(td);
+ tdq_load_rem(tdq, td);
+ }
+ /*
+ * We enter here with the thread blocked and assigned to the
+ * appropriate cpu run-queue or sleep-queue and with the current
+ * thread-queue locked.
+ */
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+ newtd = choosethread();
+ /*
+ * Call the MD code to switch contexts if necessary.
+ */
+ if (td != newtd) {
+#ifdef HWPMC_HOOKS
+ if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+ PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
+#endif
+ SDT_PROBE2(sched, , , off_cpu, newtd, newtd->td_proc);
+ lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
+ TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
+ sched_pctcpu_update(newtd->td_sched, 0);
+
+#ifdef KDTRACE_HOOKS
+ /*
+ * If DTrace has set the active vtime enum to anything
+ * other than INACTIVE (0), then it should have set the
+ * function to call.
+ */
+ if (dtrace_vtime_active)
+ (*dtrace_vtime_switch_func)(newtd);
+#endif
+
+ cpu_switch(td, newtd, mtx);
+ /*
+ * We may return from cpu_switch on a different cpu. However,
+ * we always return with td_lock pointing to the current cpu's
+ * run queue lock.
+ */
+ cpuid = PCPU_GET(cpuid);
+ tdq = TDQ_CPU(cpuid);
+ lock_profile_obtain_lock_success(
+ &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+
+ SDT_PROBE0(sched, , , on_cpu);
+#ifdef HWPMC_HOOKS
+ if (PMC_PROC_IS_USING_PMCS(td->td_proc))
+ PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
+#endif
+ } else {
+ thread_unblock_switch(td, mtx);
+ SDT_PROBE0(sched, , , remain_cpu);
+ }
+ /*
+ * Assert that all went well and return.
+ */
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ td->td_oncpu = cpuid;
+}
+
+/*
+ * Adjust thread priorities as a result of a nice request.
+ */
+void
+sched_nice(struct proc *p, int nice)
+{
+ struct thread *td;
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+
+ p->p_nice = nice;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ sched_priority(td);
+ sched_prio(td, td->td_base_user_pri);
+ thread_unlock(td);
+ }
+}
+
+/*
+ * Record the sleep time for the interactivity scorer.
+ */
+void
+sched_sleep(struct thread *td, int prio)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ td->td_slptick = ticks;
+ if (TD_IS_SUSPENDED(td) || prio >= PSOCK)
+ td->td_flags |= TDF_CANSWAP;
+ if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE)
+ return;
+ if (static_boost == 1 && prio)
+ sched_prio(td, prio);
+ else if (static_boost && td->td_priority > static_boost)
+ sched_prio(td, static_boost);
+}
+
+/*
+ * Schedule a thread to resume execution and record how long it voluntarily
+ * slept. We also update the pctcpu, interactivity, and priority.
+ */
+void
+sched_wakeup(struct thread *td)
+{
+ struct td_sched *ts;
+ int slptick;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+ td->td_flags &= ~TDF_CANSWAP;
+ /*
+ * If we slept for more than a tick update our interactivity and
+ * priority.
+ */
+ slptick = td->td_slptick;
+ td->td_slptick = 0;
+ if (slptick && slptick != ticks) {
+ ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT;
+ sched_interact_update(td);
+ sched_pctcpu_update(ts, 0);
+ }
+ /*
+ * Reset the slice value since we slept and advanced the round-robin.
+ */
+ ts->ts_slice = 0;
+ sched_add(td, SRQ_BORING);
+}
+
+/*
+ * Penalize the parent for creating a new child and initialize the child's
+ * priority.
+ */
+void
+sched_fork(struct thread *td, struct thread *child)
+{
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ sched_pctcpu_update(td->td_sched, 1);
+ sched_fork_thread(td, child);
+ /*
+ * Penalize the parent and child for forking.
+ */
+ sched_interact_fork(child);
+ sched_priority(child);
+ td->td_sched->ts_runtime += tickincr;
+ sched_interact_update(td);
+ sched_priority(td);
+}
+
+/*
+ * Fork a new thread, may be within the same process.
+ */
+void
+sched_fork_thread(struct thread *td, struct thread *child)
+{
+ struct td_sched *ts;
+ struct td_sched *ts2;
+ struct tdq *tdq;
+
+ tdq = TDQ_SELF();
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ /*
+ * Initialize child.
+ */
+ ts = td->td_sched;
+ ts2 = child->td_sched;
+ child->td_lock = TDQ_LOCKPTR(tdq);
+ child->td_cpuset = cpuset_ref(td->td_cpuset);
+ ts2->ts_cpu = ts->ts_cpu;
+ ts2->ts_flags = 0;
+ /*
+ * Grab our parents cpu estimation information.
+ */
+ ts2->ts_ticks = ts->ts_ticks;
+ ts2->ts_ltick = ts->ts_ltick;
+ ts2->ts_ftick = ts->ts_ftick;
+ /*
+ * Do not inherit any borrowed priority from the parent.
+ */
+ child->td_priority = child->td_base_pri;
+ /*
+ * And update interactivity score.
+ */
+ ts2->ts_slptime = ts->ts_slptime;
+ ts2->ts_runtime = ts->ts_runtime;
+ /* Attempt to quickly learn interactivity. */
+ ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
+#ifdef KTR
+ bzero(ts2->ts_name, sizeof(ts2->ts_name));
+#endif
+}
+
+/*
+ * Adjust the priority class of a thread.
+ */
+void
+sched_class(struct thread *td, int class)
+{
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ if (td->td_pri_class == class)
+ return;
+ td->td_pri_class = class;
+}
+
+/*
+ * Return some of the child's priority and interactivity to the parent.
+ */
+void
+sched_exit(struct proc *p, struct thread *child)
+{
+ struct thread *td;
+
+ KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit",
+ "prio:%d", child->td_priority);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ td = FIRST_THREAD_IN_PROC(p);
+ sched_exit_thread(td, child);
+}
+
+/*
+ * Penalize another thread for the time spent on this one. This helps to
+ * worsen the priority and interactivity of processes which schedule batch
+ * jobs such as make. This has little effect on the make process itself but
+ * causes new processes spawned by it to receive worse scores immediately.
+ */
+void
+sched_exit_thread(struct thread *td, struct thread *child)
+{
+
+ KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit",
+ "prio:%d", child->td_priority);
+ /*
+ * Give the child's runtime to the parent without returning the
+ * sleep time as a penalty to the parent. This causes shells that
+ * launch expensive things to mark their children as expensive.
+ */
+ thread_lock(td);
+ td->td_sched->ts_runtime += child->td_sched->ts_runtime;
+ sched_interact_update(td);
+ sched_priority(td);
+ thread_unlock(td);
+}
+
+void
+sched_preempt(struct thread *td)
+{
+ struct tdq *tdq;
+
+ SDT_PROBE2(sched, , , surrender, td, td->td_proc);
+
+ thread_lock(td);
+ tdq = TDQ_SELF();
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ tdq->tdq_ipipending = 0;
+ if (td->td_priority > tdq->tdq_lowpri) {
+ int flags;
+
+ flags = SW_INVOL | SW_PREEMPT;
+ if (td->td_critnest > 1)
+ td->td_owepreempt = 1;
+ else if (TD_IS_IDLETHREAD(td))
+ mi_switch(flags | SWT_REMOTEWAKEIDLE, NULL);
+ else
+ mi_switch(flags | SWT_REMOTEPREEMPT, NULL);
+ }
+ thread_unlock(td);
+}
+
+/*
+ * Fix priorities on return to user-space. Priorities may be elevated due
+ * to static priorities in msleep() or similar.
+ */
+void
+sched_userret(struct thread *td)
+{
+ /*
+ * XXX we cheat slightly on the locking here to avoid locking in
+ * the usual case. Setting td_priority here is essentially an
+ * incomplete workaround for not setting it properly elsewhere.
+ * Now that some interrupt handlers are threads, not setting it
+ * properly elsewhere can clobber it in the window between setting
+ * it here and returning to user mode, so don't waste time setting
+ * it perfectly here.
+ */
+ KASSERT((td->td_flags & TDF_BORROWING) == 0,
+ ("thread with borrowed priority returning to userland"));
+ if (td->td_priority != td->td_user_pri) {
+ thread_lock(td);
+ td->td_priority = td->td_user_pri;
+ td->td_base_pri = td->td_user_pri;
+ tdq_setlowpri(TDQ_SELF(), td);
+ thread_unlock(td);
+ }
+}
+
+/*
+ * Handle a stathz tick. This is really only relevant for timeshare
+ * threads.
+ */
+void
+sched_clock(struct thread *td)
+{
+ struct tdq *tdq;
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ tdq = TDQ_SELF();
+#ifdef SMP
+ /*
+ * We run the long term load balancer infrequently on the first cpu.
+ */
+ if (balance_tdq == tdq) {
+ if (balance_ticks && --balance_ticks == 0)
+ sched_balance();
+ }
+#endif
+ /*
+ * Save the old switch count so we have a record of the last ticks
+ * activity. Initialize the new switch count based on our load.
+ * If there is some activity seed it to reflect that.
+ */
+ tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
+ tdq->tdq_switchcnt = tdq->tdq_load;
+ /*
+ * Advance the insert index once for each tick to ensure that all
+ * threads get a chance to run.
+ */
+ if (tdq->tdq_idx == tdq->tdq_ridx) {
+ tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
+ if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
+ tdq->tdq_ridx = tdq->tdq_idx;
+ }
+ ts = td->td_sched;
+ sched_pctcpu_update(ts, 1);
+ if (td->td_pri_class & PRI_FIFO_BIT)
+ return;
+ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) {
+ /*
+ * We used a tick; charge it to the thread so
+ * that we can compute our interactivity.
+ */
+ td->td_sched->ts_runtime += tickincr;
+ sched_interact_update(td);
+ sched_priority(td);
+ }
+
+ /*
+ * Force a context switch if the current thread has used up a full
+ * time slice (default is 100ms).
+ */
+ if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) {
+ ts->ts_slice = 0;
+ td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
+ }
+}
+
+/*
+ * Called once per hz tick.
+ */
+void
+sched_tick(int cnt)
+{
+
+}
+
+/*
+ * Return whether the current CPU has runnable tasks. Used for in-kernel
+ * cooperative idle threads.
+ */
+int
+sched_runnable(void)
+{
+ struct tdq *tdq;
+ int load;
+
+ load = 1;
+
+ tdq = TDQ_SELF();
+ if ((curthread->td_flags & TDF_IDLETD) != 0) {
+ if (tdq->tdq_load > 0)
+ goto out;
+ } else
+ if (tdq->tdq_load - 1 > 0)
+ goto out;
+ load = 0;
+out:
+ return (load);
+}
+
+/*
+ * Choose the highest priority thread to run. The thread is removed from
+ * the run-queue while running however the load remains. For SMP we set
+ * the tdq in the global idle bitmask if it idles here.
+ */
+struct thread *
+sched_choose(void)
+{
+ struct thread *td;
+ struct tdq *tdq;
+
+ tdq = TDQ_SELF();
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ td = tdq_choose(tdq);
+ if (td) {
+ tdq_runq_rem(tdq, td);
+ tdq->tdq_lowpri = td->td_priority;
+ return (td);
+ }
+ tdq->tdq_lowpri = PRI_MAX_IDLE;
+ return (PCPU_GET(idlethread));
+}
+
+/*
+ * Set owepreempt if necessary. Preemption never happens directly in ULE,
+ * we always request it once we exit a critical section.
+ */
+static inline void
+sched_setpreempt(struct thread *td)
+{
+ struct thread *ctd;
+ int cpri;
+ int pri;
+
+ THREAD_LOCK_ASSERT(curthread, MA_OWNED);
+
+ ctd = curthread;
+ pri = td->td_priority;
+ cpri = ctd->td_priority;
+ if (pri < cpri)
+ ctd->td_flags |= TDF_NEEDRESCHED;
+ if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
+ return;
+ if (!sched_shouldpreempt(pri, cpri, 0))
+ return;
+ ctd->td_owepreempt = 1;
+}
+
+/*
+ * Add a thread to a thread queue. Select the appropriate runq and add the
+ * thread to it. This is the internal function called when the tdq is
+ * predetermined.
+ */
+void
+tdq_add(struct tdq *tdq, struct thread *td, int flags)
+{
+
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ KASSERT((td->td_inhibitors == 0),
+ ("sched_add: trying to run inhibited thread"));
+ KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+ ("sched_add: bad thread state"));
+ KASSERT(td->td_flags & TDF_INMEM,
+ ("sched_add: thread swapped out"));
+
+ if (td->td_priority < tdq->tdq_lowpri)
+ tdq->tdq_lowpri = td->td_priority;
+ tdq_runq_add(tdq, td, flags);
+ tdq_load_add(tdq, td);
+}
+
+/*
+ * Select the target thread queue and add a thread to it. Request
+ * preemption or IPI a remote processor if required.
+ */
+void
+sched_add(struct thread *td, int flags)
+{
+ struct tdq *tdq;
+#ifdef SMP
+ int cpu;
+#endif
+
+ KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
+ "prio:%d", td->td_priority, KTR_ATTR_LINKED,
+ sched_tdname(curthread));
+ KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
+ KTR_ATTR_LINKED, sched_tdname(td));
+ SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
+ flags & SRQ_PREEMPTED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ /*
+ * Recalculate the priority before we select the target cpu or
+ * run-queue.
+ */
+ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+ sched_priority(td);
+#ifdef SMP
+ /*
+ * Pick the destination cpu and if it isn't ours transfer to the
+ * target cpu.
+ */
+ cpu = sched_pickcpu(td, flags);
+ tdq = sched_setcpu(td, cpu, flags);
+ tdq_add(tdq, td, flags);
+ if (cpu != PCPU_GET(cpuid)) {
+ tdq_notify(tdq, td);
+ return;
+ }
+#else
+ tdq = TDQ_SELF();
+ TDQ_LOCK(tdq);
+ /*
+ * Now that the thread is moving to the run-queue, set the lock
+ * to the scheduler's lock.
+ */
+ thread_lock_set(td, TDQ_LOCKPTR(tdq));
+ tdq_add(tdq, td, flags);
+#endif
+ if (!(flags & SRQ_YIELDING))
+ sched_setpreempt(td);
+}
+
+/*
+ * Remove a thread from a run-queue without running it. This is used
+ * when we're stealing a thread from a remote queue. Otherwise all threads
+ * exit by calling sched_exit_thread() and sched_throw() themselves.
+ */
+void
+sched_rem(struct thread *td)
+{
+ struct tdq *tdq;
+
+ KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
+ "prio:%d", td->td_priority);
+ SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
+ tdq = TDQ_CPU(td->td_sched->ts_cpu);
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ KASSERT(TD_ON_RUNQ(td),
+ ("sched_rem: thread not on run queue"));
+ tdq_runq_rem(tdq, td);
+ tdq_load_rem(tdq, td);
+ TD_SET_CAN_RUN(td);
+ if (td->td_priority == tdq->tdq_lowpri)
+ tdq_setlowpri(tdq, NULL);
+}
+
+/*
+ * Fetch cpu utilization information. Updates on demand.
+ */
+fixpt_t
+sched_pctcpu(struct thread *td)
+{
+ fixpt_t pctcpu;
+ struct td_sched *ts;
+
+ pctcpu = 0;
+ ts = td->td_sched;
+ if (ts == NULL)
+ return (0);
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ sched_pctcpu_update(ts, TD_IS_RUNNING(td));
+ if (ts->ts_ticks) {
+ int rtick;
+
+ /* How many rtick per second ? */
+ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
+ pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
+ }
+
+ return (pctcpu);
+}
+
+/*
+ * Enforce affinity settings for a thread. Called after adjustments to
+ * cpumask.
+ */
+void
+sched_affinity(struct thread *td)
+{
+#ifdef SMP
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ ts = td->td_sched;
+ if (THREAD_CAN_SCHED(td, ts->ts_cpu))
+ return;
+ if (TD_ON_RUNQ(td)) {
+ sched_rem(td);
+ sched_add(td, SRQ_BORING);
+ return;
+ }
+ if (!TD_IS_RUNNING(td))
+ return;
+ /*
+ * Force a switch before returning to userspace. If the
+ * target thread is not running locally send an ipi to force
+ * the issue.
+ */
+ td->td_flags |= TDF_NEEDRESCHED;
+ if (td != curthread)
+ ipi_cpu(ts->ts_cpu, IPI_PREEMPT);
+#endif
+}
+
+/*
+ * Bind a thread to a target cpu.
+ */
+void
+sched_bind(struct thread *td, int cpu)
+{
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
+ KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
+ ts = td->td_sched;
+ if (ts->ts_flags & TSF_BOUND)
+ sched_unbind(td);
+ KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td));
+ ts->ts_flags |= TSF_BOUND;
+ sched_pin();
+ if (PCPU_GET(cpuid) == cpu)
+ return;
+ ts->ts_cpu = cpu;
+ /* When we return from mi_switch we'll be on the correct cpu. */
+ mi_switch(SW_VOL, NULL);
+}
+
+/*
+ * Release a bound thread.
+ */
+void
+sched_unbind(struct thread *td)
+{
+ struct td_sched *ts;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
+ ts = td->td_sched;
+ if ((ts->ts_flags & TSF_BOUND) == 0)
+ return;
+ ts->ts_flags &= ~TSF_BOUND;
+ sched_unpin();
+}
+
+int
+sched_is_bound(struct thread *td)
+{
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ return (td->td_sched->ts_flags & TSF_BOUND);
+}
+
+/*
+ * Basic yield call.
+ */
+void
+sched_relinquish(struct thread *td)
+{
+ thread_lock(td);
+ mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
+ thread_unlock(td);
+}
+
+/*
+ * Return the total system load.
+ */
+int
+sched_load(void)
+{
+#ifdef SMP
+ int total;
+ int i;
+
+ total = 0;
+ CPU_FOREACH(i)
+ total += TDQ_CPU(i)->tdq_sysload;
+ return (total);
+#else
+ return (TDQ_SELF()->tdq_sysload);
+#endif
+}
+
+int
+sched_sizeof_proc(void)
+{
+ return (sizeof(struct proc));
+}
+
+int
+sched_sizeof_thread(void)
+{
+ return (sizeof(struct thread) + sizeof(struct td_sched));
+}
+
+#ifdef SMP
+#define TDQ_IDLESPIN(tdq) \
+ ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0)
+#else
+#define TDQ_IDLESPIN(tdq) 1
+#endif
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+ struct thread *td;
+ struct tdq *tdq;
+ int oldswitchcnt, switchcnt;
+ int i;
+
+ mtx_assert(&Giant, MA_NOTOWNED);
+ td = curthread;
+ tdq = TDQ_SELF();
+ THREAD_NO_SLEEPING();
+ oldswitchcnt = -1;
+ for (;;) {
+ if (tdq->tdq_load) {
+ thread_lock(td);
+ mi_switch(SW_VOL | SWT_IDLE, NULL);
+ thread_unlock(td);
+ }
+ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+#ifdef SMP
+ if (switchcnt != oldswitchcnt) {
+ oldswitchcnt = switchcnt;
+ if (tdq_idled(tdq) == 0)
+ continue;
+ }
+ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+#else
+ oldswitchcnt = switchcnt;
+#endif
+ /*
+ * If we're switching very frequently, spin while checking
+ * for load rather than entering a low power state that
+ * may require an IPI. However, don't do any busy
+ * loops while on SMT machines as this simply steals
+ * cycles from cores doing useful work.
+ */
+ if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
+ for (i = 0; i < sched_idlespins; i++) {
+ if (tdq->tdq_load)
+ break;
+ cpu_spinwait();
+ }
+ }
+
+ /* If there was context switch during spin, restart it. */
+ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+ if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
+ continue;
+
+ /* Run main MD idle handler. */
+ tdq->tdq_cpu_idle = 1;
+ cpu_idle(switchcnt * 4 > sched_idlespinthresh);
+ tdq->tdq_cpu_idle = 0;
+
+ /*
+ * Account thread-less hardware interrupts and
+ * other wakeup reasons equal to context switches.
+ */
+ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+ if (switchcnt != oldswitchcnt)
+ continue;
+ tdq->tdq_switchcnt++;
+ oldswitchcnt++;
+ }
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+ struct thread *newtd;
+ struct tdq *tdq;
+
+ tdq = TDQ_SELF();
+ if (td == NULL) {
+ /* Correct spinlock nesting and acquire the correct lock. */
+ TDQ_LOCK(tdq);
+ spinlock_exit();
+ PCPU_SET(switchtime, cpu_ticks());
+ PCPU_SET(switchticks, ticks);
+ } else {
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ tdq_load_rem(tdq, td);
+ lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
+ }
+ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+ newtd = choosethread();
+ TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
+ cpu_throw(td, newtd); /* doesn't return */
+}
+
+/*
+ * This is called from fork_exit(). Just acquire the correct locks and
+ * let fork do the rest of the work.
+ */
+void
+sched_fork_exit(struct thread *td)
+{
+ struct td_sched *ts;
+ struct tdq *tdq;
+ int cpuid;
+
+ /*
+ * Finish setting up thread glue so that it begins execution in a
+ * non-nested critical section with the scheduler lock held.
+ */
+ cpuid = PCPU_GET(cpuid);
+ tdq = TDQ_CPU(cpuid);
+ ts = td->td_sched;
+ if (TD_IS_IDLETHREAD(td))
+ td->td_lock = TDQ_LOCKPTR(tdq);
+ MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+ td->td_oncpu = cpuid;
+ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+ lock_profile_obtain_lock_success(
+ &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+}
+
+/*
+ * Create on first use to catch odd startup conditons.
+ */
+char *
+sched_tdname(struct thread *td)
+{
+#ifdef KTR
+ struct td_sched *ts;
+
+ ts = td->td_sched;
+ if (ts->ts_name[0] == '\0')
+ snprintf(ts->ts_name, sizeof(ts->ts_name),
+ "%s tid %d", td->td_name, td->td_tid);
+ return (ts->ts_name);
+#else
+ return (td->td_name);
+#endif
+}
+
+#ifdef KTR
+void
+sched_clear_tdname(struct thread *td)
+{
+ struct td_sched *ts;
+
+ ts = td->td_sched;
+ ts->ts_name[0] = '\0';
+}
+#endif
+
+#ifdef SMP
+
+/*
+ * Build the CPU topology dump string. Is recursively called to collect
+ * the topology tree.
+ */
+static int
+sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg,
+ int indent)
+{
+ char cpusetbuf[CPUSETBUFSIZ];
+ int i, first;
+
+ sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent,
+ "", 1 + indent / 2, cg->cg_level);
+ sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "",
+ cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask));
+ first = TRUE;
+ for (i = 0; i < MAXCPU; i++) {
+ if (CPU_ISSET(i, &cg->cg_mask)) {
+ if (!first)
+ sbuf_printf(sb, ", ");
+ else
+ first = FALSE;
+ sbuf_printf(sb, "%d", i);
+ }
+ }
+ sbuf_printf(sb, "</cpu>\n");
+
+ if (cg->cg_flags != 0) {
+ sbuf_printf(sb, "%*s <flags>", indent, "");
+ if ((cg->cg_flags & CG_FLAG_HTT) != 0)
+ sbuf_printf(sb, "<flag name=\"HTT\">HTT group</flag>");
+ if ((cg->cg_flags & CG_FLAG_THREAD) != 0)
+ sbuf_printf(sb, "<flag name=\"THREAD\">THREAD group</flag>");
+ if ((cg->cg_flags & CG_FLAG_SMT) != 0)
+ sbuf_printf(sb, "<flag name=\"SMT\">SMT group</flag>");
+ sbuf_printf(sb, "</flags>\n");
+ }
+
+ if (cg->cg_children > 0) {
+ sbuf_printf(sb, "%*s <children>\n", indent, "");
+ for (i = 0; i < cg->cg_children; i++)
+ sysctl_kern_sched_topology_spec_internal(sb,
+ &cg->cg_child[i], indent+2);
+ sbuf_printf(sb, "%*s </children>\n", indent, "");
+ }
+ sbuf_printf(sb, "%*s</group>\n", indent, "");
+ return (0);
+}
+
+/*
+ * Sysctl handler for retrieving topology dump. It's a wrapper for
+ * the recursive sysctl_kern_smp_topology_spec_internal().
+ */
+static int
+sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf *topo;
+ int err;
+
+ KASSERT(cpu_top != NULL, ("cpu_top isn't initialized"));
+
+ topo = sbuf_new(NULL, NULL, 500, SBUF_AUTOEXTEND);
+ if (topo == NULL)
+ return (ENOMEM);
+
+ sbuf_printf(topo, "<groups>\n");
+ err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1);
+ sbuf_printf(topo, "</groups>\n");
+
+ if (err == 0) {
+ sbuf_finish(topo);
+ err = SYSCTL_OUT(req, sbuf_data(topo), sbuf_len(topo));
+ }
+ sbuf_delete(topo);
+ return (err);
+}
+
+#endif
+
+static int
+sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
+{
+ int error, new_val, period;
+
+ period = 1000000 / realstathz;
+ new_val = period * sched_slice;
+ error = sysctl_handle_int(oidp, &new_val, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (new_val <= 0)
+ return (EINVAL);
+ sched_slice = imax(1, (new_val + period / 2) / period);
+ sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
+ hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
+ realstathz);
+ return (0);
+}
+
+SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
+ "Scheduler name");
+SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
+ NULL, 0, sysctl_kern_quantum, "I",
+ "Quantum for timeshare threads in microseconds");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
+ "Quantum for timeshare threads in stathz ticks");
+SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
+ "Interactivity score threshold");
+SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW,
+ &preempt_thresh, 0,
+ "Maximal (lowest) priority for preemption");
+SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0,
+ "Assign static kernel priorities to sleeping threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0,
+ "Number of times idle thread will spin waiting for new work");
+SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW,
+ &sched_idlespinthresh, 0,
+ "Threshold before we will permit idle thread spinning");
+#ifdef SMP
+SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
+ "Number of hz ticks to keep thread affinity for");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
+ "Enables the long-term load balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
+ &balance_interval, 0,
+ "Average period in stathz ticks to run the long-term balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
+ "Attempts to steal work from other cores before idling");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
+ "Minimum load on remote CPU before we'll steal");
+SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING |
+ CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A",
+ "XML dump of detected CPU topology");
+#endif
+
+/* ps compat. All cpu percentages from ULE are weighted. */
+static int ccpu = 0;
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
diff --git a/sys/kern/serdev_if.m b/sys/kern/serdev_if.m
new file mode 100644
index 0000000..fbf4363
--- /dev/null
+++ b/sys/kern/serdev_if.m
@@ -0,0 +1,94 @@
+#-
+# Copyright (c) 2006 Marcel Moolenaar
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+#
+
+#include <sys/bus.h>
+#include <sys/serial.h>
+
+# The serdev interface is used by umbrella drivers and children thereof to
+# establish a more intimate relationship, necessary for efficient handling
+# of multiple (concurrent) serial communication channels. Examples include
+# serial communications controller (SCC) drivers, multi-I/O adapter drivers
+# and intelligent multi-port serial drivers. Methods specifically deal
+# with interrupt handling and configuration. Conceptually, the umbrella
+# driver is responsible for the overall operation of the hardware and uses
+# child drivers to handle each individual channel.
+# The serdev interface is intended to inherit the device interface.
+
+INTERFACE serdev;
+
+# Default implementations of some methods.
+CODE {
+ static serdev_intr_t *
+ default_ihand(device_t dev, int ipend)
+ {
+ return (NULL);
+ }
+
+ static int
+ default_ipend(device_t dev)
+ {
+ return (-1);
+ }
+
+ static int
+ default_sysdev(device_t dev)
+ {
+ return (0);
+ }
+};
+
+# ihand() - Query serial device interrupt handler.
+# This method is called by the umbrella driver to obtain function pointers
+# to interrupt handlers for each individual interrupt source. This allows
+# the umbralla driver to control the servicing of interrupts between the
+# different channels in the most flexible way.
+METHOD serdev_intr_t* ihand {
+ device_t dev;
+ int ipend;
+} DEFAULT default_ihand;
+
+# ipend() - Query pending interrupt status.
+# This method is called by the umbrella driver to obtain interrupt status
+# for the UART in question. This allows the umbrella driver to build a
+# matrix and service the interrupts in the most flexible way by calling
+# interrupt handlers collected with the ihand() method.
+METHOD int ipend {
+ device_t dev;
+} DEFAULT default_ipend;
+
+# sysdev() - Query system device status
+# This method may be called by the umbrella driver for each child driver
+# to establish if a particular channel and mode is currently being used
+# for system specific usage. If this is the case, the hardware is not
+# reset and the channel will not change its operation mode.
+# The return value is !0 if the channel and mode are used for a system
+# device and 0 otherwise.
+METHOD int sysdev {
+ device_t dev;
+} DEFAULT default_sysdev;
+
diff --git a/sys/kern/stack_protector.c b/sys/kern/stack_protector.c
new file mode 100644
index 0000000..b5f9973
--- /dev/null
+++ b/sys/kern/stack_protector.c
@@ -0,0 +1,31 @@
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/libkern.h>
+
+long __stack_chk_guard[8] = {};
+void __stack_chk_fail(void);
+
+void
+__stack_chk_fail(void)
+{
+
+ panic("stack overflow detected; backtrace may be corrupted");
+}
+
+#define __arraycount(__x) (sizeof(__x) / sizeof(__x[0]))
+static void
+__stack_chk_init(void *dummy __unused)
+{
+ size_t i;
+ long guard[__arraycount(__stack_chk_guard)];
+
+ arc4rand(guard, sizeof(guard), 0);
+ for (i = 0; i < __arraycount(guard); i++)
+ __stack_chk_guard[i] = guard[i];
+}
+SYSINIT(stack_chk, SI_SUB_RANDOM, SI_ORDER_ANY, __stack_chk_init, NULL);
diff --git a/sys/kern/subr_acl_nfs4.c b/sys/kern/subr_acl_nfs4.c
new file mode 100644
index 0000000..ef378a0
--- /dev/null
+++ b/sys/kern/subr_acl_nfs4.c
@@ -0,0 +1,1417 @@
+/*-
+ * Copyright (c) 2008-2010 Edward Tomasz Napierała <trasz@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * ACL support routines specific to NFSv4 access control lists. These are
+ * utility routines for code common across file systems implementing NFSv4
+ * ACLs.
+ */
+
+#ifdef _KERNEL
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/acl.h>
+#else
+#include <errno.h>
+#include <assert.h>
+#include <sys/acl.h>
+#include <sys/stat.h>
+#define KASSERT(a, b) assert(a)
+#define CTASSERT(a)
+
+#endif /* !_KERNEL */
+
+#ifdef _KERNEL
+
+static void acl_nfs4_trivial_from_mode(struct acl *aclp, mode_t mode);
+
+static int acl_nfs4_old_semantics = 0;
+
+SYSCTL_INT(_vfs, OID_AUTO, acl_nfs4_old_semantics, CTLFLAG_RW,
+ &acl_nfs4_old_semantics, 0, "Use pre-PSARC/2010/029 NFSv4 ACL semantics");
+
+static struct {
+ accmode_t accmode;
+ int mask;
+} accmode2mask[] = {{VREAD, ACL_READ_DATA},
+ {VWRITE, ACL_WRITE_DATA},
+ {VAPPEND, ACL_APPEND_DATA},
+ {VEXEC, ACL_EXECUTE},
+ {VREAD_NAMED_ATTRS, ACL_READ_NAMED_ATTRS},
+ {VWRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS},
+ {VDELETE_CHILD, ACL_DELETE_CHILD},
+ {VREAD_ATTRIBUTES, ACL_READ_ATTRIBUTES},
+ {VWRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES},
+ {VDELETE, ACL_DELETE},
+ {VREAD_ACL, ACL_READ_ACL},
+ {VWRITE_ACL, ACL_WRITE_ACL},
+ {VWRITE_OWNER, ACL_WRITE_OWNER},
+ {VSYNCHRONIZE, ACL_SYNCHRONIZE},
+ {0, 0}};
+
+static int
+_access_mask_from_accmode(accmode_t accmode)
+{
+ int access_mask = 0, i;
+
+ for (i = 0; accmode2mask[i].accmode != 0; i++) {
+ if (accmode & accmode2mask[i].accmode)
+ access_mask |= accmode2mask[i].mask;
+ }
+
+ /*
+ * VAPPEND is just a modifier for VWRITE; if the caller asked
+ * for 'VAPPEND | VWRITE', we want to check for ACL_APPEND_DATA only.
+ */
+ if (access_mask & ACL_APPEND_DATA)
+ access_mask &= ~ACL_WRITE_DATA;
+
+ return (access_mask);
+}
+
+/*
+ * Return 0, iff access is allowed, 1 otherwise.
+ */
+static int
+_acl_denies(const struct acl *aclp, int access_mask, struct ucred *cred,
+ int file_uid, int file_gid, int *denied_explicitly)
+{
+ int i;
+ const struct acl_entry *entry;
+
+ if (denied_explicitly != NULL)
+ *denied_explicitly = 0;
+
+ KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
+ ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+ for (i = 0; i < aclp->acl_cnt; i++) {
+ entry = &(aclp->acl_entry[i]);
+
+ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+ entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+ continue;
+ if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
+ continue;
+ switch (entry->ae_tag) {
+ case ACL_USER_OBJ:
+ if (file_uid != cred->cr_uid)
+ continue;
+ break;
+ case ACL_USER:
+ if (entry->ae_id != cred->cr_uid)
+ continue;
+ break;
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ continue;
+ break;
+ case ACL_GROUP:
+ if (!groupmember(entry->ae_id, cred))
+ continue;
+ break;
+ default:
+ KASSERT(entry->ae_tag == ACL_EVERYONE,
+ ("entry->ae_tag == ACL_EVERYONE"));
+ }
+
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_DENY) {
+ if (entry->ae_perm & access_mask) {
+ if (denied_explicitly != NULL)
+ *denied_explicitly = 1;
+ return (1);
+ }
+ }
+
+ access_mask &= ~(entry->ae_perm);
+ if (access_mask == 0)
+ return (0);
+ }
+
+ if (access_mask == 0)
+ return (0);
+
+ return (1);
+}
+
+int
+vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
+ struct acl *aclp, accmode_t accmode, struct ucred *cred, int *privused)
+{
+ accmode_t priv_granted = 0;
+ int denied, explicitly_denied, access_mask, is_directory,
+ must_be_owner = 0;
+ mode_t file_mode = 0;
+
+ KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND |
+ VEXPLICIT_DENY | VREAD_NAMED_ATTRS | VWRITE_NAMED_ATTRS |
+ VDELETE_CHILD | VREAD_ATTRIBUTES | VWRITE_ATTRIBUTES | VDELETE |
+ VREAD_ACL | VWRITE_ACL | VWRITE_OWNER | VSYNCHRONIZE)) == 0,
+ ("invalid bit in accmode"));
+ KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
+ ("VAPPEND without VWRITE"));
+
+ if (privused != NULL)
+ *privused = 0;
+
+ if (accmode & VADMIN)
+ must_be_owner = 1;
+
+ /*
+ * Ignore VSYNCHRONIZE permission.
+ */
+ accmode &= ~VSYNCHRONIZE;
+
+ access_mask = _access_mask_from_accmode(accmode);
+
+ if (type == VDIR)
+ is_directory = 1;
+ else
+ is_directory = 0;
+
+ /*
+ * File owner is always allowed to read and write the ACL
+ * and basic attributes. This is to prevent a situation
+ * where user would change ACL in a way that prevents him
+ * from undoing the change.
+ */
+ if (file_uid == cred->cr_uid)
+ access_mask &= ~(ACL_READ_ACL | ACL_WRITE_ACL |
+ ACL_READ_ATTRIBUTES | ACL_WRITE_ATTRIBUTES);
+
+ /*
+ * Ignore append permission for regular files; use write
+ * permission instead.
+ */
+ if (!is_directory && (access_mask & ACL_APPEND_DATA)) {
+ access_mask &= ~ACL_APPEND_DATA;
+ access_mask |= ACL_WRITE_DATA;
+ }
+
+ denied = _acl_denies(aclp, access_mask, cred, file_uid, file_gid,
+ &explicitly_denied);
+
+ if (must_be_owner) {
+ if (file_uid != cred->cr_uid)
+ denied = EPERM;
+ }
+
+ /*
+ * For VEXEC, ensure that at least one execute bit is set for
+ * non-directories. We have to check the mode here to stay
+ * consistent with execve(2). See the test in
+ * exec_check_permissions().
+ */
+ acl_nfs4_sync_mode_from_acl(&file_mode, aclp);
+ if (!denied && !is_directory && (accmode & VEXEC) &&
+ (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
+ denied = EACCES;
+
+ if (!denied)
+ return (0);
+
+ /*
+ * Access failed. Iff it was not denied explicitly and
+ * VEXPLICIT_DENY flag was specified, allow access.
+ */
+ if ((accmode & VEXPLICIT_DENY) && explicitly_denied == 0)
+ return (0);
+
+ accmode &= ~VEXPLICIT_DENY;
+
+ /*
+ * No match. Try to use privileges, if there are any.
+ */
+ if (is_directory) {
+ if ((accmode & VEXEC) && !priv_check_cred(cred,
+ PRIV_VFS_LOOKUP, 0))
+ priv_granted |= VEXEC;
+ } else {
+ /*
+ * Ensure that at least one execute bit is on. Otherwise,
+ * a privileged user will always succeed, and we don't want
+ * this to happen unless the file really is executable.
+ */
+ if ((accmode & VEXEC) && (file_mode &
+ (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
+ !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+ priv_granted |= VEXEC;
+ }
+
+ if ((accmode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0))
+ priv_granted |= VREAD;
+
+ if ((accmode & (VWRITE | VAPPEND | VDELETE_CHILD)) &&
+ !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+ priv_granted |= (VWRITE | VAPPEND | VDELETE_CHILD);
+
+ if ((accmode & VADMIN_PERMS) &&
+ !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+ priv_granted |= VADMIN_PERMS;
+
+ if ((accmode & VSTAT_PERMS) &&
+ !priv_check_cred(cred, PRIV_VFS_STAT, 0))
+ priv_granted |= VSTAT_PERMS;
+
+ if ((accmode & priv_granted) == accmode) {
+ if (privused != NULL)
+ *privused = 1;
+
+ return (0);
+ }
+
+ if (accmode & (VADMIN_PERMS | VDELETE_CHILD | VDELETE))
+ denied = EPERM;
+ else
+ denied = EACCES;
+
+ return (denied);
+}
+#endif /* _KERNEL */
+
+static int
+_acl_entry_matches(struct acl_entry *entry, acl_tag_t tag, acl_perm_t perm,
+ acl_entry_type_t entry_type)
+{
+ if (entry->ae_tag != tag)
+ return (0);
+
+ if (entry->ae_id != ACL_UNDEFINED_ID)
+ return (0);
+
+ if (entry->ae_perm != perm)
+ return (0);
+
+ if (entry->ae_entry_type != entry_type)
+ return (0);
+
+ if (entry->ae_flags != 0)
+ return (0);
+
+ return (1);
+}
+
+static struct acl_entry *
+_acl_append(struct acl *aclp, acl_tag_t tag, acl_perm_t perm,
+ acl_entry_type_t entry_type)
+{
+ struct acl_entry *entry;
+
+ KASSERT(aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
+ ("aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
+
+ entry = &(aclp->acl_entry[aclp->acl_cnt]);
+ aclp->acl_cnt++;
+
+ entry->ae_tag = tag;
+ entry->ae_id = ACL_UNDEFINED_ID;
+ entry->ae_perm = perm;
+ entry->ae_entry_type = entry_type;
+ entry->ae_flags = 0;
+
+ return (entry);
+}
+
+static struct acl_entry *
+_acl_duplicate_entry(struct acl *aclp, int entry_index)
+{
+ int i;
+
+ KASSERT(aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
+ ("aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
+
+ for (i = aclp->acl_cnt; i > entry_index; i--)
+ aclp->acl_entry[i] = aclp->acl_entry[i - 1];
+
+ aclp->acl_cnt++;
+
+ return (&(aclp->acl_entry[entry_index + 1]));
+}
+
+static void
+acl_nfs4_sync_acl_from_mode_draft(struct acl *aclp, mode_t mode,
+ int file_owner_id)
+{
+ int i, meets, must_append;
+ struct acl_entry *entry, *copy, *previous,
+ *a1, *a2, *a3, *a4, *a5, *a6;
+ mode_t amode;
+ const int READ = 04;
+ const int WRITE = 02;
+ const int EXEC = 01;
+
+ KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
+ ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+ /*
+ * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
+ *
+ * 3.16.6.3. Applying a Mode to an Existing ACL
+ */
+
+ /*
+ * 1. For each ACE:
+ */
+ for (i = 0; i < aclp->acl_cnt; i++) {
+ entry = &(aclp->acl_entry[i]);
+
+ /*
+ * 1.1. If the type is neither ALLOW or DENY - skip.
+ */
+ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+ entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+ continue;
+
+ /*
+ * 1.2. If ACL_ENTRY_INHERIT_ONLY is set - skip.
+ */
+ if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
+ continue;
+
+ /*
+ * 1.3. If ACL_ENTRY_FILE_INHERIT or ACL_ENTRY_DIRECTORY_INHERIT
+ * are set:
+ */
+ if (entry->ae_flags &
+ (ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT)) {
+ /*
+ * 1.3.1. A copy of the current ACE is made, and placed
+ * in the ACL immediately following the current
+ * ACE.
+ */
+ copy = _acl_duplicate_entry(aclp, i);
+
+ /*
+ * 1.3.2. In the first ACE, the flag
+ * ACL_ENTRY_INHERIT_ONLY is set.
+ */
+ entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
+
+ /*
+ * 1.3.3. In the second ACE, the following flags
+ * are cleared:
+ * ACL_ENTRY_FILE_INHERIT,
+ * ACL_ENTRY_DIRECTORY_INHERIT,
+ * ACL_ENTRY_NO_PROPAGATE_INHERIT.
+ */
+ copy->ae_flags &= ~(ACL_ENTRY_FILE_INHERIT |
+ ACL_ENTRY_DIRECTORY_INHERIT |
+ ACL_ENTRY_NO_PROPAGATE_INHERIT);
+
+ /*
+ * The algorithm continues on with the second ACE.
+ */
+ i++;
+ entry = copy;
+ }
+
+ /*
+ * 1.4. If it's owner@, group@ or everyone@ entry, clear
+ * ACL_READ_DATA, ACL_WRITE_DATA, ACL_APPEND_DATA
+ * and ACL_EXECUTE. Continue to the next entry.
+ */
+ if (entry->ae_tag == ACL_USER_OBJ ||
+ entry->ae_tag == ACL_GROUP_OBJ ||
+ entry->ae_tag == ACL_EVERYONE) {
+ entry->ae_perm &= ~(ACL_READ_DATA | ACL_WRITE_DATA |
+ ACL_APPEND_DATA | ACL_EXECUTE);
+ continue;
+ }
+
+ /*
+ * 1.5. Otherwise, if the "who" field did not match one
+ * of OWNER@, GROUP@, EVERYONE@:
+ *
+ * 1.5.1. If the type is ALLOW, check the preceding ACE.
+ * If it does not meet all of the following criteria:
+ */
+ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW)
+ continue;
+
+ meets = 0;
+ if (i > 0) {
+ meets = 1;
+ previous = &(aclp->acl_entry[i - 1]);
+
+ /*
+ * 1.5.1.1. The type field is DENY,
+ */
+ if (previous->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+ meets = 0;
+
+ /*
+ * 1.5.1.2. The "who" field is the same as the current
+ * ACE,
+ *
+ * 1.5.1.3. The flag bit ACE4_IDENTIFIER_GROUP
+ * is the same as it is in the current ACE,
+ * and no other flag bits are set,
+ */
+ if (previous->ae_id != entry->ae_id ||
+ previous->ae_tag != entry->ae_tag)
+ meets = 0;
+
+ if (previous->ae_flags)
+ meets = 0;
+
+ /*
+ * 1.5.1.4. The mask bits are a subset of the mask bits
+ * of the current ACE, and are also subset of
+ * the following: ACL_READ_DATA,
+ * ACL_WRITE_DATA, ACL_APPEND_DATA, ACL_EXECUTE
+ */
+ if (previous->ae_perm & ~(entry->ae_perm))
+ meets = 0;
+
+ if (previous->ae_perm & ~(ACL_READ_DATA |
+ ACL_WRITE_DATA | ACL_APPEND_DATA | ACL_EXECUTE))
+ meets = 0;
+ }
+
+ if (!meets) {
+ /*
+ * Then the ACE of type DENY, with a who equal
+ * to the current ACE, flag bits equal to
+ * (<current ACE flags> & <ACE_IDENTIFIER_GROUP>)
+ * and no mask bits, is prepended.
+ */
+ previous = entry;
+ entry = _acl_duplicate_entry(aclp, i);
+
+ /* Adjust counter, as we've just added an entry. */
+ i++;
+
+ previous->ae_tag = entry->ae_tag;
+ previous->ae_id = entry->ae_id;
+ previous->ae_flags = entry->ae_flags;
+ previous->ae_perm = 0;
+ previous->ae_entry_type = ACL_ENTRY_TYPE_DENY;
+ }
+
+ /*
+ * 1.5.2. The following modifications are made to the prepended
+ * ACE. The intent is to mask the following ACE
+ * to disallow ACL_READ_DATA, ACL_WRITE_DATA,
+ * ACL_APPEND_DATA, or ACL_EXECUTE, based upon the group
+ * permissions of the new mode. As a special case,
+ * if the ACE matches the current owner of the file,
+ * the owner bits are used, rather than the group bits.
+ * This is reflected in the algorithm below.
+ */
+ amode = mode >> 3;
+
+ /*
+ * If ACE4_IDENTIFIER_GROUP is not set, and the "who" field
+ * in ACE matches the owner of the file, we shift amode three
+ * more bits, in order to have the owner permission bits
+ * placed in the three low order bits of amode.
+ */
+ if (entry->ae_tag == ACL_USER && entry->ae_id == file_owner_id)
+ amode = amode >> 3;
+
+ if (entry->ae_perm & ACL_READ_DATA) {
+ if (amode & READ)
+ previous->ae_perm &= ~ACL_READ_DATA;
+ else
+ previous->ae_perm |= ACL_READ_DATA;
+ }
+
+ if (entry->ae_perm & ACL_WRITE_DATA) {
+ if (amode & WRITE)
+ previous->ae_perm &= ~ACL_WRITE_DATA;
+ else
+ previous->ae_perm |= ACL_WRITE_DATA;
+ }
+
+ if (entry->ae_perm & ACL_APPEND_DATA) {
+ if (amode & WRITE)
+ previous->ae_perm &= ~ACL_APPEND_DATA;
+ else
+ previous->ae_perm |= ACL_APPEND_DATA;
+ }
+
+ if (entry->ae_perm & ACL_EXECUTE) {
+ if (amode & EXEC)
+ previous->ae_perm &= ~ACL_EXECUTE;
+ else
+ previous->ae_perm |= ACL_EXECUTE;
+ }
+
+ /*
+ * 1.5.3. If ACE4_IDENTIFIER_GROUP is set in the flags
+ * of the ALLOW ace:
+ *
+ * XXX: This point is not there in the Falkner's draft.
+ */
+ if (entry->ae_tag == ACL_GROUP &&
+ entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW) {
+ mode_t extramode, ownermode;
+ extramode = (mode >> 3) & 07;
+ ownermode = mode >> 6;
+ extramode &= ~ownermode;
+
+ if (extramode) {
+ if (extramode & READ) {
+ entry->ae_perm &= ~ACL_READ_DATA;
+ previous->ae_perm &= ~ACL_READ_DATA;
+ }
+
+ if (extramode & WRITE) {
+ entry->ae_perm &=
+ ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
+ previous->ae_perm &=
+ ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
+ }
+
+ if (extramode & EXEC) {
+ entry->ae_perm &= ~ACL_EXECUTE;
+ previous->ae_perm &= ~ACL_EXECUTE;
+ }
+ }
+ }
+ }
+
+ /*
+ * 2. If there at least six ACEs, the final six ACEs are examined.
+ * If they are not equal to what we want, append six ACEs.
+ */
+ must_append = 0;
+ if (aclp->acl_cnt < 6) {
+ must_append = 1;
+ } else {
+ a6 = &(aclp->acl_entry[aclp->acl_cnt - 1]);
+ a5 = &(aclp->acl_entry[aclp->acl_cnt - 2]);
+ a4 = &(aclp->acl_entry[aclp->acl_cnt - 3]);
+ a3 = &(aclp->acl_entry[aclp->acl_cnt - 4]);
+ a2 = &(aclp->acl_entry[aclp->acl_cnt - 5]);
+ a1 = &(aclp->acl_entry[aclp->acl_cnt - 6]);
+
+ if (!_acl_entry_matches(a1, ACL_USER_OBJ, 0,
+ ACL_ENTRY_TYPE_DENY))
+ must_append = 1;
+ if (!_acl_entry_matches(a2, ACL_USER_OBJ, ACL_WRITE_ACL |
+ ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+ ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_ALLOW))
+ must_append = 1;
+ if (!_acl_entry_matches(a3, ACL_GROUP_OBJ, 0,
+ ACL_ENTRY_TYPE_DENY))
+ must_append = 1;
+ if (!_acl_entry_matches(a4, ACL_GROUP_OBJ, 0,
+ ACL_ENTRY_TYPE_ALLOW))
+ must_append = 1;
+ if (!_acl_entry_matches(a5, ACL_EVERYONE, ACL_WRITE_ACL |
+ ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+ ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_DENY))
+ must_append = 1;
+ if (!_acl_entry_matches(a6, ACL_EVERYONE, ACL_READ_ACL |
+ ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS |
+ ACL_SYNCHRONIZE, ACL_ENTRY_TYPE_ALLOW))
+ must_append = 1;
+ }
+
+ if (must_append) {
+ KASSERT(aclp->acl_cnt + 6 <= ACL_MAX_ENTRIES,
+ ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+ a1 = _acl_append(aclp, ACL_USER_OBJ, 0, ACL_ENTRY_TYPE_DENY);
+ a2 = _acl_append(aclp, ACL_USER_OBJ, ACL_WRITE_ACL |
+ ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+ ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_ALLOW);
+ a3 = _acl_append(aclp, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_DENY);
+ a4 = _acl_append(aclp, ACL_GROUP_OBJ, 0, ACL_ENTRY_TYPE_ALLOW);
+ a5 = _acl_append(aclp, ACL_EVERYONE, ACL_WRITE_ACL |
+ ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+ ACL_WRITE_NAMED_ATTRS, ACL_ENTRY_TYPE_DENY);
+ a6 = _acl_append(aclp, ACL_EVERYONE, ACL_READ_ACL |
+ ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS |
+ ACL_SYNCHRONIZE, ACL_ENTRY_TYPE_ALLOW);
+
+ KASSERT(a1 != NULL && a2 != NULL && a3 != NULL && a4 != NULL &&
+ a5 != NULL && a6 != NULL, ("couldn't append to ACL."));
+ }
+
+ /*
+ * 3. The final six ACEs are adjusted according to the incoming mode.
+ */
+ if (mode & S_IRUSR)
+ a2->ae_perm |= ACL_READ_DATA;
+ else
+ a1->ae_perm |= ACL_READ_DATA;
+ if (mode & S_IWUSR)
+ a2->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ else
+ a1->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ if (mode & S_IXUSR)
+ a2->ae_perm |= ACL_EXECUTE;
+ else
+ a1->ae_perm |= ACL_EXECUTE;
+
+ if (mode & S_IRGRP)
+ a4->ae_perm |= ACL_READ_DATA;
+ else
+ a3->ae_perm |= ACL_READ_DATA;
+ if (mode & S_IWGRP)
+ a4->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ else
+ a3->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ if (mode & S_IXGRP)
+ a4->ae_perm |= ACL_EXECUTE;
+ else
+ a3->ae_perm |= ACL_EXECUTE;
+
+ if (mode & S_IROTH)
+ a6->ae_perm |= ACL_READ_DATA;
+ else
+ a5->ae_perm |= ACL_READ_DATA;
+ if (mode & S_IWOTH)
+ a6->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ else
+ a5->ae_perm |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ if (mode & S_IXOTH)
+ a6->ae_perm |= ACL_EXECUTE;
+ else
+ a5->ae_perm |= ACL_EXECUTE;
+}
+
+#ifdef _KERNEL
+void
+acl_nfs4_sync_acl_from_mode(struct acl *aclp, mode_t mode,
+ int file_owner_id)
+{
+
+ if (acl_nfs4_old_semantics)
+ acl_nfs4_sync_acl_from_mode_draft(aclp, mode, file_owner_id);
+ else
+ acl_nfs4_trivial_from_mode(aclp, mode);
+}
+#endif /* _KERNEL */
+
+void
+acl_nfs4_sync_mode_from_acl(mode_t *_mode, const struct acl *aclp)
+{
+ int i;
+ mode_t old_mode = *_mode, mode = 0, seen = 0;
+ const struct acl_entry *entry;
+
+ KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES,
+ ("aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+ /*
+ * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
+ *
+ * 3.16.6.1. Recomputing mode upon SETATTR of ACL
+ */
+
+ for (i = 0; i < aclp->acl_cnt; i++) {
+ entry = &(aclp->acl_entry[i]);
+
+ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+ entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+ continue;
+
+ if (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY)
+ continue;
+
+ if (entry->ae_tag == ACL_USER_OBJ) {
+ if ((entry->ae_perm & ACL_READ_DATA) &&
+ ((seen & S_IRUSR) == 0)) {
+ seen |= S_IRUSR;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IRUSR;
+ }
+ if ((entry->ae_perm & ACL_WRITE_DATA) &&
+ ((seen & S_IWUSR) == 0)) {
+ seen |= S_IWUSR;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IWUSR;
+ }
+ if ((entry->ae_perm & ACL_EXECUTE) &&
+ ((seen & S_IXUSR) == 0)) {
+ seen |= S_IXUSR;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IXUSR;
+ }
+ } else if (entry->ae_tag == ACL_GROUP_OBJ) {
+ if ((entry->ae_perm & ACL_READ_DATA) &&
+ ((seen & S_IRGRP) == 0)) {
+ seen |= S_IRGRP;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IRGRP;
+ }
+ if ((entry->ae_perm & ACL_WRITE_DATA) &&
+ ((seen & S_IWGRP) == 0)) {
+ seen |= S_IWGRP;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IWGRP;
+ }
+ if ((entry->ae_perm & ACL_EXECUTE) &&
+ ((seen & S_IXGRP) == 0)) {
+ seen |= S_IXGRP;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IXGRP;
+ }
+ } else if (entry->ae_tag == ACL_EVERYONE) {
+ if (entry->ae_perm & ACL_READ_DATA) {
+ if ((seen & S_IRUSR) == 0) {
+ seen |= S_IRUSR;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IRUSR;
+ }
+ if ((seen & S_IRGRP) == 0) {
+ seen |= S_IRGRP;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IRGRP;
+ }
+ if ((seen & S_IROTH) == 0) {
+ seen |= S_IROTH;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IROTH;
+ }
+ }
+ if (entry->ae_perm & ACL_WRITE_DATA) {
+ if ((seen & S_IWUSR) == 0) {
+ seen |= S_IWUSR;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IWUSR;
+ }
+ if ((seen & S_IWGRP) == 0) {
+ seen |= S_IWGRP;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IWGRP;
+ }
+ if ((seen & S_IWOTH) == 0) {
+ seen |= S_IWOTH;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IWOTH;
+ }
+ }
+ if (entry->ae_perm & ACL_EXECUTE) {
+ if ((seen & S_IXUSR) == 0) {
+ seen |= S_IXUSR;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IXUSR;
+ }
+ if ((seen & S_IXGRP) == 0) {
+ seen |= S_IXGRP;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IXGRP;
+ }
+ if ((seen & S_IXOTH) == 0) {
+ seen |= S_IXOTH;
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ }
+
+ *_mode = mode | (old_mode & ACL_PRESERVE_MASK);
+}
+
+#ifdef _KERNEL
+/*
+ * Calculate inherited ACL in a manner compatible with NFSv4 Minor Version 1,
+ * draft-ietf-nfsv4-minorversion1-03.txt.
+ */
+static void
+acl_nfs4_compute_inherited_acl_draft(const struct acl *parent_aclp,
+ struct acl *child_aclp, mode_t mode, int file_owner_id,
+ int is_directory)
+{
+ int i, flags;
+ const struct acl_entry *parent_entry;
+ struct acl_entry *entry, *copy;
+
+ KASSERT(child_aclp->acl_cnt == 0, ("child_aclp->acl_cnt == 0"));
+ KASSERT(parent_aclp->acl_cnt <= ACL_MAX_ENTRIES,
+ ("parent_aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+ /*
+ * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
+ *
+ * 3.16.6.2. Applying the mode given to CREATE or OPEN
+ * to an inherited ACL
+ */
+
+ /*
+ * 1. Form an ACL that is the concatenation of all inheritable ACEs.
+ */
+ for (i = 0; i < parent_aclp->acl_cnt; i++) {
+ parent_entry = &(parent_aclp->acl_entry[i]);
+ flags = parent_entry->ae_flags;
+
+ /*
+ * Entry is not inheritable at all.
+ */
+ if ((flags & (ACL_ENTRY_DIRECTORY_INHERIT |
+ ACL_ENTRY_FILE_INHERIT)) == 0)
+ continue;
+
+ /*
+ * We're creating a file, but entry is not inheritable
+ * by files.
+ */
+ if (!is_directory && (flags & ACL_ENTRY_FILE_INHERIT) == 0)
+ continue;
+
+ /*
+ * Entry is inheritable only by files, but has NO_PROPAGATE
+ * flag set, and we're creating a directory, so it wouldn't
+ * propagate to any file in that directory anyway.
+ */
+ if (is_directory &&
+ (flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0 &&
+ (flags & ACL_ENTRY_NO_PROPAGATE_INHERIT))
+ continue;
+
+ KASSERT(child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
+ ("child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
+ child_aclp->acl_entry[child_aclp->acl_cnt] = *parent_entry;
+ child_aclp->acl_cnt++;
+ }
+
+ /*
+ * 2. For each entry in the new ACL, adjust its flags, possibly
+ * creating two entries in place of one.
+ */
+ for (i = 0; i < child_aclp->acl_cnt; i++) {
+ entry = &(child_aclp->acl_entry[i]);
+
+ /*
+ * This is not in the specification, but SunOS
+ * apparently does that.
+ */
+ if (((entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT) ||
+ !is_directory) &&
+ entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER);
+
+ /*
+ * 2.A. If the ACL_ENTRY_NO_PROPAGATE_INHERIT is set, or if the object
+ * being created is not a directory, then clear the
+ * following flags: ACL_ENTRY_NO_PROPAGATE_INHERIT,
+ * ACL_ENTRY_FILE_INHERIT, ACL_ENTRY_DIRECTORY_INHERIT,
+ * ACL_ENTRY_INHERIT_ONLY.
+ */
+ if (entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT ||
+ !is_directory) {
+ entry->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
+ ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
+ ACL_ENTRY_INHERIT_ONLY);
+
+ /*
+ * Continue on to the next ACE.
+ */
+ continue;
+ }
+
+ /*
+ * 2.B. If the object is a directory and ACL_ENTRY_FILE_INHERIT
+ * is set, but ACL_ENTRY_NO_PROPAGATE_INHERIT is not set, ensure
+ * that ACL_ENTRY_INHERIT_ONLY is set. Continue to the
+ * next ACE. Otherwise...
+ */
+ /*
+ * XXX: Read it again and make sure what does the "otherwise"
+ * apply to.
+ */
+ if (is_directory &&
+ (entry->ae_flags & ACL_ENTRY_FILE_INHERIT) &&
+ ((entry->ae_flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0)) {
+ entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
+ continue;
+ }
+
+ /*
+ * 2.C. If the type of the ACE is neither ALLOW nor deny,
+ * then continue.
+ */
+ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+ entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+ continue;
+
+ /*
+ * 2.D. Copy the original ACE into a second, adjacent ACE.
+ */
+ copy = _acl_duplicate_entry(child_aclp, i);
+
+ /*
+ * 2.E. On the first ACE, ensure that ACL_ENTRY_INHERIT_ONLY
+ * is set.
+ */
+ entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
+
+ /*
+ * 2.F. On the second ACE, clear the following flags:
+ * ACL_ENTRY_NO_PROPAGATE_INHERIT, ACL_ENTRY_FILE_INHERIT,
+ * ACL_ENTRY_DIRECTORY_INHERIT, ACL_ENTRY_INHERIT_ONLY.
+ */
+ copy->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
+ ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
+ ACL_ENTRY_INHERIT_ONLY);
+
+ /*
+ * 2.G. On the second ACE, if the type is ALLOW,
+ * an implementation MAY clear the following
+ * mask bits: ACL_WRITE_ACL, ACL_WRITE_OWNER.
+ */
+ if (copy->ae_entry_type == ACL_ENTRY_TYPE_ALLOW)
+ copy->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER);
+
+ /*
+ * Increment the counter to skip the copied entry.
+ */
+ i++;
+ }
+
+ /*
+ * 3. To ensure that the mode is honored, apply the algorithm describe
+ * in Section 2.16.6.3, using the mode that is to be used for file
+ * creation.
+ */
+ acl_nfs4_sync_acl_from_mode(child_aclp, mode, file_owner_id);
+}
+#endif /* _KERNEL */
+
+/*
+ * Populate the ACL with entries inherited from parent_aclp.
+ */
+static void
+acl_nfs4_inherit_entries(const struct acl *parent_aclp,
+ struct acl *child_aclp, mode_t mode, int file_owner_id,
+ int is_directory)
+{
+ int i, flags, tag;
+ const struct acl_entry *parent_entry;
+ struct acl_entry *entry;
+
+ KASSERT(parent_aclp->acl_cnt <= ACL_MAX_ENTRIES,
+ ("parent_aclp->acl_cnt <= ACL_MAX_ENTRIES"));
+
+ for (i = 0; i < parent_aclp->acl_cnt; i++) {
+ parent_entry = &(parent_aclp->acl_entry[i]);
+ flags = parent_entry->ae_flags;
+ tag = parent_entry->ae_tag;
+
+ /*
+ * Don't inherit owner@, group@, or everyone@ entries.
+ */
+ if (tag == ACL_USER_OBJ || tag == ACL_GROUP_OBJ ||
+ tag == ACL_EVERYONE)
+ continue;
+
+ /*
+ * Entry is not inheritable at all.
+ */
+ if ((flags & (ACL_ENTRY_DIRECTORY_INHERIT |
+ ACL_ENTRY_FILE_INHERIT)) == 0)
+ continue;
+
+ /*
+ * We're creating a file, but entry is not inheritable
+ * by files.
+ */
+ if (!is_directory && (flags & ACL_ENTRY_FILE_INHERIT) == 0)
+ continue;
+
+ /*
+ * Entry is inheritable only by files, but has NO_PROPAGATE
+ * flag set, and we're creating a directory, so it wouldn't
+ * propagate to any file in that directory anyway.
+ */
+ if (is_directory &&
+ (flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0 &&
+ (flags & ACL_ENTRY_NO_PROPAGATE_INHERIT))
+ continue;
+
+ /*
+ * Entry qualifies for being inherited.
+ */
+ KASSERT(child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES,
+ ("child_aclp->acl_cnt + 1 <= ACL_MAX_ENTRIES"));
+ entry = &(child_aclp->acl_entry[child_aclp->acl_cnt]);
+ *entry = *parent_entry;
+ child_aclp->acl_cnt++;
+
+ entry->ae_flags &= ~ACL_ENTRY_INHERIT_ONLY;
+
+ /*
+ * If the type of the ACE is neither ALLOW nor DENY,
+ * then leave it as it is and proceed to the next one.
+ */
+ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+ entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+ continue;
+
+ /*
+ * If the ACL_ENTRY_NO_PROPAGATE_INHERIT is set, or if
+ * the object being created is not a directory, then clear
+ * the following flags: ACL_ENTRY_NO_PROPAGATE_INHERIT,
+ * ACL_ENTRY_FILE_INHERIT, ACL_ENTRY_DIRECTORY_INHERIT,
+ * ACL_ENTRY_INHERIT_ONLY.
+ */
+ if (entry->ae_flags & ACL_ENTRY_NO_PROPAGATE_INHERIT ||
+ !is_directory) {
+ entry->ae_flags &= ~(ACL_ENTRY_NO_PROPAGATE_INHERIT |
+ ACL_ENTRY_FILE_INHERIT | ACL_ENTRY_DIRECTORY_INHERIT |
+ ACL_ENTRY_INHERIT_ONLY);
+ }
+
+ /*
+ * If the object is a directory and ACL_ENTRY_FILE_INHERIT
+ * is set, but ACL_ENTRY_DIRECTORY_INHERIT is not set, ensure
+ * that ACL_ENTRY_INHERIT_ONLY is set.
+ */
+ if (is_directory &&
+ (entry->ae_flags & ACL_ENTRY_FILE_INHERIT) &&
+ ((entry->ae_flags & ACL_ENTRY_DIRECTORY_INHERIT) == 0)) {
+ entry->ae_flags |= ACL_ENTRY_INHERIT_ONLY;
+ }
+
+ if (entry->ae_entry_type == ACL_ENTRY_TYPE_ALLOW &&
+ (entry->ae_flags & ACL_ENTRY_INHERIT_ONLY) == 0) {
+ /*
+ * Some permissions must never be inherited.
+ */
+ entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+ ACL_WRITE_NAMED_ATTRS | ACL_WRITE_ATTRIBUTES);
+
+ /*
+ * Others must be masked according to the file mode.
+ */
+ if ((mode & S_IRGRP) == 0)
+ entry->ae_perm &= ~ACL_READ_DATA;
+ if ((mode & S_IWGRP) == 0)
+ entry->ae_perm &=
+ ~(ACL_WRITE_DATA | ACL_APPEND_DATA);
+ if ((mode & S_IXGRP) == 0)
+ entry->ae_perm &= ~ACL_EXECUTE;
+ }
+ }
+}
+
+/*
+ * Calculate inherited ACL in a manner compatible with PSARC/2010/029.
+ * It's also being used to calculate a trivial ACL, by inheriting from
+ * a NULL ACL.
+ */
+static void
+acl_nfs4_compute_inherited_acl_psarc(const struct acl *parent_aclp,
+ struct acl *aclp, mode_t mode, int file_owner_id, int is_directory)
+{
+ acl_perm_t user_allow_first = 0, user_deny = 0, group_deny = 0;
+ acl_perm_t user_allow, group_allow, everyone_allow;
+
+ KASSERT(aclp->acl_cnt == 0, ("aclp->acl_cnt == 0"));
+
+ user_allow = group_allow = everyone_allow = ACL_READ_ACL |
+ ACL_READ_ATTRIBUTES | ACL_READ_NAMED_ATTRS | ACL_SYNCHRONIZE;
+ user_allow |= ACL_WRITE_ACL | ACL_WRITE_OWNER | ACL_WRITE_ATTRIBUTES |
+ ACL_WRITE_NAMED_ATTRS;
+
+ if (mode & S_IRUSR)
+ user_allow |= ACL_READ_DATA;
+ if (mode & S_IWUSR)
+ user_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ if (mode & S_IXUSR)
+ user_allow |= ACL_EXECUTE;
+
+ if (mode & S_IRGRP)
+ group_allow |= ACL_READ_DATA;
+ if (mode & S_IWGRP)
+ group_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ if (mode & S_IXGRP)
+ group_allow |= ACL_EXECUTE;
+
+ if (mode & S_IROTH)
+ everyone_allow |= ACL_READ_DATA;
+ if (mode & S_IWOTH)
+ everyone_allow |= (ACL_WRITE_DATA | ACL_APPEND_DATA);
+ if (mode & S_IXOTH)
+ everyone_allow |= ACL_EXECUTE;
+
+ user_deny = ((group_allow | everyone_allow) & ~user_allow);
+ group_deny = everyone_allow & ~group_allow;
+ user_allow_first = group_deny & ~user_deny;
+
+ if (user_allow_first != 0)
+ _acl_append(aclp, ACL_USER_OBJ, user_allow_first,
+ ACL_ENTRY_TYPE_ALLOW);
+ if (user_deny != 0)
+ _acl_append(aclp, ACL_USER_OBJ, user_deny,
+ ACL_ENTRY_TYPE_DENY);
+ if (group_deny != 0)
+ _acl_append(aclp, ACL_GROUP_OBJ, group_deny,
+ ACL_ENTRY_TYPE_DENY);
+
+ if (parent_aclp != NULL)
+ acl_nfs4_inherit_entries(parent_aclp, aclp, mode,
+ file_owner_id, is_directory);
+
+ _acl_append(aclp, ACL_USER_OBJ, user_allow, ACL_ENTRY_TYPE_ALLOW);
+ _acl_append(aclp, ACL_GROUP_OBJ, group_allow, ACL_ENTRY_TYPE_ALLOW);
+ _acl_append(aclp, ACL_EVERYONE, everyone_allow, ACL_ENTRY_TYPE_ALLOW);
+}
+
+#ifdef _KERNEL
+void
+acl_nfs4_compute_inherited_acl(const struct acl *parent_aclp,
+ struct acl *child_aclp, mode_t mode, int file_owner_id,
+ int is_directory)
+{
+
+ if (acl_nfs4_old_semantics)
+ acl_nfs4_compute_inherited_acl_draft(parent_aclp, child_aclp,
+ mode, file_owner_id, is_directory);
+ else
+ acl_nfs4_compute_inherited_acl_psarc(parent_aclp, child_aclp,
+ mode, file_owner_id, is_directory);
+}
+#endif /* _KERNEL */
+
+/*
+ * Calculate trivial ACL in a manner compatible with PSARC/2010/029.
+ * Note that this results in an ACL different from (but semantically
+ * equal to) the "canonical six" trivial ACL computed using algorithm
+ * described in draft-ietf-nfsv4-minorversion1-03.txt, 3.16.6.2.
+ */
+static void
+acl_nfs4_trivial_from_mode(struct acl *aclp, mode_t mode)
+{
+
+ aclp->acl_cnt = 0;
+ acl_nfs4_compute_inherited_acl_psarc(NULL, aclp, mode, -1, -1);
+}
+
+#ifndef _KERNEL
+/*
+ * This routine is used by libc to implement acl_strip_np(3)
+ * and acl_is_trivial_np(3).
+ */
+void
+acl_nfs4_trivial_from_mode_libc(struct acl *aclp, int mode, int canonical_six)
+{
+
+ aclp->acl_cnt = 0;
+ if (canonical_six)
+ acl_nfs4_sync_acl_from_mode_draft(aclp, mode, -1);
+ else
+ acl_nfs4_trivial_from_mode(aclp, mode);
+}
+#endif /* !_KERNEL */
+
+#ifdef _KERNEL
+static int
+_acls_are_equal(const struct acl *a, const struct acl *b)
+{
+ int i;
+ const struct acl_entry *entrya, *entryb;
+
+ if (a->acl_cnt != b->acl_cnt)
+ return (0);
+
+ for (i = 0; i < b->acl_cnt; i++) {
+ entrya = &(a->acl_entry[i]);
+ entryb = &(b->acl_entry[i]);
+
+ if (entrya->ae_tag != entryb->ae_tag ||
+ entrya->ae_id != entryb->ae_id ||
+ entrya->ae_perm != entryb->ae_perm ||
+ entrya->ae_entry_type != entryb->ae_entry_type ||
+ entrya->ae_flags != entryb->ae_flags)
+ return (0);
+ }
+
+ return (1);
+}
+
+/*
+ * This routine is used to determine whether to remove extended attribute
+ * that stores ACL contents.
+ */
+int
+acl_nfs4_is_trivial(const struct acl *aclp, int file_owner_id)
+{
+ int trivial;
+ mode_t tmpmode = 0;
+ struct acl *tmpaclp;
+
+ if (aclp->acl_cnt > 6)
+ return (0);
+
+ /*
+ * Compute the mode from the ACL, then compute new ACL from that mode.
+ * If the ACLs are identical, then the ACL is trivial.
+ *
+ * XXX: I guess there is a faster way to do this. However, even
+ * this slow implementation significantly speeds things up
+ * for files that don't have non-trivial ACLs - it's critical
+ * for performance to not use EA when they are not needed.
+ *
+ * First try the PSARC/2010/029 semantics.
+ */
+ tmpaclp = acl_alloc(M_WAITOK | M_ZERO);
+ acl_nfs4_sync_mode_from_acl(&tmpmode, aclp);
+ acl_nfs4_trivial_from_mode(tmpaclp, tmpmode);
+ trivial = _acls_are_equal(aclp, tmpaclp);
+ if (trivial) {
+ acl_free(tmpaclp);
+ return (trivial);
+ }
+
+ /*
+ * Check if it's a draft-ietf-nfsv4-minorversion1-03.txt trivial ACL.
+ */
+ tmpaclp->acl_cnt = 0;
+ acl_nfs4_sync_acl_from_mode_draft(tmpaclp, tmpmode, file_owner_id);
+ trivial = _acls_are_equal(aclp, tmpaclp);
+ acl_free(tmpaclp);
+
+ return (trivial);
+}
+#endif /* _KERNEL */
+
+int
+acl_nfs4_check(const struct acl *aclp, int is_directory)
+{
+ int i;
+ const struct acl_entry *entry;
+
+ /*
+ * The spec doesn't seem to say anything about ACL validity.
+ * It seems there is not much to do here. There is even no need
+ * to count "owner@" or "everyone@" (ACL_USER_OBJ and ACL_EVERYONE)
+ * entries, as there can be several of them and that's perfectly
+ * valid. There can be none of them too. Really.
+ */
+
+ if (aclp->acl_cnt > ACL_MAX_ENTRIES || aclp->acl_cnt <= 0)
+ return (EINVAL);
+
+ for (i = 0; i < aclp->acl_cnt; i++) {
+ entry = &(aclp->acl_entry[i]);
+
+ switch (entry->ae_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_EVERYONE:
+ if (entry->ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ break;
+
+ case ACL_USER:
+ case ACL_GROUP:
+ if (entry->ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ if ((entry->ae_perm | ACL_NFS4_PERM_BITS) != ACL_NFS4_PERM_BITS)
+ return (EINVAL);
+
+ /*
+ * Disallow ACL_ENTRY_TYPE_AUDIT and ACL_ENTRY_TYPE_ALARM for now.
+ */
+ if (entry->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
+ entry->ae_entry_type != ACL_ENTRY_TYPE_DENY)
+ return (EINVAL);
+
+ if ((entry->ae_flags | ACL_FLAGS_BITS) != ACL_FLAGS_BITS)
+ return (EINVAL);
+
+ /* Disallow unimplemented flags. */
+ if (entry->ae_flags & (ACL_ENTRY_SUCCESSFUL_ACCESS |
+ ACL_ENTRY_FAILED_ACCESS))
+ return (EINVAL);
+
+ /* Disallow flags not allowed for ordinary files. */
+ if (!is_directory) {
+ if (entry->ae_flags & (ACL_ENTRY_FILE_INHERIT |
+ ACL_ENTRY_DIRECTORY_INHERIT |
+ ACL_ENTRY_NO_PROPAGATE_INHERIT | ACL_ENTRY_INHERIT_ONLY))
+ return (EINVAL);
+ }
+ }
+
+ return (0);
+}
+
+#ifdef _KERNEL
+static int
+acl_nfs4_modload(module_t module, int what, void *arg)
+{
+ int ret;
+
+ ret = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ case MOD_SHUTDOWN:
+ break;
+
+ case MOD_QUIESCE:
+ /* XXX TODO */
+ ret = 0;
+ break;
+
+ case MOD_UNLOAD:
+ /* XXX TODO */
+ ret = 0;
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+
+ return (ret);
+}
+
+static moduledata_t acl_nfs4_mod = {
+ "acl_nfs4",
+ acl_nfs4_modload,
+ NULL
+};
+
+/*
+ * XXX TODO: which subsystem, order?
+ */
+DECLARE_MODULE(acl_nfs4, acl_nfs4_mod, SI_SUB_VFS, SI_ORDER_FIRST);
+MODULE_VERSION(acl_nfs4, 1);
+#endif /* _KERNEL */
diff --git a/sys/kern/subr_acl_posix1e.c b/sys/kern/subr_acl_posix1e.c
new file mode 100644
index 0000000..3200932
--- /dev/null
+++ b/sys/kern/subr_acl_posix1e.c
@@ -0,0 +1,691 @@
+/*-
+ * Copyright (c) 1999-2006 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL support routines specific to POSIX.1e access control lists. These are
+ * utility routines for code common across file systems implementing POSIX.1e
+ * ACLs.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics;
+ * the access ACL has already been prepared for evaluation by the file system
+ * and is passed via 'uid', 'gid', and 'acl'. Return 0 on success, else an
+ * errno value.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+ struct acl *acl, accmode_t accmode, struct ucred *cred, int *privused)
+{
+ struct acl_entry *acl_other, *acl_mask;
+ accmode_t dac_granted;
+ accmode_t priv_granted;
+ accmode_t acl_mask_granted;
+ int group_matched, i;
+
+ KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
+ ("invalid bit in accmode"));
+ KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
+ ("VAPPEND without VWRITE"));
+
+ /*
+ * Look for a normal, non-privileged way to access the file/directory
+ * as requested. If it exists, go with that. Otherwise, attempt to
+ * use privileges granted via priv_granted. In some cases, which
+ * privileges to use may be ambiguous due to "best match", in which
+ * case fall back on first match for the time being.
+ */
+ if (privused != NULL)
+ *privused = 0;
+
+ /*
+ * Determine privileges now, but don't apply until we've found a DAC
+ * entry that matches but has failed to allow access.
+ *
+ * XXXRW: Ideally, we'd determine the privileges required before
+ * asking for them.
+ */
+ priv_granted = 0;
+
+ if (type == VDIR) {
+ if ((accmode & VEXEC) && !priv_check_cred(cred,
+ PRIV_VFS_LOOKUP, 0))
+ priv_granted |= VEXEC;
+ } else {
+ /*
+ * Ensure that at least one execute bit is on. Otherwise,
+ * a privileged user will always succeed, and we don't want
+ * this to happen unless the file really is executable.
+ */
+ if ((accmode & VEXEC) && (acl_posix1e_acl_to_mode(acl) &
+ (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
+ !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+ priv_granted |= VEXEC;
+ }
+
+ if ((accmode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0))
+ priv_granted |= VREAD;
+
+ if (((accmode & VWRITE) || (accmode & VAPPEND)) &&
+ !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+ priv_granted |= (VWRITE | VAPPEND);
+
+ if ((accmode & VADMIN) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+ priv_granted |= VADMIN;
+
+ /*
+ * The owner matches if the effective uid associated with the
+ * credential matches that of the ACL_USER_OBJ entry. While we're
+ * doing the first scan, also cache the location of the ACL_MASK and
+ * ACL_OTHER entries, preventing some future iterations.
+ */
+ acl_mask = acl_other = NULL;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ if (file_uid != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ dac_granted |= VADMIN;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((accmode & (dac_granted | priv_granted)) ==
+ accmode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ goto error;
+
+ case ACL_MASK:
+ acl_mask = &acl->acl_entry[i];
+ break;
+
+ case ACL_OTHER:
+ acl_other = &acl->acl_entry[i];
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ /*
+ * An ACL_OTHER entry should always exist in a valid access ACL. If
+ * it doesn't, then generate a serious failure. For now, this means
+ * a debugging message and EPERM, but in the future should probably
+ * be a panic.
+ */
+ if (acl_other == NULL) {
+ /*
+ * XXX This should never happen
+ */
+ printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+ return (EPERM);
+ }
+
+ /*
+ * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are
+ * masked by an ACL_MASK entry, if any. As such, first identify the
+ * ACL_MASK field, then iterate through identifying potential user
+ * matches, then group matches. If there is no ACL_MASK, assume that
+ * the mask allows all requests to succeed.
+ */
+ if (acl_mask != NULL) {
+ acl_mask_granted = 0;
+ if (acl_mask->ae_perm & ACL_EXECUTE)
+ acl_mask_granted |= VEXEC;
+ if (acl_mask->ae_perm & ACL_READ)
+ acl_mask_granted |= VREAD;
+ if (acl_mask->ae_perm & ACL_WRITE)
+ acl_mask_granted |= (VWRITE | VAPPEND);
+ } else
+ acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
+
+ /*
+ * Check ACL_USER ACL entries. There will either be one or no
+ * matches; if there is one, we accept or rejected based on the
+ * match; otherwise, we continue on to groups.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id != cred->cr_uid)
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((accmode & (dac_granted | priv_granted)) !=
+ accmode)
+ goto error;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+ }
+
+ /*
+ * Group match is best-match, not first-match, so find a "best"
+ * match. Iterate across, testing each potential group match. Make
+ * sure we keep track of whether we found a match or not, so that we
+ * know if we should try again with any available privilege, or if we
+ * should move on to ACL_OTHER.
+ */
+ group_matched = 0;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+ group_matched = 1;
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ if (group_matched == 1) {
+ /*
+ * There was a match, but it did not grant rights via pure
+ * DAC. Try again, this time with privilege.
+ */
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_GROUP_OBJ:
+ if (!groupmember(file_gid, cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((accmode & (dac_granted | priv_granted))
+ != accmode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ case ACL_GROUP:
+ if (!groupmember(acl->acl_entry[i].ae_id,
+ cred))
+ break;
+ dac_granted = 0;
+ if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl->acl_entry[i].ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+ dac_granted &= acl_mask_granted;
+
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((accmode & (dac_granted | priv_granted))
+ != accmode)
+ break;
+
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+
+ default:
+ break;
+ }
+ }
+ /*
+ * Even with privilege, group membership was not sufficient.
+ * Return failure.
+ */
+ goto error;
+ }
+
+ /*
+ * Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER.
+ */
+ dac_granted = 0;
+ if (acl_other->ae_perm & ACL_EXECUTE)
+ dac_granted |= VEXEC;
+ if (acl_other->ae_perm & ACL_READ)
+ dac_granted |= VREAD;
+ if (acl_other->ae_perm & ACL_WRITE)
+ dac_granted |= (VWRITE | VAPPEND);
+
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+ /*
+ * XXXRW: Do privilege lookup here.
+ */
+ if ((accmode & (dac_granted | priv_granted)) == accmode) {
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+
+error:
+ return ((accmode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an inode
+ * with a mode_t field, this routine converts a mode_t entry to an
+ * acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+ acl_perm_t perm = 0;
+
+ switch(tag) {
+ case ACL_USER_OBJ:
+ if (mode & S_IXUSR)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRUSR)
+ perm |= ACL_READ;
+ if (mode & S_IWUSR)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_GROUP_OBJ:
+ if (mode & S_IXGRP)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IRGRP)
+ perm |= ACL_READ;
+ if (mode & S_IWGRP)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ case ACL_OTHER:
+ if (mode & S_IXOTH)
+ perm |= ACL_EXECUTE;
+ if (mode & S_IROTH)
+ perm |= ACL_READ;
+ if (mode & S_IWOTH)
+ perm |= ACL_WRITE;
+ return (perm);
+
+ default:
+ printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+ return (0);
+ }
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+ struct acl_entry acl_entry;
+
+ acl_entry.ae_tag = tag;
+ acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+ acl_entry.ae_entry_type = 0;
+ acl_entry.ae_flags = 0;
+ switch(tag) {
+ case ACL_USER_OBJ:
+ acl_entry.ae_id = uid;
+ break;
+
+ case ACL_GROUP_OBJ:
+ acl_entry.ae_id = gid;
+ break;
+
+ case ACL_OTHER:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ break;
+
+ default:
+ acl_entry.ae_id = ACL_UNDEFINED_ID;
+ printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+ }
+
+ return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+ struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+ mode_t mode;
+
+ mode = 0;
+ if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRUSR;
+ if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWUSR;
+ if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_READ)
+ mode |= S_IRGRP;
+ if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWGRP;
+ if (acl_other_entry->ae_perm & ACL_EXECUTE)
+ mode |= S_IXOTH;
+ if (acl_other_entry->ae_perm & ACL_READ)
+ mode |= S_IROTH;
+ if (acl_other_entry->ae_perm & ACL_WRITE)
+ mode |= S_IWOTH;
+
+ return (mode);
+}
+
+/*
+ * Utility function to generate a file mode given a complete POSIX.1e access
+ * ACL. Note that if the ACL is improperly formed, this may result in a
+ * panic.
+ */
+mode_t
+acl_posix1e_acl_to_mode(struct acl *acl)
+{
+ struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj, *acl_other;
+ int i;
+
+ /*
+ * Find the ACL entries relevant to a POSIX permission mode.
+ */
+ acl_user_obj = acl_group_obj = acl_other = acl_mask = NULL;
+ for (i = 0; i < acl->acl_cnt; i++) {
+ switch (acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ acl_user_obj = &acl->acl_entry[i];
+ break;
+
+ case ACL_GROUP_OBJ:
+ acl_group_obj = &acl->acl_entry[i];
+ break;
+
+ case ACL_OTHER:
+ acl_other = &acl->acl_entry[i];
+ break;
+
+ case ACL_MASK:
+ acl_mask = &acl->acl_entry[i];
+ break;
+
+ case ACL_USER:
+ case ACL_GROUP:
+ break;
+
+ default:
+ panic("acl_posix1e_acl_to_mode: bad ae_tag");
+ }
+ }
+
+ if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL)
+ panic("acl_posix1e_acl_to_mode: missing base ae_tags");
+
+ /*
+ * POSIX.1e specifies that if there is an ACL_MASK entry, we replace
+ * the mode "group" bits with its permissions. If there isn't, we
+ * use the ACL_GROUP_OBJ permissions.
+ */
+ if (acl_mask != NULL)
+ return (acl_posix1e_perms_to_mode(acl_user_obj, acl_mask,
+ acl_other));
+ else
+ return (acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj,
+ acl_other));
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an implementing
+ * filesystem to determine if it should accept this and rely on the POSIX.1e
+ * ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+ int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+ int num_acl_mask, num_acl_other, i;
+
+ /*
+ * Verify that the number of entries does not exceed the maximum
+ * defined for acl_t.
+ *
+ * Verify that the correct number of various sorts of ae_tags are
+ * present:
+ * Exactly one ACL_USER_OBJ
+ * Exactly one ACL_GROUP_OBJ
+ * Exactly one ACL_OTHER
+ * If any ACL_USER or ACL_GROUP entries appear, then exactly one
+ * ACL_MASK entry must also appear.
+ *
+ * Verify that all ae_perm entries are in ACL_PERM_BITS.
+ *
+ * Verify all ae_tag entries are understood by this implementation.
+ *
+ * Note: Does not check for uniqueness of qualifier (ae_id) field.
+ */
+ num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+ num_acl_mask = num_acl_other = 0;
+ if (acl->acl_cnt > ACL_MAX_ENTRIES)
+ return (EINVAL);
+ for (i = 0; i < acl->acl_cnt; i++) {
+ /*
+ * Check for a valid tag.
+ */
+ switch(acl->acl_entry[i].ae_tag) {
+ case ACL_USER_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user_obj++;
+ break;
+ case ACL_GROUP_OBJ:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group_obj++;
+ break;
+ case ACL_USER:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_user++;
+ break;
+ case ACL_GROUP:
+ if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_group++;
+ break;
+ case ACL_OTHER:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_other++;
+ break;
+ case ACL_MASK:
+ acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+ if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+ return (EINVAL);
+ num_acl_mask++;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /*
+ * Check for valid perm entries.
+ */
+ if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+ ACL_PERM_BITS)
+ return (EINVAL);
+ }
+ if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+ (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+ return (EINVAL);
+ if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+ (num_acl_mask != 1))
+ return (EINVAL);
+ return (0);
+}
+
+/*
+ * Given a requested mode for a new object, and a default ACL, combine the
+ * two to produce a new mode. Be careful not to clear any bits that aren't
+ * intended to be affected by the POSIX.1e ACL. Eventually, this might also
+ * take the cmask as an argument, if we push that down into
+ * per-filesystem-code.
+ */
+mode_t
+acl_posix1e_newfilemode(mode_t cmode, struct acl *dacl)
+{
+ mode_t mode;
+
+ mode = cmode;
+ /*
+ * The current composition policy is that a permission bit must be
+ * set in *both* the ACL and the requested creation mode for it to
+ * appear in the resulting mode/ACL. First clear any possibly
+ * effected bits, then reconstruct.
+ */
+ mode &= ACL_PRESERVE_MASK;
+ mode |= (ACL_OVERRIDE_MASK & cmode & acl_posix1e_acl_to_mode(dacl));
+
+ return (mode);
+}
+
+
+static int
+acl_posix1e_modload(module_t mod, int what, void *arg)
+{
+ int ret;
+
+ ret = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ case MOD_SHUTDOWN:
+ break;
+
+ case MOD_QUIESCE:
+ /* XXX TODO */
+ ret = 0;
+ break;
+
+ case MOD_UNLOAD:
+ /* XXX TODO */
+ ret = 0;
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+
+ return (ret);
+}
+
+static moduledata_t acl_posix1e_mod = {
+ "acl_posix1e",
+ acl_posix1e_modload,
+ NULL
+};
+
+DECLARE_MODULE(acl_posix1e, acl_posix1e_mod, SI_SUB_VFS, SI_ORDER_FIRST);
+MODULE_VERSION(acl_posix1e, 1);
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
new file mode 100644
index 0000000..6384056
--- /dev/null
+++ b/sys/kern/subr_autoconf.c
@@ -0,0 +1,230 @@
+/*-
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This software was developed by the Computer Systems Engineering group
+ * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
+ * contributed to Berkeley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+/*
+ * Autoconfiguration subroutines.
+ */
+
+/*
+ * "Interrupt driven config" functions.
+ */
+static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list =
+ TAILQ_HEAD_INITIALIZER(intr_config_hook_list);
+static struct intr_config_hook *next_to_notify;
+static struct mtx intr_config_hook_lock;
+MTX_SYSINIT(intr_config_hook, &intr_config_hook_lock, "intr config", MTX_DEF);
+
+/* ARGSUSED */
+static void run_interrupt_driven_config_hooks(void);
+
+/*
+ * If we wait too long for an interrupt-driven config hook to return, print
+ * a diagnostic.
+ */
+#define WARNING_INTERVAL_SECS 60
+static void
+run_interrupt_driven_config_hooks_warning(int warned)
+{
+ struct intr_config_hook *hook_entry;
+ char namebuf[64];
+ long offset;
+
+ if (warned < 6) {
+ printf("run_interrupt_driven_hooks: still waiting after %d "
+ "seconds for", warned * WARNING_INTERVAL_SECS);
+ TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links) {
+ if (linker_search_symbol_name(
+ (caddr_t)hook_entry->ich_func, namebuf,
+ sizeof(namebuf), &offset) == 0)
+ printf(" %s", namebuf);
+ else
+ printf(" %p", hook_entry->ich_func);
+ }
+ printf("\n");
+ }
+ KASSERT(warned < 6,
+ ("run_interrupt_driven_config_hooks: waited too long"));
+}
+
+static void
+run_interrupt_driven_config_hooks()
+{
+ static int running;
+ struct intr_config_hook *hook_entry;
+
+ mtx_lock(&intr_config_hook_lock);
+
+ /*
+ * If hook processing is already active, any newly
+ * registered hooks will eventually be notified.
+ * Let the currently running session issue these
+ * notifications.
+ */
+ if (running != 0) {
+ mtx_unlock(&intr_config_hook_lock);
+ return;
+ }
+ running = 1;
+
+ while (next_to_notify != NULL) {
+ hook_entry = next_to_notify;
+ next_to_notify = TAILQ_NEXT(hook_entry, ich_links);
+ mtx_unlock(&intr_config_hook_lock);
+ (*hook_entry->ich_func)(hook_entry->ich_arg);
+ mtx_lock(&intr_config_hook_lock);
+ }
+
+ running = 0;
+ mtx_unlock(&intr_config_hook_lock);
+}
+
+static void
+boot_run_interrupt_driven_config_hooks(void *dummy)
+{
+ int warned;
+
+ run_interrupt_driven_config_hooks();
+
+ /* Block boot processing until all hooks are disestablished. */
+ mtx_lock(&intr_config_hook_lock);
+ warned = 0;
+ while (!TAILQ_EMPTY(&intr_config_hook_list)) {
+ if (msleep(&intr_config_hook_list, &intr_config_hook_lock,
+ 0, "conifhk", WARNING_INTERVAL_SECS * hz) ==
+ EWOULDBLOCK) {
+ mtx_unlock(&intr_config_hook_lock);
+ warned++;
+ run_interrupt_driven_config_hooks_warning(warned);
+ mtx_lock(&intr_config_hook_lock);
+ }
+ }
+ mtx_unlock(&intr_config_hook_lock);
+}
+
+SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST,
+ boot_run_interrupt_driven_config_hooks, NULL);
+
+/*
+ * Register a hook that will be called after "cold"
+ * autoconfiguration is complete and interrupts can
+ * be used to complete initialization.
+ */
+int
+config_intrhook_establish(struct intr_config_hook *hook)
+{
+ struct intr_config_hook *hook_entry;
+
+ mtx_lock(&intr_config_hook_lock);
+ TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links)
+ if (hook_entry == hook)
+ break;
+ if (hook_entry != NULL) {
+ mtx_unlock(&intr_config_hook_lock);
+ printf("config_intrhook_establish: establishing an "
+ "already established hook.\n");
+ return (1);
+ }
+ TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links);
+ if (next_to_notify == NULL)
+ next_to_notify = hook;
+ mtx_unlock(&intr_config_hook_lock);
+ if (cold == 0)
+ /*
+ * XXX Call from a task since not all drivers expect
+ * to be re-entered at the time a hook is established.
+ */
+ /* XXX Sufficient for modules loaded after initial config??? */
+ run_interrupt_driven_config_hooks();
+ return (0);
+}
+
+void
+config_intrhook_disestablish(struct intr_config_hook *hook)
+{
+ struct intr_config_hook *hook_entry;
+
+ mtx_lock(&intr_config_hook_lock);
+ TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links)
+ if (hook_entry == hook)
+ break;
+ if (hook_entry == NULL)
+ panic("config_intrhook_disestablish: disestablishing an "
+ "unestablished hook");
+
+ if (next_to_notify == hook)
+ next_to_notify = TAILQ_NEXT(hook, ich_links);
+ TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links);
+
+ /* Wakeup anyone watching the list */
+ wakeup(&intr_config_hook_list);
+ mtx_unlock(&intr_config_hook_lock);
+}
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(conifhk, db_show_conifhk)
+{
+ struct intr_config_hook *hook_entry;
+ char namebuf[64];
+ long offset;
+
+ TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links) {
+ if (linker_ddb_search_symbol_name(
+ (caddr_t)hook_entry->ich_func, namebuf, sizeof(namebuf),
+ &offset) == 0) {
+ db_printf("hook: %p at %s+%#lx arg: %p\n",
+ hook_entry->ich_func, namebuf, offset,
+ hook_entry->ich_arg);
+ } else {
+ db_printf("hook: %p at ??+?? arg %p\n",
+ hook_entry->ich_func, hook_entry->ich_arg);
+ }
+ }
+}
+#endif /* DDB */
diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c
new file mode 100644
index 0000000..5c45b81
--- /dev/null
+++ b/sys/kern/subr_blist.c
@@ -0,0 +1,1095 @@
+/*-
+ * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
+ *
+ * This module implements a general bitmap allocator/deallocator. The
+ * allocator eats around 2 bits per 'block'. The module does not
+ * try to interpret the meaning of a 'block' other than to return
+ * SWAPBLK_NONE on an allocation failure.
+ *
+ * A radix tree is used to maintain the bitmap. Two radix constants are
+ * involved: One for the bitmaps contained in the leaf nodes (typically
+ * 32), and one for the meta nodes (typically 16). Both meta and leaf
+ * nodes have a hint field. This field gives us a hint as to the largest
+ * free contiguous range of blocks under the node. It may contain a
+ * value that is too high, but will never contain a value that is too
+ * low. When the radix tree is searched, allocation failures in subtrees
+ * update the hint.
+ *
+ * The radix tree also implements two collapsed states for meta nodes:
+ * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
+ * in either of these two states, all information contained underneath
+ * the node is considered stale. These states are used to optimize
+ * allocation and freeing operations.
+ *
+ * The hinting greatly increases code efficiency for allocations while
+ * the general radix structure optimizes both allocations and frees. The
+ * radix tree should be able to operate well no matter how much
+ * fragmentation there is and no matter how large a bitmap is used.
+ *
+ * The blist code wires all necessary memory at creation time. Neither
+ * allocations nor frees require interaction with the memory subsystem.
+ * The non-blocking features of the blist code are used in the swap code
+ * (vm/swap_pager.c).
+ *
+ * LAYOUT: The radix tree is layed out recursively using a
+ * linear array. Each meta node is immediately followed (layed out
+ * sequentially in memory) by BLIST_META_RADIX lower level nodes. This
+ * is a recursive structure but one that can be easily scanned through
+ * a very simple 'skip' calculation. In order to support large radixes,
+ * portions of the tree may reside outside our memory allocation. We
+ * handle this with an early-termination optimization (when bighint is
+ * set to -1) on the scan. The memory allocation is only large enough
+ * to cover the number of blocks requested at creation time even if it
+ * must be encompassed in larger root-node radix.
+ *
+ * NOTE: the allocator cannot currently allocate more than
+ * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
+ * large' if you try. This is an area that could use improvement. The
+ * radix is large enough that this restriction does not effect the swap
+ * system, though. Currently only the allocation code is effected by
+ * this algorithmic unfeature. The freeing code can handle arbitrary
+ * ranges.
+ *
+ * This code can be compiled stand-alone for debugging.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/blist.h>
+#include <sys/malloc.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+
+#else
+
+#ifndef BLIST_NO_DEBUG
+#define BLIST_DEBUG
+#endif
+
+#define SWAPBLK_NONE ((daddr_t)-1)
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#define malloc(a,b,c) calloc(a, 1)
+#define free(a,b) free(a)
+
+typedef unsigned int u_daddr_t;
+
+#include <sys/blist.h>
+
+void panic(const char *ctl, ...);
+
+#endif
+
+/*
+ * static support functions
+ */
+
+static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk,
+ daddr_t count, daddr_t radix, int skip);
+static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
+static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
+ daddr_t radix, int skip, daddr_t blk);
+static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
+ daddr_t skip, blist_t dest, daddr_t count);
+static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
+static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
+ daddr_t radix, int skip, daddr_t blk);
+static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix,
+ int skip, daddr_t count);
+#ifndef _KERNEL
+static void blst_radix_print(blmeta_t *scan, daddr_t blk,
+ daddr_t radix, int skip, int tab);
+#endif
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
+#endif
+
+/*
+ * blist_create() - create a blist capable of handling up to the specified
+ * number of blocks
+ *
+ * blocks - must be greater than 0
+ * flags - malloc flags
+ *
+ * The smallest blist consists of a single leaf node capable of
+ * managing BLIST_BMAP_RADIX blocks.
+ */
+
+blist_t
+blist_create(daddr_t blocks, int flags)
+{
+ blist_t bl;
+ int radix;
+ int skip = 0;
+
+ /*
+ * Calculate radix and skip field used for scanning.
+ */
+ radix = BLIST_BMAP_RADIX;
+
+ while (radix < blocks) {
+ radix *= BLIST_META_RADIX;
+ skip = (skip + 1) * BLIST_META_RADIX;
+ }
+
+ bl = malloc(sizeof(struct blist), M_SWAP, flags | M_ZERO);
+
+ bl->bl_blocks = blocks;
+ bl->bl_radix = radix;
+ bl->bl_skip = skip;
+ bl->bl_rootblks = 1 +
+ blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
+ bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, flags);
+
+#if defined(BLIST_DEBUG)
+ printf(
+ "BLIST representing %lld blocks (%lld MB of swap)"
+ ", requiring %lldK of ram\n",
+ (long long)bl->bl_blocks,
+ (long long)bl->bl_blocks * 4 / 1024,
+ (long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+ );
+ printf("BLIST raw radix tree contains %lld records\n",
+ (long long)bl->bl_rootblks);
+#endif
+ blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
+
+ return(bl);
+}
+
+void
+blist_destroy(blist_t bl)
+{
+ free(bl->bl_root, M_SWAP);
+ free(bl, M_SWAP);
+}
+
+/*
+ * blist_alloc() - reserve space in the block bitmap. Return the base
+ * of a contiguous region or SWAPBLK_NONE if space could
+ * not be allocated.
+ */
+
+daddr_t
+blist_alloc(blist_t bl, daddr_t count)
+{
+ daddr_t blk = SWAPBLK_NONE;
+
+ if (bl) {
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ blk = blst_leaf_alloc(bl->bl_root, 0, count);
+ else
+ blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
+ if (blk != SWAPBLK_NONE)
+ bl->bl_free -= count;
+ }
+ return(blk);
+}
+
+/*
+ * blist_free() - free up space in the block bitmap. Return the base
+ * of a contiguous region. Panic if an inconsistancy is
+ * found.
+ */
+
+void
+blist_free(blist_t bl, daddr_t blkno, daddr_t count)
+{
+ if (bl) {
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ blst_leaf_free(bl->bl_root, blkno, count);
+ else
+ blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
+ bl->bl_free += count;
+ }
+}
+
+/*
+ * blist_fill() - mark a region in the block bitmap as off-limits
+ * to the allocator (i.e. allocate it), ignoring any
+ * existing allocations. Return the number of blocks
+ * actually filled that were free before the call.
+ */
+
+int
+blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
+{
+ int filled;
+
+ if (bl) {
+ if (bl->bl_radix == BLIST_BMAP_RADIX)
+ filled = blst_leaf_fill(bl->bl_root, blkno, count);
+ else
+ filled = blst_meta_fill(bl->bl_root, blkno, count,
+ bl->bl_radix, bl->bl_skip, 0);
+ bl->bl_free -= filled;
+ return filled;
+ } else
+ return 0;
+}
+
+/*
+ * blist_resize() - resize an existing radix tree to handle the
+ * specified number of blocks. This will reallocate
+ * the tree and transfer the previous bitmap to the new
+ * one. When extending the tree you can specify whether
+ * the new blocks are to left allocated or freed.
+ */
+
+void
+blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
+{
+ blist_t newbl = blist_create(count, flags);
+ blist_t save = *pbl;
+
+ *pbl = newbl;
+ if (count > save->bl_blocks)
+ count = save->bl_blocks;
+ blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
+
+ /*
+ * If resizing upwards, should we free the new space or not?
+ */
+ if (freenew && count < newbl->bl_blocks) {
+ blist_free(newbl, count, newbl->bl_blocks - count);
+ }
+ blist_destroy(save);
+}
+
+#ifdef BLIST_DEBUG
+
+/*
+ * blist_print() - dump radix tree
+ */
+
+void
+blist_print(blist_t bl)
+{
+ printf("BLIST {\n");
+ blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
+ printf("}\n");
+}
+
+#endif
+
+/************************************************************************
+ * ALLOCATION SUPPORT FUNCTIONS *
+ ************************************************************************
+ *
+ * These support functions do all the actual work. They may seem
+ * rather longish, but that's because I've commented them up. The
+ * actual code is straight forward.
+ *
+ */
+
+/*
+ * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
+ *
+ * This is the core of the allocator and is optimized for the 1 block
+ * and the BLIST_BMAP_RADIX block allocation cases. Other cases are
+ * somewhat slower. The 1 block allocation case is log2 and extremely
+ * quick.
+ */
+
+static daddr_t
+blst_leaf_alloc(
+ blmeta_t *scan,
+ daddr_t blk,
+ int count
+) {
+ u_daddr_t orig = scan->u.bmu_bitmap;
+
+ if (orig == 0) {
+ /*
+ * Optimize bitmap all-allocated case. Also, count = 1
+ * case assumes at least 1 bit is free in the bitmap, so
+ * we have to take care of this case here.
+ */
+ scan->bm_bighint = 0;
+ return(SWAPBLK_NONE);
+ }
+ if (count == 1) {
+ /*
+ * Optimized code to allocate one bit out of the bitmap
+ */
+ u_daddr_t mask;
+ int j = BLIST_BMAP_RADIX/2;
+ int r = 0;
+
+ mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
+
+ while (j) {
+ if ((orig & mask) == 0) {
+ r += j;
+ orig >>= j;
+ }
+ j >>= 1;
+ mask >>= j;
+ }
+ scan->u.bmu_bitmap &= ~(1 << r);
+ return(blk + r);
+ }
+ if (count <= BLIST_BMAP_RADIX) {
+ /*
+ * non-optimized code to allocate N bits out of the bitmap.
+ * The more bits, the faster the code runs. It will run
+ * the slowest allocating 2 bits, but since there aren't any
+ * memory ops in the core loop (or shouldn't be, anyway),
+ * you probably won't notice the difference.
+ */
+ int j;
+ int n = BLIST_BMAP_RADIX - count;
+ u_daddr_t mask;
+
+ mask = (u_daddr_t)-1 >> n;
+
+ for (j = 0; j <= n; ++j) {
+ if ((orig & mask) == mask) {
+ scan->u.bmu_bitmap &= ~mask;
+ return(blk + j);
+ }
+ mask = (mask << 1);
+ }
+ }
+ /*
+ * We couldn't allocate count in this subtree, update bighint.
+ */
+ scan->bm_bighint = count - 1;
+ return(SWAPBLK_NONE);
+}
+
+/*
+ * blist_meta_alloc() - allocate at a meta in the radix tree.
+ *
+ * Attempt to allocate at a meta node. If we can't, we update
+ * bighint and return a failure. Updating bighint optimize future
+ * calls that hit this node. We have to check for our collapse cases
+ * and we have a few optimizations strewn in as well.
+ */
+
+static daddr_t
+blst_meta_alloc(
+ blmeta_t *scan,
+ daddr_t blk,
+ daddr_t count,
+ daddr_t radix,
+ int skip
+) {
+ int i;
+ int next_skip = ((u_int)skip / BLIST_META_RADIX);
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * ALL-ALLOCATED special case
+ */
+ scan->bm_bighint = count;
+ return(SWAPBLK_NONE);
+ }
+
+ if (scan->u.bmu_avail == radix) {
+ radix /= BLIST_META_RADIX;
+
+ /*
+ * ALL-FREE special case, initialize uninitialize
+ * sublevel.
+ */
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+ if (next_skip == 1) {
+ scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+ scan[i].bm_bighint = BLIST_BMAP_RADIX;
+ } else {
+ scan[i].bm_bighint = radix;
+ scan[i].u.bmu_avail = radix;
+ }
+ }
+ } else {
+ radix /= BLIST_META_RADIX;
+ }
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (count <= scan[i].bm_bighint) {
+ /*
+ * count fits in object
+ */
+ daddr_t r;
+ if (next_skip == 1) {
+ r = blst_leaf_alloc(&scan[i], blk, count);
+ } else {
+ r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
+ }
+ if (r != SWAPBLK_NONE) {
+ scan->u.bmu_avail -= count;
+ if (scan->bm_bighint > scan->u.bmu_avail)
+ scan->bm_bighint = scan->u.bmu_avail;
+ return(r);
+ }
+ } else if (scan[i].bm_bighint == (daddr_t)-1) {
+ /*
+ * Terminator
+ */
+ break;
+ } else if (count > radix) {
+ /*
+ * count does not fit in object even if it were
+ * complete free.
+ */
+ panic("blist_meta_alloc: allocation too large");
+ }
+ blk += radix;
+ }
+
+ /*
+ * We couldn't allocate count in this subtree, update bighint.
+ */
+ if (scan->bm_bighint >= count)
+ scan->bm_bighint = count - 1;
+ return(SWAPBLK_NONE);
+}
+
+/*
+ * BLST_LEAF_FREE() - free allocated block from leaf bitmap
+ *
+ */
+
+static void
+blst_leaf_free(
+ blmeta_t *scan,
+ daddr_t blk,
+ int count
+) {
+ /*
+ * free some data in this bitmap
+ *
+ * e.g.
+ * 0000111111111110000
+ * \_________/\__/
+ * v n
+ */
+ int n = blk & (BLIST_BMAP_RADIX - 1);
+ u_daddr_t mask;
+
+ mask = ((u_daddr_t)-1 << n) &
+ ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+ if (scan->u.bmu_bitmap & mask)
+ panic("blst_radix_free: freeing free block");
+ scan->u.bmu_bitmap |= mask;
+
+ /*
+ * We could probably do a better job here. We are required to make
+ * bighint at least as large as the biggest contiguous block of
+ * data. If we just shoehorn it, a little extra overhead will
+ * be incured on the next allocation (but only that one typically).
+ */
+ scan->bm_bighint = BLIST_BMAP_RADIX;
+}
+
+/*
+ * BLST_META_FREE() - free allocated blocks from radix tree meta info
+ *
+ * This support routine frees a range of blocks from the bitmap.
+ * The range must be entirely enclosed by this radix node. If a
+ * meta node, we break the range down recursively to free blocks
+ * in subnodes (which means that this code can free an arbitrary
+ * range whereas the allocation code cannot allocate an arbitrary
+ * range).
+ */
+
+static void
+blst_meta_free(
+ blmeta_t *scan,
+ daddr_t freeBlk,
+ daddr_t count,
+ daddr_t radix,
+ int skip,
+ daddr_t blk
+) {
+ int i;
+ int next_skip = ((u_int)skip / BLIST_META_RADIX);
+
+#if 0
+ printf("free (%llx,%lld) FROM (%llx,%lld)\n",
+ (long long)freeBlk, (long long)count,
+ (long long)blk, (long long)radix
+ );
+#endif
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * ALL-ALLOCATED special case, with possible
+ * shortcut to ALL-FREE special case.
+ */
+ scan->u.bmu_avail = count;
+ scan->bm_bighint = count;
+
+ if (count != radix) {
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+ scan[i].bm_bighint = 0;
+ if (next_skip == 1) {
+ scan[i].u.bmu_bitmap = 0;
+ } else {
+ scan[i].u.bmu_avail = 0;
+ }
+ }
+ /* fall through */
+ }
+ } else {
+ scan->u.bmu_avail += count;
+ /* scan->bm_bighint = radix; */
+ }
+
+ /*
+ * ALL-FREE special case.
+ */
+
+ if (scan->u.bmu_avail == radix)
+ return;
+ if (scan->u.bmu_avail > radix)
+ panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
+ (long long)count, (long long)scan->u.bmu_avail,
+ (long long)radix);
+
+ /*
+ * Break the free down into its components
+ */
+
+ radix /= BLIST_META_RADIX;
+
+ i = (freeBlk - blk) / radix;
+ blk += i * radix;
+ i = i * next_skip + 1;
+
+ while (i <= skip && blk < freeBlk + count) {
+ daddr_t v;
+
+ v = blk + radix - freeBlk;
+ if (v > count)
+ v = count;
+
+ if (scan->bm_bighint == (daddr_t)-1)
+ panic("blst_meta_free: freeing unexpected range");
+
+ if (next_skip == 1) {
+ blst_leaf_free(&scan[i], freeBlk, v);
+ } else {
+ blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
+ }
+ if (scan->bm_bighint < scan[i].bm_bighint)
+ scan->bm_bighint = scan[i].bm_bighint;
+ count -= v;
+ freeBlk += v;
+ blk += radix;
+ i += next_skip;
+ }
+}
+
+/*
+ * BLIST_RADIX_COPY() - copy one radix tree to another
+ *
+ * Locates free space in the source tree and frees it in the destination
+ * tree. The space may not already be free in the destination.
+ */
+
+static void blst_copy(
+ blmeta_t *scan,
+ daddr_t blk,
+ daddr_t radix,
+ daddr_t skip,
+ blist_t dest,
+ daddr_t count
+) {
+ int next_skip;
+ int i;
+
+ /*
+ * Leaf node
+ */
+
+ if (radix == BLIST_BMAP_RADIX) {
+ u_daddr_t v = scan->u.bmu_bitmap;
+
+ if (v == (u_daddr_t)-1) {
+ blist_free(dest, blk, count);
+ } else if (v != 0) {
+ int i;
+
+ for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
+ if (v & (1 << i))
+ blist_free(dest, blk + i, 1);
+ }
+ }
+ return;
+ }
+
+ /*
+ * Meta node
+ */
+
+ if (scan->u.bmu_avail == 0) {
+ /*
+ * Source all allocated, leave dest allocated
+ */
+ return;
+ }
+ if (scan->u.bmu_avail == radix) {
+ /*
+ * Source all free, free entire dest
+ */
+ if (count < radix)
+ blist_free(dest, blk, count);
+ else
+ blist_free(dest, blk, radix);
+ return;
+ }
+
+
+ radix /= BLIST_META_RADIX;
+ next_skip = ((u_int)skip / BLIST_META_RADIX);
+
+ for (i = 1; count && i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+
+ if (count >= radix) {
+ blst_copy(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ dest,
+ radix
+ );
+ count -= radix;
+ } else {
+ if (count) {
+ blst_copy(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ dest,
+ count
+ );
+ }
+ count = 0;
+ }
+ blk += radix;
+ }
+}
+
+/*
+ * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
+ *
+ * This routine allocates all blocks in the specified range
+ * regardless of any existing allocations in that range. Returns
+ * the number of blocks allocated by the call.
+ */
+
+static int
+blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
+{
+ int n = blk & (BLIST_BMAP_RADIX - 1);
+ int nblks;
+ u_daddr_t mask, bitmap;
+
+ mask = ((u_daddr_t)-1 << n) &
+ ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
+
+ /* Count the number of blocks we're about to allocate */
+ bitmap = scan->u.bmu_bitmap & mask;
+ for (nblks = 0; bitmap != 0; nblks++)
+ bitmap &= bitmap - 1;
+
+ scan->u.bmu_bitmap &= ~mask;
+ return nblks;
+}
+
+/*
+ * BLIST_META_FILL() - allocate specific blocks at a meta node
+ *
+ * This routine allocates the specified range of blocks,
+ * regardless of any existing allocations in the range. The
+ * range must be within the extent of this node. Returns the
+ * number of blocks allocated by the call.
+ */
+static int
+blst_meta_fill(
+ blmeta_t *scan,
+ daddr_t allocBlk,
+ daddr_t count,
+ daddr_t radix,
+ int skip,
+ daddr_t blk
+) {
+ int i;
+ int next_skip = ((u_int)skip / BLIST_META_RADIX);
+ int nblks = 0;
+
+ if (count == radix || scan->u.bmu_avail == 0) {
+ /*
+ * ALL-ALLOCATED special case
+ */
+ nblks = scan->u.bmu_avail;
+ scan->u.bmu_avail = 0;
+ scan->bm_bighint = count;
+ return nblks;
+ }
+
+ if (scan->u.bmu_avail == radix) {
+ radix /= BLIST_META_RADIX;
+
+ /*
+ * ALL-FREE special case, initialize sublevel
+ */
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1)
+ break;
+ if (next_skip == 1) {
+ scan[i].u.bmu_bitmap = (u_daddr_t)-1;
+ scan[i].bm_bighint = BLIST_BMAP_RADIX;
+ } else {
+ scan[i].bm_bighint = radix;
+ scan[i].u.bmu_avail = radix;
+ }
+ }
+ } else {
+ radix /= BLIST_META_RADIX;
+ }
+
+ if (count > radix)
+ panic("blist_meta_fill: allocation too large");
+
+ i = (allocBlk - blk) / radix;
+ blk += i * radix;
+ i = i * next_skip + 1;
+
+ while (i <= skip && blk < allocBlk + count) {
+ daddr_t v;
+
+ v = blk + radix - allocBlk;
+ if (v > count)
+ v = count;
+
+ if (scan->bm_bighint == (daddr_t)-1)
+ panic("blst_meta_fill: filling unexpected range");
+
+ if (next_skip == 1) {
+ nblks += blst_leaf_fill(&scan[i], allocBlk, v);
+ } else {
+ nblks += blst_meta_fill(&scan[i], allocBlk, v,
+ radix, next_skip - 1, blk);
+ }
+ count -= v;
+ allocBlk += v;
+ blk += radix;
+ i += next_skip;
+ }
+ scan->u.bmu_avail -= nblks;
+ return nblks;
+}
+
+/*
+ * BLST_RADIX_INIT() - initialize radix tree
+ *
+ * Initialize our meta structures and bitmaps and calculate the exact
+ * amount of space required to manage 'count' blocks - this space may
+ * be considerably less than the calculated radix due to the large
+ * RADIX values we use.
+ */
+
+static daddr_t
+blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+{
+ int i;
+ int next_skip;
+ daddr_t memindex = 0;
+
+ /*
+ * Leaf node
+ */
+
+ if (radix == BLIST_BMAP_RADIX) {
+ if (scan) {
+ scan->bm_bighint = 0;
+ scan->u.bmu_bitmap = 0;
+ }
+ return(memindex);
+ }
+
+ /*
+ * Meta node. If allocating the entire object we can special
+ * case it. However, we need to figure out how much memory
+ * is required to manage 'count' blocks, so we continue on anyway.
+ */
+
+ if (scan) {
+ scan->bm_bighint = 0;
+ scan->u.bmu_avail = 0;
+ }
+
+ radix /= BLIST_META_RADIX;
+ next_skip = ((u_int)skip / BLIST_META_RADIX);
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (count >= radix) {
+ /*
+ * Allocate the entire object
+ */
+ memindex = i + blst_radix_init(
+ ((scan) ? &scan[i] : NULL),
+ radix,
+ next_skip - 1,
+ radix
+ );
+ count -= radix;
+ } else if (count > 0) {
+ /*
+ * Allocate a partial object
+ */
+ memindex = i + blst_radix_init(
+ ((scan) ? &scan[i] : NULL),
+ radix,
+ next_skip - 1,
+ count
+ );
+ count = 0;
+ } else {
+ /*
+ * Add terminator and break out
+ */
+ if (scan)
+ scan[i].bm_bighint = (daddr_t)-1;
+ break;
+ }
+ }
+ if (memindex < i)
+ memindex = i;
+ return(memindex);
+}
+
+#ifdef BLIST_DEBUG
+
+static void
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+{
+ int i;
+ int next_skip;
+ int lastState = 0;
+
+ if (radix == BLIST_BMAP_RADIX) {
+ printf(
+ "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n",
+ tab, tab, "",
+ (long long)blk, (long long)radix,
+ (long long)scan->u.bmu_bitmap,
+ (long long)scan->bm_bighint
+ );
+ return;
+ }
+
+ if (scan->u.bmu_avail == 0) {
+ printf(
+ "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
+ tab, tab, "",
+ (long long)blk,
+ (long long)radix
+ );
+ return;
+ }
+ if (scan->u.bmu_avail == radix) {
+ printf(
+ "%*.*s(%08llx,%lld) ALL FREE\n",
+ tab, tab, "",
+ (long long)blk,
+ (long long)radix
+ );
+ return;
+ }
+
+ printf(
+ "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
+ tab, tab, "",
+ (long long)blk, (long long)radix,
+ (long long)scan->u.bmu_avail,
+ (long long)radix,
+ (long long)scan->bm_bighint
+ );
+
+ radix /= BLIST_META_RADIX;
+ next_skip = ((u_int)skip / BLIST_META_RADIX);
+ tab += 4;
+
+ for (i = 1; i <= skip; i += next_skip) {
+ if (scan[i].bm_bighint == (daddr_t)-1) {
+ printf(
+ "%*.*s(%08llx,%lld): Terminator\n",
+ tab, tab, "",
+ (long long)blk, (long long)radix
+ );
+ lastState = 0;
+ break;
+ }
+ blst_radix_print(
+ &scan[i],
+ blk,
+ radix,
+ next_skip - 1,
+ tab
+ );
+ blk += radix;
+ }
+ tab -= 4;
+
+ printf(
+ "%*.*s}\n",
+ tab, tab, ""
+ );
+}
+
+#endif
+
+#ifdef BLIST_DEBUG
+
+int
+main(int ac, char **av)
+{
+ int size = 1024;
+ int i;
+ blist_t bl;
+
+ for (i = 1; i < ac; ++i) {
+ const char *ptr = av[i];
+ if (*ptr != '-') {
+ size = strtol(ptr, NULL, 0);
+ continue;
+ }
+ ptr += 2;
+ fprintf(stderr, "Bad option: %s\n", ptr - 2);
+ exit(1);
+ }
+ bl = blist_create(size, M_WAITOK);
+ blist_free(bl, 0, size);
+
+ for (;;) {
+ char buf[1024];
+ daddr_t da = 0;
+ daddr_t count = 0;
+
+
+ printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
+ (long long)size, (long long)bl->bl_radix);
+ fflush(stdout);
+ if (fgets(buf, sizeof(buf), stdin) == NULL)
+ break;
+ switch(buf[0]) {
+ case 'r':
+ if (sscanf(buf + 1, "%lld", &count) == 1) {
+ blist_resize(&bl, count, 1);
+ } else {
+ printf("?\n");
+ }
+ case 'p':
+ blist_print(bl);
+ break;
+ case 'a':
+ if (sscanf(buf + 1, "%lld", &count) == 1) {
+ daddr_t blk = blist_alloc(bl, count);
+ printf(" R=%08llx\n", (long long)blk);
+ } else {
+ printf("?\n");
+ }
+ break;
+ case 'f':
+ if (sscanf(buf + 1, "%llx %lld",
+ (long long *)&da, (long long *)&count) == 2) {
+ blist_free(bl, da, count);
+ } else {
+ printf("?\n");
+ }
+ break;
+ case 'l':
+ if (sscanf(buf + 1, "%llx %lld",
+ (long long *)&da, (long long *)&count) == 2) {
+ printf(" n=%d\n",
+ blist_fill(bl, da, count));
+ } else {
+ printf("?\n");
+ }
+ break;
+ case '?':
+ case 'h':
+ puts(
+ "p -print\n"
+ "a %d -allocate\n"
+ "f %x %d -free\n"
+ "l %x %d -fill\n"
+ "r %d -resize\n"
+ "h/? -help"
+ );
+ break;
+ default:
+ printf("?\n");
+ break;
+ }
+ }
+ return(0);
+}
+
+void
+panic(const char *ctl, ...)
+{
+ va_list va;
+
+ va_start(va, ctl);
+ vfprintf(stderr, ctl, va);
+ fprintf(stderr, "\n");
+ va_end(va);
+ exit(1);
+}
+
+#endif
+
diff --git a/sys/kern/subr_bufring.c b/sys/kern/subr_bufring.c
new file mode 100644
index 0000000..4cd3929
--- /dev/null
+++ b/sys/kern/subr_bufring.c
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2007, 2008 Kip Macy <kmacy@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/ktr.h>
+#include <sys/buf_ring.h>
+
+
+struct buf_ring *
+buf_ring_alloc(int count, struct malloc_type *type, int flags, struct mtx *lock)
+{
+ struct buf_ring *br;
+
+ KASSERT(powerof2(count), ("buf ring must be size power of 2"));
+
+ br = malloc(sizeof(struct buf_ring) + count*sizeof(caddr_t),
+ type, flags|M_ZERO);
+ if (br == NULL)
+ return (NULL);
+#ifdef DEBUG_BUFRING
+ br->br_lock = lock;
+#endif
+ br->br_prod_size = br->br_cons_size = count;
+ br->br_prod_mask = br->br_cons_mask = count-1;
+ br->br_prod_head = br->br_cons_head = 0;
+ br->br_prod_tail = br->br_cons_tail = 0;
+
+ return (br);
+}
+
+void
+buf_ring_free(struct buf_ring *br, struct malloc_type *type)
+{
+ free(br, type);
+}
diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c
new file mode 100644
index 0000000..b3b1852
--- /dev/null
+++ b/sys/kern/subr_bus.c
@@ -0,0 +1,4885 @@
+/*-
+ * Copyright (c) 1997,1998,2003 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/filio.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/queue.h>
+#include <machine/bus.h>
+#include <sys/rman.h>
+#include <sys/selinfo.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+
+#include <net/vnet.h>
+
+#include <machine/stdarg.h>
+
+#include <vm/uma.h>
+
+SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
+SYSCTL_NODE(, OID_AUTO, dev, CTLFLAG_RW, NULL, NULL);
+
+/*
+ * Used to attach drivers to devclasses.
+ */
+typedef struct driverlink *driverlink_t;
+struct driverlink {
+ kobj_class_t driver;
+ TAILQ_ENTRY(driverlink) link; /* list of drivers in devclass */
+ int pass;
+ TAILQ_ENTRY(driverlink) passlink;
+};
+
+/*
+ * Forward declarations
+ */
+typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
+typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
+typedef TAILQ_HEAD(device_list, device) device_list_t;
+
+struct devclass {
+ TAILQ_ENTRY(devclass) link;
+ devclass_t parent; /* parent in devclass hierarchy */
+ driver_list_t drivers; /* bus devclasses store drivers for bus */
+ char *name;
+ device_t *devices; /* array of devices indexed by unit */
+ int maxunit; /* size of devices array */
+ int flags;
+#define DC_HAS_CHILDREN 1
+
+ struct sysctl_ctx_list sysctl_ctx;
+ struct sysctl_oid *sysctl_tree;
+};
+
+/**
+ * @brief Implementation of device.
+ */
+struct device {
+ /*
+ * A device is a kernel object. The first field must be the
+ * current ops table for the object.
+ */
+ KOBJ_FIELDS;
+
+ /*
+ * Device hierarchy.
+ */
+ TAILQ_ENTRY(device) link; /**< list of devices in parent */
+ TAILQ_ENTRY(device) devlink; /**< global device list membership */
+ device_t parent; /**< parent of this device */
+ device_list_t children; /**< list of child devices */
+
+ /*
+ * Details of this device.
+ */
+ driver_t *driver; /**< current driver */
+ devclass_t devclass; /**< current device class */
+ int unit; /**< current unit number */
+ char* nameunit; /**< name+unit e.g. foodev0 */
+ char* desc; /**< driver specific description */
+ int busy; /**< count of calls to device_busy() */
+ device_state_t state; /**< current device state */
+ uint32_t devflags; /**< api level flags for device_get_flags() */
+ u_int flags; /**< internal device flags */
+#define DF_ENABLED 0x01 /* device should be probed/attached */
+#define DF_FIXEDCLASS 0x02 /* devclass specified at create time */
+#define DF_WILDCARD 0x04 /* unit was originally wildcard */
+#define DF_DESCMALLOCED 0x08 /* description was malloced */
+#define DF_QUIET 0x10 /* don't print verbose attach message */
+#define DF_DONENOMATCH 0x20 /* don't execute DEVICE_NOMATCH again */
+#define DF_EXTERNALSOFTC 0x40 /* softc not allocated by us */
+#define DF_REBID 0x80 /* Can rebid after attach */
+ u_int order; /**< order from device_add_child_ordered() */
+ void *ivars; /**< instance variables */
+ void *softc; /**< current driver's variables */
+
+ struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables */
+ struct sysctl_oid *sysctl_tree; /**< state for sysctl variables */
+};
+
+static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
+static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
+
+#ifdef BUS_DEBUG
+
+static int bus_debug = 1;
+TUNABLE_INT("bus.debug", &bus_debug);
+SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
+ "Debug bus code");
+
+#define PDEBUG(a) if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
+#define DEVICENAME(d) ((d)? device_get_name(d): "no device")
+#define DRIVERNAME(d) ((d)? d->name : "no driver")
+#define DEVCLANAME(d) ((d)? d->name : "no devclass")
+
+/**
+ * Produce the indenting, indent*2 spaces plus a '.' ahead of that to
+ * prevent syslog from deleting initial spaces
+ */
+#define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf(" "); printf p ; } while (0)
+
+static void print_device_short(device_t dev, int indent);
+static void print_device(device_t dev, int indent);
+void print_device_tree_short(device_t dev, int indent);
+void print_device_tree(device_t dev, int indent);
+static void print_driver_short(driver_t *driver, int indent);
+static void print_driver(driver_t *driver, int indent);
+static void print_driver_list(driver_list_t drivers, int indent);
+static void print_devclass_short(devclass_t dc, int indent);
+static void print_devclass(devclass_t dc, int indent);
+void print_devclass_list_short(void);
+void print_devclass_list(void);
+
+#else
+/* Make the compiler ignore the function calls */
+#define PDEBUG(a) /* nop */
+#define DEVICENAME(d) /* nop */
+#define DRIVERNAME(d) /* nop */
+#define DEVCLANAME(d) /* nop */
+
+#define print_device_short(d,i) /* nop */
+#define print_device(d,i) /* nop */
+#define print_device_tree_short(d,i) /* nop */
+#define print_device_tree(d,i) /* nop */
+#define print_driver_short(d,i) /* nop */
+#define print_driver(d,i) /* nop */
+#define print_driver_list(d,i) /* nop */
+#define print_devclass_short(d,i) /* nop */
+#define print_devclass(d,i) /* nop */
+#define print_devclass_list_short() /* nop */
+#define print_devclass_list() /* nop */
+#endif
+
+/*
+ * dev sysctl tree
+ */
+
+enum {
+ DEVCLASS_SYSCTL_PARENT,
+};
+
+static int
+devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+ devclass_t dc = (devclass_t)arg1;
+ const char *value;
+
+ switch (arg2) {
+ case DEVCLASS_SYSCTL_PARENT:
+ value = dc->parent ? dc->parent->name : "";
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (SYSCTL_OUT(req, value, strlen(value)));
+}
+
+static void
+devclass_sysctl_init(devclass_t dc)
+{
+
+ if (dc->sysctl_tree != NULL)
+ return;
+ sysctl_ctx_init(&dc->sysctl_ctx);
+ dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
+ CTLFLAG_RD, NULL, "");
+ SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
+ OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
+ dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
+ "parent class");
+}
+
+enum {
+ DEVICE_SYSCTL_DESC,
+ DEVICE_SYSCTL_DRIVER,
+ DEVICE_SYSCTL_LOCATION,
+ DEVICE_SYSCTL_PNPINFO,
+ DEVICE_SYSCTL_PARENT,
+};
+
+static int
+device_sysctl_handler(SYSCTL_HANDLER_ARGS)
+{
+ device_t dev = (device_t)arg1;
+ const char *value;
+ char *buf;
+ int error;
+
+ buf = NULL;
+ switch (arg2) {
+ case DEVICE_SYSCTL_DESC:
+ value = dev->desc ? dev->desc : "";
+ break;
+ case DEVICE_SYSCTL_DRIVER:
+ value = dev->driver ? dev->driver->name : "";
+ break;
+ case DEVICE_SYSCTL_LOCATION:
+ value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
+ bus_child_location_str(dev, buf, 1024);
+ break;
+ case DEVICE_SYSCTL_PNPINFO:
+ value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO);
+ bus_child_pnpinfo_str(dev, buf, 1024);
+ break;
+ case DEVICE_SYSCTL_PARENT:
+ value = dev->parent ? dev->parent->nameunit : "";
+ break;
+ default:
+ return (EINVAL);
+ }
+ error = SYSCTL_OUT(req, value, strlen(value));
+ if (buf != NULL)
+ free(buf, M_BUS);
+ return (error);
+}
+
+static void
+device_sysctl_init(device_t dev)
+{
+ devclass_t dc = dev->devclass;
+
+ if (dev->sysctl_tree != NULL)
+ return;
+ devclass_sysctl_init(dc);
+ sysctl_ctx_init(&dev->sysctl_ctx);
+ dev->sysctl_tree = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
+ SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
+ dev->nameunit + strlen(dc->name),
+ CTLFLAG_RD, NULL, "");
+ SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+ OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD,
+ dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
+ "device description");
+ SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+ OID_AUTO, "%driver", CTLTYPE_STRING | CTLFLAG_RD,
+ dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
+ "device driver name");
+ SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+ OID_AUTO, "%location", CTLTYPE_STRING | CTLFLAG_RD,
+ dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
+ "device location relative to parent");
+ SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+ OID_AUTO, "%pnpinfo", CTLTYPE_STRING | CTLFLAG_RD,
+ dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
+ "device identification");
+ SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
+ OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
+ dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
+ "parent device");
+}
+
+static void
+device_sysctl_update(device_t dev)
+{
+ devclass_t dc = dev->devclass;
+
+ if (dev->sysctl_tree == NULL)
+ return;
+ sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name));
+}
+
+static void
+device_sysctl_fini(device_t dev)
+{
+ if (dev->sysctl_tree == NULL)
+ return;
+ sysctl_ctx_free(&dev->sysctl_ctx);
+ dev->sysctl_tree = NULL;
+}
+
+/*
+ * /dev/devctl implementation
+ */
+
+/*
+ * This design allows only one reader for /dev/devctl. This is not desirable
+ * in the long run, but will get a lot of hair out of this implementation.
+ * Maybe we should make this device a clonable device.
+ *
+ * Also note: we specifically do not attach a device to the device_t tree
+ * to avoid potential chicken and egg problems. One could argue that all
+ * of this belongs to the root node. One could also further argue that the
+ * sysctl interface that we have not might more properly be an ioctl
+ * interface, but at this stage of the game, I'm not inclined to rock that
+ * boat.
+ *
+ * I'm also not sure that the SIGIO support is done correctly or not, as
+ * I copied it from a driver that had SIGIO support that likely hasn't been
+ * tested since 3.4 or 2.2.8!
+ */
+
+/* Deprecated way to adjust queue length */
+static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
+/* XXX Need to support old-style tunable hw.bus.devctl_disable" */
+SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RW, NULL,
+ 0, sysctl_devctl_disable, "I", "devctl disable -- deprecated");
+
+#define DEVCTL_DEFAULT_QUEUE_LEN 1000
+static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS);
+static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
+TUNABLE_INT("hw.bus.devctl_queue", &devctl_queue_length);
+SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RW, NULL,
+ 0, sysctl_devctl_queue, "I", "devctl queue length");
+
+static d_open_t devopen;
+static d_close_t devclose;
+static d_read_t devread;
+static d_ioctl_t devioctl;
+static d_poll_t devpoll;
+
+static struct cdevsw dev_cdevsw = {
+ .d_version = D_VERSION,
+ .d_flags = D_NEEDGIANT,
+ .d_open = devopen,
+ .d_close = devclose,
+ .d_read = devread,
+ .d_ioctl = devioctl,
+ .d_poll = devpoll,
+ .d_name = "devctl",
+};
+
+struct dev_event_info
+{
+ char *dei_data;
+ TAILQ_ENTRY(dev_event_info) dei_link;
+};
+
+TAILQ_HEAD(devq, dev_event_info);
+
+static struct dev_softc
+{
+ int inuse;
+ int nonblock;
+ int queued;
+ struct mtx mtx;
+ struct cv cv;
+ struct selinfo sel;
+ struct devq devq;
+ struct proc *async_proc;
+} devsoftc;
+
+static struct cdev *devctl_dev;
+
+static void
+devinit(void)
+{
+ devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL,
+ UID_ROOT, GID_WHEEL, 0600, "devctl");
+ mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
+ cv_init(&devsoftc.cv, "dev cv");
+ TAILQ_INIT(&devsoftc.devq);
+}
+
+static int
+devopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ if (devsoftc.inuse)
+ return (EBUSY);
+ /* move to init */
+ devsoftc.inuse = 1;
+ devsoftc.nonblock = 0;
+ devsoftc.async_proc = NULL;
+ return (0);
+}
+
+static int
+devclose(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+ devsoftc.inuse = 0;
+ mtx_lock(&devsoftc.mtx);
+ cv_broadcast(&devsoftc.cv);
+ mtx_unlock(&devsoftc.mtx);
+ devsoftc.async_proc = NULL;
+ return (0);
+}
+
+/*
+ * The read channel for this device is used to report changes to
+ * userland in realtime. We are required to free the data as well as
+ * the n1 object because we allocate them separately. Also note that
+ * we return one record at a time. If you try to read this device a
+ * character at a time, you will lose the rest of the data. Listening
+ * programs are expected to cope.
+ */
+static int
+devread(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct dev_event_info *n1;
+ int rv;
+
+ mtx_lock(&devsoftc.mtx);
+ while (TAILQ_EMPTY(&devsoftc.devq)) {
+ if (devsoftc.nonblock) {
+ mtx_unlock(&devsoftc.mtx);
+ return (EAGAIN);
+ }
+ rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
+ if (rv) {
+ /*
+ * Need to translate ERESTART to EINTR here? -- jake
+ */
+ mtx_unlock(&devsoftc.mtx);
+ return (rv);
+ }
+ }
+ n1 = TAILQ_FIRST(&devsoftc.devq);
+ TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
+ devsoftc.queued--;
+ mtx_unlock(&devsoftc.mtx);
+ rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
+ free(n1->dei_data, M_BUS);
+ free(n1, M_BUS);
+ return (rv);
+}
+
+static int
+devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+ switch (cmd) {
+
+ case FIONBIO:
+ if (*(int*)data)
+ devsoftc.nonblock = 1;
+ else
+ devsoftc.nonblock = 0;
+ return (0);
+ case FIOASYNC:
+ if (*(int*)data)
+ devsoftc.async_proc = td->td_proc;
+ else
+ devsoftc.async_proc = NULL;
+ return (0);
+
+ /* (un)Support for other fcntl() calls. */
+ case FIOCLEX:
+ case FIONCLEX:
+ case FIONREAD:
+ case FIOSETOWN:
+ case FIOGETOWN:
+ default:
+ break;
+ }
+ return (ENOTTY);
+}
+
+static int
+devpoll(struct cdev *dev, int events, struct thread *td)
+{
+ int revents = 0;
+
+ mtx_lock(&devsoftc.mtx);
+ if (events & (POLLIN | POLLRDNORM)) {
+ if (!TAILQ_EMPTY(&devsoftc.devq))
+ revents = events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(td, &devsoftc.sel);
+ }
+ mtx_unlock(&devsoftc.mtx);
+
+ return (revents);
+}
+
+/**
+ * @brief Return whether the userland process is running
+ */
+boolean_t
+devctl_process_running(void)
+{
+ return (devsoftc.inuse == 1);
+}
+
+/**
+ * @brief Queue data to be read from the devctl device
+ *
+ * Generic interface to queue data to the devctl device. It is
+ * assumed that @p data is properly formatted. It is further assumed
+ * that @p data is allocated using the M_BUS malloc type.
+ */
+void
+devctl_queue_data_f(char *data, int flags)
+{
+ struct dev_event_info *n1 = NULL, *n2 = NULL;
+ struct proc *p;
+
+ if (strlen(data) == 0)
+ goto out;
+ if (devctl_queue_length == 0)
+ goto out;
+ n1 = malloc(sizeof(*n1), M_BUS, flags);
+ if (n1 == NULL)
+ goto out;
+ n1->dei_data = data;
+ mtx_lock(&devsoftc.mtx);
+ if (devctl_queue_length == 0) {
+ mtx_unlock(&devsoftc.mtx);
+ free(n1->dei_data, M_BUS);
+ free(n1, M_BUS);
+ return;
+ }
+ /* Leave at least one spot in the queue... */
+ while (devsoftc.queued > devctl_queue_length - 1) {
+ n2 = TAILQ_FIRST(&devsoftc.devq);
+ TAILQ_REMOVE(&devsoftc.devq, n2, dei_link);
+ free(n2->dei_data, M_BUS);
+ free(n2, M_BUS);
+ devsoftc.queued--;
+ }
+ TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
+ devsoftc.queued++;
+ cv_broadcast(&devsoftc.cv);
+ mtx_unlock(&devsoftc.mtx);
+ selwakeup(&devsoftc.sel);
+ p = devsoftc.async_proc;
+ if (p != NULL) {
+ PROC_LOCK(p);
+ kern_psignal(p, SIGIO);
+ PROC_UNLOCK(p);
+ }
+ return;
+out:
+ /*
+ * We have to free data on all error paths since the caller
+ * assumes it will be free'd when this item is dequeued.
+ */
+ free(data, M_BUS);
+ return;
+}
+
+void
+devctl_queue_data(char *data)
+{
+
+ devctl_queue_data_f(data, M_NOWAIT);
+}
+
+/**
+ * @brief Send a 'notification' to userland, using standard ways
+ */
+void
+devctl_notify_f(const char *system, const char *subsystem, const char *type,
+ const char *data, int flags)
+{
+ int len = 0;
+ char *msg;
+
+ if (system == NULL)
+ return; /* BOGUS! Must specify system. */
+ if (subsystem == NULL)
+ return; /* BOGUS! Must specify subsystem. */
+ if (type == NULL)
+ return; /* BOGUS! Must specify type. */
+ len += strlen(" system=") + strlen(system);
+ len += strlen(" subsystem=") + strlen(subsystem);
+ len += strlen(" type=") + strlen(type);
+ /* add in the data message plus newline. */
+ if (data != NULL)
+ len += strlen(data);
+ len += 3; /* '!', '\n', and NUL */
+ msg = malloc(len, M_BUS, flags);
+ if (msg == NULL)
+ return; /* Drop it on the floor */
+ if (data != NULL)
+ snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n",
+ system, subsystem, type, data);
+ else
+ snprintf(msg, len, "!system=%s subsystem=%s type=%s\n",
+ system, subsystem, type);
+ devctl_queue_data_f(msg, flags);
+}
+
+void
+devctl_notify(const char *system, const char *subsystem, const char *type,
+ const char *data)
+{
+
+ devctl_notify_f(system, subsystem, type, data, M_NOWAIT);
+}
+
+/*
+ * Common routine that tries to make sending messages as easy as possible.
+ * We allocate memory for the data, copy strings into that, but do not
+ * free it unless there's an error. The dequeue part of the driver should
+ * free the data. We don't send data when the device is disabled. We do
+ * send data, even when we have no listeners, because we wish to avoid
+ * races relating to startup and restart of listening applications.
+ *
+ * devaddq is designed to string together the type of event, with the
+ * object of that event, plus the plug and play info and location info
+ * for that event. This is likely most useful for devices, but less
+ * useful for other consumers of this interface. Those should use
+ * the devctl_queue_data() interface instead.
+ */
+static void
+devaddq(const char *type, const char *what, device_t dev)
+{
+ char *data = NULL;
+ char *loc = NULL;
+ char *pnp = NULL;
+ const char *parstr;
+
+ if (!devctl_queue_length)/* Rare race, but lost races safely discard */
+ return;
+ data = malloc(1024, M_BUS, M_NOWAIT);
+ if (data == NULL)
+ goto bad;
+
+ /* get the bus specific location of this device */
+ loc = malloc(1024, M_BUS, M_NOWAIT);
+ if (loc == NULL)
+ goto bad;
+ *loc = '\0';
+ bus_child_location_str(dev, loc, 1024);
+
+ /* Get the bus specific pnp info of this device */
+ pnp = malloc(1024, M_BUS, M_NOWAIT);
+ if (pnp == NULL)
+ goto bad;
+ *pnp = '\0';
+ bus_child_pnpinfo_str(dev, pnp, 1024);
+
+ /* Get the parent of this device, or / if high enough in the tree. */
+ if (device_get_parent(dev) == NULL)
+ parstr = "."; /* Or '/' ? */
+ else
+ parstr = device_get_nameunit(device_get_parent(dev));
+ /* String it all together. */
+ snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp,
+ parstr);
+ free(loc, M_BUS);
+ free(pnp, M_BUS);
+ devctl_queue_data(data);
+ return;
+bad:
+ free(pnp, M_BUS);
+ free(loc, M_BUS);
+ free(data, M_BUS);
+ return;
+}
+
+/*
+ * A device was added to the tree. We are called just after it successfully
+ * attaches (that is, probe and attach success for this device). No call
+ * is made if a device is merely parented into the tree. See devnomatch
+ * if probe fails. If attach fails, no notification is sent (but maybe
+ * we should have a different message for this).
+ */
+static void
+devadded(device_t dev)
+{
+ devaddq("+", device_get_nameunit(dev), dev);
+}
+
+/*
+ * A device was removed from the tree. We are called just before this
+ * happens.
+ */
+static void
+devremoved(device_t dev)
+{
+ devaddq("-", device_get_nameunit(dev), dev);
+}
+
+/*
+ * Called when there's no match for this device. This is only called
+ * the first time that no match happens, so we don't keep getting this
+ * message. Should that prove to be undesirable, we can change it.
+ * This is called when all drivers that can attach to a given bus
+ * decline to accept this device. Other errors may not be detected.
+ */
+static void
+devnomatch(device_t dev)
+{
+ devaddq("?", "", dev);
+}
+
+static int
+sysctl_devctl_disable(SYSCTL_HANDLER_ARGS)
+{
+ struct dev_event_info *n1;
+ int dis, error;
+
+ dis = devctl_queue_length == 0;
+ error = sysctl_handle_int(oidp, &dis, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ mtx_lock(&devsoftc.mtx);
+ if (dis) {
+ while (!TAILQ_EMPTY(&devsoftc.devq)) {
+ n1 = TAILQ_FIRST(&devsoftc.devq);
+ TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
+ free(n1->dei_data, M_BUS);
+ free(n1, M_BUS);
+ }
+ devsoftc.queued = 0;
+ devctl_queue_length = 0;
+ } else {
+ devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
+ }
+ mtx_unlock(&devsoftc.mtx);
+ return (0);
+}
+
+static int
+sysctl_devctl_queue(SYSCTL_HANDLER_ARGS)
+{
+ struct dev_event_info *n1;
+ int q, error;
+
+ q = devctl_queue_length;
+ error = sysctl_handle_int(oidp, &q, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (q < 0)
+ return (EINVAL);
+ mtx_lock(&devsoftc.mtx);
+ devctl_queue_length = q;
+ while (devsoftc.queued > devctl_queue_length) {
+ n1 = TAILQ_FIRST(&devsoftc.devq);
+ TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
+ free(n1->dei_data, M_BUS);
+ free(n1, M_BUS);
+ devsoftc.queued--;
+ }
+ mtx_unlock(&devsoftc.mtx);
+ return (0);
+}
+
+/* End of /dev/devctl code */
+
+static TAILQ_HEAD(,device) bus_data_devices;
+static int bus_data_generation = 1;
+
+static kobj_method_t null_methods[] = {
+ KOBJMETHOD_END
+};
+
+DEFINE_CLASS(null, null_methods, 0);
+
+/*
+ * Bus pass implementation
+ */
+
+static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes);
+int bus_current_pass = BUS_PASS_ROOT;
+
+/**
+ * @internal
+ * @brief Register the pass level of a new driver attachment
+ *
+ * Register a new driver attachment's pass level. If no driver
+ * attachment with the same pass level has been added, then @p new
+ * will be added to the global passes list.
+ *
+ * @param new the new driver attachment
+ */
+static void
+driver_register_pass(struct driverlink *new)
+{
+ struct driverlink *dl;
+
+ /* We only consider pass numbers during boot. */
+ if (bus_current_pass == BUS_PASS_DEFAULT)
+ return;
+
+ /*
+ * Walk the passes list. If we already know about this pass
+ * then there is nothing to do. If we don't, then insert this
+ * driver link into the list.
+ */
+ TAILQ_FOREACH(dl, &passes, passlink) {
+ if (dl->pass < new->pass)
+ continue;
+ if (dl->pass == new->pass)
+ return;
+ TAILQ_INSERT_BEFORE(dl, new, passlink);
+ return;
+ }
+ TAILQ_INSERT_TAIL(&passes, new, passlink);
+}
+
+/**
+ * @brief Raise the current bus pass
+ *
+ * Raise the current bus pass level to @p pass. Call the BUS_NEW_PASS()
+ * method on the root bus to kick off a new device tree scan for each
+ * new pass level that has at least one driver.
+ */
+void
+bus_set_pass(int pass)
+{
+ struct driverlink *dl;
+
+ if (bus_current_pass > pass)
+ panic("Attempt to lower bus pass level");
+
+ TAILQ_FOREACH(dl, &passes, passlink) {
+ /* Skip pass values below the current pass level. */
+ if (dl->pass <= bus_current_pass)
+ continue;
+
+ /*
+ * Bail once we hit a driver with a pass level that is
+ * too high.
+ */
+ if (dl->pass > pass)
+ break;
+
+ /*
+ * Raise the pass level to the next level and rescan
+ * the tree.
+ */
+ bus_current_pass = dl->pass;
+ BUS_NEW_PASS(root_bus);
+ }
+
+ /*
+ * If there isn't a driver registered for the requested pass,
+ * then bus_current_pass might still be less than 'pass'. Set
+ * it to 'pass' in that case.
+ */
+ if (bus_current_pass < pass)
+ bus_current_pass = pass;
+ KASSERT(bus_current_pass == pass, ("Failed to update bus pass level"));
+}
+
+/*
+ * Devclass implementation
+ */
+
+static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);
+
+/**
+ * @internal
+ * @brief Find or create a device class
+ *
+ * If a device class with the name @p classname exists, return it,
+ * otherwise if @p create is non-zero create and return a new device
+ * class.
+ *
+ * If @p parentname is non-NULL, the parent of the devclass is set to
+ * the devclass of that name.
+ *
+ * @param classname the devclass name to find or create
+ * @param parentname the parent devclass name or @c NULL
+ * @param create non-zero to create a devclass
+ */
+static devclass_t
+devclass_find_internal(const char *classname, const char *parentname,
+ int create)
+{
+ devclass_t dc;
+
+ PDEBUG(("looking for %s", classname));
+ if (!classname)
+ return (NULL);
+
+ TAILQ_FOREACH(dc, &devclasses, link) {
+ if (!strcmp(dc->name, classname))
+ break;
+ }
+
+ if (create && !dc) {
+ PDEBUG(("creating %s", classname));
+ dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
+ M_BUS, M_NOWAIT | M_ZERO);
+ if (!dc)
+ return (NULL);
+ dc->parent = NULL;
+ dc->name = (char*) (dc + 1);
+ strcpy(dc->name, classname);
+ TAILQ_INIT(&dc->drivers);
+ TAILQ_INSERT_TAIL(&devclasses, dc, link);
+
+ bus_data_generation_update();
+ }
+
+ /*
+ * If a parent class is specified, then set that as our parent so
+ * that this devclass will support drivers for the parent class as
+ * well. If the parent class has the same name don't do this though
+ * as it creates a cycle that can trigger an infinite loop in
+ * device_probe_child() if a device exists for which there is no
+ * suitable driver.
+ */
+ if (parentname && dc && !dc->parent &&
+ strcmp(classname, parentname) != 0) {
+ dc->parent = devclass_find_internal(parentname, NULL, TRUE);
+ dc->parent->flags |= DC_HAS_CHILDREN;
+ }
+
+ return (dc);
+}
+
+/**
+ * @brief Create a device class
+ *
+ * If a device class with the name @p classname exists, return it,
+ * otherwise create and return a new device class.
+ *
+ * @param classname the devclass name to find or create
+ */
+devclass_t
+devclass_create(const char *classname)
+{
+ return (devclass_find_internal(classname, NULL, TRUE));
+}
+
+/**
+ * @brief Find a device class
+ *
+ * If a device class with the name @p classname exists, return it,
+ * otherwise return @c NULL.
+ *
+ * @param classname the devclass name to find
+ */
+devclass_t
+devclass_find(const char *classname)
+{
+ return (devclass_find_internal(classname, NULL, FALSE));
+}
+
+/**
+ * @brief Register that a device driver has been added to a devclass
+ *
+ * Register that a device driver has been added to a devclass. This
+ * is called by devclass_add_driver to accomplish the recursive
+ * notification of all the children classes of dc, as well as dc.
+ * Each layer will have BUS_DRIVER_ADDED() called for all instances of
+ * the devclass.
+ *
+ * We do a full search here of the devclass list at each iteration
+ * level to save storing children-lists in the devclass structure. If
+ * we ever move beyond a few dozen devices doing this, we may need to
+ * reevaluate...
+ *
+ * @param dc the devclass to edit
+ * @param driver the driver that was just added
+ */
+static void
+devclass_driver_added(devclass_t dc, driver_t *driver)
+{
+ devclass_t parent;
+ int i;
+
+ /*
+ * Call BUS_DRIVER_ADDED for any existing busses in this class.
+ */
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i] && device_is_attached(dc->devices[i]))
+ BUS_DRIVER_ADDED(dc->devices[i], driver);
+
+ /*
+ * Walk through the children classes. Since we only keep a
+ * single parent pointer around, we walk the entire list of
+ * devclasses looking for children. We set the
+ * DC_HAS_CHILDREN flag when a child devclass is created on
+ * the parent, so we only walk the list for those devclasses
+ * that have children.
+ */
+ if (!(dc->flags & DC_HAS_CHILDREN))
+ return;
+ parent = dc;
+ TAILQ_FOREACH(dc, &devclasses, link) {
+ if (dc->parent == parent)
+ devclass_driver_added(dc, driver);
+ }
+}
+
+/**
+ * @brief Add a device driver to a device class
+ *
+ * Add a device driver to a devclass. This is normally called
+ * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
+ * all devices in the devclass will be called to allow them to attempt
+ * to re-probe any unmatched children.
+ *
+ * @param dc the devclass to edit
+ * @param driver the driver to register
+ */
+int
+devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp)
+{
+ driverlink_t dl;
+ const char *parentname;
+
+ PDEBUG(("%s", DRIVERNAME(driver)));
+
+ /* Don't allow invalid pass values. */
+ if (pass <= BUS_PASS_ROOT)
+ return (EINVAL);
+
+ dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO);
+ if (!dl)
+ return (ENOMEM);
+
+ /*
+ * Compile the driver's methods. Also increase the reference count
+ * so that the class doesn't get freed when the last instance
+ * goes. This means we can safely use static methods and avoids a
+ * double-free in devclass_delete_driver.
+ */
+ kobj_class_compile((kobj_class_t) driver);
+
+ /*
+ * If the driver has any base classes, make the
+ * devclass inherit from the devclass of the driver's
+ * first base class. This will allow the system to
+ * search for drivers in both devclasses for children
+ * of a device using this driver.
+ */
+ if (driver->baseclasses)
+ parentname = driver->baseclasses[0]->name;
+ else
+ parentname = NULL;
+ *dcp = devclass_find_internal(driver->name, parentname, TRUE);
+
+ dl->driver = driver;
+ TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
+ driver->refs++; /* XXX: kobj_mtx */
+ dl->pass = pass;
+ driver_register_pass(dl);
+
+ devclass_driver_added(dc, driver);
+ bus_data_generation_update();
+ return (0);
+}
+
+/**
+ * @brief Register that a device driver has been deleted from a devclass
+ *
+ * Register that a device driver has been removed from a devclass.
+ * This is called by devclass_delete_driver to accomplish the
+ * recursive notification of all the children classes of busclass, as
+ * well as busclass. Each layer will attempt to detach the driver
+ * from any devices that are children of the bus's devclass. The function
+ * will return an error if a device fails to detach.
+ *
+ * We do a full search here of the devclass list at each iteration
+ * level to save storing children-lists in the devclass structure. If
+ * we ever move beyond a few dozen devices doing this, we may need to
+ * reevaluate...
+ *
+ * @param busclass the devclass of the parent bus
+ * @param dc the devclass of the driver being deleted
+ * @param driver the driver being deleted
+ */
+static int
+devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver)
+{
+ devclass_t parent;
+ device_t dev;
+ int error, i;
+
+ /*
+ * Disassociate from any devices. We iterate through all the
+ * devices in the devclass of the driver and detach any which are
+ * using the driver and which have a parent in the devclass which
+ * we are deleting from.
+ *
+ * Note that since a driver can be in multiple devclasses, we
+ * should not detach devices which are not children of devices in
+ * the affected devclass.
+ */
+ for (i = 0; i < dc->maxunit; i++) {
+ if (dc->devices[i]) {
+ dev = dc->devices[i];
+ if (dev->driver == driver && dev->parent &&
+ dev->parent->devclass == busclass) {
+ if ((error = device_detach(dev)) != 0)
+ return (error);
+ BUS_PROBE_NOMATCH(dev->parent, dev);
+ devnomatch(dev);
+ dev->flags |= DF_DONENOMATCH;
+ }
+ }
+ }
+
+ /*
+ * Walk through the children classes. Since we only keep a
+ * single parent pointer around, we walk the entire list of
+ * devclasses looking for children. We set the
+ * DC_HAS_CHILDREN flag when a child devclass is created on
+ * the parent, so we only walk the list for those devclasses
+ * that have children.
+ */
+ if (!(busclass->flags & DC_HAS_CHILDREN))
+ return (0);
+ parent = busclass;
+ TAILQ_FOREACH(busclass, &devclasses, link) {
+ if (busclass->parent == parent) {
+ error = devclass_driver_deleted(busclass, dc, driver);
+ if (error)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+/**
+ * @brief Delete a device driver from a device class
+ *
+ * Delete a device driver from a devclass. This is normally called
+ * automatically by DRIVER_MODULE().
+ *
+ * If the driver is currently attached to any devices,
+ * devclass_delete_driver() will first attempt to detach from each
+ * device. If one of the detach calls fails, the driver will not be
+ * deleted.
+ *
+ * @param dc the devclass to edit
+ * @param driver the driver to unregister
+ */
+int
+devclass_delete_driver(devclass_t busclass, driver_t *driver)
+{
+ devclass_t dc = devclass_find(driver->name);
+ driverlink_t dl;
+ int error;
+
+ PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
+
+ if (!dc)
+ return (0);
+
+ /*
+ * Find the link structure in the bus' list of drivers.
+ */
+ TAILQ_FOREACH(dl, &busclass->drivers, link) {
+ if (dl->driver == driver)
+ break;
+ }
+
+ if (!dl) {
+ PDEBUG(("%s not found in %s list", driver->name,
+ busclass->name));
+ return (ENOENT);
+ }
+
+ error = devclass_driver_deleted(busclass, dc, driver);
+ if (error != 0)
+ return (error);
+
+ TAILQ_REMOVE(&busclass->drivers, dl, link);
+ free(dl, M_BUS);
+
+ /* XXX: kobj_mtx */
+ driver->refs--;
+ if (driver->refs == 0)
+ kobj_class_free((kobj_class_t) driver);
+
+ bus_data_generation_update();
+ return (0);
+}
+
+/**
+ * @brief Quiesces a set of device drivers from a device class
+ *
+ * Quiesce a device driver from a devclass. This is normally called
+ * automatically by DRIVER_MODULE().
+ *
+ * If the driver is currently attached to any devices,
+ * devclass_quiesece_driver() will first attempt to quiesce each
+ * device.
+ *
+ * @param dc the devclass to edit
+ * @param driver the driver to unregister
+ */
+static int
+devclass_quiesce_driver(devclass_t busclass, driver_t *driver)
+{
+ devclass_t dc = devclass_find(driver->name);
+ driverlink_t dl;
+ device_t dev;
+ int i;
+ int error;
+
+ PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));
+
+ if (!dc)
+ return (0);
+
+ /*
+ * Find the link structure in the bus' list of drivers.
+ */
+ TAILQ_FOREACH(dl, &busclass->drivers, link) {
+ if (dl->driver == driver)
+ break;
+ }
+
+ if (!dl) {
+ PDEBUG(("%s not found in %s list", driver->name,
+ busclass->name));
+ return (ENOENT);
+ }
+
+ /*
+ * Quiesce all devices. We iterate through all the devices in
+ * the devclass of the driver and quiesce any which are using
+ * the driver and which have a parent in the devclass which we
+ * are quiescing.
+ *
+ * Note that since a driver can be in multiple devclasses, we
+ * should not quiesce devices which are not children of
+ * devices in the affected devclass.
+ */
+ for (i = 0; i < dc->maxunit; i++) {
+ if (dc->devices[i]) {
+ dev = dc->devices[i];
+ if (dev->driver == driver && dev->parent &&
+ dev->parent->devclass == busclass) {
+ if ((error = device_quiesce(dev)) != 0)
+ return (error);
+ }
+ }
+ }
+
+ return (0);
+}
+
+/**
+ * @internal
+ */
+static driverlink_t
+devclass_find_driver_internal(devclass_t dc, const char *classname)
+{
+ driverlink_t dl;
+
+ PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));
+
+ TAILQ_FOREACH(dl, &dc->drivers, link) {
+ if (!strcmp(dl->driver->name, classname))
+ return (dl);
+ }
+
+ PDEBUG(("not found"));
+ return (NULL);
+}
+
+/**
+ * @brief Return the name of the devclass
+ */
+const char *
+devclass_get_name(devclass_t dc)
+{
+ return (dc->name);
+}
+
+/**
+ * @brief Find a device given a unit number
+ *
+ * @param dc the devclass to search
+ * @param unit the unit number to search for
+ *
+ * @returns the device with the given unit number or @c
+ * NULL if there is no such device
+ */
+device_t
+devclass_get_device(devclass_t dc, int unit)
+{
+ if (dc == NULL || unit < 0 || unit >= dc->maxunit)
+ return (NULL);
+ return (dc->devices[unit]);
+}
+
+/**
+ * @brief Find the softc field of a device given a unit number
+ *
+ * @param dc the devclass to search
+ * @param unit the unit number to search for
+ *
+ * @returns the softc field of the device with the given
+ * unit number or @c NULL if there is no such
+ * device
+ */
+void *
+devclass_get_softc(devclass_t dc, int unit)
+{
+ device_t dev;
+
+ dev = devclass_get_device(dc, unit);
+ if (!dev)
+ return (NULL);
+
+ return (device_get_softc(dev));
+}
+
+/**
+ * @brief Get a list of devices in the devclass
+ *
+ * An array containing a list of all the devices in the given devclass
+ * is allocated and returned in @p *devlistp. The number of devices
+ * in the array is returned in @p *devcountp. The caller should free
+ * the array using @c free(p, M_TEMP), even if @p *devcountp is 0.
+ *
+ * @param dc the devclass to examine
+ * @param devlistp points at location for array pointer return
+ * value
+ * @param devcountp points at location for array size return value
+ *
+ * @retval 0 success
+ * @retval ENOMEM the array allocation failed
+ */
+int
+devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp)
+{
+ int count, i;
+ device_t *list;
+
+ count = devclass_get_count(dc);
+ list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
+ if (!list)
+ return (ENOMEM);
+
+ count = 0;
+ for (i = 0; i < dc->maxunit; i++) {
+ if (dc->devices[i]) {
+ list[count] = dc->devices[i];
+ count++;
+ }
+ }
+
+ *devlistp = list;
+ *devcountp = count;
+
+ return (0);
+}
+
+/**
+ * @brief Get a list of drivers in the devclass
+ *
+ * An array containing a list of pointers to all the drivers in the
+ * given devclass is allocated and returned in @p *listp. The number
+ * of drivers in the array is returned in @p *countp. The caller should
+ * free the array using @c free(p, M_TEMP).
+ *
+ * @param dc the devclass to examine
+ * @param listp gives location for array pointer return value
+ * @param countp gives location for number of array elements
+ * return value
+ *
+ * @retval 0 success
+ * @retval ENOMEM the array allocation failed
+ */
+int
+devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp)
+{
+ driverlink_t dl;
+ driver_t **list;
+ int count;
+
+ count = 0;
+ TAILQ_FOREACH(dl, &dc->drivers, link)
+ count++;
+ list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
+ if (list == NULL)
+ return (ENOMEM);
+
+ count = 0;
+ TAILQ_FOREACH(dl, &dc->drivers, link) {
+ list[count] = dl->driver;
+ count++;
+ }
+ *listp = list;
+ *countp = count;
+
+ return (0);
+}
+
+/**
+ * @brief Get the number of devices in a devclass
+ *
+ * @param dc the devclass to examine
+ */
+int
+devclass_get_count(devclass_t dc)
+{
+ int count, i;
+
+ count = 0;
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i])
+ count++;
+ return (count);
+}
+
+/**
+ * @brief Get the maximum unit number used in a devclass
+ *
+ * Note that this is one greater than the highest currently-allocated
+ * unit. If a null devclass_t is passed in, -1 is returned to indicate
+ * that not even the devclass has been allocated yet.
+ *
+ * @param dc the devclass to examine
+ */
+int
+devclass_get_maxunit(devclass_t dc)
+{
+ if (dc == NULL)
+ return (-1);
+ return (dc->maxunit);
+}
+
+/**
+ * @brief Find a free unit number in a devclass
+ *
+ * This function searches for the first unused unit number greater
+ * that or equal to @p unit.
+ *
+ * @param dc the devclass to examine
+ * @param unit the first unit number to check
+ */
+int
+devclass_find_free_unit(devclass_t dc, int unit)
+{
+ if (dc == NULL)
+ return (unit);
+ while (unit < dc->maxunit && dc->devices[unit] != NULL)
+ unit++;
+ return (unit);
+}
+
+/**
+ * @brief Set the parent of a devclass
+ *
+ * The parent class is normally initialised automatically by
+ * DRIVER_MODULE().
+ *
+ * @param dc the devclass to edit
+ * @param pdc the new parent devclass
+ */
+void
+devclass_set_parent(devclass_t dc, devclass_t pdc)
+{
+ dc->parent = pdc;
+}
+
+/**
+ * @brief Get the parent of a devclass
+ *
+ * @param dc the devclass to examine
+ */
+devclass_t
+devclass_get_parent(devclass_t dc)
+{
+ return (dc->parent);
+}
+
+struct sysctl_ctx_list *
+devclass_get_sysctl_ctx(devclass_t dc)
+{
+ return (&dc->sysctl_ctx);
+}
+
+struct sysctl_oid *
+devclass_get_sysctl_tree(devclass_t dc)
+{
+ return (dc->sysctl_tree);
+}
+
+/**
+ * @internal
+ * @brief Allocate a unit number
+ *
+ * On entry, @p *unitp is the desired unit number (or @c -1 if any
+ * will do). The allocated unit number is returned in @p *unitp.
+
+ * @param dc the devclass to allocate from
+ * @param unitp points at the location for the allocated unit
+ * number
+ *
+ * @retval 0 success
+ * @retval EEXIST the requested unit number is already allocated
+ * @retval ENOMEM memory allocation failure
+ */
+static int
+devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp)
+{
+ const char *s;
+ int unit = *unitp;
+
+ PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+ /* Ask the parent bus if it wants to wire this device. */
+ if (unit == -1)
+ BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name,
+ &unit);
+
+ /* If we were given a wired unit number, check for existing device */
+ /* XXX imp XXX */
+ if (unit != -1) {
+ if (unit >= 0 && unit < dc->maxunit &&
+ dc->devices[unit] != NULL) {
+ if (bootverbose)
+ printf("%s: %s%d already exists; skipping it\n",
+ dc->name, dc->name, *unitp);
+ return (EEXIST);
+ }
+ } else {
+ /* Unwired device, find the next available slot for it */
+ unit = 0;
+ for (unit = 0;; unit++) {
+ /* If there is an "at" hint for a unit then skip it. */
+ if (resource_string_value(dc->name, unit, "at", &s) ==
+ 0)
+ continue;
+
+ /* If this device slot is already in use, skip it. */
+ if (unit < dc->maxunit && dc->devices[unit] != NULL)
+ continue;
+
+ break;
+ }
+ }
+
+ /*
+ * We've selected a unit beyond the length of the table, so let's
+ * extend the table to make room for all units up to and including
+ * this one.
+ */
+ if (unit >= dc->maxunit) {
+ device_t *newlist, *oldlist;
+ int newsize;
+
+ oldlist = dc->devices;
+ newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
+ newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
+ if (!newlist)
+ return (ENOMEM);
+ if (oldlist != NULL)
+ bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit);
+ bzero(newlist + dc->maxunit,
+ sizeof(device_t) * (newsize - dc->maxunit));
+ dc->devices = newlist;
+ dc->maxunit = newsize;
+ if (oldlist != NULL)
+ free(oldlist, M_BUS);
+ }
+ PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));
+
+ *unitp = unit;
+ return (0);
+}
+
+/**
+ * @internal
+ * @brief Add a device to a devclass
+ *
+ * A unit number is allocated for the device (using the device's
+ * preferred unit number if any) and the device is registered in the
+ * devclass. This allows the device to be looked up by its unit
+ * number, e.g. by decoding a dev_t minor number.
+ *
+ * @param dc the devclass to add to
+ * @param dev the device to add
+ *
+ * @retval 0 success
+ * @retval EEXIST the requested unit number is already allocated
+ * @retval ENOMEM memory allocation failure
+ */
+static int
+devclass_add_device(devclass_t dc, device_t dev)
+{
+ int buflen, error;
+
+ PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+ buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX);
+ if (buflen < 0)
+ return (ENOMEM);
+ dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO);
+ if (!dev->nameunit)
+ return (ENOMEM);
+
+ if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) {
+ free(dev->nameunit, M_BUS);
+ dev->nameunit = NULL;
+ return (error);
+ }
+ dc->devices[dev->unit] = dev;
+ dev->devclass = dc;
+ snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);
+
+ return (0);
+}
+
+/**
+ * @internal
+ * @brief Delete a device from a devclass
+ *
+ * The device is removed from the devclass's device list and its unit
+ * number is freed.
+
+ * @param dc the devclass to delete from
+ * @param dev the device to delete
+ *
+ * @retval 0 success
+ */
+static int
+devclass_delete_device(devclass_t dc, device_t dev)
+{
+ if (!dc || !dev)
+ return (0);
+
+ PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));
+
+ if (dev->devclass != dc || dc->devices[dev->unit] != dev)
+ panic("devclass_delete_device: inconsistent device class");
+ dc->devices[dev->unit] = NULL;
+ if (dev->flags & DF_WILDCARD)
+ dev->unit = -1;
+ dev->devclass = NULL;
+ free(dev->nameunit, M_BUS);
+ dev->nameunit = NULL;
+
+ return (0);
+}
+
+/**
+ * @internal
+ * @brief Make a new device and add it as a child of @p parent
+ *
+ * @param parent the parent of the new device
+ * @param name the devclass name of the new device or @c NULL
+ * to leave the devclass unspecified
+ * @parem unit the unit number of the new device of @c -1 to
+ * leave the unit number unspecified
+ *
+ * @returns the new device
+ */
+static device_t
+make_device(device_t parent, const char *name, int unit)
+{
+ device_t dev;
+ devclass_t dc;
+
+ PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
+
+ if (name) {
+ dc = devclass_find_internal(name, NULL, TRUE);
+ if (!dc) {
+ printf("make_device: can't find device class %s\n",
+ name);
+ return (NULL);
+ }
+ } else {
+ dc = NULL;
+ }
+
+ dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO);
+ if (!dev)
+ return (NULL);
+
+ dev->parent = parent;
+ TAILQ_INIT(&dev->children);
+ kobj_init((kobj_t) dev, &null_class);
+ dev->driver = NULL;
+ dev->devclass = NULL;
+ dev->unit = unit;
+ dev->nameunit = NULL;
+ dev->desc = NULL;
+ dev->busy = 0;
+ dev->devflags = 0;
+ dev->flags = DF_ENABLED;
+ dev->order = 0;
+ if (unit == -1)
+ dev->flags |= DF_WILDCARD;
+ if (name) {
+ dev->flags |= DF_FIXEDCLASS;
+ if (devclass_add_device(dc, dev)) {
+ kobj_delete((kobj_t) dev, M_BUS);
+ return (NULL);
+ }
+ }
+ dev->ivars = NULL;
+ dev->softc = NULL;
+
+ dev->state = DS_NOTPRESENT;
+
+ TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
+ bus_data_generation_update();
+
+ return (dev);
+}
+
+/**
+ * @internal
+ * @brief Print a description of a device.
+ */
+static int
+device_print_child(device_t dev, device_t child)
+{
+ int retval = 0;
+
+ if (device_is_alive(child))
+ retval += BUS_PRINT_CHILD(dev, child);
+ else
+ retval += device_printf(child, " not found\n");
+
+ return (retval);
+}
+
+/**
+ * @brief Create a new device
+ *
+ * This creates a new device and adds it as a child of an existing
+ * parent device. The new device will be added after the last existing
+ * child with order zero.
+ *
+ * @param dev the device which will be the parent of the
+ * new child device
+ * @param name devclass name for new device or @c NULL if not
+ * specified
+ * @param unit unit number for new device or @c -1 if not
+ * specified
+ *
+ * @returns the new device
+ */
+device_t
+device_add_child(device_t dev, const char *name, int unit)
+{
+ return (device_add_child_ordered(dev, 0, name, unit));
+}
+
+/**
+ * @brief Create a new device
+ *
+ * This creates a new device and adds it as a child of an existing
+ * parent device. The new device will be added after the last existing
+ * child with the same order.
+ *
+ * @param dev the device which will be the parent of the
+ * new child device
+ * @param order a value which is used to partially sort the
+ * children of @p dev - devices created using
+ * lower values of @p order appear first in @p
+ * dev's list of children
+ * @param name devclass name for new device or @c NULL if not
+ * specified
+ * @param unit unit number for new device or @c -1 if not
+ * specified
+ *
+ * @returns the new device
+ */
+device_t
+device_add_child_ordered(device_t dev, u_int order, const char *name, int unit)
+{
+ device_t child;
+ device_t place;
+
+ PDEBUG(("%s at %s with order %u as unit %d",
+ name, DEVICENAME(dev), order, unit));
+ KASSERT(name != NULL || unit == -1,
+ ("child device with wildcard name and specific unit number"));
+
+ child = make_device(dev, name, unit);
+ if (child == NULL)
+ return (child);
+ child->order = order;
+
+ TAILQ_FOREACH(place, &dev->children, link) {
+ if (place->order > order)
+ break;
+ }
+
+ if (place) {
+ /*
+ * The device 'place' is the first device whose order is
+ * greater than the new child.
+ */
+ TAILQ_INSERT_BEFORE(place, child, link);
+ } else {
+ /*
+ * The new child's order is greater or equal to the order of
+ * any existing device. Add the child to the tail of the list.
+ */
+ TAILQ_INSERT_TAIL(&dev->children, child, link);
+ }
+
+ bus_data_generation_update();
+ return (child);
+}
+
+/**
+ * @brief Delete a device
+ *
+ * This function deletes a device along with all of its children. If
+ * the device currently has a driver attached to it, the device is
+ * detached first using device_detach().
+ *
+ * @param dev the parent device
+ * @param child the device to delete
+ *
+ * @retval 0 success
+ * @retval non-zero a unit error code describing the error
+ */
+int
+device_delete_child(device_t dev, device_t child)
+{
+ int error;
+ device_t grandchild;
+
+ PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
+
+ /* remove children first */
+ while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) {
+ error = device_delete_child(child, grandchild);
+ if (error)
+ return (error);
+ }
+
+ if ((error = device_detach(child)) != 0)
+ return (error);
+ if (child->devclass)
+ devclass_delete_device(child->devclass, child);
+ if (child->parent)
+ BUS_CHILD_DELETED(dev, child);
+ TAILQ_REMOVE(&dev->children, child, link);
+ TAILQ_REMOVE(&bus_data_devices, child, devlink);
+ kobj_delete((kobj_t) child, M_BUS);
+
+ bus_data_generation_update();
+ return (0);
+}
+
+/**
+ * @brief Delete all children devices of the given device, if any.
+ *
+ * This function deletes all children devices of the given device, if
+ * any, using the device_delete_child() function for each device it
+ * finds. If a child device cannot be deleted, this function will
+ * return an error code.
+ *
+ * @param dev the parent device
+ *
+ * @retval 0 success
+ * @retval non-zero a device would not detach
+ */
+int
+device_delete_children(device_t dev)
+{
+ device_t child;
+ int error;
+
+ PDEBUG(("Deleting all children of %s", DEVICENAME(dev)));
+
+ error = 0;
+
+ while ((child = TAILQ_FIRST(&dev->children)) != NULL) {
+ error = device_delete_child(dev, child);
+ if (error) {
+ PDEBUG(("Failed deleting %s", DEVICENAME(child)));
+ break;
+ }
+ }
+ return (error);
+}
+
+/**
+ * @brief Find a device given a unit number
+ *
+ * This is similar to devclass_get_devices() but only searches for
+ * devices which have @p dev as a parent.
+ *
+ * @param dev the parent device to search
+ * @param unit the unit number to search for. If the unit is -1,
+ * return the first child of @p dev which has name
+ * @p classname (that is, the one with the lowest unit.)
+ *
+ * @returns the device with the given unit number or @c
+ * NULL if there is no such device
+ */
+device_t
+device_find_child(device_t dev, const char *classname, int unit)
+{
+ devclass_t dc;
+ device_t child;
+
+ dc = devclass_find(classname);
+ if (!dc)
+ return (NULL);
+
+ if (unit != -1) {
+ child = devclass_get_device(dc, unit);
+ if (child && child->parent == dev)
+ return (child);
+ } else {
+ for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
+ child = devclass_get_device(dc, unit);
+ if (child && child->parent == dev)
+ return (child);
+ }
+ }
+ return (NULL);
+}
+
+/**
+ * @internal
+ */
+static driverlink_t
+first_matching_driver(devclass_t dc, device_t dev)
+{
+ if (dev->devclass)
+ return (devclass_find_driver_internal(dc, dev->devclass->name));
+ return (TAILQ_FIRST(&dc->drivers));
+}
+
+/**
+ * @internal
+ */
+static driverlink_t
+next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
+{
+ if (dev->devclass) {
+ driverlink_t dl;
+ for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
+ if (!strcmp(dev->devclass->name, dl->driver->name))
+ return (dl);
+ return (NULL);
+ }
+ return (TAILQ_NEXT(last, link));
+}
+
+/**
+ * @internal
+ */
+int
+device_probe_child(device_t dev, device_t child)
+{
+ devclass_t dc;
+ driverlink_t best = NULL;
+ driverlink_t dl;
+ int result, pri = 0;
+ int hasclass = (child->devclass != NULL);
+
+ GIANT_REQUIRED;
+
+ dc = dev->devclass;
+ if (!dc)
+ panic("device_probe_child: parent device has no devclass");
+
+ /*
+ * If the state is already probed, then return. However, don't
+ * return if we can rebid this object.
+ */
+ if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0)
+ return (0);
+
+ for (; dc; dc = dc->parent) {
+ for (dl = first_matching_driver(dc, child);
+ dl;
+ dl = next_matching_driver(dc, child, dl)) {
+ /* If this driver's pass is too high, then ignore it. */
+ if (dl->pass > bus_current_pass)
+ continue;
+
+ PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
+ result = device_set_driver(child, dl->driver);
+ if (result == ENOMEM)
+ return (result);
+ else if (result != 0)
+ continue;
+ if (!hasclass) {
+ if (device_set_devclass(child,
+ dl->driver->name) != 0) {
+ char const * devname =
+ device_get_name(child);
+ if (devname == NULL)
+ devname = "(unknown)";
+ printf("driver bug: Unable to set "
+ "devclass (class: %s "
+ "devname: %s)\n",
+ dl->driver->name,
+ devname);
+ (void)device_set_driver(child, NULL);
+ continue;
+ }
+ }
+
+ /* Fetch any flags for the device before probing. */
+ resource_int_value(dl->driver->name, child->unit,
+ "flags", &child->devflags);
+
+ result = DEVICE_PROBE(child);
+
+ /* Reset flags and devclass before the next probe. */
+ child->devflags = 0;
+ if (!hasclass)
+ (void)device_set_devclass(child, NULL);
+
+ /*
+ * If the driver returns SUCCESS, there can be
+ * no higher match for this device.
+ */
+ if (result == 0) {
+ best = dl;
+ pri = 0;
+ break;
+ }
+
+ /*
+ * The driver returned an error so it
+ * certainly doesn't match.
+ */
+ if (result > 0) {
+ (void)device_set_driver(child, NULL);
+ continue;
+ }
+
+ /*
+ * A priority lower than SUCCESS, remember the
+ * best matching driver. Initialise the value
+ * of pri for the first match.
+ */
+ if (best == NULL || result > pri) {
+ /*
+ * Probes that return BUS_PROBE_NOWILDCARD
+ * or lower only match on devices whose
+ * driver was explicitly specified.
+ */
+ if (result <= BUS_PROBE_NOWILDCARD &&
+ !(child->flags & DF_FIXEDCLASS))
+ continue;
+ best = dl;
+ pri = result;
+ continue;
+ }
+ }
+ /*
+ * If we have an unambiguous match in this devclass,
+ * don't look in the parent.
+ */
+ if (best && pri == 0)
+ break;
+ }
+
+ /*
+ * If we found a driver, change state and initialise the devclass.
+ */
+ /* XXX What happens if we rebid and got no best? */
+ if (best) {
+ /*
+ * If this device was attached, and we were asked to
+ * rescan, and it is a different driver, then we have
+ * to detach the old driver and reattach this new one.
+ * Note, we don't have to check for DF_REBID here
+ * because if the state is > DS_ALIVE, we know it must
+ * be.
+ *
+ * This assumes that all DF_REBID drivers can have
+ * their probe routine called at any time and that
+ * they are idempotent as well as completely benign in
+ * normal operations.
+ *
+ * We also have to make sure that the detach
+ * succeeded, otherwise we fail the operation (or
+ * maybe it should just fail silently? I'm torn).
+ */
+ if (child->state > DS_ALIVE && best->driver != child->driver)
+ if ((result = device_detach(dev)) != 0)
+ return (result);
+
+ /* Set the winning driver, devclass, and flags. */
+ if (!child->devclass) {
+ result = device_set_devclass(child, best->driver->name);
+ if (result != 0)
+ return (result);
+ }
+ result = device_set_driver(child, best->driver);
+ if (result != 0)
+ return (result);
+ resource_int_value(best->driver->name, child->unit,
+ "flags", &child->devflags);
+
+ if (pri < 0) {
+ /*
+ * A bit bogus. Call the probe method again to make
+ * sure that we have the right description.
+ */
+ DEVICE_PROBE(child);
+#if 0
+ child->flags |= DF_REBID;
+#endif
+ } else
+ child->flags &= ~DF_REBID;
+ child->state = DS_ALIVE;
+
+ bus_data_generation_update();
+ return (0);
+ }
+
+ return (ENXIO);
+}
+
+/**
+ * @brief Return the parent of a device
+ */
+device_t
+device_get_parent(device_t dev)
+{
+ return (dev->parent);
+}
+
+/**
+ * @brief Get a list of children of a device
+ *
+ * An array containing a list of all the children of the given device
+ * is allocated and returned in @p *devlistp. The number of devices
+ * in the array is returned in @p *devcountp. The caller should free
+ * the array using @c free(p, M_TEMP).
+ *
+ * @param dev the device to examine
+ * @param devlistp points at location for array pointer return
+ * value
+ * @param devcountp points at location for array size return value
+ *
+ * @retval 0 success
+ * @retval ENOMEM the array allocation failed
+ */
+int
+device_get_children(device_t dev, device_t **devlistp, int *devcountp)
+{
+ int count;
+ device_t child;
+ device_t *list;
+
+ count = 0;
+ TAILQ_FOREACH(child, &dev->children, link) {
+ count++;
+ }
+ if (count == 0) {
+ *devlistp = NULL;
+ *devcountp = 0;
+ return (0);
+ }
+
+ list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO);
+ if (!list)
+ return (ENOMEM);
+
+ count = 0;
+ TAILQ_FOREACH(child, &dev->children, link) {
+ list[count] = child;
+ count++;
+ }
+
+ *devlistp = list;
+ *devcountp = count;
+
+ return (0);
+}
+
+/**
+ * @brief Return the current driver for the device or @c NULL if there
+ * is no driver currently attached
+ */
+driver_t *
+device_get_driver(device_t dev)
+{
+ return (dev->driver);
+}
+
+/**
+ * @brief Return the current devclass for the device or @c NULL if
+ * there is none.
+ */
+devclass_t
+device_get_devclass(device_t dev)
+{
+ return (dev->devclass);
+}
+
+/**
+ * @brief Return the name of the device's devclass or @c NULL if there
+ * is none.
+ */
+const char *
+device_get_name(device_t dev)
+{
+ if (dev != NULL && dev->devclass)
+ return (devclass_get_name(dev->devclass));
+ return (NULL);
+}
+
+/**
+ * @brief Return a string containing the device's devclass name
+ * followed by an ascii representation of the device's unit number
+ * (e.g. @c "foo2").
+ */
+const char *
+device_get_nameunit(device_t dev)
+{
+ return (dev->nameunit);
+}
+
+/**
+ * @brief Return the device's unit number.
+ */
+int
+device_get_unit(device_t dev)
+{
+ return (dev->unit);
+}
+
+/**
+ * @brief Return the device's description string
+ */
+const char *
+device_get_desc(device_t dev)
+{
+ return (dev->desc);
+}
+
+/**
+ * @brief Return the device's flags
+ */
+uint32_t
+device_get_flags(device_t dev)
+{
+ return (dev->devflags);
+}
+
+struct sysctl_ctx_list *
+device_get_sysctl_ctx(device_t dev)
+{
+ return (&dev->sysctl_ctx);
+}
+
+struct sysctl_oid *
+device_get_sysctl_tree(device_t dev)
+{
+ return (dev->sysctl_tree);
+}
+
+/**
+ * @brief Print the name of the device followed by a colon and a space
+ *
+ * @returns the number of characters printed
+ */
+int
+device_print_prettyname(device_t dev)
+{
+ const char *name = device_get_name(dev);
+
+ if (name == NULL)
+ return (printf("unknown: "));
+ return (printf("%s%d: ", name, device_get_unit(dev)));
+}
+
+/**
+ * @brief Print the name of the device followed by a colon, a space
+ * and the result of calling vprintf() with the value of @p fmt and
+ * the following arguments.
+ *
+ * @returns the number of characters printed
+ */
+int
+device_printf(device_t dev, const char * fmt, ...)
+{
+ va_list ap;
+ int retval;
+
+ retval = device_print_prettyname(dev);
+ va_start(ap, fmt);
+ retval += vprintf(fmt, ap);
+ va_end(ap);
+ return (retval);
+}
+
+/**
+ * @internal
+ */
+static void
+device_set_desc_internal(device_t dev, const char* desc, int copy)
+{
+ if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
+ free(dev->desc, M_BUS);
+ dev->flags &= ~DF_DESCMALLOCED;
+ dev->desc = NULL;
+ }
+
+ if (copy && desc) {
+ dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
+ if (dev->desc) {
+ strcpy(dev->desc, desc);
+ dev->flags |= DF_DESCMALLOCED;
+ }
+ } else {
+ /* Avoid a -Wcast-qual warning */
+ dev->desc = (char *)(uintptr_t) desc;
+ }
+
+ bus_data_generation_update();
+}
+
+/**
+ * @brief Set the device's description
+ *
+ * The value of @c desc should be a string constant that will not
+ * change (at least until the description is changed in a subsequent
+ * call to device_set_desc() or device_set_desc_copy()).
+ */
+void
+device_set_desc(device_t dev, const char* desc)
+{
+ device_set_desc_internal(dev, desc, FALSE);
+}
+
+/**
+ * @brief Set the device's description
+ *
+ * The string pointed to by @c desc is copied. Use this function if
+ * the device description is generated, (e.g. with sprintf()).
+ */
+void
+device_set_desc_copy(device_t dev, const char* desc)
+{
+ device_set_desc_internal(dev, desc, TRUE);
+}
+
+/**
+ * @brief Set the device's flags
+ */
+void
+device_set_flags(device_t dev, uint32_t flags)
+{
+ dev->devflags = flags;
+}
+
+/**
+ * @brief Return the device's softc field
+ *
+ * The softc is allocated and zeroed when a driver is attached, based
+ * on the size field of the driver.
+ */
+void *
+device_get_softc(device_t dev)
+{
+ return (dev->softc);
+}
+
+/**
+ * @brief Set the device's softc field
+ *
+ * Most drivers do not need to use this since the softc is allocated
+ * automatically when the driver is attached.
+ */
+void
+device_set_softc(device_t dev, void *softc)
+{
+ if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
+ free(dev->softc, M_BUS_SC);
+ dev->softc = softc;
+ if (dev->softc)
+ dev->flags |= DF_EXTERNALSOFTC;
+ else
+ dev->flags &= ~DF_EXTERNALSOFTC;
+}
+
+/**
+ * @brief Free claimed softc
+ *
+ * Most drivers do not need to use this since the softc is freed
+ * automatically when the driver is detached.
+ */
+void
+device_free_softc(void *softc)
+{
+ free(softc, M_BUS_SC);
+}
+
+/**
+ * @brief Claim softc
+ *
+ * This function can be used to let the driver free the automatically
+ * allocated softc using "device_free_softc()". This function is
+ * useful when the driver is refcounting the softc and the softc
+ * cannot be freed when the "device_detach" method is called.
+ */
+void
+device_claim_softc(device_t dev)
+{
+ if (dev->softc)
+ dev->flags |= DF_EXTERNALSOFTC;
+ else
+ dev->flags &= ~DF_EXTERNALSOFTC;
+}
+
+/**
+ * @brief Get the device's ivars field
+ *
+ * The ivars field is used by the parent device to store per-device
+ * state (e.g. the physical location of the device or a list of
+ * resources).
+ */
+void *
+device_get_ivars(device_t dev)
+{
+
+ KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
+ return (dev->ivars);
+}
+
+/**
+ * @brief Set the device's ivars field
+ */
+void
+device_set_ivars(device_t dev, void * ivars)
+{
+
+ KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
+ dev->ivars = ivars;
+}
+
+/**
+ * @brief Return the device's state
+ */
+device_state_t
+device_get_state(device_t dev)
+{
+ return (dev->state);
+}
+
+/**
+ * @brief Set the DF_ENABLED flag for the device
+ */
+void
+device_enable(device_t dev)
+{
+ dev->flags |= DF_ENABLED;
+}
+
+/**
+ * @brief Clear the DF_ENABLED flag for the device
+ */
+void
+device_disable(device_t dev)
+{
+ dev->flags &= ~DF_ENABLED;
+}
+
+/**
+ * @brief Increment the busy counter for the device
+ */
+void
+device_busy(device_t dev)
+{
+ if (dev->state < DS_ATTACHING)
+ panic("device_busy: called for unattached device");
+ if (dev->busy == 0 && dev->parent)
+ device_busy(dev->parent);
+ dev->busy++;
+ if (dev->state == DS_ATTACHED)
+ dev->state = DS_BUSY;
+}
+
+/**
+ * @brief Decrement the busy counter for the device
+ */
+void
+device_unbusy(device_t dev)
+{
+ if (dev->busy != 0 && dev->state != DS_BUSY &&
+ dev->state != DS_ATTACHING)
+ panic("device_unbusy: called for non-busy device %s",
+ device_get_nameunit(dev));
+ dev->busy--;
+ if (dev->busy == 0) {
+ if (dev->parent)
+ device_unbusy(dev->parent);
+ if (dev->state == DS_BUSY)
+ dev->state = DS_ATTACHED;
+ }
+}
+
+/**
+ * @brief Set the DF_QUIET flag for the device
+ */
+void
+device_quiet(device_t dev)
+{
+ dev->flags |= DF_QUIET;
+}
+
+/**
+ * @brief Clear the DF_QUIET flag for the device
+ */
+void
+device_verbose(device_t dev)
+{
+ dev->flags &= ~DF_QUIET;
+}
+
+/**
+ * @brief Return non-zero if the DF_QUIET flag is set on the device
+ */
+int
+device_is_quiet(device_t dev)
+{
+ return ((dev->flags & DF_QUIET) != 0);
+}
+
+/**
+ * @brief Return non-zero if the DF_ENABLED flag is set on the device
+ */
+int
+device_is_enabled(device_t dev)
+{
+ return ((dev->flags & DF_ENABLED) != 0);
+}
+
+/**
+ * @brief Return non-zero if the device was successfully probed
+ */
+int
+device_is_alive(device_t dev)
+{
+ return (dev->state >= DS_ALIVE);
+}
+
+/**
+ * @brief Return non-zero if the device currently has a driver
+ * attached to it
+ */
+int
+device_is_attached(device_t dev)
+{
+ return (dev->state >= DS_ATTACHED);
+}
+
+/**
+ * @brief Set the devclass of a device
+ * @see devclass_add_device().
+ */
+int
+device_set_devclass(device_t dev, const char *classname)
+{
+ devclass_t dc;
+ int error;
+
+ if (!classname) {
+ if (dev->devclass)
+ devclass_delete_device(dev->devclass, dev);
+ return (0);
+ }
+
+ if (dev->devclass) {
+ printf("device_set_devclass: device class already set\n");
+ return (EINVAL);
+ }
+
+ dc = devclass_find_internal(classname, NULL, TRUE);
+ if (!dc)
+ return (ENOMEM);
+
+ error = devclass_add_device(dc, dev);
+
+ bus_data_generation_update();
+ return (error);
+}
+
+/**
+ * @brief Set the driver of a device
+ *
+ * @retval 0 success
+ * @retval EBUSY the device already has a driver attached
+ * @retval ENOMEM a memory allocation failure occurred
+ */
+int
+device_set_driver(device_t dev, driver_t *driver)
+{
+ if (dev->state >= DS_ATTACHED)
+ return (EBUSY);
+
+ if (dev->driver == driver)
+ return (0);
+
+ if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
+ free(dev->softc, M_BUS_SC);
+ dev->softc = NULL;
+ }
+ device_set_desc(dev, NULL);
+ kobj_delete((kobj_t) dev, NULL);
+ dev->driver = driver;
+ if (driver) {
+ kobj_init((kobj_t) dev, (kobj_class_t) driver);
+ if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
+ dev->softc = malloc(driver->size, M_BUS_SC,
+ M_NOWAIT | M_ZERO);
+ if (!dev->softc) {
+ kobj_delete((kobj_t) dev, NULL);
+ kobj_init((kobj_t) dev, &null_class);
+ dev->driver = NULL;
+ return (ENOMEM);
+ }
+ }
+ } else {
+ kobj_init((kobj_t) dev, &null_class);
+ }
+
+ bus_data_generation_update();
+ return (0);
+}
+
+/**
+ * @brief Probe a device, and return this status.
+ *
+ * This function is the core of the device autoconfiguration
+ * system. Its purpose is to select a suitable driver for a device and
+ * then call that driver to initialise the hardware appropriately. The
+ * driver is selected by calling the DEVICE_PROBE() method of a set of
+ * candidate drivers and then choosing the driver which returned the
+ * best value. This driver is then attached to the device using
+ * device_attach().
+ *
+ * The set of suitable drivers is taken from the list of drivers in
+ * the parent device's devclass. If the device was originally created
+ * with a specific class name (see device_add_child()), only drivers
+ * with that name are probed, otherwise all drivers in the devclass
+ * are probed. If no drivers return successful probe values in the
+ * parent devclass, the search continues in the parent of that
+ * devclass (see devclass_get_parent()) if any.
+ *
+ * @param dev the device to initialise
+ *
+ * @retval 0 success
+ * @retval ENXIO no driver was found
+ * @retval ENOMEM memory allocation failure
+ * @retval non-zero some other unix error code
+ * @retval -1 Device already attached
+ */
+int
+device_probe(device_t dev)
+{
+ int error;
+
+ GIANT_REQUIRED;
+
+ if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0)
+ return (-1);
+
+ if (!(dev->flags & DF_ENABLED)) {
+ if (bootverbose && device_get_name(dev) != NULL) {
+ device_print_prettyname(dev);
+ printf("not probed (disabled)\n");
+ }
+ return (-1);
+ }
+ if ((error = device_probe_child(dev->parent, dev)) != 0) {
+ if (bus_current_pass == BUS_PASS_DEFAULT &&
+ !(dev->flags & DF_DONENOMATCH)) {
+ BUS_PROBE_NOMATCH(dev->parent, dev);
+ devnomatch(dev);
+ dev->flags |= DF_DONENOMATCH;
+ }
+ return (error);
+ }
+ return (0);
+}
+
+/**
+ * @brief Probe a device and attach a driver if possible
+ *
+ * calls device_probe() and attaches if that was successful.
+ */
+int
+device_probe_and_attach(device_t dev)
+{
+ int error;
+
+ GIANT_REQUIRED;
+
+ error = device_probe(dev);
+ if (error == -1)
+ return (0);
+ else if (error != 0)
+ return (error);
+
+ CURVNET_SET_QUIET(vnet0);
+ error = device_attach(dev);
+ CURVNET_RESTORE();
+ return error;
+}
+
+/**
+ * @brief Attach a device driver to a device
+ *
+ * This function is a wrapper around the DEVICE_ATTACH() driver
+ * method. In addition to calling DEVICE_ATTACH(), it initialises the
+ * device's sysctl tree, optionally prints a description of the device
+ * and queues a notification event for user-based device management
+ * services.
+ *
+ * Normally this function is only called internally from
+ * device_probe_and_attach().
+ *
+ * @param dev the device to initialise
+ *
+ * @retval 0 success
+ * @retval ENXIO no driver was found
+ * @retval ENOMEM memory allocation failure
+ * @retval non-zero some other unix error code
+ */
+int
+device_attach(device_t dev)
+{
+ int error;
+
+ if (resource_disabled(dev->driver->name, dev->unit)) {
+ device_disable(dev);
+ if (bootverbose)
+ device_printf(dev, "disabled via hints entry\n");
+ return (ENXIO);
+ }
+
+ device_sysctl_init(dev);
+ if (!device_is_quiet(dev))
+ device_print_child(dev->parent, dev);
+ dev->state = DS_ATTACHING;
+ if ((error = DEVICE_ATTACH(dev)) != 0) {
+ printf("device_attach: %s%d attach returned %d\n",
+ dev->driver->name, dev->unit, error);
+ if (!(dev->flags & DF_FIXEDCLASS))
+ devclass_delete_device(dev->devclass, dev);
+ (void)device_set_driver(dev, NULL);
+ device_sysctl_fini(dev);
+ KASSERT(dev->busy == 0, ("attach failed but busy"));
+ dev->state = DS_NOTPRESENT;
+ return (error);
+ }
+ device_sysctl_update(dev);
+ if (dev->busy)
+ dev->state = DS_BUSY;
+ else
+ dev->state = DS_ATTACHED;
+ dev->flags &= ~DF_DONENOMATCH;
+ devadded(dev);
+ return (0);
+}
+
+/**
+ * @brief Detach a driver from a device
+ *
+ * This function is a wrapper around the DEVICE_DETACH() driver
+ * method. If the call to DEVICE_DETACH() succeeds, it calls
+ * BUS_CHILD_DETACHED() for the parent of @p dev, queues a
+ * notification event for user-based device management services and
+ * cleans up the device's sysctl tree.
+ *
+ * @param dev the device to un-initialise
+ *
+ * @retval 0 success
+ * @retval ENXIO no driver was found
+ * @retval ENOMEM memory allocation failure
+ * @retval non-zero some other unix error code
+ */
+int
+device_detach(device_t dev)
+{
+ int error;
+
+ GIANT_REQUIRED;
+
+ PDEBUG(("%s", DEVICENAME(dev)));
+ if (dev->state == DS_BUSY)
+ return (EBUSY);
+ if (dev->state != DS_ATTACHED)
+ return (0);
+
+ if ((error = DEVICE_DETACH(dev)) != 0)
+ return (error);
+ devremoved(dev);
+ if (!device_is_quiet(dev))
+ device_printf(dev, "detached\n");
+ if (dev->parent)
+ BUS_CHILD_DETACHED(dev->parent, dev);
+
+ if (!(dev->flags & DF_FIXEDCLASS))
+ devclass_delete_device(dev->devclass, dev);
+
+ dev->state = DS_NOTPRESENT;
+ (void)device_set_driver(dev, NULL);
+ device_sysctl_fini(dev);
+
+ return (0);
+}
+
+/**
+ * @brief Tells a driver to quiesce itself.
+ *
+ * This function is a wrapper around the DEVICE_QUIESCE() driver
+ * method. If the call to DEVICE_QUIESCE() succeeds.
+ *
+ * @param dev the device to quiesce
+ *
+ * @retval 0 success
+ * @retval ENXIO no driver was found
+ * @retval ENOMEM memory allocation failure
+ * @retval non-zero some other unix error code
+ */
+int
+device_quiesce(device_t dev)
+{
+
+ PDEBUG(("%s", DEVICENAME(dev)));
+ if (dev->state == DS_BUSY)
+ return (EBUSY);
+ if (dev->state != DS_ATTACHED)
+ return (0);
+
+ return (DEVICE_QUIESCE(dev));
+}
+
+/**
+ * @brief Notify a device of system shutdown
+ *
+ * This function calls the DEVICE_SHUTDOWN() driver method if the
+ * device currently has an attached driver.
+ *
+ * @returns the value returned by DEVICE_SHUTDOWN()
+ */
+int
+device_shutdown(device_t dev)
+{
+ if (dev->state < DS_ATTACHED)
+ return (0);
+ return (DEVICE_SHUTDOWN(dev));
+}
+
+/**
+ * @brief Set the unit number of a device
+ *
+ * This function can be used to override the unit number used for a
+ * device (e.g. to wire a device to a pre-configured unit number).
+ */
+int
+device_set_unit(device_t dev, int unit)
+{
+ devclass_t dc;
+ int err;
+
+ dc = device_get_devclass(dev);
+ if (unit < dc->maxunit && dc->devices[unit])
+ return (EBUSY);
+ err = devclass_delete_device(dc, dev);
+ if (err)
+ return (err);
+ dev->unit = unit;
+ err = devclass_add_device(dc, dev);
+ if (err)
+ return (err);
+
+ bus_data_generation_update();
+ return (0);
+}
+
+/*======================================*/
+/*
+ * Some useful method implementations to make life easier for bus drivers.
+ */
+
+/**
+ * @brief Initialise a resource list.
+ *
+ * @param rl the resource list to initialise
+ */
+void
+resource_list_init(struct resource_list *rl)
+{
+ STAILQ_INIT(rl);
+}
+
+/**
+ * @brief Reclaim memory used by a resource list.
+ *
+ * This function frees the memory for all resource entries on the list
+ * (if any).
+ *
+ * @param rl the resource list to free
+ */
+void
+resource_list_free(struct resource_list *rl)
+{
+ struct resource_list_entry *rle;
+
+ while ((rle = STAILQ_FIRST(rl)) != NULL) {
+ if (rle->res)
+ panic("resource_list_free: resource entry is busy");
+ STAILQ_REMOVE_HEAD(rl, link);
+ free(rle, M_BUS);
+ }
+}
+
+/**
+ * @brief Add a resource entry.
+ *
+ * This function adds a resource entry using the given @p type, @p
+ * start, @p end and @p count values. A rid value is chosen by
+ * searching sequentially for the first unused rid starting at zero.
+ *
+ * @param rl the resource list to edit
+ * @param type the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param start the start address of the resource
+ * @param end the end address of the resource
+ * @param count XXX end-start+1
+ */
+int
+resource_list_add_next(struct resource_list *rl, int type, u_long start,
+ u_long end, u_long count)
+{
+ int rid;
+
+ rid = 0;
+ while (resource_list_find(rl, type, rid) != NULL)
+ rid++;
+ resource_list_add(rl, type, rid, start, end, count);
+ return (rid);
+}
+
+/**
+ * @brief Add or modify a resource entry.
+ *
+ * If an existing entry exists with the same type and rid, it will be
+ * modified using the given values of @p start, @p end and @p
+ * count. If no entry exists, a new one will be created using the
+ * given values. The resource list entry that matches is then returned.
+ *
+ * @param rl the resource list to edit
+ * @param type the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid the resource identifier
+ * @param start the start address of the resource
+ * @param end the end address of the resource
+ * @param count XXX end-start+1
+ */
+struct resource_list_entry *
+resource_list_add(struct resource_list *rl, int type, int rid,
+ u_long start, u_long end, u_long count)
+{
+ struct resource_list_entry *rle;
+
+ rle = resource_list_find(rl, type, rid);
+ if (!rle) {
+ rle = malloc(sizeof(struct resource_list_entry), M_BUS,
+ M_NOWAIT);
+ if (!rle)
+ panic("resource_list_add: can't record entry");
+ STAILQ_INSERT_TAIL(rl, rle, link);
+ rle->type = type;
+ rle->rid = rid;
+ rle->res = NULL;
+ rle->flags = 0;
+ }
+
+ if (rle->res)
+ panic("resource_list_add: resource entry is busy");
+
+ rle->start = start;
+ rle->end = end;
+ rle->count = count;
+ return (rle);
+}
+
+/**
+ * @brief Determine if a resource entry is busy.
+ *
+ * Returns true if a resource entry is busy meaning that it has an
+ * associated resource that is not an unallocated "reserved" resource.
+ *
+ * @param rl the resource list to search
+ * @param type the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid the resource identifier
+ *
+ * @returns Non-zero if the entry is busy, zero otherwise.
+ */
+int
+resource_list_busy(struct resource_list *rl, int type, int rid)
+{
+ struct resource_list_entry *rle;
+
+ rle = resource_list_find(rl, type, rid);
+ if (rle == NULL || rle->res == NULL)
+ return (0);
+ if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) {
+ KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE),
+ ("reserved resource is active"));
+ return (0);
+ }
+ return (1);
+}
+
+/**
+ * @brief Determine if a resource entry is reserved.
+ *
+ * Returns true if a resource entry is reserved meaning that it has an
+ * associated "reserved" resource. The resource can either be
+ * allocated or unallocated.
+ *
+ * @param rl the resource list to search
+ * @param type the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid the resource identifier
+ *
+ * @returns Non-zero if the entry is reserved, zero otherwise.
+ */
+int
+resource_list_reserved(struct resource_list *rl, int type, int rid)
+{
+ struct resource_list_entry *rle;
+
+ rle = resource_list_find(rl, type, rid);
+ if (rle != NULL && rle->flags & RLE_RESERVED)
+ return (1);
+ return (0);
+}
+
+/**
+ * @brief Find a resource entry by type and rid.
+ *
+ * @param rl the resource list to search
+ * @param type the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid the resource identifier
+ *
+ * @returns the resource entry pointer or NULL if there is no such
+ * entry.
+ */
+struct resource_list_entry *
+resource_list_find(struct resource_list *rl, int type, int rid)
+{
+ struct resource_list_entry *rle;
+
+ STAILQ_FOREACH(rle, rl, link) {
+ if (rle->type == type && rle->rid == rid)
+ return (rle);
+ }
+ return (NULL);
+}
+
+/**
+ * @brief Delete a resource entry.
+ *
+ * @param rl the resource list to edit
+ * @param type the resource entry type (e.g. SYS_RES_MEMORY)
+ * @param rid the resource identifier
+ */
+void
+resource_list_delete(struct resource_list *rl, int type, int rid)
+{
+ struct resource_list_entry *rle = resource_list_find(rl, type, rid);
+
+ if (rle) {
+ if (rle->res != NULL)
+ panic("resource_list_delete: resource has not been released");
+ STAILQ_REMOVE(rl, rle, resource_list_entry, link);
+ free(rle, M_BUS);
+ }
+}
+
+/**
+ * @brief Allocate a reserved resource
+ *
+ * This can be used by busses to force the allocation of resources
+ * that are always active in the system even if they are not allocated
+ * by a driver (e.g. PCI BARs). This function is usually called when
+ * adding a new child to the bus. The resource is allocated from the
+ * parent bus when it is reserved. The resource list entry is marked
+ * with RLE_RESERVED to note that it is a reserved resource.
+ *
+ * Subsequent attempts to allocate the resource with
+ * resource_list_alloc() will succeed the first time and will set
+ * RLE_ALLOCATED to note that it has been allocated. When a reserved
+ * resource that has been allocated is released with
+ * resource_list_release() the resource RLE_ALLOCATED is cleared, but
+ * the actual resource remains allocated. The resource can be released to
+ * the parent bus by calling resource_list_unreserve().
+ *
+ * @param rl the resource list to allocate from
+ * @param bus the parent device of @p child
+ * @param child the device for which the resource is being reserved
+ * @param type the type of resource to allocate
+ * @param rid a pointer to the resource identifier
+ * @param start hint at the start of the resource range - pass
+ * @c 0UL for any start address
+ * @param end hint at the end of the resource range - pass
+ * @c ~0UL for any end address
+ * @param count hint at the size of range required - pass @c 1
+ * for any size
+ * @param flags any extra flags to control the resource
+ * allocation - see @c RF_XXX flags in
+ * <sys/rman.h> for details
+ *
+ * @returns the resource which was allocated or @c NULL if no
+ * resource could be allocated
+ */
+struct resource *
+resource_list_reserve(struct resource_list *rl, device_t bus, device_t child,
+ int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+ struct resource_list_entry *rle = NULL;
+ int passthrough = (device_get_parent(child) != bus);
+ struct resource *r;
+
+ if (passthrough)
+ panic(
+ "resource_list_reserve() should only be called for direct children");
+ if (flags & RF_ACTIVE)
+ panic(
+ "resource_list_reserve() should only reserve inactive resources");
+
+ r = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
+ flags);
+ if (r != NULL) {
+ rle = resource_list_find(rl, type, *rid);
+ rle->flags |= RLE_RESERVED;
+ }
+ return (r);
+}
+
+/**
+ * @brief Helper function for implementing BUS_ALLOC_RESOURCE()
+ *
+ * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
+ * and passing the allocation up to the parent of @p bus. This assumes
+ * that the first entry of @c device_get_ivars(child) is a struct
+ * resource_list. This also handles 'passthrough' allocations where a
+ * child is a remote descendant of bus by passing the allocation up to
+ * the parent of bus.
+ *
+ * Typically, a bus driver would store a list of child resources
+ * somewhere in the child device's ivars (see device_get_ivars()) and
+ * its implementation of BUS_ALLOC_RESOURCE() would find that list and
+ * then call resource_list_alloc() to perform the allocation.
+ *
+ * @param rl the resource list to allocate from
+ * @param bus the parent device of @p child
+ * @param child the device which is requesting an allocation
+ * @param type the type of resource to allocate
+ * @param rid a pointer to the resource identifier
+ * @param start hint at the start of the resource range - pass
+ * @c 0UL for any start address
+ * @param end hint at the end of the resource range - pass
+ * @c ~0UL for any end address
+ * @param count hint at the size of range required - pass @c 1
+ * for any size
+ * @param flags any extra flags to control the resource
+ * allocation - see @c RF_XXX flags in
+ * <sys/rman.h> for details
+ *
+ * @returns the resource which was allocated or @c NULL if no
+ * resource could be allocated
+ */
+struct resource *
+resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
+ int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+ struct resource_list_entry *rle = NULL;
+ int passthrough = (device_get_parent(child) != bus);
+ int isdefault = (start == 0UL && end == ~0UL);
+
+ if (passthrough) {
+ return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
+ type, rid, start, end, count, flags));
+ }
+
+ rle = resource_list_find(rl, type, *rid);
+
+ if (!rle)
+ return (NULL); /* no resource of that type/rid */
+
+ if (rle->res) {
+ if (rle->flags & RLE_RESERVED) {
+ if (rle->flags & RLE_ALLOCATED)
+ return (NULL);
+ if ((flags & RF_ACTIVE) &&
+ bus_activate_resource(child, type, *rid,
+ rle->res) != 0)
+ return (NULL);
+ rle->flags |= RLE_ALLOCATED;
+ return (rle->res);
+ }
+ panic("resource_list_alloc: resource entry is busy");
+ }
+
+ if (isdefault) {
+ start = rle->start;
+ count = ulmax(count, rle->count);
+ end = ulmax(rle->end, start + count - 1);
+ }
+
+ rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
+ type, rid, start, end, count, flags);
+
+ /*
+ * Record the new range.
+ */
+ if (rle->res) {
+ rle->start = rman_get_start(rle->res);
+ rle->end = rman_get_end(rle->res);
+ rle->count = count;
+ }
+
+ return (rle->res);
+}
+
+/**
+ * @brief Helper function for implementing BUS_RELEASE_RESOURCE()
+ *
+ * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
+ * used with resource_list_alloc().
+ *
+ * @param rl the resource list which was allocated from
+ * @param bus the parent device of @p child
+ * @param child the device which is requesting a release
+ * @param type the type of resource to release
+ * @param rid the resource identifier
+ * @param res the resource to release
+ *
+ * @retval 0 success
+ * @retval non-zero a standard unix error code indicating what
+ * error condition prevented the operation
+ */
+int
+resource_list_release(struct resource_list *rl, device_t bus, device_t child,
+ int type, int rid, struct resource *res)
+{
+ struct resource_list_entry *rle = NULL;
+ int passthrough = (device_get_parent(child) != bus);
+ int error;
+
+ if (passthrough) {
+ return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
+ type, rid, res));
+ }
+
+ rle = resource_list_find(rl, type, rid);
+
+ if (!rle)
+ panic("resource_list_release: can't find resource");
+ if (!rle->res)
+ panic("resource_list_release: resource entry is not busy");
+ if (rle->flags & RLE_RESERVED) {
+ if (rle->flags & RLE_ALLOCATED) {
+ if (rman_get_flags(res) & RF_ACTIVE) {
+ error = bus_deactivate_resource(child, type,
+ rid, res);
+ if (error)
+ return (error);
+ }
+ rle->flags &= ~RLE_ALLOCATED;
+ return (0);
+ }
+ return (EINVAL);
+ }
+
+ error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
+ type, rid, res);
+ if (error)
+ return (error);
+
+ rle->res = NULL;
+ return (0);
+}
+
+/**
+ * @brief Release all active resources of a given type
+ *
+ * Release all active resources of a specified type. This is intended
+ * to be used to cleanup resources leaked by a driver after detach or
+ * a failed attach.
+ *
+ * @param rl the resource list which was allocated from
+ * @param bus the parent device of @p child
+ * @param child the device whose active resources are being released
+ * @param type the type of resources to release
+ *
+ * @retval 0 success
+ * @retval EBUSY at least one resource was active
+ */
+int
+resource_list_release_active(struct resource_list *rl, device_t bus,
+ device_t child, int type)
+{
+ struct resource_list_entry *rle;
+ int error, retval;
+
+ retval = 0;
+ STAILQ_FOREACH(rle, rl, link) {
+ if (rle->type != type)
+ continue;
+ if (rle->res == NULL)
+ continue;
+ if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) ==
+ RLE_RESERVED)
+ continue;
+ retval = EBUSY;
+ error = resource_list_release(rl, bus, child, type,
+ rman_get_rid(rle->res), rle->res);
+ if (error != 0)
+ device_printf(bus,
+ "Failed to release active resource: %d\n", error);
+ }
+ return (retval);
+}
+
+
+/**
+ * @brief Fully release a reserved resource
+ *
+ * Fully releases a resource reserved via resource_list_reserve().
+ *
+ * @param rl the resource list which was allocated from
+ * @param bus the parent device of @p child
+ * @param child the device whose reserved resource is being released
+ * @param type the type of resource to release
+ * @param rid the resource identifier
+ * @param res the resource to release
+ *
+ * @retval 0 success
+ * @retval non-zero a standard unix error code indicating what
+ * error condition prevented the operation
+ */
+int
+resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child,
+ int type, int rid)
+{
+ struct resource_list_entry *rle = NULL;
+ int passthrough = (device_get_parent(child) != bus);
+
+ if (passthrough)
+ panic(
+ "resource_list_unreserve() should only be called for direct children");
+
+ rle = resource_list_find(rl, type, rid);
+
+ if (!rle)
+ panic("resource_list_unreserve: can't find resource");
+ if (!(rle->flags & RLE_RESERVED))
+ return (EINVAL);
+ if (rle->flags & RLE_ALLOCATED)
+ return (EBUSY);
+ rle->flags &= ~RLE_RESERVED;
+ return (resource_list_release(rl, bus, child, type, rid, rle->res));
+}
+
+/**
+ * @brief Print a description of resources in a resource list
+ *
+ * Print all resources of a specified type, for use in BUS_PRINT_CHILD().
+ * The name is printed if at least one resource of the given type is available.
+ * The format is used to print resource start and end.
+ *
+ * @param rl the resource list to print
+ * @param name the name of @p type, e.g. @c "memory"
+ * @param type type type of resource entry to print
+ * @param format printf(9) format string to print resource
+ * start and end values
+ *
+ * @returns the number of characters printed
+ */
+int
+resource_list_print_type(struct resource_list *rl, const char *name, int type,
+ const char *format)
+{
+ struct resource_list_entry *rle;
+ int printed, retval;
+
+ printed = 0;
+ retval = 0;
+ /* Yes, this is kinda cheating */
+ STAILQ_FOREACH(rle, rl, link) {
+ if (rle->type == type) {
+ if (printed == 0)
+ retval += printf(" %s ", name);
+ else
+ retval += printf(",");
+ printed++;
+ retval += printf(format, rle->start);
+ if (rle->count > 1) {
+ retval += printf("-");
+ retval += printf(format, rle->start +
+ rle->count - 1);
+ }
+ }
+ }
+ return (retval);
+}
+
+/**
+ * @brief Releases all the resources in a list.
+ *
+ * @param rl The resource list to purge.
+ *
+ * @returns nothing
+ */
+void
+resource_list_purge(struct resource_list *rl)
+{
+ struct resource_list_entry *rle;
+
+ while ((rle = STAILQ_FIRST(rl)) != NULL) {
+ if (rle->res)
+ bus_release_resource(rman_get_device(rle->res),
+ rle->type, rle->rid, rle->res);
+ STAILQ_REMOVE_HEAD(rl, link);
+ free(rle, M_BUS);
+ }
+}
+
+device_t
+bus_generic_add_child(device_t dev, u_int order, const char *name, int unit)
+{
+
+ return (device_add_child_ordered(dev, order, name, unit));
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_PROBE()
+ *
+ * This function can be used to help implement the DEVICE_PROBE() for
+ * a bus (i.e. a device which has other devices attached to it). It
+ * calls the DEVICE_IDENTIFY() method of each driver in the device's
+ * devclass.
+ */
+int
+bus_generic_probe(device_t dev)
+{
+ devclass_t dc = dev->devclass;
+ driverlink_t dl;
+
+ TAILQ_FOREACH(dl, &dc->drivers, link) {
+ /*
+ * If this driver's pass is too high, then ignore it.
+ * For most drivers in the default pass, this will
+ * never be true. For early-pass drivers they will
+ * only call the identify routines of eligible drivers
+ * when this routine is called. Drivers for later
+ * passes should have their identify routines called
+ * on early-pass busses during BUS_NEW_PASS().
+ */
+ if (dl->pass > bus_current_pass)
+ continue;
+ DEVICE_IDENTIFY(dl->driver, dev);
+ }
+
+ return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_ATTACH()
+ *
+ * This function can be used to help implement the DEVICE_ATTACH() for
+ * a bus. It calls device_probe_and_attach() for each of the device's
+ * children.
+ */
+int
+bus_generic_attach(device_t dev)
+{
+ device_t child;
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ device_probe_and_attach(child);
+ }
+
+ return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_DETACH()
+ *
+ * This function can be used to help implement the DEVICE_DETACH() for
+ * a bus. It calls device_detach() for each of the device's
+ * children.
+ */
+int
+bus_generic_detach(device_t dev)
+{
+ device_t child;
+ int error;
+
+ if (dev->state != DS_ATTACHED)
+ return (EBUSY);
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ if ((error = device_detach(child)) != 0)
+ return (error);
+ }
+
+ return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_SHUTDOWN()
+ *
+ * This function can be used to help implement the DEVICE_SHUTDOWN()
+ * for a bus. It calls device_shutdown() for each of the device's
+ * children.
+ */
+int
+bus_generic_shutdown(device_t dev)
+{
+ device_t child;
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ device_shutdown(child);
+ }
+
+ return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_SUSPEND()
+ *
+ * This function can be used to help implement the DEVICE_SUSPEND()
+ * for a bus. It calls DEVICE_SUSPEND() for each of the device's
+ * children. If any call to DEVICE_SUSPEND() fails, the suspend
+ * operation is aborted and any devices which were suspended are
+ * resumed immediately by calling their DEVICE_RESUME() methods.
+ */
+int
+bus_generic_suspend(device_t dev)
+{
+ int error;
+ device_t child, child2;
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ error = DEVICE_SUSPEND(child);
+ if (error) {
+ for (child2 = TAILQ_FIRST(&dev->children);
+ child2 && child2 != child;
+ child2 = TAILQ_NEXT(child2, link))
+ DEVICE_RESUME(child2);
+ return (error);
+ }
+ }
+ return (0);
+}
+
+/**
+ * @brief Helper function for implementing DEVICE_RESUME()
+ *
+ * This function can be used to help implement the DEVICE_RESUME() for
+ * a bus. It calls DEVICE_RESUME() on each of the device's children.
+ */
+int
+bus_generic_resume(device_t dev)
+{
+ device_t child;
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ DEVICE_RESUME(child);
+ /* if resume fails, there's nothing we can usefully do... */
+ }
+ return (0);
+}
+
+/**
+ * @brief Helper function for implementing BUS_PRINT_CHILD().
+ *
+ * This function prints the first part of the ascii representation of
+ * @p child, including its name, unit and description (if any - see
+ * device_set_desc()).
+ *
+ * @returns the number of characters printed
+ */
+int
+bus_print_child_header(device_t dev, device_t child)
+{
+ int retval = 0;
+
+ if (device_get_desc(child)) {
+ retval += device_printf(child, "<%s>", device_get_desc(child));
+ } else {
+ retval += printf("%s", device_get_nameunit(child));
+ }
+
+ return (retval);
+}
+
+/**
+ * @brief Helper function for implementing BUS_PRINT_CHILD().
+ *
+ * This function prints the last part of the ascii representation of
+ * @p child, which consists of the string @c " on " followed by the
+ * name and unit of the @p dev.
+ *
+ * @returns the number of characters printed
+ */
+int
+bus_print_child_footer(device_t dev, device_t child)
+{
+ return (printf(" on %s\n", device_get_nameunit(dev)));
+}
+
+/**
+ * @brief Helper function for implementing BUS_PRINT_CHILD().
+ *
+ * This function simply calls bus_print_child_header() followed by
+ * bus_print_child_footer().
+ *
+ * @returns the number of characters printed
+ */
+int
+bus_generic_print_child(device_t dev, device_t child)
+{
+ int retval = 0;
+
+ retval += bus_print_child_header(dev, child);
+ retval += bus_print_child_footer(dev, child);
+
+ return (retval);
+}
+
+/**
+ * @brief Stub function for implementing BUS_READ_IVAR().
+ *
+ * @returns ENOENT
+ */
+int
+bus_generic_read_ivar(device_t dev, device_t child, int index,
+ uintptr_t * result)
+{
+ return (ENOENT);
+}
+
+/**
+ * @brief Stub function for implementing BUS_WRITE_IVAR().
+ *
+ * @returns ENOENT
+ */
+int
+bus_generic_write_ivar(device_t dev, device_t child, int index,
+ uintptr_t value)
+{
+ return (ENOENT);
+}
+
+/**
+ * @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
+ *
+ * @returns NULL
+ */
+struct resource_list *
+bus_generic_get_resource_list(device_t dev, device_t child)
+{
+ return (NULL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_DRIVER_ADDED().
+ *
+ * This implementation of BUS_DRIVER_ADDED() simply calls the driver's
+ * DEVICE_IDENTIFY() method to allow it to add new children to the bus
+ * and then calls device_probe_and_attach() for each unattached child.
+ */
+void
+bus_generic_driver_added(device_t dev, driver_t *driver)
+{
+ device_t child;
+
+ DEVICE_IDENTIFY(driver, dev);
+ TAILQ_FOREACH(child, &dev->children, link) {
+ if (child->state == DS_NOTPRESENT ||
+ (child->flags & DF_REBID))
+ device_probe_and_attach(child);
+ }
+}
+
+/**
+ * @brief Helper function for implementing BUS_NEW_PASS().
+ *
+ * This implementing of BUS_NEW_PASS() first calls the identify
+ * routines for any drivers that probe at the current pass. Then it
+ * walks the list of devices for this bus. If a device is already
+ * attached, then it calls BUS_NEW_PASS() on that device. If the
+ * device is not already attached, it attempts to attach a driver to
+ * it.
+ */
+void
+bus_generic_new_pass(device_t dev)
+{
+ driverlink_t dl;
+ devclass_t dc;
+ device_t child;
+
+ dc = dev->devclass;
+ TAILQ_FOREACH(dl, &dc->drivers, link) {
+ if (dl->pass == bus_current_pass)
+ DEVICE_IDENTIFY(dl->driver, dev);
+ }
+ TAILQ_FOREACH(child, &dev->children, link) {
+ if (child->state >= DS_ATTACHED)
+ BUS_NEW_PASS(child);
+ else if (child->state == DS_NOTPRESENT)
+ device_probe_and_attach(child);
+ }
+}
+
+/**
+ * @brief Helper function for implementing BUS_SETUP_INTR().
+ *
+ * This simple implementation of BUS_SETUP_INTR() simply calls the
+ * BUS_SETUP_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
+ int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg,
+ void **cookiep)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
+ filter, intr, arg, cookiep));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_TEARDOWN_INTR().
+ *
+ * This simple implementation of BUS_TEARDOWN_INTR() simply calls the
+ * BUS_TEARDOWN_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
+ void *cookie)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_ADJUST_RESOURCE().
+ *
+ * This simple implementation of BUS_ADJUST_RESOURCE() simply calls the
+ * BUS_ADJUST_RESOURCE() method of the parent of @p dev.
+ */
+int
+bus_generic_adjust_resource(device_t dev, device_t child, int type,
+ struct resource *r, u_long start, u_long end)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start,
+ end));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
+ *
+ * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
+ * BUS_ALLOC_RESOURCE() method of the parent of @p dev.
+ */
+struct resource *
+bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
+ u_long start, u_long end, u_long count, u_int flags)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
+ start, end, count, flags));
+ return (NULL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
+ *
+ * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
+ * BUS_RELEASE_RESOURCE() method of the parent of @p dev.
+ */
+int
+bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
+ *
+ * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
+ * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
+ */
+int
+bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
+ struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
+ *
+ * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
+ * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
+ */
+int
+bus_generic_deactivate_resource(device_t dev, device_t child, int type,
+ int rid, struct resource *r)
+{
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
+ r));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_BIND_INTR().
+ *
+ * This simple implementation of BUS_BIND_INTR() simply calls the
+ * BUS_BIND_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq,
+ int cpu)
+{
+
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_BIND_INTR(dev->parent, child, irq, cpu));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_CONFIG_INTR().
+ *
+ * This simple implementation of BUS_CONFIG_INTR() simply calls the
+ * BUS_CONFIG_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
+ enum intr_polarity pol)
+{
+
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_DESCRIBE_INTR().
+ *
+ * This simple implementation of BUS_DESCRIBE_INTR() simply calls the
+ * BUS_DESCRIBE_INTR() method of the parent of @p dev.
+ */
+int
+bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq,
+ void *cookie, const char *descr)
+{
+
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent)
+ return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie,
+ descr));
+ return (EINVAL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_GET_DMA_TAG().
+ *
+ * This simple implementation of BUS_GET_DMA_TAG() simply calls the
+ * BUS_GET_DMA_TAG() method of the parent of @p dev.
+ */
+bus_dma_tag_t
+bus_generic_get_dma_tag(device_t dev, device_t child)
+{
+
+ /* Propagate up the bus hierarchy until someone handles it. */
+ if (dev->parent != NULL)
+ return (BUS_GET_DMA_TAG(dev->parent, child));
+ return (NULL);
+}
+
+/**
+ * @brief Helper function for implementing BUS_GET_RESOURCE().
+ *
+ * This implementation of BUS_GET_RESOURCE() uses the
+ * resource_list_find() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
+ * search.
+ */
+int
+bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
+ u_long *startp, u_long *countp)
+{
+ struct resource_list * rl = NULL;
+ struct resource_list_entry * rle = NULL;
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return (EINVAL);
+
+ rle = resource_list_find(rl, type, rid);
+ if (!rle)
+ return (ENOENT);
+
+ if (startp)
+ *startp = rle->start;
+ if (countp)
+ *countp = rle->count;
+
+ return (0);
+}
+
+/**
+ * @brief Helper function for implementing BUS_SET_RESOURCE().
+ *
+ * This implementation of BUS_SET_RESOURCE() uses the
+ * resource_list_add() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
+ * edit.
+ */
+int
+bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
+ u_long start, u_long count)
+{
+ struct resource_list * rl = NULL;
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return (EINVAL);
+
+ resource_list_add(rl, type, rid, start, (start + count - 1), count);
+
+ return (0);
+}
+
+/**
+ * @brief Helper function for implementing BUS_DELETE_RESOURCE().
+ *
+ * This implementation of BUS_DELETE_RESOURCE() uses the
+ * resource_list_delete() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list to
+ * edit.
+ */
+void
+bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
+{
+ struct resource_list * rl = NULL;
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return;
+
+ resource_list_delete(rl, type, rid);
+
+ return;
+}
+
+/**
+ * @brief Helper function for implementing BUS_RELEASE_RESOURCE().
+ *
+ * This implementation of BUS_RELEASE_RESOURCE() uses the
+ * resource_list_release() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
+ */
+int
+bus_generic_rl_release_resource(device_t dev, device_t child, int type,
+ int rid, struct resource *r)
+{
+ struct resource_list * rl = NULL;
+
+ if (device_get_parent(child) != dev)
+ return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
+ type, rid, r));
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return (EINVAL);
+
+ return (resource_list_release(rl, dev, child, type, rid, r));
+}
+
+/**
+ * @brief Helper function for implementing BUS_ALLOC_RESOURCE().
+ *
+ * This implementation of BUS_ALLOC_RESOURCE() uses the
+ * resource_list_alloc() function to do most of the work. It calls
+ * BUS_GET_RESOURCE_LIST() to find a suitable resource list.
+ */
+struct resource *
+bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
+ int *rid, u_long start, u_long end, u_long count, u_int flags)
+{
+ struct resource_list * rl = NULL;
+
+ if (device_get_parent(child) != dev)
+ return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
+ type, rid, start, end, count, flags));
+
+ rl = BUS_GET_RESOURCE_LIST(dev, child);
+ if (!rl)
+ return (NULL);
+
+ return (resource_list_alloc(rl, dev, child, type, rid,
+ start, end, count, flags));
+}
+
+/**
+ * @brief Helper function for implementing BUS_CHILD_PRESENT().
+ *
+ * This simple implementation of BUS_CHILD_PRESENT() simply calls the
+ * BUS_CHILD_PRESENT() method of the parent of @p dev.
+ */
+int
+bus_generic_child_present(device_t dev, device_t child)
+{
+ return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
+}
+
+/*
+ * Some convenience functions to make it easier for drivers to use the
+ * resource-management functions. All these really do is hide the
+ * indirection through the parent's method table, making for slightly
+ * less-wordy code. In the future, it might make sense for this code
+ * to maintain some sort of a list of resources allocated by each device.
+ */
+
+int
+bus_alloc_resources(device_t dev, struct resource_spec *rs,
+ struct resource **res)
+{
+ int i;
+
+ for (i = 0; rs[i].type != -1; i++)
+ res[i] = NULL;
+ for (i = 0; rs[i].type != -1; i++) {
+ res[i] = bus_alloc_resource_any(dev,
+ rs[i].type, &rs[i].rid, rs[i].flags);
+ if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
+ bus_release_resources(dev, rs, res);
+ return (ENXIO);
+ }
+ }
+ return (0);
+}
+
+void
+bus_release_resources(device_t dev, const struct resource_spec *rs,
+ struct resource **res)
+{
+ int i;
+
+ for (i = 0; rs[i].type != -1; i++)
+ if (res[i] != NULL) {
+ bus_release_resource(
+ dev, rs[i].type, rs[i].rid, res[i]);
+ res[i] = NULL;
+ }
+}
+
+/**
+ * @brief Wrapper function for BUS_ALLOC_RESOURCE().
+ *
+ * This function simply calls the BUS_ALLOC_RESOURCE() method of the
+ * parent of @p dev.
+ */
+struct resource *
+bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
+ u_long count, u_int flags)
+{
+ if (dev->parent == NULL)
+ return (NULL);
+ return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
+ count, flags));
+}
+
+/**
+ * @brief Wrapper function for BUS_ADJUST_RESOURCE().
+ *
+ * This function simply calls the BUS_ADJUST_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_adjust_resource(device_t dev, int type, struct resource *r, u_long start,
+ u_long end)
+{
+ if (dev->parent == NULL)
+ return (EINVAL);
+ return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end));
+}
+
+/**
+ * @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
+ *
+ * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == NULL)
+ return (EINVAL);
+ return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+/**
+ * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
+ *
+ * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == NULL)
+ return (EINVAL);
+ return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+/**
+ * @brief Wrapper function for BUS_RELEASE_RESOURCE().
+ *
+ * This function simply calls the BUS_RELEASE_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_release_resource(device_t dev, int type, int rid, struct resource *r)
+{
+ if (dev->parent == NULL)
+ return (EINVAL);
+ return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
+}
+
+/**
+ * @brief Wrapper function for BUS_SETUP_INTR().
+ *
+ * This function simply calls the BUS_SETUP_INTR() method of the
+ * parent of @p dev.
+ */
+int
+bus_setup_intr(device_t dev, struct resource *r, int flags,
+ driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
+{
+ int error;
+
+ if (dev->parent == NULL)
+ return (EINVAL);
+ error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler,
+ arg, cookiep);
+ if (error != 0)
+ return (error);
+ if (handler != NULL && !(flags & INTR_MPSAFE))
+ device_printf(dev, "[GIANT-LOCKED]\n");
+ return (0);
+}
+
+/**
+ * @brief Wrapper function for BUS_TEARDOWN_INTR().
+ *
+ * This function simply calls the BUS_TEARDOWN_INTR() method of the
+ * parent of @p dev.
+ */
+int
+bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
+{
+ if (dev->parent == NULL)
+ return (EINVAL);
+ return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
+}
+
+/**
+ * @brief Wrapper function for BUS_BIND_INTR().
+ *
+ * This function simply calls the BUS_BIND_INTR() method of the
+ * parent of @p dev.
+ */
+int
+bus_bind_intr(device_t dev, struct resource *r, int cpu)
+{
+ if (dev->parent == NULL)
+ return (EINVAL);
+ return (BUS_BIND_INTR(dev->parent, dev, r, cpu));
+}
+
+/**
+ * @brief Wrapper function for BUS_DESCRIBE_INTR().
+ *
+ * This function first formats the requested description into a
+ * temporary buffer and then calls the BUS_DESCRIBE_INTR() method of
+ * the parent of @p dev.
+ */
+int
+bus_describe_intr(device_t dev, struct resource *irq, void *cookie,
+ const char *fmt, ...)
+{
+ va_list ap;
+ char descr[MAXCOMLEN + 1];
+
+ if (dev->parent == NULL)
+ return (EINVAL);
+ va_start(ap, fmt);
+ vsnprintf(descr, sizeof(descr), fmt, ap);
+ va_end(ap);
+ return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr));
+}
+
+/**
+ * @brief Wrapper function for BUS_SET_RESOURCE().
+ *
+ * This function simply calls the BUS_SET_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_set_resource(device_t dev, int type, int rid,
+ u_long start, u_long count)
+{
+ return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
+ start, count));
+}
+
+/**
+ * @brief Wrapper function for BUS_GET_RESOURCE().
+ *
+ * This function simply calls the BUS_GET_RESOURCE() method of the
+ * parent of @p dev.
+ */
+int
+bus_get_resource(device_t dev, int type, int rid,
+ u_long *startp, u_long *countp)
+{
+ return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+ startp, countp));
+}
+
+/**
+ * @brief Wrapper function for BUS_GET_RESOURCE().
+ *
+ * This function simply calls the BUS_GET_RESOURCE() method of the
+ * parent of @p dev and returns the start value.
+ */
+u_long
+bus_get_resource_start(device_t dev, int type, int rid)
+{
+ u_long start, count;
+ int error;
+
+ error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+ &start, &count);
+ if (error)
+ return (0);
+ return (start);
+}
+
+/**
+ * @brief Wrapper function for BUS_GET_RESOURCE().
+ *
+ * This function simply calls the BUS_GET_RESOURCE() method of the
+ * parent of @p dev and returns the count value.
+ */
+u_long
+bus_get_resource_count(device_t dev, int type, int rid)
+{
+ u_long start, count;
+ int error;
+
+ error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
+ &start, &count);
+ if (error)
+ return (0);
+ return (count);
+}
+
+/**
+ * @brief Wrapper function for BUS_DELETE_RESOURCE().
+ *
+ * This function simply calls the BUS_DELETE_RESOURCE() method of the
+ * parent of @p dev.
+ */
+void
+bus_delete_resource(device_t dev, int type, int rid)
+{
+ BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
+}
+
+/**
+ * @brief Wrapper function for BUS_CHILD_PRESENT().
+ *
+ * This function simply calls the BUS_CHILD_PRESENT() method of the
+ * parent of @p dev.
+ */
+int
+bus_child_present(device_t child)
+{
+ return (BUS_CHILD_PRESENT(device_get_parent(child), child));
+}
+
+/**
+ * @brief Wrapper function for BUS_CHILD_PNPINFO_STR().
+ *
+ * This function simply calls the BUS_CHILD_PNPINFO_STR() method of the
+ * parent of @p dev.
+ */
+int
+bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen)
+{
+ device_t parent;
+
+ parent = device_get_parent(child);
+ if (parent == NULL) {
+ *buf = '\0';
+ return (0);
+ }
+ return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen));
+}
+
+/**
+ * @brief Wrapper function for BUS_CHILD_LOCATION_STR().
+ *
+ * This function simply calls the BUS_CHILD_LOCATION_STR() method of the
+ * parent of @p dev.
+ */
+int
+bus_child_location_str(device_t child, char *buf, size_t buflen)
+{
+ device_t parent;
+
+ parent = device_get_parent(child);
+ if (parent == NULL) {
+ *buf = '\0';
+ return (0);
+ }
+ return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
+}
+
+/**
+ * @brief Wrapper function for BUS_GET_DMA_TAG().
+ *
+ * This function simply calls the BUS_GET_DMA_TAG() method of the
+ * parent of @p dev.
+ */
+bus_dma_tag_t
+bus_get_dma_tag(device_t dev)
+{
+ device_t parent;
+
+ parent = device_get_parent(dev);
+ if (parent == NULL)
+ return (NULL);
+ return (BUS_GET_DMA_TAG(parent, dev));
+}
+
+/* Resume all devices and then notify userland that we're up again. */
+static int
+root_resume(device_t dev)
+{
+ int error;
+
+ error = bus_generic_resume(dev);
+ if (error == 0)
+ devctl_notify("kern", "power", "resume", NULL);
+ return (error);
+}
+
+static int
+root_print_child(device_t dev, device_t child)
+{
+ int retval = 0;
+
+ retval += bus_print_child_header(dev, child);
+ retval += printf("\n");
+
+ return (retval);
+}
+
+static int
+root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
+ driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep)
+{
+ /*
+ * If an interrupt mapping gets to here something bad has happened.
+ */
+ panic("root_setup_intr");
+}
+
+/*
+ * If we get here, assume that the device is permanant and really is
+ * present in the system. Removable bus drivers are expected to intercept
+ * this call long before it gets here. We return -1 so that drivers that
+ * really care can check vs -1 or some ERRNO returned higher in the food
+ * chain.
+ */
+static int
+root_child_present(device_t dev, device_t child)
+{
+ return (-1);
+}
+
+static kobj_method_t root_methods[] = {
+ /* Device interface */
+ KOBJMETHOD(device_shutdown, bus_generic_shutdown),
+ KOBJMETHOD(device_suspend, bus_generic_suspend),
+ KOBJMETHOD(device_resume, root_resume),
+
+ /* Bus interface */
+ KOBJMETHOD(bus_print_child, root_print_child),
+ KOBJMETHOD(bus_read_ivar, bus_generic_read_ivar),
+ KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar),
+ KOBJMETHOD(bus_setup_intr, root_setup_intr),
+ KOBJMETHOD(bus_child_present, root_child_present),
+
+ KOBJMETHOD_END
+};
+
+static driver_t root_driver = {
+ "root",
+ root_methods,
+ 1, /* no softc */
+};
+
+device_t root_bus;
+devclass_t root_devclass;
+
+static int
+root_bus_module_handler(module_t mod, int what, void* arg)
+{
+ switch (what) {
+ case MOD_LOAD:
+ TAILQ_INIT(&bus_data_devices);
+ kobj_class_compile((kobj_class_t) &root_driver);
+ root_bus = make_device(NULL, "root", 0);
+ root_bus->desc = "System root bus";
+ kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
+ root_bus->driver = &root_driver;
+ root_bus->state = DS_ATTACHED;
+ root_devclass = devclass_find_internal("root", NULL, FALSE);
+ devinit();
+ return (0);
+
+ case MOD_SHUTDOWN:
+ device_shutdown(root_bus);
+ return (0);
+ default:
+ return (EOPNOTSUPP);
+ }
+
+ return (0);
+}
+
+static moduledata_t root_bus_mod = {
+ "rootbus",
+ root_bus_module_handler,
+ NULL
+};
+DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+
+/**
+ * @brief Automatically configure devices
+ *
+ * This function begins the autoconfiguration process by calling
+ * device_probe_and_attach() for each child of the @c root0 device.
+ */
+void
+root_bus_configure(void)
+{
+
+ PDEBUG(("."));
+
+ /* Eventually this will be split up, but this is sufficient for now. */
+ bus_set_pass(BUS_PASS_DEFAULT);
+}
+
+/**
+ * @brief Module handler for registering device drivers
+ *
+ * This module handler is used to automatically register device
+ * drivers when modules are loaded. If @p what is MOD_LOAD, it calls
+ * devclass_add_driver() for the driver described by the
+ * driver_module_data structure pointed to by @p arg
+ */
+int
+driver_module_handler(module_t mod, int what, void *arg)
+{
+ struct driver_module_data *dmd;
+ devclass_t bus_devclass;
+ kobj_class_t driver;
+ int error, pass;
+
+ dmd = (struct driver_module_data *)arg;
+ bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
+ error = 0;
+
+ switch (what) {
+ case MOD_LOAD:
+ if (dmd->dmd_chainevh)
+ error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+
+ pass = dmd->dmd_pass;
+ driver = dmd->dmd_driver;
+ PDEBUG(("Loading module: driver %s on bus %s (pass %d)",
+ DRIVERNAME(driver), dmd->dmd_busname, pass));
+ error = devclass_add_driver(bus_devclass, driver, pass,
+ dmd->dmd_devclass);
+ break;
+
+ case MOD_UNLOAD:
+ PDEBUG(("Unloading module: driver %s from bus %s",
+ DRIVERNAME(dmd->dmd_driver),
+ dmd->dmd_busname));
+ error = devclass_delete_driver(bus_devclass,
+ dmd->dmd_driver);
+
+ if (!error && dmd->dmd_chainevh)
+ error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+ break;
+ case MOD_QUIESCE:
+ PDEBUG(("Quiesce module: driver %s from bus %s",
+ DRIVERNAME(dmd->dmd_driver),
+ dmd->dmd_busname));
+ error = devclass_quiesce_driver(bus_devclass,
+ dmd->dmd_driver);
+
+ if (!error && dmd->dmd_chainevh)
+ error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+/**
+ * @brief Enumerate all hinted devices for this bus.
+ *
+ * Walks through the hints for this bus and calls the bus_hinted_child
+ * routine for each one it fines. It searches first for the specific
+ * bus that's being probed for hinted children (eg isa0), and then for
+ * generic children (eg isa).
+ *
+ * @param dev bus device to enumerate
+ */
+void
+bus_enumerate_hinted_children(device_t bus)
+{
+ int i;
+ const char *dname, *busname;
+ int dunit;
+
+ /*
+ * enumerate all devices on the specific bus
+ */
+ busname = device_get_nameunit(bus);
+ i = 0;
+ while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
+ BUS_HINTED_CHILD(bus, dname, dunit);
+
+ /*
+ * and all the generic ones.
+ */
+ busname = device_get_name(bus);
+ i = 0;
+ while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
+ BUS_HINTED_CHILD(bus, dname, dunit);
+}
+
+#ifdef BUS_DEBUG
+
+/* the _short versions avoid iteration by not calling anything that prints
+ * more than oneliners. I love oneliners.
+ */
+
+static void
+print_device_short(device_t dev, int indent)
+{
+ if (!dev)
+ return;
+
+ indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
+ dev->unit, dev->desc,
+ (dev->parent? "":"no "),
+ (TAILQ_EMPTY(&dev->children)? "no ":""),
+ (dev->flags&DF_ENABLED? "enabled,":"disabled,"),
+ (dev->flags&DF_FIXEDCLASS? "fixed,":""),
+ (dev->flags&DF_WILDCARD? "wildcard,":""),
+ (dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
+ (dev->flags&DF_REBID? "rebiddable,":""),
+ (dev->ivars? "":"no "),
+ (dev->softc? "":"no "),
+ dev->busy));
+}
+
+static void
+print_device(device_t dev, int indent)
+{
+ if (!dev)
+ return;
+
+ print_device_short(dev, indent);
+
+ indentprintf(("Parent:\n"));
+ print_device_short(dev->parent, indent+1);
+ indentprintf(("Driver:\n"));
+ print_driver_short(dev->driver, indent+1);
+ indentprintf(("Devclass:\n"));
+ print_devclass_short(dev->devclass, indent+1);
+}
+
+void
+print_device_tree_short(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+ device_t child;
+
+ if (!dev)
+ return;
+
+ print_device_short(dev, indent);
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ print_device_tree_short(child, indent+1);
+ }
+}
+
+void
+print_device_tree(device_t dev, int indent)
+/* print the device and all its children (indented) */
+{
+ device_t child;
+
+ if (!dev)
+ return;
+
+ print_device(dev, indent);
+
+ TAILQ_FOREACH(child, &dev->children, link) {
+ print_device_tree(child, indent+1);
+ }
+}
+
+static void
+print_driver_short(driver_t *driver, int indent)
+{
+ if (!driver)
+ return;
+
+ indentprintf(("driver %s: softc size = %zd\n",
+ driver->name, driver->size));
+}
+
+static void
+print_driver(driver_t *driver, int indent)
+{
+ if (!driver)
+ return;
+
+ print_driver_short(driver, indent);
+}
+
+static void
+print_driver_list(driver_list_t drivers, int indent)
+{
+ driverlink_t driver;
+
+ TAILQ_FOREACH(driver, &drivers, link) {
+ print_driver(driver->driver, indent);
+ }
+}
+
+static void
+print_devclass_short(devclass_t dc, int indent)
+{
+ if ( !dc )
+ return;
+
+ indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
+}
+
+static void
+print_devclass(devclass_t dc, int indent)
+{
+ int i;
+
+ if ( !dc )
+ return;
+
+ print_devclass_short(dc, indent);
+ indentprintf(("Drivers:\n"));
+ print_driver_list(dc->drivers, indent+1);
+
+ indentprintf(("Devices:\n"));
+ for (i = 0; i < dc->maxunit; i++)
+ if (dc->devices[i])
+ print_device(dc->devices[i], indent+1);
+}
+
+void
+print_devclass_list_short(void)
+{
+ devclass_t dc;
+
+ printf("Short listing of devclasses, drivers & devices:\n");
+ TAILQ_FOREACH(dc, &devclasses, link) {
+ print_devclass_short(dc, 0);
+ }
+}
+
+void
+print_devclass_list(void)
+{
+ devclass_t dc;
+
+ printf("Full listing of devclasses, drivers & devices:\n");
+ TAILQ_FOREACH(dc, &devclasses, link) {
+ print_devclass(dc, 0);
+ }
+}
+
+#endif
+
+/*
+ * User-space access to the device tree.
+ *
+ * We implement a small set of nodes:
+ *
+ * hw.bus Single integer read method to obtain the
+ * current generation count.
+ * hw.bus.devices Reads the entire device tree in flat space.
+ * hw.bus.rman Resource manager interface
+ *
+ * We might like to add the ability to scan devclasses and/or drivers to
+ * determine what else is currently loaded/available.
+ */
+
+static int
+sysctl_bus(SYSCTL_HANDLER_ARGS)
+{
+ struct u_businfo ubus;
+
+ ubus.ub_version = BUS_USER_VERSION;
+ ubus.ub_generation = bus_data_generation;
+
+ return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
+}
+SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
+ "bus-related data");
+
+static int
+sysctl_devices(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ int index;
+ struct device *dev;
+ struct u_device udev; /* XXX this is a bit big */
+ int error;
+
+ if (namelen != 2)
+ return (EINVAL);
+
+ if (bus_data_generation_check(name[0]))
+ return (EINVAL);
+
+ index = name[1];
+
+ /*
+ * Scan the list of devices, looking for the requested index.
+ */
+ TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
+ if (index-- == 0)
+ break;
+ }
+ if (dev == NULL)
+ return (ENOENT);
+
+ /*
+ * Populate the return array.
+ */
+ bzero(&udev, sizeof(udev));
+ udev.dv_handle = (uintptr_t)dev;
+ udev.dv_parent = (uintptr_t)dev->parent;
+ if (dev->nameunit != NULL)
+ strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name));
+ if (dev->desc != NULL)
+ strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc));
+ if (dev->driver != NULL && dev->driver->name != NULL)
+ strlcpy(udev.dv_drivername, dev->driver->name,
+ sizeof(udev.dv_drivername));
+ bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo));
+ bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location));
+ udev.dv_devflags = dev->devflags;
+ udev.dv_flags = dev->flags;
+ udev.dv_state = dev->state;
+ error = SYSCTL_OUT(req, &udev, sizeof(udev));
+ return (error);
+}
+
+SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
+ "system device tree");
+
+int
+bus_data_generation_check(int generation)
+{
+ if (generation != bus_data_generation)
+ return (1);
+
+ /* XXX generate optimised lists here? */
+ return (0);
+}
+
+void
+bus_data_generation_update(void)
+{
+ bus_data_generation++;
+}
+
+int
+bus_free_resource(device_t dev, int type, struct resource *r)
+{
+ if (r == NULL)
+ return (0);
+ return (bus_release_resource(dev, type, rman_get_rid(r), r));
+}
diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c
new file mode 100644
index 0000000..999de3f
--- /dev/null
+++ b/sys/kern/subr_bus_dma.c
@@ -0,0 +1,533 @@
+/*-
+ * Copyright (c) 2012 EMC Corp.
+ * All rights reserved.
+ *
+ * Copyright (c) 1997, 1998 Justin T. Gibbs.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/mbuf.h>
+#include <sys/memdesc.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/pmap.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+
+#include <machine/bus.h>
+
+/*
+ * Load a list of virtual addresses.
+ */
+static int
+_bus_dmamap_load_vlist(bus_dma_tag_t dmat, bus_dmamap_t map,
+ bus_dma_segment_t *list, int sglist_cnt, struct pmap *pmap, int *nsegs,
+ int flags)
+{
+ int error;
+
+ error = 0;
+ for (; sglist_cnt > 0; sglist_cnt--, list++) {
+ error = _bus_dmamap_load_buffer(dmat, map,
+ (void *)(uintptr_t)list->ds_addr, list->ds_len, pmap,
+ flags, NULL, nsegs);
+ if (error)
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Load a list of physical addresses.
+ */
+static int
+_bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
+ bus_dma_segment_t *list, int sglist_cnt, int *nsegs, int flags)
+{
+ int error;
+
+ error = 0;
+ for (; sglist_cnt > 0; sglist_cnt--, list++) {
+ error = _bus_dmamap_load_phys(dmat, map,
+ (vm_paddr_t)list->ds_addr, list->ds_len, flags, NULL,
+ nsegs);
+ if (error)
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Load an mbuf chain.
+ */
+static int
+_bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+ struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+ struct mbuf *m;
+ int error;
+
+ error = 0;
+ for (m = m0; m != NULL && error == 0; m = m->m_next) {
+ if (m->m_len > 0) {
+ error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
+ m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
+ segs, nsegs);
+ }
+ }
+ CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+ __func__, dmat, flags, error, *nsegs);
+ return (error);
+}
+
+/*
+ * Load from block io.
+ */
+static int
+_bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
+ int *nsegs, int flags)
+{
+ vm_paddr_t paddr;
+ bus_size_t len, tlen;
+ int error, i, ma_offs;
+
+ if ((bio->bio_flags & BIO_UNMAPPED) == 0) {
+ error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data,
+ bio->bio_bcount, kernel_pmap, flags, NULL, nsegs);
+ return (error);
+ }
+
+ error = 0;
+ tlen = bio->bio_bcount;
+ ma_offs = bio->bio_ma_offset;
+ for (i = 0; tlen > 0; i++, tlen -= len) {
+ len = min(PAGE_SIZE - ma_offs, tlen);
+ paddr = VM_PAGE_TO_PHYS(bio->bio_ma[i]) + ma_offs;
+ error = _bus_dmamap_load_phys(dmat, map, paddr, len,
+ flags, NULL, nsegs);
+ if (error != 0)
+ break;
+ ma_offs = 0;
+ }
+ return (error);
+}
+
+/*
+ * Load a cam control block.
+ */
+static int
+_bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
+ int *nsegs, int flags)
+{
+ struct ccb_hdr *ccb_h;
+ void *data_ptr;
+ int error;
+ uint32_t dxfer_len;
+ uint16_t sglist_cnt;
+
+ error = 0;
+ ccb_h = &ccb->ccb_h;
+ switch (ccb_h->func_code) {
+ case XPT_SCSI_IO: {
+ struct ccb_scsiio *csio;
+
+ csio = &ccb->csio;
+ data_ptr = csio->data_ptr;
+ dxfer_len = csio->dxfer_len;
+ sglist_cnt = csio->sglist_cnt;
+ break;
+ }
+ case XPT_CONT_TARGET_IO: {
+ struct ccb_scsiio *ctio;
+
+ ctio = &ccb->ctio;
+ data_ptr = ctio->data_ptr;
+ dxfer_len = ctio->dxfer_len;
+ sglist_cnt = ctio->sglist_cnt;
+ break;
+ }
+ case XPT_ATA_IO: {
+ struct ccb_ataio *ataio;
+
+ ataio = &ccb->ataio;
+ data_ptr = ataio->data_ptr;
+ dxfer_len = ataio->dxfer_len;
+ sglist_cnt = 0;
+ break;
+ }
+ default:
+ panic("_bus_dmamap_load_ccb: Unsupported func code %d",
+ ccb_h->func_code);
+ }
+
+ switch ((ccb_h->flags & CAM_DATA_MASK)) {
+ case CAM_DATA_VADDR:
+ error = _bus_dmamap_load_buffer(dmat, map, data_ptr, dxfer_len,
+ kernel_pmap, flags, NULL, nsegs);
+ break;
+ case CAM_DATA_PADDR:
+ error = _bus_dmamap_load_phys(dmat, map,
+ (vm_paddr_t)(uintptr_t)data_ptr, dxfer_len, flags, NULL,
+ nsegs);
+ break;
+ case CAM_DATA_SG:
+ error = _bus_dmamap_load_vlist(dmat, map,
+ (bus_dma_segment_t *)data_ptr, sglist_cnt, kernel_pmap,
+ nsegs, flags);
+ break;
+ case CAM_DATA_SG_PADDR:
+ error = _bus_dmamap_load_plist(dmat, map,
+ (bus_dma_segment_t *)data_ptr, sglist_cnt, nsegs, flags);
+ break;
+ case CAM_DATA_BIO:
+ error = _bus_dmamap_load_bio(dmat, map, (struct bio *)data_ptr,
+ nsegs, flags);
+ break;
+ default:
+ panic("_bus_dmamap_load_ccb: flags 0x%X unimplemented",
+ ccb_h->flags);
+ }
+ return (error);
+}
+
+/*
+ * Load a uio.
+ */
+static int
+_bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
+ int *nsegs, int flags)
+{
+ bus_size_t resid;
+ bus_size_t minlen;
+ struct iovec *iov;
+ pmap_t pmap;
+ caddr_t addr;
+ int error, i;
+
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ KASSERT(uio->uio_td != NULL,
+ ("bus_dmamap_load_uio: USERSPACE but no proc"));
+ pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
+ } else
+ pmap = kernel_pmap;
+ resid = uio->uio_resid;
+ iov = uio->uio_iov;
+ error = 0;
+
+ for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) {
+ /*
+ * Now at the first iovec to load. Load each iovec
+ * until we have exhausted the residual count.
+ */
+
+ addr = (caddr_t) iov[i].iov_base;
+ minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len;
+ if (minlen > 0) {
+ error = _bus_dmamap_load_buffer(dmat, map, addr,
+ minlen, pmap, flags, NULL, nsegs);
+ resid -= minlen;
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Map the buffer buf into bus space using the dmamap map.
+ */
+int
+bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
+ bus_size_t buflen, bus_dmamap_callback_t *callback,
+ void *callback_arg, int flags)
+{
+ bus_dma_segment_t *segs;
+ struct memdesc mem;
+ int error;
+ int nsegs;
+
+ if ((flags & BUS_DMA_NOWAIT) == 0) {
+ mem = memdesc_vaddr(buf, buflen);
+ _bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+ }
+
+ nsegs = -1;
+ error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, kernel_pmap,
+ flags, NULL, &nsegs);
+ nsegs++;
+
+ CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+ __func__, dmat, flags, error, nsegs);
+
+ if (error == EINPROGRESS)
+ return (error);
+
+ segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+ if (error)
+ (*callback)(callback_arg, segs, 0, error);
+ else
+ (*callback)(callback_arg, segs, nsegs, 0);
+
+ /*
+ * Return ENOMEM to the caller so that it can pass it up the stack.
+ * This error only happens when NOWAIT is set, so deferral is disabled.
+ */
+ if (error == ENOMEM)
+ return (error);
+
+ return (0);
+}
+
+int
+bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
+ bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
+{
+ bus_dma_segment_t *segs;
+ int nsegs, error;
+
+ M_ASSERTPKTHDR(m0);
+
+ flags |= BUS_DMA_NOWAIT;
+ nsegs = -1;
+ error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, NULL, &nsegs, flags);
+ ++nsegs;
+
+ segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+ if (error)
+ (*callback)(callback_arg, segs, 0, 0, error);
+ else
+ (*callback)(callback_arg, segs, nsegs, m0->m_pkthdr.len, error);
+
+ CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+ __func__, dmat, flags, error, nsegs);
+ return (error);
+}
+
+int
+bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
+ bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+ int error;
+
+ flags |= BUS_DMA_NOWAIT;
+ *nsegs = -1;
+ error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags);
+ ++*nsegs;
+ _bus_dmamap_complete(dmat, map, segs, *nsegs, error);
+ return (error);
+}
+
+int
+bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
+ bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
+{
+ bus_dma_segment_t *segs;
+ int nsegs, error;
+
+ flags |= BUS_DMA_NOWAIT;
+ nsegs = -1;
+ error = _bus_dmamap_load_uio(dmat, map, uio, &nsegs, flags);
+ nsegs++;
+
+ segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+ if (error)
+ (*callback)(callback_arg, segs, 0, 0, error);
+ else
+ (*callback)(callback_arg, segs, nsegs, uio->uio_resid, error);
+
+ CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+ __func__, dmat, flags, error, nsegs);
+ return (error);
+}
+
+int
+bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
+ bus_dmamap_callback_t *callback, void *callback_arg,
+ int flags)
+{
+ bus_dma_segment_t *segs;
+ struct ccb_hdr *ccb_h;
+ struct memdesc mem;
+ int error;
+ int nsegs;
+
+ ccb_h = &ccb->ccb_h;
+ if ((ccb_h->flags & CAM_DIR_MASK) == CAM_DIR_NONE) {
+ callback(callback_arg, NULL, 0, 0);
+ return (0);
+ }
+ if ((flags & BUS_DMA_NOWAIT) == 0) {
+ mem = memdesc_ccb(ccb);
+ _bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+ }
+ nsegs = -1;
+ error = _bus_dmamap_load_ccb(dmat, map, ccb, &nsegs, flags);
+ nsegs++;
+
+ CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+ __func__, dmat, flags, error, nsegs);
+
+ if (error == EINPROGRESS)
+ return (error);
+
+ segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+ if (error)
+ (*callback)(callback_arg, segs, 0, error);
+ else
+ (*callback)(callback_arg, segs, nsegs, error);
+ /*
+ * Return ENOMEM to the caller so that it can pass it up the stack.
+ * This error only happens when NOWAIT is set, so deferral is disabled.
+ */
+ if (error == ENOMEM)
+ return (error);
+
+ return (0);
+}
+
+int
+bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
+ bus_dmamap_callback_t *callback, void *callback_arg,
+ int flags)
+{
+ bus_dma_segment_t *segs;
+ struct memdesc mem;
+ int error;
+ int nsegs;
+
+ if ((flags & BUS_DMA_NOWAIT) == 0) {
+ mem = memdesc_bio(bio);
+ _bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+ }
+ nsegs = -1;
+ error = _bus_dmamap_load_bio(dmat, map, bio, &nsegs, flags);
+ nsegs++;
+
+ CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+ __func__, dmat, flags, error, nsegs);
+
+ if (error == EINPROGRESS)
+ return (error);
+
+ segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+ if (error)
+ (*callback)(callback_arg, segs, 0, error);
+ else
+ (*callback)(callback_arg, segs, nsegs, error);
+ /*
+ * Return ENOMEM to the caller so that it can pass it up the stack.
+ * This error only happens when NOWAIT is set, so deferral is disabled.
+ */
+ if (error == ENOMEM)
+ return (error);
+
+ return (0);
+}
+
+int
+bus_dmamap_load_mem(bus_dma_tag_t dmat, bus_dmamap_t map,
+ struct memdesc *mem, bus_dmamap_callback_t *callback,
+ void *callback_arg, int flags)
+{
+ bus_dma_segment_t *segs;
+ int error;
+ int nsegs;
+
+ if ((flags & BUS_DMA_NOWAIT) == 0)
+ _bus_dmamap_waitok(dmat, map, mem, callback, callback_arg);
+
+ nsegs = -1;
+ error = 0;
+ switch (mem->md_type) {
+ case MEMDESC_VADDR:
+ error = _bus_dmamap_load_buffer(dmat, map, mem->u.md_vaddr,
+ mem->md_opaque, kernel_pmap, flags, NULL, &nsegs);
+ break;
+ case MEMDESC_PADDR:
+ error = _bus_dmamap_load_phys(dmat, map, mem->u.md_paddr,
+ mem->md_opaque, flags, NULL, &nsegs);
+ break;
+ case MEMDESC_VLIST:
+ error = _bus_dmamap_load_vlist(dmat, map, mem->u.md_list,
+ mem->md_opaque, kernel_pmap, &nsegs, flags);
+ break;
+ case MEMDESC_PLIST:
+ error = _bus_dmamap_load_plist(dmat, map, mem->u.md_list,
+ mem->md_opaque, &nsegs, flags);
+ break;
+ case MEMDESC_BIO:
+ error = _bus_dmamap_load_bio(dmat, map, mem->u.md_bio,
+ &nsegs, flags);
+ break;
+ case MEMDESC_UIO:
+ error = _bus_dmamap_load_uio(dmat, map, mem->u.md_uio,
+ &nsegs, flags);
+ break;
+ case MEMDESC_MBUF:
+ error = _bus_dmamap_load_mbuf_sg(dmat, map, mem->u.md_mbuf,
+ NULL, &nsegs, flags);
+ break;
+ case MEMDESC_CCB:
+ error = _bus_dmamap_load_ccb(dmat, map, mem->u.md_ccb, &nsegs,
+ flags);
+ break;
+ }
+ nsegs++;
+
+ CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+ __func__, dmat, flags, error, nsegs);
+
+ if (error == EINPROGRESS)
+ return (error);
+
+ segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+ if (error)
+ (*callback)(callback_arg, segs, 0, error);
+ else
+ (*callback)(callback_arg, segs, nsegs, 0);
+
+ /*
+ * Return ENOMEM to the caller so that it can pass it up the stack.
+ * This error only happens when NOWAIT is set, so deferral is disabled.
+ */
+ if (error == ENOMEM)
+ return (error);
+
+ return (0);
+}
diff --git a/sys/kern/subr_busdma_bufalloc.c b/sys/kern/subr_busdma_bufalloc.c
new file mode 100644
index 0000000..a80a233
--- /dev/null
+++ b/sys/kern/subr_busdma_bufalloc.c
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2012 Ian Lepore
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * Buffer allocation support routines for bus_dmamem_alloc implementations.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/busdma_bufalloc.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/uma.h>
+
+/*
+ * We manage buffer zones up to a page in size. Buffers larger than a page can
+ * be managed by one of the kernel's page-oriented memory allocation routines as
+ * efficiently as what we can do here. Also, a page is the largest size for
+ * which we can g'tee contiguity when using uma, and contiguity is one of the
+ * requirements we have to fulfill.
+ */
+#define MIN_ZONE_BUFSIZE 32
+#define MAX_ZONE_BUFSIZE PAGE_SIZE
+
+/*
+ * The static array of 12 bufzones is big enough to handle all the zones for the
+ * smallest supported allocation size of 32 through the largest supported page
+ * size of 64K. If you up the biggest page size number, up the array size too.
+ * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1,
+ * but I don't know of an easy way to express that as a compile-time constant.
+ */
+#if PAGE_SIZE > 65536
+#error Unsupported page size
+#endif
+
+struct busdma_bufalloc {
+ bus_size_t min_size;
+ size_t num_zones;
+ struct busdma_bufzone buf_zones[12];
+};
+
+busdma_bufalloc_t
+busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment,
+ uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags)
+{
+ struct busdma_bufalloc *ba;
+ struct busdma_bufzone *bz;
+ int i;
+ bus_size_t cursize;
+
+ ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF,
+ M_ZERO | M_WAITOK);
+
+ ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment);
+
+ /*
+ * Each uma zone is created with an alignment of size-1, meaning that
+ * the alignment is equal to the size (I.E., 64 byte buffers are aligned
+ * to 64 byte boundaries, etc). This allows for a fast efficient test
+ * when deciding whether a pool buffer meets the constraints of a given
+ * tag used for allocation: the buffer is usable if tag->alignment <=
+ * bufzone->size.
+ */
+ for (i = 0, bz = ba->buf_zones, cursize = ba->min_size;
+ i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE;
+ ++i, ++bz, cursize <<= 1) {
+ snprintf(bz->name, sizeof(bz->name), "dma %.10s %lu",
+ name, cursize);
+ bz->size = cursize;
+ bz->umazone = uma_zcreate(bz->name, bz->size,
+ NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags);
+ if (bz->umazone == NULL) {
+ busdma_bufalloc_destroy(ba);
+ return (NULL);
+ }
+ if (alloc_func != NULL)
+ uma_zone_set_allocf(bz->umazone, alloc_func);
+ if (free_func != NULL)
+ uma_zone_set_freef(bz->umazone, free_func);
+ ++ba->num_zones;
+ }
+
+ return (ba);
+}
+
+void
+busdma_bufalloc_destroy(busdma_bufalloc_t ba)
+{
+ struct busdma_bufzone *bz;
+ int i;
+
+ if (ba == NULL)
+ return;
+
+ for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
+ uma_zdestroy(bz->umazone);
+ }
+
+ free(ba, M_DEVBUF);
+}
+
+struct busdma_bufzone *
+busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size)
+{
+ struct busdma_bufzone *bz;
+ int i;
+
+ if (size > MAX_ZONE_BUFSIZE)
+ return (NULL);
+
+ for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
+ if (bz->size >= size)
+ return (bz);
+ }
+
+ panic("Didn't find a buffer zone of the right size");
+}
+
+void *
+busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag,
+ int wait)
+{
+#ifdef VM_MEMATTR_UNCACHEABLE
+
+ /* Inform UMA that this allocator uses kernel_arena/object. */
+ *pflag = UMA_SLAB_KERNEL;
+
+ return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0,
+ BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE));
+
+#else
+
+ panic("VM_MEMATTR_UNCACHEABLE unavailable");
+
+#endif /* VM_MEMATTR_UNCACHEABLE */
+}
+
+void
+busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag)
+{
+
+ kmem_free(kernel_arena, (vm_offset_t)item, size);
+}
+
diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c
new file mode 100644
index 0000000..61ace5a
--- /dev/null
+++ b/sys/kern/subr_capability.c
@@ -0,0 +1,298 @@
+/*-
+ * Copyright (c) 2013 FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/capability.h>
+#include <sys/systm.h>
+
+#include <machine/stdarg.h>
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <sys/capability.h>
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#endif
+
+#ifdef _KERNEL
+#define assert(exp) KASSERT((exp), ("%s:%u", __func__, __LINE__))
+#endif
+
+#define CAPARSIZE_MIN (CAP_RIGHTS_VERSION_00 + 2)
+#define CAPARSIZE_MAX (CAP_RIGHTS_VERSION + 2)
+
+static __inline int
+right_to_index(uint64_t right)
+{
+ static const int bit2idx[] = {
+ -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1,
+ 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+ };
+ int idx;
+
+ idx = CAPIDXBIT(right);
+ assert(idx >= 0 && idx < sizeof(bit2idx) / sizeof(bit2idx[0]));
+ return (bit2idx[idx]);
+}
+
+static void
+cap_rights_vset(cap_rights_t *rights, va_list ap)
+{
+ uint64_t right;
+ int i, n;
+
+ assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+ n = CAPARSIZE(rights);
+ assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+ for (;;) {
+ right = (uint64_t)va_arg(ap, unsigned long long);
+ if (right == 0)
+ break;
+ assert(CAPRVER(right) == 0);
+ i = right_to_index(right);
+ assert(i >= 0);
+ assert(i < n);
+ assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+ rights->cr_rights[i] |= right;
+ assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+ }
+}
+
+static void
+cap_rights_vclear(cap_rights_t *rights, va_list ap)
+{
+ uint64_t right;
+ int i, n;
+
+ assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+ n = CAPARSIZE(rights);
+ assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+ for (;;) {
+ right = (uint64_t)va_arg(ap, unsigned long long);
+ if (right == 0)
+ break;
+ assert(CAPRVER(right) == 0);
+ i = right_to_index(right);
+ assert(i >= 0);
+ assert(i < n);
+ assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+ rights->cr_rights[i] &= ~(right & 0x01FFFFFFFFFFFFFFULL);
+ assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+ }
+}
+
+static bool
+cap_rights_is_vset(const cap_rights_t *rights, va_list ap)
+{
+ uint64_t right;
+ int i, n;
+
+ assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+ n = CAPARSIZE(rights);
+ assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+ for (;;) {
+ right = (uint64_t)va_arg(ap, unsigned long long);
+ if (right == 0)
+ break;
+ assert(CAPRVER(right) == 0);
+ i = right_to_index(right);
+ assert(i >= 0);
+ assert(i < n);
+ assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right));
+ if ((rights->cr_rights[i] & right) != right)
+ return (false);
+ }
+
+ return (true);
+}
+
+cap_rights_t *
+__cap_rights_init(int version, cap_rights_t *rights, ...)
+{
+ unsigned int n;
+ va_list ap;
+
+ assert(version == CAP_RIGHTS_VERSION_00);
+
+ n = version + 2;
+ assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+ memset(rights->cr_rights, 0, sizeof(rights->cr_rights[0]) * n);
+ CAP_NONE(rights);
+ va_start(ap, rights);
+ cap_rights_vset(rights, ap);
+ va_end(ap);
+
+ return (rights);
+}
+
+void
+__cap_rights_set(cap_rights_t *rights, ...)
+{
+ va_list ap;
+
+ assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+ va_start(ap, rights);
+ cap_rights_vset(rights, ap);
+ va_end(ap);
+}
+
+void
+__cap_rights_clear(cap_rights_t *rights, ...)
+{
+ va_list ap;
+
+ assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+ va_start(ap, rights);
+ cap_rights_vclear(rights, ap);
+ va_end(ap);
+}
+
+bool
+__cap_rights_is_set(const cap_rights_t *rights, ...)
+{
+ va_list ap;
+ bool ret;
+
+ assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00);
+
+ va_start(ap, rights);
+ ret = cap_rights_is_vset(rights, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+bool
+cap_rights_is_valid(const cap_rights_t *rights)
+{
+ cap_rights_t allrights;
+ int i, j;
+
+ if (CAPVER(rights) != CAP_RIGHTS_VERSION_00)
+ return (false);
+ if (CAPARSIZE(rights) < CAPARSIZE_MIN ||
+ CAPARSIZE(rights) > CAPARSIZE_MAX) {
+ return (false);
+ }
+ CAP_ALL(&allrights);
+ if (!cap_rights_contains(&allrights, rights))
+ return (false);
+ for (i = 0; i < CAPARSIZE(rights); i++) {
+ j = right_to_index(rights->cr_rights[i]);
+ if (i != j)
+ return (false);
+ if (i > 0) {
+ if (CAPRVER(rights->cr_rights[i]) != 0)
+ return (false);
+ }
+ }
+
+ return (true);
+}
+
+void
+cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src)
+{
+ unsigned int i, n;
+
+ assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00);
+ assert(CAPVER(src) == CAP_RIGHTS_VERSION_00);
+ assert(CAPVER(dst) == CAPVER(src));
+ assert(cap_rights_is_valid(src));
+ assert(cap_rights_is_valid(dst));
+
+ n = CAPARSIZE(dst);
+ assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+ for (i = 0; i < n; i++)
+ dst->cr_rights[i] |= src->cr_rights[i];
+
+ assert(cap_rights_is_valid(src));
+ assert(cap_rights_is_valid(dst));
+}
+
+void
+cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src)
+{
+ unsigned int i, n;
+
+ assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00);
+ assert(CAPVER(src) == CAP_RIGHTS_VERSION_00);
+ assert(CAPVER(dst) == CAPVER(src));
+ assert(cap_rights_is_valid(src));
+ assert(cap_rights_is_valid(dst));
+
+ n = CAPARSIZE(dst);
+ assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+ for (i = 0; i < n; i++) {
+ dst->cr_rights[i] &=
+ ~(src->cr_rights[i] & 0x01FFFFFFFFFFFFFFULL);
+ }
+
+ assert(cap_rights_is_valid(src));
+ assert(cap_rights_is_valid(dst));
+}
+
+bool
+cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little)
+{
+ unsigned int i, n;
+
+ assert(CAPVER(big) == CAP_RIGHTS_VERSION_00);
+ assert(CAPVER(little) == CAP_RIGHTS_VERSION_00);
+ assert(CAPVER(big) == CAPVER(little));
+
+ n = CAPARSIZE(big);
+ assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX);
+
+ for (i = 0; i < n; i++) {
+ if ((big->cr_rights[i] & little->cr_rights[i]) !=
+ little->cr_rights[i]) {
+ return (false);
+ }
+ }
+
+ return (true);
+}
diff --git a/sys/kern/subr_clock.c b/sys/kern/subr_clock.c
new file mode 100644
index 0000000..dbd74f7
--- /dev/null
+++ b/sys/kern/subr_clock.c
@@ -0,0 +1,225 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: clock.c 1.18 91/01/21$
+ * from: @(#)clock.c 8.2 (Berkeley) 1/12/94
+ * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ * and
+ * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#include <sys/timetc.h>
+
+int tz_minuteswest;
+int tz_dsttime;
+
+/*
+ * The adjkerntz and wall_cmos_clock sysctls are in the "machdep" sysctl
+ * namespace because they were misplaced there originally.
+ */
+static int adjkerntz;
+static int
+sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+ if (!error && req->newptr)
+ resettodr();
+ return (error);
+}
+SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
+ &adjkerntz, 0, sysctl_machdep_adjkerntz, "I",
+ "Local offset from UTC in seconds");
+
+static int ct_debug;
+SYSCTL_INT(_debug, OID_AUTO, clocktime, CTLFLAG_RW,
+ &ct_debug, 0, "Enable printing of clocktime debugging");
+
+static int wall_cmos_clock;
+SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock, CTLFLAG_RW,
+ &wall_cmos_clock, 0, "Enables application of machdep.adjkerntz");
+
+/*--------------------------------------------------------------------*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+
+
+#define FEBRUARY 2
+#define days_in_year(y) (leapyear(y) ? 366 : 365)
+#define days_in_month(y, m) \
+ (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define day_of_week(days) (((days) + 4) % 7)
+
+static const int month_days[12] = {
+ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+
+/*
+ * This inline avoids some unnecessary modulo operations
+ * as compared with the usual macro:
+ * ( ((year % 4) == 0 &&
+ * (year % 100) != 0) ||
+ * ((year % 400) == 0) )
+ * It is otherwise equivalent.
+ */
+static int
+leapyear(int year)
+{
+ int rv = 0;
+
+ if ((year & 3) == 0) {
+ rv = 1;
+ if ((year % 100) == 0) {
+ rv = 0;
+ if ((year % 400) == 0)
+ rv = 1;
+ }
+ }
+ return (rv);
+}
+
+static void
+print_ct(struct clocktime *ct)
+{
+ printf("[%04d-%02d-%02d %02d:%02d:%02d]",
+ ct->year, ct->mon, ct->day,
+ ct->hour, ct->min, ct->sec);
+}
+
+int
+clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
+{
+ time_t secs;
+ int i, year, days;
+
+ year = ct->year;
+
+ if (ct_debug) {
+ printf("ct_to_ts(");
+ print_ct(ct);
+ printf(")");
+ }
+
+ /* Sanity checks. */
+ if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
+ ct->day > days_in_month(year, ct->mon) ||
+ ct->hour > 23 || ct->min > 59 || ct->sec > 59 ||
+ ct->year > 2037) { /* time_t overflow */
+ if (ct_debug)
+ printf(" = EINVAL\n");
+ return (EINVAL);
+ }
+
+ /*
+ * Compute days since start of time
+ * First from years, then from months.
+ */
+ days = 0;
+ for (i = POSIX_BASE_YEAR; i < year; i++)
+ days += days_in_year(i);
+
+ /* Months */
+ for (i = 1; i < ct->mon; i++)
+ days += days_in_month(year, i);
+ days += (ct->day - 1);
+
+ /* Add hours, minutes, seconds. */
+ secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec;
+
+ ts->tv_sec = secs;
+ ts->tv_nsec = ct->nsec;
+ if (ct_debug)
+ printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec);
+ return (0);
+}
+
+void
+clock_ts_to_ct(struct timespec *ts, struct clocktime *ct)
+{
+ int i, year, days;
+ time_t rsec; /* remainder seconds */
+ time_t secs;
+
+ secs = ts->tv_sec;
+ days = secs / SECDAY;
+ rsec = secs % SECDAY;
+
+ ct->dow = day_of_week(days);
+
+ /* Subtract out whole years, counting them in i. */
+ for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++)
+ days -= days_in_year(year);
+ ct->year = year;
+
+ /* Subtract out whole months, counting them in i. */
+ for (i = 1; days >= days_in_month(year, i); i++)
+ days -= days_in_month(year, i);
+ ct->mon = i;
+
+ /* Days are what is left over (+1) from all that. */
+ ct->day = days + 1;
+
+ /* Hours, minutes, seconds are easy */
+ ct->hour = rsec / 3600;
+ rsec = rsec % 3600;
+ ct->min = rsec / 60;
+ rsec = rsec % 60;
+ ct->sec = rsec;
+ ct->nsec = ts->tv_nsec;
+ if (ct_debug) {
+ printf("ts_to_ct(%ld.%09ld) = ",
+ (long)ts->tv_sec, (long)ts->tv_nsec);
+ print_ct(ct);
+ printf("\n");
+ }
+}
+
+int
+utc_offset(void)
+{
+
+ return (tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0));
+}
diff --git a/sys/kern/subr_counter.c b/sys/kern/subr_counter.c
new file mode 100644
index 0000000..b3ddc7a
--- /dev/null
+++ b/sys/kern/subr_counter.c
@@ -0,0 +1,107 @@
+/*-
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <vm/uma.h>
+
+#define IN_SUBR_COUNTER_C
+#include <sys/counter.h>
+
+static uma_zone_t uint64_pcpu_zone;
+
+void
+counter_u64_zero(counter_u64_t c)
+{
+
+ counter_u64_zero_inline(c);
+}
+
+uint64_t
+counter_u64_fetch(counter_u64_t c)
+{
+
+ return (counter_u64_fetch_inline(c));
+}
+
+counter_u64_t
+counter_u64_alloc(int flags)
+{
+ counter_u64_t r;
+
+ r = uma_zalloc(uint64_pcpu_zone, flags);
+ if (r != NULL)
+ counter_u64_zero(r);
+
+ return (r);
+}
+
+void
+counter_u64_free(counter_u64_t c)
+{
+
+ uma_zfree(uint64_pcpu_zone, c);
+}
+
+int
+sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t out;
+ int error;
+
+ out = counter_u64_fetch(*(counter_u64_t *)arg1);
+
+ error = SYSCTL_OUT(req, &out, sizeof(uint64_t));
+
+ if (error || !req->newptr)
+ return (error);
+
+ /*
+ * Any write attempt to a counter zeroes it.
+ */
+ counter_u64_zero(*(counter_u64_t *)arg1);
+
+ return (0);
+}
+
+static void
+counter_startup(void)
+{
+
+ uint64_pcpu_zone = uma_zcreate("uint64 pcpu", sizeof(uint64_t),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU);
+}
+SYSINIT(counter, SI_SUB_KMEM, SI_ORDER_ANY, counter_startup, NULL);
diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c
new file mode 100644
index 0000000..c44ef27
--- /dev/null
+++ b/sys/kern/subr_devstat.c
@@ -0,0 +1,604 @@
+/*-
+ * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/devicestat.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/conf.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/atomic.h>
+
+#ifdef KDTRACE_HOOKS
+#include <sys/dtrace_bsd.h>
+
+dtrace_io_start_probe_func_t dtrace_io_start_probe;
+dtrace_io_done_probe_func_t dtrace_io_done_probe;
+dtrace_io_wait_start_probe_func_t dtrace_io_wait_start_probe;
+dtrace_io_wait_done_probe_func_t dtrace_io_wait_done_probe;
+
+uint32_t dtio_start_id;
+uint32_t dtio_done_id;
+uint32_t dtio_wait_start_id;
+uint32_t dtio_wait_done_id;
+
+#define DTRACE_DEVSTAT_START() \
+ if (dtrace_io_start_probe != NULL) \
+ (*dtrace_io_start_probe)(dtio_start_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_BIO_START() \
+ if (dtrace_io_start_probe != NULL) \
+ (*dtrace_io_start_probe)(dtio_start_id, bp, ds);
+
+#define DTRACE_DEVSTAT_DONE() \
+ if (dtrace_io_done_probe != NULL) \
+ (*dtrace_io_done_probe)(dtio_done_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_BIO_DONE() \
+ if (dtrace_io_done_probe != NULL) \
+ (*dtrace_io_done_probe)(dtio_done_id, bp, ds);
+
+#define DTRACE_DEVSTAT_WAIT_START() \
+ if (dtrace_io_wait_start_probe != NULL) \
+ (*dtrace_io_wait_start_probe)(dtio_wait_start_id, NULL, ds);
+
+#define DTRACE_DEVSTAT_WAIT_DONE() \
+ if (dtrace_io_wait_done_probe != NULL) \
+ (*dtrace_io_wait_done_probe)(dtio_wait_done_id, NULL, ds);
+
+#else /* ! KDTRACE_HOOKS */
+
+#define DTRACE_DEVSTAT_START()
+
+#define DTRACE_DEVSTAT_BIO_START()
+
+#define DTRACE_DEVSTAT_DONE()
+
+#define DTRACE_DEVSTAT_BIO_DONE()
+
+#define DTRACE_DEVSTAT_WAIT_START()
+
+#define DTRACE_DEVSTAT_WAIT_DONE()
+#endif /* KDTRACE_HOOKS */
+
+static int devstat_num_devs;
+static long devstat_generation = 1;
+static int devstat_version = DEVSTAT_VERSION;
+static int devstat_current_devnumber;
+static struct mtx devstat_mutex;
+MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF);
+
+static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq);
+static struct devstat *devstat_alloc(void);
+static void devstat_free(struct devstat *);
+static void devstat_add_entry(struct devstat *ds, const void *dev_name,
+ int unit_number, uint32_t block_size,
+ devstat_support_flags flags,
+ devstat_type_flags device_type,
+ devstat_priority priority);
+
+/*
+ * Allocate a devstat and initialize it
+ */
+struct devstat *
+devstat_new_entry(const void *dev_name,
+ int unit_number, uint32_t block_size,
+ devstat_support_flags flags,
+ devstat_type_flags device_type,
+ devstat_priority priority)
+{
+ struct devstat *ds;
+
+ mtx_assert(&devstat_mutex, MA_NOTOWNED);
+
+ ds = devstat_alloc();
+ mtx_lock(&devstat_mutex);
+ if (unit_number == -1) {
+ ds->id = dev_name;
+ binuptime(&ds->creation_time);
+ devstat_generation++;
+ } else {
+ devstat_add_entry(ds, dev_name, unit_number, block_size,
+ flags, device_type, priority);
+ }
+ mtx_unlock(&devstat_mutex);
+ return (ds);
+}
+
+/*
+ * Take a malloced and zeroed devstat structure given to us, fill it in
+ * and add it to the queue of devices.
+ */
+static void
+devstat_add_entry(struct devstat *ds, const void *dev_name,
+ int unit_number, uint32_t block_size,
+ devstat_support_flags flags,
+ devstat_type_flags device_type,
+ devstat_priority priority)
+{
+ struct devstatlist *devstat_head;
+ struct devstat *ds_tmp;
+
+ mtx_assert(&devstat_mutex, MA_OWNED);
+ devstat_num_devs++;
+
+ devstat_head = &device_statq;
+
+ /*
+ * Priority sort. Each driver passes in its priority when it adds
+ * its devstat entry. Drivers are sorted first by priority, and
+ * then by probe order.
+ *
+ * For the first device, we just insert it, since the priority
+ * doesn't really matter yet. Subsequent devices are inserted into
+ * the list using the order outlined above.
+ */
+ if (devstat_num_devs == 1)
+ STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
+ else {
+ STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
+ struct devstat *ds_next;
+
+ ds_next = STAILQ_NEXT(ds_tmp, dev_links);
+
+ /*
+ * If we find a break between higher and lower
+ * priority items, and if this item fits in the
+ * break, insert it. This also applies if the
+ * "lower priority item" is the end of the list.
+ */
+ if ((priority <= ds_tmp->priority)
+ && ((ds_next == NULL)
+ || (priority > ds_next->priority))) {
+ STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
+ dev_links);
+ break;
+ } else if (priority > ds_tmp->priority) {
+ /*
+ * If this is the case, we should be able
+ * to insert ourselves at the head of the
+ * list. If we can't, something is wrong.
+ */
+ if (ds_tmp == STAILQ_FIRST(devstat_head)) {
+ STAILQ_INSERT_HEAD(devstat_head,
+ ds, dev_links);
+ break;
+ } else {
+ STAILQ_INSERT_TAIL(devstat_head,
+ ds, dev_links);
+ printf("devstat_add_entry: HELP! "
+ "sorting problem detected "
+ "for name %p unit %d\n",
+ dev_name, unit_number);
+ break;
+ }
+ }
+ }
+ }
+
+ ds->device_number = devstat_current_devnumber++;
+ ds->unit_number = unit_number;
+ strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
+ ds->block_size = block_size;
+ ds->flags = flags;
+ ds->device_type = device_type;
+ ds->priority = priority;
+ binuptime(&ds->creation_time);
+ devstat_generation++;
+}
+
+/*
+ * Remove a devstat structure from the list of devices.
+ */
+void
+devstat_remove_entry(struct devstat *ds)
+{
+ struct devstatlist *devstat_head;
+
+ mtx_assert(&devstat_mutex, MA_NOTOWNED);
+ if (ds == NULL)
+ return;
+
+ mtx_lock(&devstat_mutex);
+
+ devstat_head = &device_statq;
+
+ /* Remove this entry from the devstat queue */
+ atomic_add_acq_int(&ds->sequence1, 1);
+ if (ds->id == NULL) {
+ devstat_num_devs--;
+ STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
+ }
+ devstat_free(ds);
+ devstat_generation++;
+ mtx_unlock(&devstat_mutex);
+}
+
+/*
+ * Record a transaction start.
+ *
+ * See comments for devstat_end_transaction(). Ordering is very important
+ * here.
+ */
+void
+devstat_start_transaction(struct devstat *ds, struct bintime *now)
+{
+
+ mtx_assert(&devstat_mutex, MA_NOTOWNED);
+
+ /* sanity check */
+ if (ds == NULL)
+ return;
+
+ atomic_add_acq_int(&ds->sequence1, 1);
+ /*
+ * We only want to set the start time when we are going from idle
+ * to busy. The start time is really the start of the latest busy
+ * period.
+ */
+ if (ds->start_count == ds->end_count) {
+ if (now != NULL)
+ ds->busy_from = *now;
+ else
+ binuptime(&ds->busy_from);
+ }
+ ds->start_count++;
+ atomic_add_rel_int(&ds->sequence0, 1);
+ DTRACE_DEVSTAT_START();
+}
+
+void
+devstat_start_transaction_bio(struct devstat *ds, struct bio *bp)
+{
+
+ mtx_assert(&devstat_mutex, MA_NOTOWNED);
+
+ /* sanity check */
+ if (ds == NULL)
+ return;
+
+ binuptime(&bp->bio_t0);
+ devstat_start_transaction(ds, &bp->bio_t0);
+ DTRACE_DEVSTAT_BIO_START();
+}
+
+/*
+ * Record the ending of a transaction, and incrment the various counters.
+ *
+ * Ordering in this function, and in devstat_start_transaction() is VERY
+ * important. The idea here is to run without locks, so we are very
+ * careful to only modify some fields on the way "down" (i.e. at
+ * transaction start) and some fields on the way "up" (i.e. at transaction
+ * completion). One exception is busy_from, which we only modify in
+ * devstat_start_transaction() when there are no outstanding transactions,
+ * and thus it can't be modified in devstat_end_transaction()
+ * simultaneously.
+ *
+ * The sequence0 and sequence1 fields are provided to enable an application
+ * spying on the structures with mmap(2) to tell when a structure is in a
+ * consistent state or not.
+ *
+ * For this to work 100% reliably, it is important that the two fields
+ * are at opposite ends of the structure and that they are incremented
+ * in the opposite order of how a memcpy(3) in userland would copy them.
+ * We assume that the copying happens front to back, but there is actually
+ * no way short of writing your own memcpy(3) replacement to guarantee
+ * this will be the case.
+ *
+ * In addition to this, being a kind of locks, they must be updated with
+ * atomic instructions using appropriate memory barriers.
+ */
+void
+devstat_end_transaction(struct devstat *ds, uint32_t bytes,
+ devstat_tag_type tag_type, devstat_trans_flags flags,
+ struct bintime *now, struct bintime *then)
+{
+ struct bintime dt, lnow;
+
+ /* sanity check */
+ if (ds == NULL)
+ return;
+
+ if (now == NULL) {
+ now = &lnow;
+ binuptime(now);
+ }
+
+ atomic_add_acq_int(&ds->sequence1, 1);
+ /* Update byte and operations counts */
+ ds->bytes[flags] += bytes;
+ ds->operations[flags]++;
+
+ /*
+ * Keep a count of the various tag types sent.
+ */
+ if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
+ tag_type != DEVSTAT_TAG_NONE)
+ ds->tag_types[tag_type]++;
+
+ if (then != NULL) {
+ /* Update duration of operations */
+ dt = *now;
+ bintime_sub(&dt, then);
+ bintime_add(&ds->duration[flags], &dt);
+ }
+
+ /* Accumulate busy time */
+ dt = *now;
+ bintime_sub(&dt, &ds->busy_from);
+ bintime_add(&ds->busy_time, &dt);
+ ds->busy_from = *now;
+
+ ds->end_count++;
+ atomic_add_rel_int(&ds->sequence0, 1);
+ DTRACE_DEVSTAT_DONE();
+}
+
+void
+devstat_end_transaction_bio(struct devstat *ds, struct bio *bp)
+{
+ devstat_trans_flags flg;
+
+ /* sanity check */
+ if (ds == NULL)
+ return;
+
+ if (bp->bio_cmd == BIO_DELETE)
+ flg = DEVSTAT_FREE;
+ else if (bp->bio_cmd == BIO_READ)
+ flg = DEVSTAT_READ;
+ else if (bp->bio_cmd == BIO_WRITE)
+ flg = DEVSTAT_WRITE;
+ else
+ flg = DEVSTAT_NO_DATA;
+
+ devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
+ DEVSTAT_TAG_SIMPLE, flg, NULL, &bp->bio_t0);
+ DTRACE_DEVSTAT_BIO_DONE();
+}
+
+/*
+ * This is the sysctl handler for the devstat package. The data pushed out
+ * on the kern.devstat.all sysctl variable consists of the current devstat
+ * generation number, and then an array of devstat structures, one for each
+ * device in the system.
+ *
+ * This is more cryptic that obvious, but basically we neither can nor
+ * want to hold the devstat_mutex for any amount of time, so we grab it
+ * only when we need to and keep an eye on devstat_generation all the time.
+ */
+static int
+sysctl_devstat(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ long mygen;
+ struct devstat *nds;
+
+ mtx_assert(&devstat_mutex, MA_NOTOWNED);
+
+ /*
+ * XXX devstat_generation should really be "volatile" but that
+ * XXX freaks out the sysctl macro below. The places where we
+ * XXX change it and inspect it are bracketed in the mutex which
+ * XXX guarantees us proper write barriers. I don't belive the
+ * XXX compiler is allowed to optimize mygen away across calls
+ * XXX to other functions, so the following is belived to be safe.
+ */
+ mygen = devstat_generation;
+
+ error = SYSCTL_OUT(req, &mygen, sizeof(mygen));
+
+ if (devstat_num_devs == 0)
+ return(0);
+
+ if (error != 0)
+ return (error);
+
+ mtx_lock(&devstat_mutex);
+ nds = STAILQ_FIRST(&device_statq);
+ if (mygen != devstat_generation)
+ error = EBUSY;
+ mtx_unlock(&devstat_mutex);
+
+ if (error != 0)
+ return (error);
+
+ for (;nds != NULL;) {
+ error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
+ if (error != 0)
+ return (error);
+ mtx_lock(&devstat_mutex);
+ if (mygen != devstat_generation)
+ error = EBUSY;
+ else
+ nds = STAILQ_NEXT(nds, dev_links);
+ mtx_unlock(&devstat_mutex);
+ if (error != 0)
+ return (error);
+ }
+ return(error);
+}
+
+/*
+ * Sysctl entries for devstat. The first one is a node that all the rest
+ * hang off of.
+ */
+static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL,
+ "Device Statistics");
+
+SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
+ NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list");
+/*
+ * Export the number of devices in the system so that userland utilities
+ * can determine how much memory to allocate to hold all the devices.
+ */
+SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD,
+ &devstat_num_devs, 0, "Number of devices in the devstat list");
+SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
+ &devstat_generation, 0, "Devstat list generation");
+SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD,
+ &devstat_version, 0, "Devstat list version number");
+
+/*
+ * Allocator for struct devstat structures. We sub-allocate these from pages
+ * which we get from malloc. These pages are exported for mmap(2)'ing through
+ * a miniature device driver
+ */
+
+#define statsperpage (PAGE_SIZE / sizeof(struct devstat))
+
+static d_mmap_t devstat_mmap;
+
+static struct cdevsw devstat_cdevsw = {
+ .d_version = D_VERSION,
+ .d_flags = D_NEEDGIANT,
+ .d_mmap = devstat_mmap,
+ .d_name = "devstat",
+};
+
+struct statspage {
+ TAILQ_ENTRY(statspage) list;
+ struct devstat *stat;
+ u_int nfree;
+};
+
+static TAILQ_HEAD(, statspage) pagelist = TAILQ_HEAD_INITIALIZER(pagelist);
+static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics");
+
+static int
+devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, vm_memattr_t *memattr)
+{
+ struct statspage *spp;
+
+ if (nprot != VM_PROT_READ)
+ return (-1);
+ TAILQ_FOREACH(spp, &pagelist, list) {
+ if (offset == 0) {
+ *paddr = vtophys(spp->stat);
+ return (0);
+ }
+ offset -= PAGE_SIZE;
+ }
+ return (-1);
+}
+
+static struct devstat *
+devstat_alloc(void)
+{
+ struct devstat *dsp;
+ struct statspage *spp, *spp2;
+ u_int u;
+ static int once;
+
+ mtx_assert(&devstat_mutex, MA_NOTOWNED);
+ if (!once) {
+ make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME,
+ &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
+ DEVSTAT_DEVICE_NAME);
+ once = 1;
+ }
+ spp2 = NULL;
+ mtx_lock(&devstat_mutex);
+ for (;;) {
+ TAILQ_FOREACH(spp, &pagelist, list) {
+ if (spp->nfree > 0)
+ break;
+ }
+ if (spp != NULL)
+ break;
+ mtx_unlock(&devstat_mutex);
+ spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK);
+ spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK);
+ spp2->nfree = statsperpage;
+
+ /*
+ * If free statspages were added while the lock was released
+ * just reuse them.
+ */
+ mtx_lock(&devstat_mutex);
+ TAILQ_FOREACH(spp, &pagelist, list)
+ if (spp->nfree > 0)
+ break;
+ if (spp == NULL) {
+ spp = spp2;
+
+ /*
+ * It would make more sense to add the new page at the
+ * head but the order on the list determine the
+ * sequence of the mapping so we can't do that.
+ */
+ TAILQ_INSERT_TAIL(&pagelist, spp, list);
+ } else
+ break;
+ }
+ dsp = spp->stat;
+ for (u = 0; u < statsperpage; u++) {
+ if (dsp->allocated == 0)
+ break;
+ dsp++;
+ }
+ spp->nfree--;
+ dsp->allocated = 1;
+ mtx_unlock(&devstat_mutex);
+ if (spp2 != NULL && spp2 != spp) {
+ free(spp2->stat, M_DEVSTAT);
+ free(spp2, M_DEVSTAT);
+ }
+ return (dsp);
+}
+
+static void
+devstat_free(struct devstat *dsp)
+{
+ struct statspage *spp;
+
+ mtx_assert(&devstat_mutex, MA_OWNED);
+ bzero(dsp, sizeof *dsp);
+ TAILQ_FOREACH(spp, &pagelist, list) {
+ if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) {
+ spp->nfree++;
+ return;
+ }
+ }
+}
+
+SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD,
+ NULL, sizeof(struct devstat), "sizeof(struct devstat)");
diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c
new file mode 100644
index 0000000..2391540
--- /dev/null
+++ b/sys/kern/subr_disk.c
@@ -0,0 +1,267 @@
+/*-
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * The bioq_disksort() (and the specification of the bioq API)
+ * have been written by Luigi Rizzo and Fabio Checconi under the same
+ * license as above.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_geom.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/disk.h>
+#include <geom/geom_disk.h>
+
+/*-
+ * Disk error is the preface to plaintive error messages
+ * about failing disk transfers. It prints messages of the form
+ * "hp0g: BLABLABLA cmd=read fsbn 12345 of 12344-12347"
+ * blkdone should be -1 if the position of the error is unknown.
+ * The message is printed with printf.
+ */
+void
+disk_err(struct bio *bp, const char *what, int blkdone, int nl)
+{
+ daddr_t sn;
+
+ if (bp->bio_dev != NULL)
+ printf("%s: %s ", devtoname(bp->bio_dev), what);
+ else if (bp->bio_disk != NULL)
+ printf("%s%d: %s ",
+ bp->bio_disk->d_name, bp->bio_disk->d_unit, what);
+ else
+ printf("disk??: %s ", what);
+ switch(bp->bio_cmd) {
+ case BIO_READ: printf("cmd=read "); break;
+ case BIO_WRITE: printf("cmd=write "); break;
+ case BIO_DELETE: printf("cmd=delete "); break;
+ case BIO_GETATTR: printf("cmd=getattr "); break;
+ case BIO_FLUSH: printf("cmd=flush "); break;
+ default: printf("cmd=%x ", bp->bio_cmd); break;
+ }
+ sn = bp->bio_pblkno;
+ if (bp->bio_bcount <= DEV_BSIZE) {
+ printf("fsbn %jd%s", (intmax_t)sn, nl ? "\n" : "");
+ return;
+ }
+ if (blkdone >= 0) {
+ sn += blkdone;
+ printf("fsbn %jd of ", (intmax_t)sn);
+ }
+ printf("%jd-%jd", (intmax_t)bp->bio_pblkno,
+ (intmax_t)(bp->bio_pblkno + (bp->bio_bcount - 1) / DEV_BSIZE));
+ if (nl)
+ printf("\n");
+}
+
+/*
+ * BIO queue implementation
+ *
+ * Please read carefully the description below before making any change
+ * to the code, or you might change the behaviour of the data structure
+ * in undesirable ways.
+ *
+ * A bioq stores disk I/O request (bio), normally sorted according to
+ * the distance of the requested position (bio->bio_offset) from the
+ * current head position (bioq->last_offset) in the scan direction, i.e.
+ *
+ * (uoff_t)(bio_offset - last_offset)
+ *
+ * Note that the cast to unsigned (uoff_t) is fundamental to insure
+ * that the distance is computed in the scan direction.
+ *
+ * The main methods for manipulating the bioq are:
+ *
+ * bioq_disksort() performs an ordered insertion;
+ *
+ * bioq_first() return the head of the queue, without removing;
+ *
+ * bioq_takefirst() return and remove the head of the queue,
+ * updating the 'current head position' as
+ * bioq->last_offset = bio->bio_offset + bio->bio_length;
+ *
+ * When updating the 'current head position', we assume that the result of
+ * bioq_takefirst() is dispatched to the device, so bioq->last_offset
+ * represents the head position once the request is complete.
+ *
+ * If the bioq is manipulated using only the above calls, it starts
+ * with a sorted sequence of requests with bio_offset >= last_offset,
+ * possibly followed by another sorted sequence of requests with
+ * 0 <= bio_offset < bioq->last_offset
+ *
+ * NOTE: historical behaviour was to ignore bio->bio_length in the
+ * update, but its use tracks the head position in a better way.
+ * Historical behaviour was also to update the head position when
+ * the request under service is complete, rather than when the
+ * request is extracted from the queue. However, the current API
+ * has no method to update the head position; secondly, once
+ * a request has been submitted to the disk, we have no idea of
+ * the actual head position, so the final one is our best guess.
+ *
+ * --- Direct queue manipulation ---
+ *
+ * A bioq uses an underlying TAILQ to store requests, so we also
+ * export methods to manipulate the TAILQ, in particular:
+ *
+ * bioq_insert_tail() insert an entry at the end.
+ * It also creates a 'barrier' so all subsequent
+ * insertions through bioq_disksort() will end up
+ * after this entry;
+ *
+ * bioq_insert_head() insert an entry at the head, update
+ * bioq->last_offset = bio->bio_offset so that
+ * all subsequent insertions through bioq_disksort()
+ * will end up after this entry;
+ *
+ * bioq_remove() remove a generic element from the queue, act as
+ * bioq_takefirst() if invoked on the head of the queue.
+ *
+ * The semantic of these methods is the same as the operations
+ * on the underlying TAILQ, but with additional guarantees on
+ * subsequent bioq_disksort() calls. E.g. bioq_insert_tail()
+ * can be useful for making sure that all previous ops are flushed
+ * to disk before continuing.
+ *
+ * Updating bioq->last_offset on a bioq_insert_head() guarantees
+ * that the bio inserted with the last bioq_insert_head() will stay
+ * at the head of the queue even after subsequent bioq_disksort().
+ *
+ * Note that when the direct queue manipulation functions are used,
+ * the queue may contain multiple inversion points (i.e. more than
+ * two sorted sequences of requests).
+ *
+ */
+
+void
+bioq_init(struct bio_queue_head *head)
+{
+
+ TAILQ_INIT(&head->queue);
+ head->last_offset = 0;
+ head->insert_point = NULL;
+}
+
+void
+bioq_remove(struct bio_queue_head *head, struct bio *bp)
+{
+
+ if (head->insert_point == NULL) {
+ if (bp == TAILQ_FIRST(&head->queue))
+ head->last_offset = bp->bio_offset + bp->bio_length;
+ } else if (bp == head->insert_point)
+ head->insert_point = NULL;
+
+ TAILQ_REMOVE(&head->queue, bp, bio_queue);
+}
+
+void
+bioq_flush(struct bio_queue_head *head, struct devstat *stp, int error)
+{
+ struct bio *bp;
+
+ while ((bp = bioq_takefirst(head)) != NULL)
+ biofinish(bp, stp, error);
+}
+
+void
+bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
+{
+
+ if (head->insert_point == NULL)
+ head->last_offset = bp->bio_offset;
+ TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
+}
+
+void
+bioq_insert_tail(struct bio_queue_head *head, struct bio *bp)
+{
+
+ TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue);
+ head->insert_point = bp;
+ head->last_offset = bp->bio_offset;
+}
+
+struct bio *
+bioq_first(struct bio_queue_head *head)
+{
+
+ return (TAILQ_FIRST(&head->queue));
+}
+
+struct bio *
+bioq_takefirst(struct bio_queue_head *head)
+{
+ struct bio *bp;
+
+ bp = TAILQ_FIRST(&head->queue);
+ if (bp != NULL)
+ bioq_remove(head, bp);
+ return (bp);
+}
+
+/*
+ * Compute the sorting key. The cast to unsigned is
+ * fundamental for correctness, see the description
+ * near the beginning of the file.
+ */
+static inline uoff_t
+bioq_bio_key(struct bio_queue_head *head, struct bio *bp)
+{
+
+ return ((uoff_t)(bp->bio_offset - head->last_offset));
+}
+
+/*
+ * Seek sort for disks.
+ *
+ * Sort all requests in a single queue while keeping
+ * track of the current position of the disk with last_offset.
+ * See above for details.
+ */
+void
+bioq_disksort(struct bio_queue_head *head, struct bio *bp)
+{
+ struct bio *cur, *prev;
+ uoff_t key;
+
+ if ((bp->bio_flags & BIO_ORDERED) != 0) {
+ /*
+ * Ordered transactions can only be dispatched
+ * after any currently queued transactions. They
+ * also have barrier semantics - no transactions
+ * queued in the future can pass them.
+ */
+ bioq_insert_tail(head, bp);
+ return;
+ }
+
+ prev = NULL;
+ key = bioq_bio_key(head, bp);
+ cur = TAILQ_FIRST(&head->queue);
+
+ if (head->insert_point) {
+ prev = head->insert_point;
+ cur = TAILQ_NEXT(head->insert_point, bio_queue);
+ }
+
+ while (cur != NULL && key >= bioq_bio_key(head, cur)) {
+ prev = cur;
+ cur = TAILQ_NEXT(cur, bio_queue);
+ }
+
+ if (prev == NULL)
+ TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
+ else
+ TAILQ_INSERT_AFTER(&head->queue, prev, bp, bio_queue);
+}
diff --git a/sys/kern/subr_dummy_vdso_tc.c b/sys/kern/subr_dummy_vdso_tc.c
new file mode 100644
index 0000000..9c84501
--- /dev/null
+++ b/sys/kern/subr_dummy_vdso_tc.c
@@ -0,0 +1,49 @@
+/*-
+ * Copyright 2012 Konstantin Belousov <kib@FreeBSD.ORG>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/vdso.h>
+
+uint32_t
+cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th)
+{
+
+ return (0);
+}
+
+#ifdef COMPAT_FREEBSD32
+uint32_t
+cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
+{
+
+ return (0);
+}
+#endif
diff --git a/sys/kern/subr_eventhandler.c b/sys/kern/subr_eventhandler.c
new file mode 100644
index 0000000..5894099
--- /dev/null
+++ b/sys/kern/subr_eventhandler.c
@@ -0,0 +1,280 @@
+/*-
+ * Copyright (c) 1999 Michael Smith <msmith@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+
+static MALLOC_DEFINE(M_EVENTHANDLER, "eventhandler", "Event handler records");
+
+/* List of 'slow' lists */
+static TAILQ_HEAD(, eventhandler_list) eventhandler_lists;
+static int eventhandler_lists_initted = 0;
+static struct mtx eventhandler_mutex;
+
+struct eventhandler_entry_generic
+{
+ struct eventhandler_entry ee;
+ void (* func)(void);
+};
+
+static struct eventhandler_list *_eventhandler_find_list(const char *name);
+
+/*
+ * Initialize the eventhandler mutex and list.
+ */
+static void
+eventhandler_init(void *dummy __unused)
+{
+ TAILQ_INIT(&eventhandler_lists);
+ mtx_init(&eventhandler_mutex, "eventhandler", NULL, MTX_DEF);
+ atomic_store_rel_int(&eventhandler_lists_initted, 1);
+}
+SYSINIT(eventhandlers, SI_SUB_EVENTHANDLER, SI_ORDER_FIRST, eventhandler_init,
+ NULL);
+
+/*
+ * Insertion is O(n) due to the priority scan, but optimises to O(1)
+ * if all priorities are identical.
+ */
+static eventhandler_tag
+eventhandler_register_internal(struct eventhandler_list *list,
+ const char *name, eventhandler_tag epn)
+{
+ struct eventhandler_list *new_list;
+ struct eventhandler_entry *ep;
+
+ KASSERT(eventhandler_lists_initted, ("eventhandler registered too early"));
+ KASSERT(epn != NULL, ("%s: cannot register NULL event", __func__));
+
+ /* lock the eventhandler lists */
+ mtx_lock(&eventhandler_mutex);
+
+ /* Do we need to find/create the (slow) list? */
+ if (list == NULL) {
+ /* look for a matching, existing list */
+ list = _eventhandler_find_list(name);
+
+ /* Do we need to create the list? */
+ if (list == NULL) {
+ mtx_unlock(&eventhandler_mutex);
+
+ new_list = malloc(sizeof(struct eventhandler_list) +
+ strlen(name) + 1, M_EVENTHANDLER, M_WAITOK);
+
+ /* If someone else created it already, then use that one. */
+ mtx_lock(&eventhandler_mutex);
+ list = _eventhandler_find_list(name);
+ if (list != NULL) {
+ free(new_list, M_EVENTHANDLER);
+ } else {
+ CTR2(KTR_EVH, "%s: creating list \"%s\"", __func__, name);
+ list = new_list;
+ list->el_flags = 0;
+ list->el_runcount = 0;
+ bzero(&list->el_lock, sizeof(list->el_lock));
+ list->el_name = (char *)list + sizeof(struct eventhandler_list);
+ strcpy(list->el_name, name);
+ TAILQ_INSERT_HEAD(&eventhandler_lists, list, el_link);
+ }
+ }
+ }
+ if (!(list->el_flags & EHL_INITTED)) {
+ TAILQ_INIT(&list->el_entries);
+ mtx_init(&list->el_lock, name, "eventhandler list", MTX_DEF);
+ atomic_store_rel_int(&list->el_flags, EHL_INITTED);
+ }
+ mtx_unlock(&eventhandler_mutex);
+
+ KASSERT(epn->ee_priority != EHE_DEAD_PRIORITY,
+ ("%s: handler for %s registered with dead priority", __func__, name));
+
+ /* sort it into the list */
+ CTR4(KTR_EVH, "%s: adding item %p (function %p) to \"%s\"", __func__, epn,
+ ((struct eventhandler_entry_generic *)epn)->func, name);
+ EHL_LOCK(list);
+ TAILQ_FOREACH(ep, &list->el_entries, ee_link) {
+ if (ep->ee_priority != EHE_DEAD_PRIORITY &&
+ epn->ee_priority < ep->ee_priority) {
+ TAILQ_INSERT_BEFORE(ep, epn, ee_link);
+ break;
+ }
+ }
+ if (ep == NULL)
+ TAILQ_INSERT_TAIL(&list->el_entries, epn, ee_link);
+ EHL_UNLOCK(list);
+ return(epn);
+}
+
+eventhandler_tag
+eventhandler_register(struct eventhandler_list *list, const char *name,
+ void *func, void *arg, int priority)
+{
+ struct eventhandler_entry_generic *eg;
+
+ /* allocate an entry for this handler, populate it */
+ eg = malloc(sizeof(struct eventhandler_entry_generic), M_EVENTHANDLER,
+ M_WAITOK | M_ZERO);
+ eg->func = func;
+ eg->ee.ee_arg = arg;
+ eg->ee.ee_priority = priority;
+
+ return (eventhandler_register_internal(list, name, &eg->ee));
+}
+
+#ifdef VIMAGE
+struct eventhandler_entry_generic_vimage
+{
+ struct eventhandler_entry ee;
+ vimage_iterator_func_t func; /* Vimage iterator function. */
+ struct eventhandler_entry_vimage v_ee; /* Original func, arg. */
+};
+
+eventhandler_tag
+vimage_eventhandler_register(struct eventhandler_list *list, const char *name,
+ void *func, void *arg, int priority, vimage_iterator_func_t iterfunc)
+{
+ struct eventhandler_entry_generic_vimage *eg;
+
+ /* allocate an entry for this handler, populate it */
+ eg = malloc(sizeof(struct eventhandler_entry_generic_vimage),
+ M_EVENTHANDLER, M_WAITOK | M_ZERO);
+ eg->func = iterfunc;
+ eg->v_ee.func = func;
+ eg->v_ee.ee_arg = arg;
+ eg->ee.ee_arg = &eg->v_ee;
+ eg->ee.ee_priority = priority;
+
+ return (eventhandler_register_internal(list, name, &eg->ee));
+}
+#endif
+
+void
+eventhandler_deregister(struct eventhandler_list *list, eventhandler_tag tag)
+{
+ struct eventhandler_entry *ep = tag;
+
+ EHL_LOCK_ASSERT(list, MA_OWNED);
+ if (ep != NULL) {
+ /* remove just this entry */
+ if (list->el_runcount == 0) {
+ CTR3(KTR_EVH, "%s: removing item %p from \"%s\"", __func__, ep,
+ list->el_name);
+ TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+ free(ep, M_EVENTHANDLER);
+ } else {
+ CTR3(KTR_EVH, "%s: marking item %p from \"%s\" as dead", __func__,
+ ep, list->el_name);
+ ep->ee_priority = EHE_DEAD_PRIORITY;
+ }
+ } else {
+ /* remove entire list */
+ if (list->el_runcount == 0) {
+ CTR2(KTR_EVH, "%s: removing all items from \"%s\"", __func__,
+ list->el_name);
+ while (!TAILQ_EMPTY(&list->el_entries)) {
+ ep = TAILQ_FIRST(&list->el_entries);
+ TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+ free(ep, M_EVENTHANDLER);
+ }
+ } else {
+ CTR2(KTR_EVH, "%s: marking all items from \"%s\" as dead",
+ __func__, list->el_name);
+ TAILQ_FOREACH(ep, &list->el_entries, ee_link)
+ ep->ee_priority = EHE_DEAD_PRIORITY;
+ }
+ }
+ while (list->el_runcount > 0)
+ mtx_sleep(list, &list->el_lock, 0, "evhrm", 0);
+ EHL_UNLOCK(list);
+}
+
+/*
+ * Internal version for use when eventhandler list is already locked.
+ */
+static struct eventhandler_list *
+_eventhandler_find_list(const char *name)
+{
+ struct eventhandler_list *list;
+
+ mtx_assert(&eventhandler_mutex, MA_OWNED);
+ TAILQ_FOREACH(list, &eventhandler_lists, el_link) {
+ if (!strcmp(name, list->el_name))
+ break;
+ }
+ return (list);
+}
+
+/*
+ * Lookup a "slow" list by name. Returns with the list locked.
+ */
+struct eventhandler_list *
+eventhandler_find_list(const char *name)
+{
+ struct eventhandler_list *list;
+
+ if (!eventhandler_lists_initted)
+ return(NULL);
+
+ /* scan looking for the requested list */
+ mtx_lock(&eventhandler_mutex);
+ list = _eventhandler_find_list(name);
+ if (list != NULL)
+ EHL_LOCK(list);
+ mtx_unlock(&eventhandler_mutex);
+
+ return(list);
+}
+
+/*
+ * Prune "dead" entries from an eventhandler list.
+ */
+void
+eventhandler_prune_list(struct eventhandler_list *list)
+{
+ struct eventhandler_entry *ep, *en;
+ int pruned = 0;
+
+ CTR2(KTR_EVH, "%s: pruning list \"%s\"", __func__, list->el_name);
+ EHL_LOCK_ASSERT(list, MA_OWNED);
+ TAILQ_FOREACH_SAFE(ep, &list->el_entries, ee_link, en) {
+ if (ep->ee_priority == EHE_DEAD_PRIORITY) {
+ TAILQ_REMOVE(&list->el_entries, ep, ee_link);
+ free(ep, M_EVENTHANDLER);
+ pruned++;
+ }
+ }
+ if (pruned > 0)
+ wakeup(list);
+}
diff --git a/sys/kern/subr_fattime.c b/sys/kern/subr_fattime.c
new file mode 100644
index 0000000..1fb207e
--- /dev/null
+++ b/sys/kern/subr_fattime.c
@@ -0,0 +1,307 @@
+/*-
+ * Copyright (c) 2006 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ * Convert MS-DOS FAT format timestamps to and from unix timespecs
+ *
+ * FAT filestamps originally consisted of two 16 bit integers, encoded like
+ * this:
+ *
+ * yyyyyyymmmmddddd (year - 1980, month, day)
+ *
+ * hhhhhmmmmmmsssss (hour, minutes, seconds divided by two)
+ *
+ * Subsequently even Microsoft realized that files could be accessed in less
+ * than two seconds and a byte was added containing:
+ *
+ * sfffffff (second mod two, 100ths of second)
+ *
+ * FAT timestamps are in the local timezone, with no indication of which
+ * timezone much less if daylight savings time applies.
+ *
+ * Later on again, in Windows NT, timestamps were defined relative to GMT.
+ *
+ * Purists will point out that UTC replaced GMT for such uses around
+ * a century ago, already then. Ironically "NT" was an abbreviation of
+ * "New Technology". Anyway...
+ *
+ * The 'utc' argument determines if the resulting FATTIME timestamp
+ * should b on the UTC or local timezone calendar.
+ *
+ * The conversion functions below cut time into four-year leap-second
+ * cycles rather than single years and uses table lookups inside those
+ * cycles to get the months and years sorted out.
+ *
+ * Obviously we cannot calculate the correct table index going from
+ * a posix seconds count to Y/M/D, but we can get pretty close by
+ * dividing the daycount by 32 (giving a too low index), and then
+ * adjusting upwards a couple of steps if necessary.
+ *
+ * FAT timestamps have 7 bits for the year and starts at 1980, so
+ * they can represent up to 2107 which means that the non-leap-year
+ * 2100 must be handled.
+ *
+ * XXX: As long as time_t is 32 bits this is not relevant or easily
+ * XXX: testable. Revisit when time_t grows bigger.
+ * XXX: grepfodder: 64 bit time_t, y2100, y2.1k, 2100, leap year
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/clock.h>
+
+#define DAY (24 * 60 * 60) /* Length of day in seconds */
+#define YEAR 365 /* Length of normal year */
+#define LYC (4 * YEAR + 1) /* Length of 4 year leap-year cycle */
+#define T1980 (10 * 365 + 2) /* Days from 1970 to 1980 */
+
+/* End of month is N days from start of (normal) year */
+#define JAN 31
+#define FEB (JAN + 28)
+#define MAR (FEB + 31)
+#define APR (MAR + 30)
+#define MAY (APR + 31)
+#define JUN (MAY + 30)
+#define JUL (JUN + 31)
+#define AUG (JUL + 31)
+#define SEP (AUG + 30)
+#define OCT (SEP + 31)
+#define NOV (OCT + 30)
+#define DEC (NOV + 31)
+
+/* Table of months in a 4 year leap-year cycle */
+
+#define ENC(y,m) (((y) << 9) | ((m) << 5))
+
+static const struct {
+ uint16_t days; /* month start in days relative to cycle */
+ uint16_t coded; /* encoded year + month information */
+} mtab[48] = {
+ { 0 + 0 * YEAR, ENC(0, 1) },
+
+ { JAN + 0 * YEAR, ENC(0, 2) }, { FEB + 0 * YEAR + 1, ENC(0, 3) },
+ { MAR + 0 * YEAR + 1, ENC(0, 4) }, { APR + 0 * YEAR + 1, ENC(0, 5) },
+ { MAY + 0 * YEAR + 1, ENC(0, 6) }, { JUN + 0 * YEAR + 1, ENC(0, 7) },
+ { JUL + 0 * YEAR + 1, ENC(0, 8) }, { AUG + 0 * YEAR + 1, ENC(0, 9) },
+ { SEP + 0 * YEAR + 1, ENC(0, 10) }, { OCT + 0 * YEAR + 1, ENC(0, 11) },
+ { NOV + 0 * YEAR + 1, ENC(0, 12) }, { DEC + 0 * YEAR + 1, ENC(1, 1) },
+
+ { JAN + 1 * YEAR + 1, ENC(1, 2) }, { FEB + 1 * YEAR + 1, ENC(1, 3) },
+ { MAR + 1 * YEAR + 1, ENC(1, 4) }, { APR + 1 * YEAR + 1, ENC(1, 5) },
+ { MAY + 1 * YEAR + 1, ENC(1, 6) }, { JUN + 1 * YEAR + 1, ENC(1, 7) },
+ { JUL + 1 * YEAR + 1, ENC(1, 8) }, { AUG + 1 * YEAR + 1, ENC(1, 9) },
+ { SEP + 1 * YEAR + 1, ENC(1, 10) }, { OCT + 1 * YEAR + 1, ENC(1, 11) },
+ { NOV + 1 * YEAR + 1, ENC(1, 12) }, { DEC + 1 * YEAR + 1, ENC(2, 1) },
+
+ { JAN + 2 * YEAR + 1, ENC(2, 2) }, { FEB + 2 * YEAR + 1, ENC(2, 3) },
+ { MAR + 2 * YEAR + 1, ENC(2, 4) }, { APR + 2 * YEAR + 1, ENC(2, 5) },
+ { MAY + 2 * YEAR + 1, ENC(2, 6) }, { JUN + 2 * YEAR + 1, ENC(2, 7) },
+ { JUL + 2 * YEAR + 1, ENC(2, 8) }, { AUG + 2 * YEAR + 1, ENC(2, 9) },
+ { SEP + 2 * YEAR + 1, ENC(2, 10) }, { OCT + 2 * YEAR + 1, ENC(2, 11) },
+ { NOV + 2 * YEAR + 1, ENC(2, 12) }, { DEC + 2 * YEAR + 1, ENC(3, 1) },
+
+ { JAN + 3 * YEAR + 1, ENC(3, 2) }, { FEB + 3 * YEAR + 1, ENC(3, 3) },
+ { MAR + 3 * YEAR + 1, ENC(3, 4) }, { APR + 3 * YEAR + 1, ENC(3, 5) },
+ { MAY + 3 * YEAR + 1, ENC(3, 6) }, { JUN + 3 * YEAR + 1, ENC(3, 7) },
+ { JUL + 3 * YEAR + 1, ENC(3, 8) }, { AUG + 3 * YEAR + 1, ENC(3, 9) },
+ { SEP + 3 * YEAR + 1, ENC(3, 10) }, { OCT + 3 * YEAR + 1, ENC(3, 11) },
+ { NOV + 3 * YEAR + 1, ENC(3, 12) }
+};
+
+
+void
+timespec2fattime(struct timespec *tsp, int utc, uint16_t *ddp, uint16_t *dtp, uint8_t *dhp)
+{
+ time_t t1;
+ unsigned t2, l, m;
+
+ t1 = tsp->tv_sec;
+ if (!utc)
+ t1 -= utc_offset();
+
+ if (dhp != NULL)
+ *dhp = (tsp->tv_sec & 1) * 100 + tsp->tv_nsec / 10000000;
+ if (dtp != NULL) {
+ *dtp = (t1 / 2) % 30;
+ *dtp |= ((t1 / 60) % 60) << 5;
+ *dtp |= ((t1 / 3600) % 24) << 11;
+ }
+ if (ddp != NULL) {
+ t2 = t1 / DAY;
+ if (t2 < T1980) {
+ /* Impossible date, truncate to 1980-01-01 */
+ *ddp = 0x0021;
+ } else {
+ t2 -= T1980;
+
+ /*
+ * 2100 is not a leap year.
+ * XXX: a 32 bit time_t can not get us here.
+ */
+ if (t2 >= ((2100 - 1980) / 4 * LYC + FEB))
+ t2++;
+
+ /* Account for full leapyear cycles */
+ l = t2 / LYC;
+ *ddp = (l * 4) << 9;
+ t2 -= l * LYC;
+
+ /* Find approximate table entry */
+ m = t2 / 32;
+
+ /* Find correct table entry */
+ while (m < 47 && mtab[m + 1].days <= t2)
+ m++;
+
+ /* Get year + month from the table */
+ *ddp += mtab[m].coded;
+
+ /* And apply the day in the month */
+ t2 -= mtab[m].days - 1;
+ *ddp |= t2;
+ }
+ }
+}
+
+/*
+ * Table indexed by the bottom two bits of year + four bits of the month
+ * from the FAT timestamp, returning number of days into 4 year long
+ * leap-year cycle
+ */
+
+#define DCOD(m, y, l) ((m) + YEAR * (y) + (l))
+static const uint16_t daytab[64] = {
+ 0, DCOD( 0, 0, 0), DCOD(JAN, 0, 0), DCOD(FEB, 0, 1),
+ DCOD(MAR, 0, 1), DCOD(APR, 0, 1), DCOD(MAY, 0, 1), DCOD(JUN, 0, 1),
+ DCOD(JUL, 0, 1), DCOD(AUG, 0, 1), DCOD(SEP, 0, 1), DCOD(OCT, 0, 1),
+ DCOD(NOV, 0, 1), DCOD(DEC, 0, 1), 0, 0,
+ 0, DCOD( 0, 1, 1), DCOD(JAN, 1, 1), DCOD(FEB, 1, 1),
+ DCOD(MAR, 1, 1), DCOD(APR, 1, 1), DCOD(MAY, 1, 1), DCOD(JUN, 1, 1),
+ DCOD(JUL, 1, 1), DCOD(AUG, 1, 1), DCOD(SEP, 1, 1), DCOD(OCT, 1, 1),
+ DCOD(NOV, 1, 1), DCOD(DEC, 1, 1), 0, 0,
+ 0, DCOD( 0, 2, 1), DCOD(JAN, 2, 1), DCOD(FEB, 2, 1),
+ DCOD(MAR, 2, 1), DCOD(APR, 2, 1), DCOD(MAY, 2, 1), DCOD(JUN, 2, 1),
+ DCOD(JUL, 2, 1), DCOD(AUG, 2, 1), DCOD(SEP, 2, 1), DCOD(OCT, 2, 1),
+ DCOD(NOV, 2, 1), DCOD(DEC, 2, 1), 0, 0,
+ 0, DCOD( 0, 3, 1), DCOD(JAN, 3, 1), DCOD(FEB, 3, 1),
+ DCOD(MAR, 3, 1), DCOD(APR, 3, 1), DCOD(MAY, 3, 1), DCOD(JUN, 3, 1),
+ DCOD(JUL, 3, 1), DCOD(AUG, 3, 1), DCOD(SEP, 3, 1), DCOD(OCT, 3, 1),
+ DCOD(NOV, 3, 1), DCOD(DEC, 3, 1), 0, 0
+};
+
+void
+fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp)
+{
+ unsigned day;
+
+ /* Unpack time fields */
+ tsp->tv_sec = (dt & 0x1f) << 1;
+ tsp->tv_sec += ((dt & 0x7e0) >> 5) * 60;
+ tsp->tv_sec += ((dt & 0xf800) >> 11) * 3600;
+ tsp->tv_sec += dh / 100;
+ tsp->tv_nsec = (dh % 100) * 10000000;
+
+ /* Day of month */
+ day = (dd & 0x1f) - 1;
+
+ /* Full leap-year cycles */
+ day += LYC * ((dd >> 11) & 0x1f);
+
+ /* Month offset from leap-year cycle */
+ day += daytab[(dd >> 5) & 0x3f];
+
+ /*
+ * 2100 is not a leap year.
+ * XXX: a 32 bit time_t can not get us here.
+ */
+ if (day >= ((2100 - 1980) / 4 * LYC + FEB))
+ day--;
+
+ /* Align with time_t epoch */
+ day += T1980;
+
+ tsp->tv_sec += DAY * day;
+ if (!utc)
+ tsp->tv_sec += utc_offset();
+}
+
+#ifdef TEST_DRIVER
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+int
+main(int argc __unused, char **argv __unused)
+{
+ int i;
+ struct timespec ts;
+ struct tm tm;
+ double a;
+ uint16_t d, t;
+ uint8_t p;
+ char buf[100];
+
+ for (i = 0; i < 10000; i++) {
+ do {
+ ts.tv_sec = random();
+ } while (ts.tv_sec < T1980 * 86400);
+ ts.tv_nsec = random() % 1000000000;
+
+ printf("%10d.%03ld -- ", ts.tv_sec, ts.tv_nsec / 1000000);
+
+ gmtime_r(&ts.tv_sec, &tm);
+ strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
+ printf("%s -- ", buf);
+
+ a = ts.tv_sec + ts.tv_nsec * 1e-9;
+ d = t = p = 0;
+ timet2fattime(&ts, &d, &t, &p);
+ printf("%04x %04x %02x -- ", d, t, p);
+ printf("%3d %02d %02d %02d %02d %02d -- ",
+ ((d >> 9) & 0x7f) + 1980,
+ (d >> 5) & 0x0f,
+ (d >> 0) & 0x1f,
+ (t >> 11) & 0x1f,
+ (t >> 5) & 0x3f,
+ ((t >> 0) & 0x1f) * 2);
+
+ ts.tv_sec = ts.tv_nsec = 0;
+ fattime2timet(d, t, p, &ts);
+ printf("%10d.%03ld == ", ts.tv_sec, ts.tv_nsec / 1000000);
+ gmtime_r(&ts.tv_sec, &tm);
+ strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
+ printf("%s -- ", buf);
+ a -= ts.tv_sec + ts.tv_nsec * 1e-9;
+ printf("%.3f", a);
+ printf("\n");
+ }
+ return (0);
+}
+
+#endif /* TEST_DRIVER */
diff --git a/sys/kern/subr_firmware.c b/sys/kern/subr_firmware.c
new file mode 100644
index 0000000..20ab76e
--- /dev/null
+++ b/sys/kern/subr_firmware.c
@@ -0,0 +1,537 @@
+/*-
+ * Copyright (c) 2005-2008, Sam Leffler <sam@errno.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/errno.h>
+#include <sys/linker.h>
+#include <sys/firmware.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/module.h>
+#include <sys/eventhandler.h>
+
+#include <sys/filedesc.h>
+#include <sys/vnode.h>
+
+/*
+ * Loadable firmware support. See sys/sys/firmware.h and firmware(9)
+ * form more details on the subsystem.
+ *
+ * 'struct firmware' is the user-visible part of the firmware table.
+ * Additional internal information is stored in a 'struct priv_fw'
+ * (currently a static array). A slot is in use if FW_INUSE is true:
+ */
+
+#define FW_INUSE(p) ((p)->file != NULL || (p)->fw.name != NULL)
+
+/*
+ * fw.name != NULL when an image is registered; file != NULL for
+ * autoloaded images whose handling has not been completed.
+ *
+ * The state of a slot evolves as follows:
+ * firmware_register --> fw.name = image_name
+ * (autoloaded image) --> file = module reference
+ * firmware_unregister --> fw.name = NULL
+ * (unloadentry complete) --> file = NULL
+ *
+ * In order for the above to work, the 'file' field must remain
+ * unchanged in firmware_unregister().
+ *
+ * Images residing in the same module are linked to each other
+ * through the 'parent' argument of firmware_register().
+ * One image (typically, one with the same name as the module to let
+ * the autoloading mechanism work) is considered the parent image for
+ * all other images in the same module. Children affect the refcount
+ * on the parent image preventing improper unloading of the image itself.
+ */
+
+struct priv_fw {
+ int refcnt; /* reference count */
+
+ /*
+ * parent entry, see above. Set on firmware_register(),
+ * cleared on firmware_unregister().
+ */
+ struct priv_fw *parent;
+
+ int flags; /* record FIRMWARE_UNLOAD requests */
+#define FW_UNLOAD 0x100
+
+ /*
+ * 'file' is private info managed by the autoload/unload code.
+ * Set at the end of firmware_get(), cleared only in the
+ * firmware_unload_task, so the latter can depend on its value even
+ * while the lock is not held.
+ */
+ linker_file_t file; /* module file, if autoloaded */
+
+ /*
+ * 'fw' is the externally visible image information.
+ * We do not make it the first field in priv_fw, to avoid the
+ * temptation of casting pointers to each other.
+ * Use PRIV_FW(fw) to get a pointer to the cointainer of fw.
+ * Beware, PRIV_FW does not work for a NULL pointer.
+ */
+ struct firmware fw; /* externally visible information */
+};
+
+/*
+ * PRIV_FW returns the pointer to the container of struct firmware *x.
+ * Cast to intptr_t to override the 'const' attribute of x
+ */
+#define PRIV_FW(x) ((struct priv_fw *) \
+ ((intptr_t)(x) - offsetof(struct priv_fw, fw)) )
+
+/*
+ * At the moment we use a static array as backing store for the registry.
+ * Should we move to a dynamic structure, keep in mind that we cannot
+ * reallocate the array because pointers are held externally.
+ * A list may work, though.
+ */
+#define FIRMWARE_MAX 50
+static struct priv_fw firmware_table[FIRMWARE_MAX];
+
+/*
+ * Firmware module operations are handled in a separate task as they
+ * might sleep and they require directory context to do i/o.
+ */
+static struct taskqueue *firmware_tq;
+static struct task firmware_unload_task;
+
+/*
+ * This mutex protects accesses to the firmware table.
+ */
+static struct mtx firmware_mtx;
+MTX_SYSINIT(firmware, &firmware_mtx, "firmware table", MTX_DEF);
+
+/*
+ * Helper function to lookup a name.
+ * As a side effect, it sets the pointer to a free slot, if any.
+ * This way we can concentrate most of the registry scanning in
+ * this function, which makes it easier to replace the registry
+ * with some other data structure.
+ */
+static struct priv_fw *
+lookup(const char *name, struct priv_fw **empty_slot)
+{
+ struct priv_fw *fp = NULL;
+ struct priv_fw *dummy;
+ int i;
+
+ if (empty_slot == NULL)
+ empty_slot = &dummy;
+ *empty_slot = NULL;
+ for (i = 0; i < FIRMWARE_MAX; i++) {
+ fp = &firmware_table[i];
+ if (fp->fw.name != NULL && strcasecmp(name, fp->fw.name) == 0)
+ break;
+ else if (!FW_INUSE(fp))
+ *empty_slot = fp;
+ }
+ return (i < FIRMWARE_MAX ) ? fp : NULL;
+}
+
+/*
+ * Register a firmware image with the specified name. The
+ * image name must not already be registered. If this is a
+ * subimage then parent refers to a previously registered
+ * image that this should be associated with.
+ */
+const struct firmware *
+firmware_register(const char *imagename, const void *data, size_t datasize,
+ unsigned int version, const struct firmware *parent)
+{
+ struct priv_fw *match, *frp;
+ char *str;
+
+ str = strdup(imagename, M_TEMP);
+
+ mtx_lock(&firmware_mtx);
+ /*
+ * Do a lookup to make sure the name is unique or find a free slot.
+ */
+ match = lookup(imagename, &frp);
+ if (match != NULL) {
+ mtx_unlock(&firmware_mtx);
+ printf("%s: image %s already registered!\n",
+ __func__, imagename);
+ free(str, M_TEMP);
+ return NULL;
+ }
+ if (frp == NULL) {
+ mtx_unlock(&firmware_mtx);
+ printf("%s: cannot register image %s, firmware table full!\n",
+ __func__, imagename);
+ free(str, M_TEMP);
+ return NULL;
+ }
+ bzero(frp, sizeof(*frp)); /* start from a clean record */
+ frp->fw.name = str;
+ frp->fw.data = data;
+ frp->fw.datasize = datasize;
+ frp->fw.version = version;
+ if (parent != NULL)
+ frp->parent = PRIV_FW(parent);
+ mtx_unlock(&firmware_mtx);
+ if (bootverbose)
+ printf("firmware: '%s' version %u: %zu bytes loaded at %p\n",
+ imagename, version, datasize, data);
+ return &frp->fw;
+}
+
+/*
+ * Unregister/remove a firmware image. If there are outstanding
+ * references an error is returned and the image is not removed
+ * from the registry.
+ */
+int
+firmware_unregister(const char *imagename)
+{
+ struct priv_fw *fp;
+ int err;
+
+ mtx_lock(&firmware_mtx);
+ fp = lookup(imagename, NULL);
+ if (fp == NULL) {
+ /*
+ * It is ok for the lookup to fail; this can happen
+ * when a module is unloaded on last reference and the
+ * module unload handler unregister's each of it's
+ * firmware images.
+ */
+ err = 0;
+ } else if (fp->refcnt != 0) { /* cannot unregister */
+ err = EBUSY;
+ } else {
+ linker_file_t x = fp->file; /* save value */
+
+ /*
+ * Clear the whole entry with bzero to make sure we
+ * do not forget anything. Then restore 'file' which is
+ * non-null for autoloaded images.
+ */
+ free((void *) (uintptr_t) fp->fw.name, M_TEMP);
+ bzero(fp, sizeof(struct priv_fw));
+ fp->file = x;
+ err = 0;
+ }
+ mtx_unlock(&firmware_mtx);
+ return err;
+}
+
+static void
+loadimage(void *arg, int npending)
+{
+ struct thread *td = curthread;
+ char *imagename = arg;
+ struct priv_fw *fp;
+ linker_file_t result;
+ int error;
+
+ /* synchronize with the thread that dispatched us */
+ mtx_lock(&firmware_mtx);
+ mtx_unlock(&firmware_mtx);
+
+ if (td->td_proc->p_fd->fd_rdir == NULL) {
+ printf("%s: root not mounted yet, no way to load image\n",
+ imagename);
+ goto done;
+ }
+ error = linker_reference_module(imagename, NULL, &result);
+ if (error != 0) {
+ printf("%s: could not load firmware image, error %d\n",
+ imagename, error);
+ goto done;
+ }
+
+ mtx_lock(&firmware_mtx);
+ fp = lookup(imagename, NULL);
+ if (fp == NULL || fp->file != NULL) {
+ mtx_unlock(&firmware_mtx);
+ if (fp == NULL)
+ printf("%s: firmware image loaded, "
+ "but did not register\n", imagename);
+ (void) linker_release_module(imagename, NULL, NULL);
+ goto done;
+ }
+ fp->file = result; /* record the module identity */
+ mtx_unlock(&firmware_mtx);
+done:
+ wakeup_one(imagename); /* we're done */
+}
+
+/*
+ * Lookup and potentially load the specified firmware image.
+ * If the firmware is not found in the registry, try to load a kernel
+ * module named as the image name.
+ * If the firmware is located, a reference is returned. The caller must
+ * release this reference for the image to be eligible for removal/unload.
+ */
+const struct firmware *
+firmware_get(const char *imagename)
+{
+ struct task fwload_task;
+ struct thread *td;
+ struct priv_fw *fp;
+
+ mtx_lock(&firmware_mtx);
+ fp = lookup(imagename, NULL);
+ if (fp != NULL)
+ goto found;
+ /*
+ * Image not present, try to load the module holding it.
+ */
+ td = curthread;
+ if (priv_check(td, PRIV_FIRMWARE_LOAD) != 0 ||
+ securelevel_gt(td->td_ucred, 0) != 0) {
+ mtx_unlock(&firmware_mtx);
+ printf("%s: insufficient privileges to "
+ "load firmware image %s\n", __func__, imagename);
+ return NULL;
+ }
+ /*
+ * Defer load to a thread with known context. linker_reference_module
+ * may do filesystem i/o which requires root & current dirs, etc.
+ * Also we must not hold any mtx's over this call which is problematic.
+ */
+ if (!cold) {
+ TASK_INIT(&fwload_task, 0, loadimage, __DECONST(void *,
+ imagename));
+ taskqueue_enqueue(firmware_tq, &fwload_task);
+ msleep(__DECONST(void *, imagename), &firmware_mtx, 0,
+ "fwload", 0);
+ }
+ /*
+ * After attempting to load the module, see if the image is registered.
+ */
+ fp = lookup(imagename, NULL);
+ if (fp == NULL) {
+ mtx_unlock(&firmware_mtx);
+ return NULL;
+ }
+found: /* common exit point on success */
+ if (fp->refcnt == 0 && fp->parent != NULL)
+ fp->parent->refcnt++;
+ fp->refcnt++;
+ mtx_unlock(&firmware_mtx);
+ return &fp->fw;
+}
+
+/*
+ * Release a reference to a firmware image returned by firmware_get.
+ * The caller may specify, with the FIRMWARE_UNLOAD flag, its desire
+ * to release the resource, but the flag is only advisory.
+ *
+ * If this is the last reference to the firmware image, and this is an
+ * autoloaded module, wake up the firmware_unload_task to figure out
+ * what to do with the associated module.
+ */
+void
+firmware_put(const struct firmware *p, int flags)
+{
+ struct priv_fw *fp = PRIV_FW(p);
+
+ mtx_lock(&firmware_mtx);
+ fp->refcnt--;
+ if (fp->refcnt == 0) {
+ if (fp->parent != NULL)
+ fp->parent->refcnt--;
+ if (flags & FIRMWARE_UNLOAD)
+ fp->flags |= FW_UNLOAD;
+ if (fp->file)
+ taskqueue_enqueue(firmware_tq, &firmware_unload_task);
+ }
+ mtx_unlock(&firmware_mtx);
+}
+
+/*
+ * Setup directory state for the firmware_tq thread so we can do i/o.
+ */
+static void
+set_rootvnode(void *arg, int npending)
+{
+ struct thread *td = curthread;
+ struct proc *p = td->td_proc;
+
+ FILEDESC_XLOCK(p->p_fd);
+ if (p->p_fd->fd_cdir == NULL) {
+ p->p_fd->fd_cdir = rootvnode;
+ VREF(rootvnode);
+ }
+ if (p->p_fd->fd_rdir == NULL) {
+ p->p_fd->fd_rdir = rootvnode;
+ VREF(rootvnode);
+ }
+ FILEDESC_XUNLOCK(p->p_fd);
+
+ free(arg, M_TEMP);
+}
+
+/*
+ * Event handler called on mounting of /; bounce a task
+ * into the task queue thread to setup it's directories.
+ */
+static void
+firmware_mountroot(void *arg)
+{
+ struct task *setroot_task;
+
+ setroot_task = malloc(sizeof(struct task), M_TEMP, M_NOWAIT);
+ if (setroot_task != NULL) {
+ TASK_INIT(setroot_task, 0, set_rootvnode, setroot_task);
+ taskqueue_enqueue(firmware_tq, setroot_task);
+ } else
+ printf("%s: no memory for task!\n", __func__);
+}
+EVENTHANDLER_DEFINE(mountroot, firmware_mountroot, NULL, 0);
+
+/*
+ * The body of the task in charge of unloading autoloaded modules
+ * that are not needed anymore.
+ * Images can be cross-linked so we may need to make multiple passes,
+ * but the time we spend in the loop is bounded because we clear entries
+ * as we touch them.
+ */
+static void
+unloadentry(void *unused1, int unused2)
+{
+ int limit = FIRMWARE_MAX;
+ int i; /* current cycle */
+
+ mtx_lock(&firmware_mtx);
+ /*
+ * Scan the table. limit is set to make sure we make another
+ * full sweep after matching an entry that requires unloading.
+ */
+ for (i = 0; i < limit; i++) {
+ struct priv_fw *fp;
+ int err;
+
+ fp = &firmware_table[i % FIRMWARE_MAX];
+ if (fp->fw.name == NULL || fp->file == NULL ||
+ fp->refcnt != 0 || (fp->flags & FW_UNLOAD) == 0)
+ continue;
+
+ /*
+ * Found an entry. Now:
+ * 1. bump up limit to make sure we make another full round;
+ * 2. clear FW_UNLOAD so we don't try this entry again.
+ * 3. release the lock while trying to unload the module.
+ * 'file' remains set so that the entry cannot be reused
+ * in the meantime (it also means that fp->file will
+ * not change while we release the lock).
+ */
+ limit = i + FIRMWARE_MAX; /* make another full round */
+ fp->flags &= ~FW_UNLOAD; /* do not try again */
+
+ mtx_unlock(&firmware_mtx);
+ err = linker_release_module(NULL, NULL, fp->file);
+ mtx_lock(&firmware_mtx);
+
+ /*
+ * We rely on the module to call firmware_unregister()
+ * on unload to actually release the entry.
+ * If err = 0 we can drop our reference as the system
+ * accepted it. Otherwise unloading failed (e.g. the
+ * module itself gave an error) so our reference is
+ * still valid.
+ */
+ if (err == 0)
+ fp->file = NULL;
+ }
+ mtx_unlock(&firmware_mtx);
+}
+
+/*
+ * Module glue.
+ */
+static int
+firmware_modevent(module_t mod, int type, void *unused)
+{
+ struct priv_fw *fp;
+ int i, err;
+
+ switch (type) {
+ case MOD_LOAD:
+ TASK_INIT(&firmware_unload_task, 0, unloadentry, NULL);
+ firmware_tq = taskqueue_create("taskqueue_firmware", M_WAITOK,
+ taskqueue_thread_enqueue, &firmware_tq);
+ /* NB: use our own loop routine that sets up context */
+ (void) taskqueue_start_threads(&firmware_tq, 1, PWAIT,
+ "firmware taskq");
+ if (rootvnode != NULL) {
+ /*
+ * Root is already mounted so we won't get an event;
+ * simulate one here.
+ */
+ firmware_mountroot(NULL);
+ }
+ return 0;
+
+ case MOD_UNLOAD:
+ /* request all autoloaded modules to be released */
+ mtx_lock(&firmware_mtx);
+ for (i = 0; i < FIRMWARE_MAX; i++) {
+ fp = &firmware_table[i];
+ fp->flags |= FW_UNLOAD;
+ }
+ mtx_unlock(&firmware_mtx);
+ taskqueue_enqueue(firmware_tq, &firmware_unload_task);
+ taskqueue_drain(firmware_tq, &firmware_unload_task);
+ err = 0;
+ for (i = 0; i < FIRMWARE_MAX; i++) {
+ fp = &firmware_table[i];
+ if (fp->fw.name != NULL) {
+ printf("%s: image %p ref %d still active slot %d\n",
+ __func__, fp->fw.name,
+ fp->refcnt, i);
+ err = EINVAL;
+ }
+ }
+ if (err == 0)
+ taskqueue_free(firmware_tq);
+ return err;
+ }
+ return EINVAL;
+}
+
+static moduledata_t firmware_mod = {
+ "firmware",
+ firmware_modevent,
+ NULL
+};
+DECLARE_MODULE(firmware, firmware_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_VERSION(firmware, 1);
diff --git a/sys/kern/subr_hash.c b/sys/kern/subr_hash.c
new file mode 100644
index 0000000..5533882
--- /dev/null
+++ b/sys/kern/subr_hash.c
@@ -0,0 +1,128 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+/*
+ * General routine to allocate a hash table with control of memory flags.
+ */
+void *
+hashinit_flags(int elements, struct malloc_type *type, u_long *hashmask,
+ int flags)
+{
+ long hashsize;
+ LIST_HEAD(generic, generic) *hashtbl;
+ int i;
+
+ KASSERT(elements > 0, ("%s: bad elements", __func__));
+ /* Exactly one of HASH_WAITOK and HASH_NOWAIT must be set. */
+ KASSERT((flags & HASH_WAITOK) ^ (flags & HASH_NOWAIT),
+ ("Bad flags (0x%x) passed to hashinit_flags", flags));
+
+ for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
+ continue;
+ hashsize >>= 1;
+
+ if (flags & HASH_NOWAIT)
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl),
+ type, M_NOWAIT);
+ else
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl),
+ type, M_WAITOK);
+
+ if (hashtbl != NULL) {
+ for (i = 0; i < hashsize; i++)
+ LIST_INIT(&hashtbl[i]);
+ *hashmask = hashsize - 1;
+ }
+ return (hashtbl);
+}
+
+/*
+ * Allocate and initialize a hash table with default flag: may sleep.
+ */
+void *
+hashinit(int elements, struct malloc_type *type, u_long *hashmask)
+{
+
+ return (hashinit_flags(elements, type, hashmask, HASH_WAITOK));
+}
+
+void
+hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask)
+{
+ LIST_HEAD(generic, generic) *hashtbl, *hp;
+
+ hashtbl = vhashtbl;
+ for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++)
+ KASSERT(LIST_EMPTY(hp), ("%s: hash not empty", __func__));
+ free(hashtbl, type);
+}
+
+static const int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531,
+ 2039, 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143,
+ 6653, 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
+#define NPRIMES (sizeof(primes) / sizeof(primes[0]))
+
+/*
+ * General routine to allocate a prime number sized hash table.
+ */
+void *
+phashinit(int elements, struct malloc_type *type, u_long *nentries)
+{
+ long hashsize;
+ LIST_HEAD(generic, generic) *hashtbl;
+ int i;
+
+ KASSERT(elements > 0, ("%s: bad elements", __func__));
+ for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
+ i++;
+ if (i == NPRIMES)
+ break;
+ hashsize = primes[i];
+ }
+ hashsize = primes[i - 1];
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+ for (i = 0; i < hashsize; i++)
+ LIST_INIT(&hashtbl[i]);
+ *nentries = hashsize;
+ return (hashtbl);
+}
diff --git a/sys/kern/subr_hints.c b/sys/kern/subr_hints.c
new file mode 100644
index 0000000..db45fb8
--- /dev/null
+++ b/sys/kern/subr_hints.c
@@ -0,0 +1,463 @@
+/*-
+ * Copyright (c) 2000,2001 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+
+/*
+ * Access functions for device resources.
+ */
+
+static int checkmethod = 1;
+static int use_kenv;
+static char *hintp;
+
+/*
+ * Define kern.hintmode sysctl, which only accept value 2, that cause to
+ * switch from Static KENV mode to Dynamic KENV. So systems that have hints
+ * compiled into kernel will be able to see/modify KENV (and hints too).
+ */
+
+static int
+sysctl_hintmode(SYSCTL_HANDLER_ARGS)
+{
+ const char *cp;
+ char *line, *eq;
+ int eqidx, error, from_kenv, i, value;
+
+ from_kenv = 0;
+ cp = kern_envp;
+ value = hintmode;
+
+ /* Fetch candidate for new hintmode value */
+ error = sysctl_handle_int(oidp, &value, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+
+ if (value != 2)
+ /* Only accept swithing to hintmode 2 */
+ return (EINVAL);
+
+ /* Migrate from static to dynamic hints */
+ switch (hintmode) {
+ case 0:
+ if (dynamic_kenv) {
+ /*
+ * Already here. But assign hintmode to 2, to not
+ * check it in the future.
+ */
+ hintmode = 2;
+ return (0);
+ }
+ from_kenv = 1;
+ cp = kern_envp;
+ break;
+ case 1:
+ cp = static_hints;
+ break;
+ case 2:
+ /* Nothing to do, hintmode already 2 */
+ return (0);
+ }
+
+ while (cp) {
+ i = strlen(cp);
+ if (i == 0)
+ break;
+ if (from_kenv) {
+ if (strncmp(cp, "hint.", 5) != 0)
+ /* kenv can have not only hints */
+ continue;
+ }
+ eq = strchr(cp, '=');
+ if (eq == NULL)
+ /* Bad hint value */
+ continue;
+ eqidx = eq - cp;
+
+ line = malloc(i+1, M_TEMP, M_WAITOK);
+ strcpy(line, cp);
+ line[eqidx] = '\0';
+ setenv(line, line + eqidx + 1);
+ free(line, M_TEMP);
+ cp += i + 1;
+ }
+
+ hintmode = value;
+ use_kenv = 1;
+ return (0);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, hintmode, CTLTYPE_INT|CTLFLAG_RW,
+ &hintmode, 0, sysctl_hintmode, "I", "Get/set current hintmode");
+
+/*
+ * Evil wildcarding resource string lookup.
+ * This walks the supplied env string table and returns a match.
+ * The start point can be remembered for incremental searches.
+ */
+static int
+res_find(int *line, int *startln,
+ const char *name, int *unit, const char *resname, const char *value,
+ const char **ret_name, int *ret_namelen, int *ret_unit,
+ const char **ret_resname, int *ret_resnamelen, const char **ret_value)
+{
+ int n = 0, hit, i = 0;
+ char r_name[32];
+ int r_unit;
+ char r_resname[32];
+ char r_value[128];
+ const char *s, *cp;
+ char *p;
+
+ if (checkmethod) {
+ hintp = NULL;
+
+ switch (hintmode) {
+ case 0: /* loader hints in environment only */
+ break;
+ case 1: /* static hints only */
+ hintp = static_hints;
+ checkmethod = 0;
+ break;
+ case 2: /* fallback mode */
+ if (dynamic_kenv) {
+ mtx_lock(&kenv_lock);
+ cp = kenvp[0];
+ for (i = 0; cp != NULL; cp = kenvp[++i]) {
+ if (!strncmp(cp, "hint.", 5)) {
+ use_kenv = 1;
+ checkmethod = 0;
+ break;
+ }
+ }
+ mtx_unlock(&kenv_lock);
+ } else {
+ cp = kern_envp;
+ while (cp) {
+ if (strncmp(cp, "hint.", 5) == 0) {
+ cp = NULL;
+ hintp = kern_envp;
+ break;
+ }
+ while (*cp != '\0')
+ cp++;
+ cp++;
+ if (*cp == '\0') {
+ cp = NULL;
+ hintp = static_hints;
+ break;
+ }
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ if (hintp == NULL) {
+ if (dynamic_kenv) {
+ use_kenv = 1;
+ checkmethod = 0;
+ } else
+ hintp = kern_envp;
+ }
+ }
+
+ if (use_kenv) {
+ mtx_lock(&kenv_lock);
+ i = 0;
+ cp = kenvp[0];
+ if (cp == NULL) {
+ mtx_unlock(&kenv_lock);
+ return (ENOENT);
+ }
+ } else
+ cp = hintp;
+ while (cp) {
+ hit = 1;
+ (*line)++;
+ if (strncmp(cp, "hint.", 5) != 0)
+ hit = 0;
+ else
+ n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%128s",
+ r_name, &r_unit, r_resname, r_value);
+ if (hit && n != 4) {
+ printf("CONFIG: invalid hint '%s'\n", cp);
+ p = strchr(cp, 'h');
+ *p = 'H';
+ hit = 0;
+ }
+ if (hit && startln && *startln >= 0 && *line < *startln)
+ hit = 0;
+ if (hit && name && strcmp(name, r_name) != 0)
+ hit = 0;
+ if (hit && unit && *unit != r_unit)
+ hit = 0;
+ if (hit && resname && strcmp(resname, r_resname) != 0)
+ hit = 0;
+ if (hit && value && strcmp(value, r_value) != 0)
+ hit = 0;
+ if (hit)
+ break;
+ if (use_kenv) {
+ cp = kenvp[++i];
+ if (cp == NULL)
+ break;
+ } else {
+ while (*cp != '\0')
+ cp++;
+ cp++;
+ if (*cp == '\0') {
+ cp = NULL;
+ break;
+ }
+ }
+ }
+ if (use_kenv)
+ mtx_unlock(&kenv_lock);
+ if (cp == NULL)
+ return ENOENT;
+
+ s = cp;
+ /* This is a bit of a hack, but at least is reentrant */
+ /* Note that it returns some !unterminated! strings. */
+ s = strchr(s, '.') + 1; /* start of device */
+ if (ret_name)
+ *ret_name = s;
+ s = strchr(s, '.') + 1; /* start of unit */
+ if (ret_namelen && ret_name)
+ *ret_namelen = s - *ret_name - 1; /* device length */
+ if (ret_unit)
+ *ret_unit = r_unit;
+ s = strchr(s, '.') + 1; /* start of resname */
+ if (ret_resname)
+ *ret_resname = s;
+ s = strchr(s, '=') + 1; /* start of value */
+ if (ret_resnamelen && ret_resname)
+ *ret_resnamelen = s - *ret_resname - 1; /* value len */
+ if (ret_value)
+ *ret_value = s;
+ if (startln) /* line number for anchor */
+ *startln = *line + 1;
+ return 0;
+}
+
+/*
+ * Search all the data sources for matches to our query. We look for
+ * dynamic hints first as overrides for static or fallback hints.
+ */
+static int
+resource_find(int *line, int *startln,
+ const char *name, int *unit, const char *resname, const char *value,
+ const char **ret_name, int *ret_namelen, int *ret_unit,
+ const char **ret_resname, int *ret_resnamelen, const char **ret_value)
+{
+ int i;
+ int un;
+
+ *line = 0;
+
+ /* Search for exact unit matches first */
+ i = res_find(line, startln, name, unit, resname, value,
+ ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
+ ret_value);
+ if (i == 0)
+ return 0;
+ if (unit == NULL)
+ return ENOENT;
+ /* If we are still here, search for wildcard matches */
+ un = -1;
+ i = res_find(line, startln, name, &un, resname, value,
+ ret_name, ret_namelen, ret_unit, ret_resname, ret_resnamelen,
+ ret_value);
+ if (i == 0)
+ return 0;
+ return ENOENT;
+}
+
+int
+resource_int_value(const char *name, int unit, const char *resname, int *result)
+{
+ int error;
+ const char *str;
+ char *op;
+ unsigned long val;
+ int line;
+
+ line = 0;
+ error = resource_find(&line, NULL, name, &unit, resname, NULL,
+ NULL, NULL, NULL, NULL, NULL, &str);
+ if (error)
+ return error;
+ if (*str == '\0')
+ return EFTYPE;
+ val = strtoul(str, &op, 0);
+ if (*op != '\0')
+ return EFTYPE;
+ *result = val;
+ return 0;
+}
+
+int
+resource_long_value(const char *name, int unit, const char *resname,
+ long *result)
+{
+ int error;
+ const char *str;
+ char *op;
+ unsigned long val;
+ int line;
+
+ line = 0;
+ error = resource_find(&line, NULL, name, &unit, resname, NULL,
+ NULL, NULL, NULL, NULL, NULL, &str);
+ if (error)
+ return error;
+ if (*str == '\0')
+ return EFTYPE;
+ val = strtoul(str, &op, 0);
+ if (*op != '\0')
+ return EFTYPE;
+ *result = val;
+ return 0;
+}
+
+int
+resource_string_value(const char *name, int unit, const char *resname,
+ const char **result)
+{
+ int error;
+ const char *str;
+ int line;
+
+ line = 0;
+ error = resource_find(&line, NULL, name, &unit, resname, NULL,
+ NULL, NULL, NULL, NULL, NULL, &str);
+ if (error)
+ return error;
+ *result = str;
+ return 0;
+}
+
+/*
+ * This is a bit nasty, but allows us to not modify the env strings.
+ */
+static const char *
+resource_string_copy(const char *s, int len)
+{
+ static char stringbuf[256];
+ static int offset = 0;
+ const char *ret;
+
+ if (len == 0)
+ len = strlen(s);
+ if (len > 255)
+ return NULL;
+ if ((offset + len + 1) > 255)
+ offset = 0;
+ bcopy(s, &stringbuf[offset], len);
+ stringbuf[offset + len] = '\0';
+ ret = &stringbuf[offset];
+ offset += len + 1;
+ return ret;
+}
+
+/*
+ * err = resource_find_match(&anchor, &name, &unit, resname, value)
+ * Iteratively fetch a list of devices wired "at" something
+ * res and value are restrictions. eg: "at", "scbus0".
+ * For practical purposes, res = required, value = optional.
+ * *name and *unit are set.
+ * set *anchor to zero before starting.
+ */
+int
+resource_find_match(int *anchor, const char **name, int *unit,
+ const char *resname, const char *value)
+{
+ const char *found_name;
+ int found_namelen;
+ int found_unit;
+ int ret;
+ int newln;
+
+ newln = *anchor;
+ ret = resource_find(anchor, &newln, NULL, NULL, resname, value,
+ &found_name, &found_namelen, &found_unit, NULL, NULL, NULL);
+ if (ret == 0) {
+ *name = resource_string_copy(found_name, found_namelen);
+ *unit = found_unit;
+ }
+ *anchor = newln;
+ return ret;
+}
+
+
+/*
+ * err = resource_find_dev(&anchor, name, &unit, res, value);
+ * Iterate through a list of devices, returning their unit numbers.
+ * res and value are optional restrictions. eg: "at", "scbus0".
+ * *unit is set to the value.
+ * set *anchor to zero before starting.
+ */
+int
+resource_find_dev(int *anchor, const char *name, int *unit,
+ const char *resname, const char *value)
+{
+ int found_unit;
+ int newln;
+ int ret;
+
+ newln = *anchor;
+ ret = resource_find(anchor, &newln, name, NULL, resname, value,
+ NULL, NULL, &found_unit, NULL, NULL, NULL);
+ if (ret == 0) {
+ *unit = found_unit;
+ }
+ *anchor = newln;
+ return ret;
+}
+
+/*
+ * Check to see if a device is disabled via a disabled hint.
+ */
+int
+resource_disabled(const char *name, int unit)
+{
+ int error, value;
+
+ error = resource_int_value(name, unit, "disabled", &value);
+ if (error)
+ return (0);
+ return (value);
+}
diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c
new file mode 100644
index 0000000..59d6258
--- /dev/null
+++ b/sys/kern/subr_kdb.c
@@ -0,0 +1,675 @@
+/*-
+ * Copyright (c) 2004 The FreeBSD Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdb.h"
+#include "opt_stack.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cons.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/smp.h>
+#include <sys/stack.h>
+#include <sys/sysctl.h>
+
+#include <machine/kdb.h>
+#include <machine/pcb.h>
+
+#ifdef SMP
+#include <machine/smp.h>
+#endif
+
+int kdb_active = 0;
+static void *kdb_jmpbufp = NULL;
+struct kdb_dbbe *kdb_dbbe = NULL;
+static struct pcb kdb_pcb;
+struct pcb *kdb_thrctx = NULL;
+struct thread *kdb_thread = NULL;
+struct trapframe *kdb_frame = NULL;
+
+#ifdef BREAK_TO_DEBUGGER
+#define KDB_BREAK_TO_DEBUGGER 1
+#else
+#define KDB_BREAK_TO_DEBUGGER 0
+#endif
+
+#ifdef ALT_BREAK_TO_DEBUGGER
+#define KDB_ALT_BREAK_TO_DEBUGGER 1
+#else
+#define KDB_ALT_BREAK_TO_DEBUGGER 0
+#endif
+
+static int kdb_break_to_debugger = KDB_BREAK_TO_DEBUGGER;
+static int kdb_alt_break_to_debugger = KDB_ALT_BREAK_TO_DEBUGGER;
+
+KDB_BACKEND(null, NULL, NULL, NULL, NULL);
+SET_DECLARE(kdb_dbbe_set, struct kdb_dbbe);
+
+static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS);
+
+static SYSCTL_NODE(_debug, OID_AUTO, kdb, CTLFLAG_RW, NULL, "KDB nodes");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD, NULL,
+ 0, kdb_sysctl_available, "A", "list of available KDB backends");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, current, CTLTYPE_STRING | CTLFLAG_RW, NULL,
+ 0, kdb_sysctl_current, "A", "currently selected KDB backend");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, enter,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+ kdb_sysctl_enter, "I", "set to enter the debugger");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, panic,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+ kdb_sysctl_panic, "I", "set to panic the kernel");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+ kdb_sysctl_trap, "I", "set to cause a page fault via data access");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
+ kdb_sysctl_trap_code, "I", "set to cause a page fault via code access");
+
+SYSCTL_INT(_debug_kdb, OID_AUTO, break_to_debugger,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_TUN | CTLFLAG_SECURE,
+ &kdb_break_to_debugger, 0, "Enable break to debugger");
+TUNABLE_INT("debug.kdb.break_to_debugger", &kdb_break_to_debugger);
+
+SYSCTL_INT(_debug_kdb, OID_AUTO, alt_break_to_debugger,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_TUN | CTLFLAG_SECURE,
+ &kdb_alt_break_to_debugger, 0, "Enable alternative break to debugger");
+TUNABLE_INT("debug.kdb.alt_break_to_debugger", &kdb_alt_break_to_debugger);
+
+/*
+ * Flag to indicate to debuggers why the debugger was entered.
+ */
+const char * volatile kdb_why = KDB_WHY_UNSET;
+
+static int
+kdb_sysctl_available(SYSCTL_HANDLER_ARGS)
+{
+ struct kdb_dbbe **iter;
+ struct sbuf sbuf;
+ int error;
+
+ sbuf_new_for_sysctl(&sbuf, NULL, 64, req);
+ SET_FOREACH(iter, kdb_dbbe_set) {
+ if ((*iter)->dbbe_active == 0)
+ sbuf_printf(&sbuf, "%s ", (*iter)->dbbe_name);
+ }
+ error = sbuf_finish(&sbuf);
+ sbuf_delete(&sbuf);
+ return (error);
+}
+
+static int
+kdb_sysctl_current(SYSCTL_HANDLER_ARGS)
+{
+ char buf[16];
+ int error;
+
+ if (kdb_dbbe != NULL)
+ strlcpy(buf, kdb_dbbe->dbbe_name, sizeof(buf));
+ else
+ *buf = '\0';
+ error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (kdb_active)
+ return (EBUSY);
+ return (kdb_dbbe_select(buf));
+}
+
+static int
+kdb_sysctl_enter(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error == 0) {
+ i = 0;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (kdb_active)
+ return (EBUSY);
+ kdb_enter(KDB_WHY_SYSCTL, "sysctl debug.kdb.enter");
+ return (0);
+}
+
+static int
+kdb_sysctl_panic(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error == 0) {
+ i = 0;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ panic("kdb_sysctl_panic");
+ return (0);
+}
+
+static int
+kdb_sysctl_trap(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+ int *addr = (int *)0x10;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error == 0) {
+ i = 0;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ return (*addr);
+}
+
+static int
+kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS)
+{
+ int error, i;
+ void (*fp)(u_int, u_int, u_int) = (void *)0xdeadc0de;
+
+ error = sysctl_wire_old_buffer(req, sizeof(int));
+ if (error == 0) {
+ i = 0;
+ error = sysctl_handle_int(oidp, &i, 0, req);
+ }
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ (*fp)(0x11111111, 0x22222222, 0x33333333);
+ return (0);
+}
+
+void
+kdb_panic(const char *msg)
+{
+
+ printf("KDB: panic\n");
+ panic("%s", msg);
+}
+
+void
+kdb_reboot(void)
+{
+
+ printf("KDB: reboot requested\n");
+ shutdown_nice(0);
+}
+
+/*
+ * Solaris implements a new BREAK which is initiated by a character sequence
+ * CR ~ ^b which is similar to a familiar pattern used on Sun servers by the
+ * Remote Console.
+ *
+ * Note that this function may be called from almost anywhere, with interrupts
+ * disabled and with unknown locks held, so it must not access data other than
+ * its arguments. Its up to the caller to ensure that the state variable is
+ * consistent.
+ */
+
+#define KEY_CR 13 /* CR '\r' */
+#define KEY_TILDE 126 /* ~ */
+#define KEY_CRTLB 2 /* ^B */
+#define KEY_CRTLP 16 /* ^P */
+#define KEY_CRTLR 18 /* ^R */
+
+/* States of th KDB "alternate break sequence" detecting state machine. */
+enum {
+ KDB_ALT_BREAK_SEEN_NONE,
+ KDB_ALT_BREAK_SEEN_CR,
+ KDB_ALT_BREAK_SEEN_CR_TILDE,
+};
+
+int
+kdb_break(void)
+{
+
+ if (!kdb_break_to_debugger)
+ return (0);
+ kdb_enter(KDB_WHY_BREAK, "Break to debugger");
+ return (KDB_REQ_DEBUGGER);
+}
+
+static int
+kdb_alt_break_state(int key, int *state)
+{
+ int brk;
+
+ /* All states transition to KDB_ALT_BREAK_SEEN_CR on a CR. */
+ if (key == KEY_CR) {
+ *state = KDB_ALT_BREAK_SEEN_CR;
+ return (0);
+ }
+
+ brk = 0;
+ switch (*state) {
+ case KDB_ALT_BREAK_SEEN_CR:
+ *state = KDB_ALT_BREAK_SEEN_NONE;
+ if (key == KEY_TILDE)
+ *state = KDB_ALT_BREAK_SEEN_CR_TILDE;
+ break;
+ case KDB_ALT_BREAK_SEEN_CR_TILDE:
+ *state = KDB_ALT_BREAK_SEEN_NONE;
+ if (key == KEY_CRTLB)
+ brk = KDB_REQ_DEBUGGER;
+ else if (key == KEY_CRTLP)
+ brk = KDB_REQ_PANIC;
+ else if (key == KEY_CRTLR)
+ brk = KDB_REQ_REBOOT;
+ break;
+ case KDB_ALT_BREAK_SEEN_NONE:
+ default:
+ *state = KDB_ALT_BREAK_SEEN_NONE;
+ break;
+ }
+ return (brk);
+}
+
+static int
+kdb_alt_break_internal(int key, int *state, int force_gdb)
+{
+ int brk;
+
+ if (!kdb_alt_break_to_debugger)
+ return (0);
+ brk = kdb_alt_break_state(key, state);
+ switch (brk) {
+ case KDB_REQ_DEBUGGER:
+ if (force_gdb)
+ kdb_dbbe_select("gdb");
+ kdb_enter(KDB_WHY_BREAK, "Break to debugger");
+ break;
+
+ case KDB_REQ_PANIC:
+ if (force_gdb)
+ kdb_dbbe_select("gdb");
+ kdb_panic("Panic sequence on console");
+ break;
+
+ case KDB_REQ_REBOOT:
+ kdb_reboot();
+ break;
+ }
+ return (0);
+}
+
+int
+kdb_alt_break(int key, int *state)
+{
+
+ return (kdb_alt_break_internal(key, state, 0));
+}
+
+/*
+ * This variation on kdb_alt_break() is used only by dcons, which has its own
+ * configuration flag to force GDB use regardless of the global KDB
+ * configuration.
+ */
+int
+kdb_alt_break_gdb(int key, int *state)
+{
+
+ return (kdb_alt_break_internal(key, state, 1));
+}
+
+/*
+ * Print a backtrace of the calling thread. The backtrace is generated by
+ * the selected debugger, provided it supports backtraces. If no debugger
+ * is selected or the current debugger does not support backtraces, this
+ * function silently returns.
+ */
+
+void
+kdb_backtrace(void)
+{
+
+ if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace != NULL) {
+ printf("KDB: stack backtrace:\n");
+ kdb_dbbe->dbbe_trace();
+ }
+#ifdef STACK
+ else {
+ struct stack st;
+
+ printf("KDB: stack backtrace:\n");
+ stack_zero(&st);
+ stack_save(&st);
+ stack_print_ddb(&st);
+ }
+#endif
+}
+
+/*
+ * Similar to kdb_backtrace() except that it prints a backtrace of an
+ * arbitrary thread rather than the calling thread.
+ */
+void
+kdb_backtrace_thread(struct thread *td)
+{
+
+ if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace_thread != NULL) {
+ printf("KDB: stack backtrace of thread %d:\n", td->td_tid);
+ kdb_dbbe->dbbe_trace_thread(td);
+ }
+#ifdef STACK
+ else {
+ struct stack st;
+
+ printf("KDB: stack backtrace of thread %d:\n", td->td_tid);
+ stack_zero(&st);
+ stack_save_td(&st, td);
+ stack_print_ddb(&st);
+ }
+#endif
+}
+
+/*
+ * Set/change the current backend.
+ */
+
+int
+kdb_dbbe_select(const char *name)
+{
+ struct kdb_dbbe *be, **iter;
+
+ SET_FOREACH(iter, kdb_dbbe_set) {
+ be = *iter;
+ if (be->dbbe_active == 0 && strcmp(be->dbbe_name, name) == 0) {
+ kdb_dbbe = be;
+ return (0);
+ }
+ }
+ return (EINVAL);
+}
+
+/*
+ * Enter the currently selected debugger. If a message has been provided,
+ * it is printed first. If the debugger does not support the enter method,
+ * it is entered by using breakpoint(), which enters the debugger through
+ * kdb_trap(). The 'why' argument will contain a more mechanically usable
+ * string than 'msg', and is relied upon by DDB scripting to identify the
+ * reason for entering the debugger so that the right script can be run.
+ */
+void
+kdb_enter(const char *why, const char *msg)
+{
+
+ if (kdb_dbbe != NULL && kdb_active == 0) {
+ if (msg != NULL)
+ printf("KDB: enter: %s\n", msg);
+ kdb_why = why;
+ breakpoint();
+ kdb_why = KDB_WHY_UNSET;
+ }
+}
+
+/*
+ * Initialize the kernel debugger interface.
+ */
+
+void
+kdb_init(void)
+{
+ struct kdb_dbbe *be, **iter;
+ int cur_pri, pri;
+
+ kdb_active = 0;
+ kdb_dbbe = NULL;
+ cur_pri = -1;
+ SET_FOREACH(iter, kdb_dbbe_set) {
+ be = *iter;
+ pri = (be->dbbe_init != NULL) ? be->dbbe_init() : -1;
+ be->dbbe_active = (pri >= 0) ? 0 : -1;
+ if (pri > cur_pri) {
+ cur_pri = pri;
+ kdb_dbbe = be;
+ }
+ }
+ if (kdb_dbbe != NULL) {
+ printf("KDB: debugger backends:");
+ SET_FOREACH(iter, kdb_dbbe_set) {
+ be = *iter;
+ if (be->dbbe_active == 0)
+ printf(" %s", be->dbbe_name);
+ }
+ printf("\n");
+ printf("KDB: current backend: %s\n",
+ kdb_dbbe->dbbe_name);
+ }
+}
+
+/*
+ * Handle contexts.
+ */
+
+void *
+kdb_jmpbuf(jmp_buf new)
+{
+ void *old;
+
+ old = kdb_jmpbufp;
+ kdb_jmpbufp = new;
+ return (old);
+}
+
+void
+kdb_reenter(void)
+{
+
+ if (!kdb_active || kdb_jmpbufp == NULL)
+ return;
+
+ longjmp(kdb_jmpbufp, 1);
+ /* NOTREACHED */
+}
+
+/*
+ * Thread related support functions.
+ */
+
+struct pcb *
+kdb_thr_ctx(struct thread *thr)
+{
+#if defined(SMP) && defined(KDB_STOPPEDPCB)
+ struct pcpu *pc;
+#endif
+
+ if (thr == curthread)
+ return (&kdb_pcb);
+
+#if defined(SMP) && defined(KDB_STOPPEDPCB)
+ STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
+ if (pc->pc_curthread == thr &&
+ CPU_ISSET(pc->pc_cpuid, &stopped_cpus))
+ return (KDB_STOPPEDPCB(pc));
+ }
+#endif
+ return (thr->td_pcb);
+}
+
+struct thread *
+kdb_thr_first(void)
+{
+ struct proc *p;
+ struct thread *thr;
+
+ p = LIST_FIRST(&allproc);
+ while (p != NULL) {
+ if (p->p_flag & P_INMEM) {
+ thr = FIRST_THREAD_IN_PROC(p);
+ if (thr != NULL)
+ return (thr);
+ }
+ p = LIST_NEXT(p, p_list);
+ }
+ return (NULL);
+}
+
+struct thread *
+kdb_thr_from_pid(pid_t pid)
+{
+ struct proc *p;
+
+ p = LIST_FIRST(&allproc);
+ while (p != NULL) {
+ if (p->p_flag & P_INMEM && p->p_pid == pid)
+ return (FIRST_THREAD_IN_PROC(p));
+ p = LIST_NEXT(p, p_list);
+ }
+ return (NULL);
+}
+
+struct thread *
+kdb_thr_lookup(lwpid_t tid)
+{
+ struct thread *thr;
+
+ thr = kdb_thr_first();
+ while (thr != NULL && thr->td_tid != tid)
+ thr = kdb_thr_next(thr);
+ return (thr);
+}
+
+struct thread *
+kdb_thr_next(struct thread *thr)
+{
+ struct proc *p;
+
+ p = thr->td_proc;
+ thr = TAILQ_NEXT(thr, td_plist);
+ do {
+ if (thr != NULL)
+ return (thr);
+ p = LIST_NEXT(p, p_list);
+ if (p != NULL && (p->p_flag & P_INMEM))
+ thr = FIRST_THREAD_IN_PROC(p);
+ } while (p != NULL);
+ return (NULL);
+}
+
+int
+kdb_thr_select(struct thread *thr)
+{
+ if (thr == NULL)
+ return (EINVAL);
+ kdb_thread = thr;
+ kdb_thrctx = kdb_thr_ctx(thr);
+ return (0);
+}
+
+/*
+ * Enter the debugger due to a trap.
+ */
+
+int
+kdb_trap(int type, int code, struct trapframe *tf)
+{
+#ifdef SMP
+ cpuset_t other_cpus;
+#endif
+ struct kdb_dbbe *be;
+ register_t intr;
+ int handled;
+#ifdef SMP
+ int did_stop_cpus;
+#endif
+
+ be = kdb_dbbe;
+ if (be == NULL || be->dbbe_trap == NULL)
+ return (0);
+
+ /* We reenter the debugger through kdb_reenter(). */
+ if (kdb_active)
+ return (0);
+
+ intr = intr_disable();
+
+#ifdef SMP
+ if (!SCHEDULER_STOPPED()) {
+ other_cpus = all_cpus;
+ CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+ stop_cpus_hard(other_cpus);
+ did_stop_cpus = 1;
+ } else
+ did_stop_cpus = 0;
+#endif
+
+ kdb_active++;
+
+ kdb_frame = tf;
+
+ /* Let MD code do its thing first... */
+ kdb_cpu_trap(type, code);
+
+ makectx(tf, &kdb_pcb);
+ kdb_thr_select(curthread);
+
+ cngrab();
+
+ for (;;) {
+ handled = be->dbbe_trap(type, code);
+ if (be == kdb_dbbe)
+ break;
+ be = kdb_dbbe;
+ if (be == NULL || be->dbbe_trap == NULL)
+ break;
+ printf("Switching to %s back-end\n", be->dbbe_name);
+ }
+
+ cnungrab();
+
+ kdb_active--;
+
+#ifdef SMP
+ if (did_stop_cpus)
+ restart_cpus(stopped_cpus);
+#endif
+
+ intr_restore(intr);
+
+ return (handled);
+}
diff --git a/sys/kern/subr_kobj.c b/sys/kern/subr_kobj.c
new file mode 100644
index 0000000..5be746a
--- /dev/null
+++ b/sys/kern/subr_kobj.c
@@ -0,0 +1,348 @@
+/*-
+ * Copyright (c) 2000,2003 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/sysctl.h>
+#ifndef TEST
+#include <sys/systm.h>
+#endif
+
+#ifdef TEST
+#include "usertest.h"
+#endif
+
+static MALLOC_DEFINE(M_KOBJ, "kobj", "Kernel object structures");
+
+#ifdef KOBJ_STATS
+
+u_int kobj_lookup_hits;
+u_int kobj_lookup_misses;
+
+SYSCTL_UINT(_kern, OID_AUTO, kobj_hits, CTLFLAG_RD,
+ &kobj_lookup_hits, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, kobj_misses, CTLFLAG_RD,
+ &kobj_lookup_misses, 0, "");
+
+#endif
+
+static struct mtx kobj_mtx;
+static int kobj_mutex_inited;
+static int kobj_next_id = 1;
+
+#define KOBJ_LOCK() mtx_lock(&kobj_mtx)
+#define KOBJ_UNLOCK() mtx_unlock(&kobj_mtx)
+#define KOBJ_ASSERT(what) mtx_assert(&kobj_mtx, what);
+
+SYSCTL_INT(_kern, OID_AUTO, kobj_methodcount, CTLFLAG_RD,
+ &kobj_next_id, 0, "");
+
+static void
+kobj_init_mutex(void *arg)
+{
+ if (!kobj_mutex_inited) {
+ mtx_init(&kobj_mtx, "kobj", NULL, MTX_DEF);
+ kobj_mutex_inited = 1;
+ }
+}
+
+SYSINIT(kobj, SI_SUB_LOCK, SI_ORDER_ANY, kobj_init_mutex, NULL);
+
+/*
+ * This method structure is used to initialise new caches. Since the
+ * desc pointer is NULL, it is guaranteed never to match any read
+ * descriptors.
+ */
+static const struct kobj_method null_method = {
+ 0, 0,
+};
+
+int
+kobj_error_method(void)
+{
+
+ return ENXIO;
+}
+
+static void
+kobj_class_compile_common(kobj_class_t cls, kobj_ops_t ops)
+{
+ kobj_method_t *m;
+ int i;
+
+ /*
+ * Don't do anything if we are already compiled.
+ */
+ if (cls->ops)
+ return;
+
+ /*
+ * First register any methods which need it.
+ */
+ for (i = 0, m = cls->methods; m->desc; i++, m++) {
+ if (m->desc->id == 0)
+ m->desc->id = kobj_next_id++;
+ }
+
+ /*
+ * Then initialise the ops table.
+ */
+ for (i = 0; i < KOBJ_CACHE_SIZE; i++)
+ ops->cache[i] = &null_method;
+ ops->cls = cls;
+ cls->ops = ops;
+}
+
+void
+kobj_class_compile(kobj_class_t cls)
+{
+ kobj_ops_t ops;
+
+ KOBJ_ASSERT(MA_NOTOWNED);
+
+ /*
+ * Allocate space for the compiled ops table.
+ */
+ ops = malloc(sizeof(struct kobj_ops), M_KOBJ, M_NOWAIT);
+ if (!ops)
+ panic("%s: out of memory", __func__);
+
+ KOBJ_LOCK();
+
+ /*
+ * We may have lost a race for kobj_class_compile here - check
+ * to make sure someone else hasn't already compiled this
+ * class.
+ */
+ if (cls->ops) {
+ KOBJ_UNLOCK();
+ free(ops, M_KOBJ);
+ return;
+ }
+
+ kobj_class_compile_common(cls, ops);
+ KOBJ_UNLOCK();
+}
+
+void
+kobj_class_compile_static(kobj_class_t cls, kobj_ops_t ops)
+{
+
+ KASSERT(kobj_mutex_inited == 0,
+ ("%s: only supported during early cycles", __func__));
+
+ /*
+ * Increment refs to make sure that the ops table is not freed.
+ */
+ cls->refs++;
+ kobj_class_compile_common(cls, ops);
+}
+
+static kobj_method_t*
+kobj_lookup_method_class(kobj_class_t cls, kobjop_desc_t desc)
+{
+ kobj_method_t *methods = cls->methods;
+ kobj_method_t *ce;
+
+ for (ce = methods; ce && ce->desc; ce++) {
+ if (ce->desc == desc) {
+ return ce;
+ }
+ }
+
+ return NULL;
+}
+
+static kobj_method_t*
+kobj_lookup_method_mi(kobj_class_t cls,
+ kobjop_desc_t desc)
+{
+ kobj_method_t *ce;
+ kobj_class_t *basep;
+
+ ce = kobj_lookup_method_class(cls, desc);
+ if (ce)
+ return ce;
+
+ basep = cls->baseclasses;
+ if (basep) {
+ for (; *basep; basep++) {
+ ce = kobj_lookup_method_mi(*basep, desc);
+ if (ce)
+ return ce;
+ }
+ }
+
+ return NULL;
+}
+
+kobj_method_t*
+kobj_lookup_method(kobj_class_t cls,
+ kobj_method_t **cep,
+ kobjop_desc_t desc)
+{
+ kobj_method_t *ce;
+
+#ifdef KOBJ_STATS
+ /*
+ * Correct for the 'hit' assumption in KOBJOPLOOKUP and record
+ * a 'miss'.
+ */
+ kobj_lookup_hits--;
+ kobj_lookup_misses++;
+#endif
+
+ ce = kobj_lookup_method_mi(cls, desc);
+ if (!ce)
+ ce = &desc->deflt;
+ *cep = ce;
+ return ce;
+}
+
+void
+kobj_class_free(kobj_class_t cls)
+{
+ void* ops = NULL;
+
+ KOBJ_ASSERT(MA_NOTOWNED);
+ KOBJ_LOCK();
+
+ /*
+ * Protect against a race between kobj_create and
+ * kobj_delete.
+ */
+ if (cls->refs == 0) {
+ /*
+ * For now we don't do anything to unregister any methods
+ * which are no longer used.
+ */
+
+ /*
+ * Free memory and clean up.
+ */
+ ops = cls->ops;
+ cls->ops = NULL;
+ }
+
+ KOBJ_UNLOCK();
+
+ if (ops)
+ free(ops, M_KOBJ);
+}
+
+kobj_t
+kobj_create(kobj_class_t cls,
+ struct malloc_type *mtype,
+ int mflags)
+{
+ kobj_t obj;
+
+ /*
+ * Allocate and initialise the new object.
+ */
+ obj = malloc(cls->size, mtype, mflags | M_ZERO);
+ if (!obj)
+ return NULL;
+ kobj_init(obj, cls);
+
+ return obj;
+}
+
+static void
+kobj_init_common(kobj_t obj, kobj_class_t cls)
+{
+
+ obj->ops = cls->ops;
+ cls->refs++;
+}
+
+void
+kobj_init(kobj_t obj, kobj_class_t cls)
+{
+ KOBJ_ASSERT(MA_NOTOWNED);
+ retry:
+ KOBJ_LOCK();
+
+ /*
+ * Consider compiling the class' method table.
+ */
+ if (!cls->ops) {
+ /*
+ * kobj_class_compile doesn't want the lock held
+ * because of the call to malloc - we drop the lock
+ * and re-try.
+ */
+ KOBJ_UNLOCK();
+ kobj_class_compile(cls);
+ goto retry;
+ }
+
+ kobj_init_common(obj, cls);
+
+ KOBJ_UNLOCK();
+}
+
+void
+kobj_init_static(kobj_t obj, kobj_class_t cls)
+{
+
+ KASSERT(kobj_mutex_inited == 0,
+ ("%s: only supported during early cycles", __func__));
+
+ kobj_init_common(obj, cls);
+}
+
+void
+kobj_delete(kobj_t obj, struct malloc_type *mtype)
+{
+ kobj_class_t cls = obj->ops->cls;
+ int refs;
+
+ /*
+ * Consider freeing the compiled method table for the class
+ * after its last instance is deleted. As an optimisation, we
+ * should defer this for a short while to avoid thrashing.
+ */
+ KOBJ_ASSERT(MA_NOTOWNED);
+ KOBJ_LOCK();
+ cls->refs--;
+ refs = cls->refs;
+ KOBJ_UNLOCK();
+
+ if (!refs)
+ kobj_class_free(cls);
+
+ obj->ops = NULL;
+ if (mtype)
+ free(obj, mtype);
+}
diff --git a/sys/kern/subr_lock.c b/sys/kern/subr_lock.c
new file mode 100644
index 0000000..94908ac
--- /dev/null
+++ b/sys/kern/subr_lock.c
@@ -0,0 +1,649 @@
+/*-
+ * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module holds the global variables and functions used to maintain
+ * lock_object structures.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_mprof.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/lock_profile.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <machine/cpufunc.h>
+
+CTASSERT(LOCK_CLASS_MAX == 15);
+
+struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = {
+ &lock_class_mtx_spin,
+ &lock_class_mtx_sleep,
+ &lock_class_sx,
+ &lock_class_rm,
+ &lock_class_rm_sleepable,
+ &lock_class_rw,
+ &lock_class_lockmgr,
+};
+
+void
+lock_init(struct lock_object *lock, struct lock_class *class, const char *name,
+ const char *type, int flags)
+{
+ int i;
+
+ /* Check for double-init and zero object. */
+ KASSERT(!lock_initalized(lock), ("lock \"%s\" %p already initialized",
+ name, lock));
+
+ /* Look up lock class to find its index. */
+ for (i = 0; i < LOCK_CLASS_MAX; i++)
+ if (lock_classes[i] == class) {
+ lock->lo_flags = i << LO_CLASSSHIFT;
+ break;
+ }
+ KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class));
+
+ /* Initialize the lock object. */
+ lock->lo_name = name;
+ lock->lo_flags |= flags | LO_INITIALIZED;
+ LOCK_LOG_INIT(lock, 0);
+ WITNESS_INIT(lock, (type != NULL) ? type : name);
+}
+
+void
+lock_destroy(struct lock_object *lock)
+{
+
+ KASSERT(lock_initalized(lock), ("lock %p is not initialized", lock));
+ WITNESS_DESTROY(lock);
+ LOCK_LOG_DESTROY(lock, 0);
+ lock->lo_flags &= ~LO_INITIALIZED;
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(lock, db_show_lock)
+{
+ struct lock_object *lock;
+ struct lock_class *class;
+
+ if (!have_addr)
+ return;
+ lock = (struct lock_object *)addr;
+ if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) {
+ db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock));
+ return;
+ }
+ class = LOCK_CLASS(lock);
+ db_printf(" class: %s\n", class->lc_name);
+ db_printf(" name: %s\n", lock->lo_name);
+ class->lc_ddb_show(lock);
+}
+#endif
+
+#ifdef LOCK_PROFILING
+
+/*
+ * One object per-thread for each lock the thread owns. Tracks individual
+ * lock instances.
+ */
+struct lock_profile_object {
+ LIST_ENTRY(lock_profile_object) lpo_link;
+ struct lock_object *lpo_obj;
+ const char *lpo_file;
+ int lpo_line;
+ uint16_t lpo_ref;
+ uint16_t lpo_cnt;
+ uint64_t lpo_acqtime;
+ uint64_t lpo_waittime;
+ u_int lpo_contest_locking;
+};
+
+/*
+ * One lock_prof for each (file, line, lock object) triple.
+ */
+struct lock_prof {
+ SLIST_ENTRY(lock_prof) link;
+ struct lock_class *class;
+ const char *file;
+ const char *name;
+ int line;
+ int ticks;
+ uintmax_t cnt_wait_max;
+ uintmax_t cnt_max;
+ uintmax_t cnt_tot;
+ uintmax_t cnt_wait;
+ uintmax_t cnt_cur;
+ uintmax_t cnt_contest_locking;
+};
+
+SLIST_HEAD(lphead, lock_prof);
+
+#define LPROF_HASH_SIZE 4096
+#define LPROF_HASH_MASK (LPROF_HASH_SIZE - 1)
+#define LPROF_CACHE_SIZE 4096
+
+/*
+ * Array of objects and profs for each type of object for each cpu. Spinlocks
+ * are handled separately because a thread may be preempted and acquire a
+ * spinlock while in the lock profiling code of a non-spinlock. In this way
+ * we only need a critical section to protect the per-cpu lists.
+ */
+struct lock_prof_type {
+ struct lphead lpt_lpalloc;
+ struct lpohead lpt_lpoalloc;
+ struct lphead lpt_hash[LPROF_HASH_SIZE];
+ struct lock_prof lpt_prof[LPROF_CACHE_SIZE];
+ struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE];
+};
+
+struct lock_prof_cpu {
+ struct lock_prof_type lpc_types[2]; /* One for spin one for other. */
+};
+
+struct lock_prof_cpu *lp_cpu[MAXCPU];
+
+volatile int lock_prof_enable = 0;
+static volatile int lock_prof_resetting;
+
+#define LPROF_SBUF_SIZE 256
+
+static int lock_prof_rejected;
+static int lock_prof_skipspin;
+static int lock_prof_skipcount;
+
+#ifndef USE_CPU_NANOSECONDS
+uint64_t
+nanoseconds(void)
+{
+ struct bintime bt;
+ uint64_t ns;
+
+ binuptime(&bt);
+ /* From bintime2timespec */
+ ns = bt.sec * (uint64_t)1000000000;
+ ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32;
+ return (ns);
+}
+#endif
+
+static void
+lock_prof_init_type(struct lock_prof_type *type)
+{
+ int i;
+
+ SLIST_INIT(&type->lpt_lpalloc);
+ LIST_INIT(&type->lpt_lpoalloc);
+ for (i = 0; i < LPROF_CACHE_SIZE; i++) {
+ SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i],
+ link);
+ LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i],
+ lpo_link);
+ }
+}
+
+static void
+lock_prof_init(void *arg)
+{
+ int cpu;
+
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF,
+ M_WAITOK | M_ZERO);
+ lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]);
+ lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]);
+ }
+}
+SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL);
+
+static void
+lock_prof_reset_wait(void)
+{
+
+ /*
+ * Spin relinquishing our cpu so that quiesce_all_cpus may
+ * complete.
+ */
+ while (lock_prof_resetting)
+ sched_relinquish(curthread);
+}
+
+static void
+lock_prof_reset(void)
+{
+ struct lock_prof_cpu *lpc;
+ int enabled, i, cpu;
+
+ /*
+ * We not only race with acquiring and releasing locks but also
+ * thread exit. To be certain that threads exit without valid head
+ * pointers they must see resetting set before enabled is cleared.
+ * Otherwise a lock may not be removed from a per-thread list due
+ * to disabled being set but not wait for reset() to remove it below.
+ */
+ atomic_store_rel_int(&lock_prof_resetting, 1);
+ enabled = lock_prof_enable;
+ lock_prof_enable = 0;
+ quiesce_all_cpus("profreset", 0);
+ /*
+ * Some objects may have migrated between CPUs. Clear all links
+ * before we zero the structures. Some items may still be linked
+ * into per-thread lists as well.
+ */
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ lpc = lp_cpu[cpu];
+ for (i = 0; i < LPROF_CACHE_SIZE; i++) {
+ LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link);
+ LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link);
+ }
+ }
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ lpc = lp_cpu[cpu];
+ bzero(lpc, sizeof(*lpc));
+ lock_prof_init_type(&lpc->lpc_types[0]);
+ lock_prof_init_type(&lpc->lpc_types[1]);
+ }
+ atomic_store_rel_int(&lock_prof_resetting, 0);
+ lock_prof_enable = enabled;
+}
+
+static void
+lock_prof_output(struct lock_prof *lp, struct sbuf *sb)
+{
+ const char *p;
+
+ for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3);
+ sbuf_printf(sb,
+ "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n",
+ lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000,
+ lp->cnt_wait / 1000, lp->cnt_cur,
+ lp->cnt_cur == 0 ? (uintmax_t)0 :
+ lp->cnt_tot / (lp->cnt_cur * 1000),
+ lp->cnt_cur == 0 ? (uintmax_t)0 :
+ lp->cnt_wait / (lp->cnt_cur * 1000),
+ (uintmax_t)0, lp->cnt_contest_locking,
+ p, lp->line, lp->class->lc_name, lp->name);
+}
+
+static void
+lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash,
+ int spin, int t)
+{
+ struct lock_prof_type *type;
+ struct lock_prof *l;
+ int cpu;
+
+ dst->file = match->file;
+ dst->line = match->line;
+ dst->class = match->class;
+ dst->name = match->name;
+
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (lp_cpu[cpu] == NULL)
+ continue;
+ type = &lp_cpu[cpu]->lpc_types[spin];
+ SLIST_FOREACH(l, &type->lpt_hash[hash], link) {
+ if (l->ticks == t)
+ continue;
+ if (l->file != match->file || l->line != match->line ||
+ l->name != match->name)
+ continue;
+ l->ticks = t;
+ if (l->cnt_max > dst->cnt_max)
+ dst->cnt_max = l->cnt_max;
+ if (l->cnt_wait_max > dst->cnt_wait_max)
+ dst->cnt_wait_max = l->cnt_wait_max;
+ dst->cnt_tot += l->cnt_tot;
+ dst->cnt_wait += l->cnt_wait;
+ dst->cnt_cur += l->cnt_cur;
+ dst->cnt_contest_locking += l->cnt_contest_locking;
+ }
+ }
+
+}
+
+static void
+lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin,
+ int t)
+{
+ struct lock_prof *l;
+ int i;
+
+ for (i = 0; i < LPROF_HASH_SIZE; ++i) {
+ SLIST_FOREACH(l, &type->lpt_hash[i], link) {
+ struct lock_prof lp = {};
+
+ if (l->ticks == t)
+ continue;
+ lock_prof_sum(l, &lp, i, spin, t);
+ lock_prof_output(&lp, sb);
+ }
+ }
+}
+
+static int
+dump_lock_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+ struct sbuf *sb;
+ int error, cpu, t;
+ int enabled;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req);
+ sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n",
+ "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name");
+ enabled = lock_prof_enable;
+ lock_prof_enable = 0;
+ quiesce_all_cpus("profstat", 0);
+ t = ticks;
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (lp_cpu[cpu] == NULL)
+ continue;
+ lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t);
+ lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t);
+ }
+ lock_prof_enable = enabled;
+
+ error = sbuf_finish(sb);
+ /* Output a trailing NUL. */
+ if (error == 0)
+ error = SYSCTL_OUT(req, "", 1);
+ sbuf_delete(sb);
+ return (error);
+}
+
+static int
+enable_lock_prof(SYSCTL_HANDLER_ARGS)
+{
+ int error, v;
+
+ v = lock_prof_enable;
+ error = sysctl_handle_int(oidp, &v, v, req);
+ if (error)
+ return (error);
+ if (req->newptr == NULL)
+ return (error);
+ if (v == lock_prof_enable)
+ return (0);
+ if (v == 1)
+ lock_prof_reset();
+ lock_prof_enable = !!v;
+
+ return (0);
+}
+
+static int
+reset_lock_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+ int error, v;
+
+ v = 0;
+ error = sysctl_handle_int(oidp, &v, 0, req);
+ if (error)
+ return (error);
+ if (req->newptr == NULL)
+ return (error);
+ if (v == 0)
+ return (0);
+ lock_prof_reset();
+
+ return (0);
+}
+
+static struct lock_prof *
+lock_profile_lookup(struct lock_object *lo, int spin, const char *file,
+ int line)
+{
+ const char *unknown = "(unknown)";
+ struct lock_prof_type *type;
+ struct lock_prof *lp;
+ struct lphead *head;
+ const char *p;
+ u_int hash;
+
+ p = file;
+ if (p == NULL || *p == '\0')
+ p = unknown;
+ hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line;
+ hash &= LPROF_HASH_MASK;
+ type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
+ head = &type->lpt_hash[hash];
+ SLIST_FOREACH(lp, head, link) {
+ if (lp->line == line && lp->file == p &&
+ lp->name == lo->lo_name)
+ return (lp);
+
+ }
+ lp = SLIST_FIRST(&type->lpt_lpalloc);
+ if (lp == NULL) {
+ lock_prof_rejected++;
+ return (lp);
+ }
+ SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link);
+ lp->file = p;
+ lp->line = line;
+ lp->class = LOCK_CLASS(lo);
+ lp->name = lo->lo_name;
+ SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link);
+ return (lp);
+}
+
+static struct lock_profile_object *
+lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file,
+ int line)
+{
+ struct lock_profile_object *l;
+ struct lock_prof_type *type;
+ struct lpohead *head;
+
+ head = &curthread->td_lprof[spin];
+ LIST_FOREACH(l, head, lpo_link)
+ if (l->lpo_obj == lo && l->lpo_file == file &&
+ l->lpo_line == line)
+ return (l);
+ type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
+ l = LIST_FIRST(&type->lpt_lpoalloc);
+ if (l == NULL) {
+ lock_prof_rejected++;
+ return (NULL);
+ }
+ LIST_REMOVE(l, lpo_link);
+ l->lpo_obj = lo;
+ l->lpo_file = file;
+ l->lpo_line = line;
+ l->lpo_cnt = 0;
+ LIST_INSERT_HEAD(head, l, lpo_link);
+
+ return (l);
+}
+
+void
+lock_profile_obtain_lock_success(struct lock_object *lo, int contested,
+ uint64_t waittime, const char *file, int line)
+{
+ static int lock_prof_count;
+ struct lock_profile_object *l;
+ int spin;
+
+ if (SCHEDULER_STOPPED())
+ return;
+
+ /* don't reset the timer when/if recursing */
+ if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE))
+ return;
+ if (lock_prof_skipcount &&
+ (++lock_prof_count % lock_prof_skipcount) != 0)
+ return;
+ spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
+ if (spin && lock_prof_skipspin == 1)
+ return;
+ critical_enter();
+ /* Recheck enabled now that we're in a critical section. */
+ if (lock_prof_enable == 0)
+ goto out;
+ l = lock_profile_object_lookup(lo, spin, file, line);
+ if (l == NULL)
+ goto out;
+ l->lpo_cnt++;
+ if (++l->lpo_ref > 1)
+ goto out;
+ l->lpo_contest_locking = contested;
+ l->lpo_acqtime = nanoseconds();
+ if (waittime && (l->lpo_acqtime > waittime))
+ l->lpo_waittime = l->lpo_acqtime - waittime;
+ else
+ l->lpo_waittime = 0;
+out:
+ critical_exit();
+}
+
+void
+lock_profile_thread_exit(struct thread *td)
+{
+#ifdef INVARIANTS
+ struct lock_profile_object *l;
+
+ MPASS(curthread->td_critnest == 0);
+#endif
+ /*
+ * If lock profiling was disabled we have to wait for reset to
+ * clear our pointers before we can exit safely.
+ */
+ lock_prof_reset_wait();
+#ifdef INVARIANTS
+ LIST_FOREACH(l, &td->td_lprof[0], lpo_link)
+ printf("thread still holds lock acquired at %s:%d\n",
+ l->lpo_file, l->lpo_line);
+ LIST_FOREACH(l, &td->td_lprof[1], lpo_link)
+ printf("thread still holds lock acquired at %s:%d\n",
+ l->lpo_file, l->lpo_line);
+#endif
+ MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL);
+ MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL);
+}
+
+void
+lock_profile_release_lock(struct lock_object *lo)
+{
+ struct lock_profile_object *l;
+ struct lock_prof_type *type;
+ struct lock_prof *lp;
+ uint64_t curtime, holdtime;
+ struct lpohead *head;
+ int spin;
+
+ if (SCHEDULER_STOPPED())
+ return;
+ if (lo->lo_flags & LO_NOPROFILE)
+ return;
+ spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0;
+ head = &curthread->td_lprof[spin];
+ if (LIST_FIRST(head) == NULL)
+ return;
+ critical_enter();
+ /* Recheck enabled now that we're in a critical section. */
+ if (lock_prof_enable == 0 && lock_prof_resetting == 1)
+ goto out;
+ /*
+ * If lock profiling is not enabled we still want to remove the
+ * lpo from our queue.
+ */
+ LIST_FOREACH(l, head, lpo_link)
+ if (l->lpo_obj == lo)
+ break;
+ if (l == NULL)
+ goto out;
+ if (--l->lpo_ref > 0)
+ goto out;
+ lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line);
+ if (lp == NULL)
+ goto release;
+ curtime = nanoseconds();
+ if (curtime < l->lpo_acqtime)
+ goto release;
+ holdtime = curtime - l->lpo_acqtime;
+
+ /*
+ * Record if the lock has been held longer now than ever
+ * before.
+ */
+ if (holdtime > lp->cnt_max)
+ lp->cnt_max = holdtime;
+ if (l->lpo_waittime > lp->cnt_wait_max)
+ lp->cnt_wait_max = l->lpo_waittime;
+ lp->cnt_tot += holdtime;
+ lp->cnt_wait += l->lpo_waittime;
+ lp->cnt_contest_locking += l->lpo_contest_locking;
+ lp->cnt_cur += l->lpo_cnt;
+release:
+ LIST_REMOVE(l, lpo_link);
+ type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin];
+ LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link);
+out:
+ critical_exit();
+}
+
+static SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging");
+static SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL,
+ "lock profiling");
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW,
+ &lock_prof_skipspin, 0, "Skip profiling on spinlocks.");
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW,
+ &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions.");
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD,
+ &lock_prof_rejected, 0, "Number of rejected profiling records");
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics");
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
+ NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics");
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
+ NULL, 0, enable_lock_prof, "I", "Enable lock profiling");
+
+#endif
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
new file mode 100644
index 0000000..1e61274
--- /dev/null
+++ b/sys/kern/subr_log.c
@@ -0,0 +1,310 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_log.c 8.1 (Berkeley) 6/10/93
+ */
+
+/*
+ * Error log buffer for kernel printf's.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/msgbuf.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/poll.h>
+#include <sys/filedesc.h>
+#include <sys/sysctl.h>
+
+#define LOG_RDPRI (PZERO + 1)
+
+#define LOG_ASYNC 0x04
+
+static d_open_t logopen;
+static d_close_t logclose;
+static d_read_t logread;
+static d_ioctl_t logioctl;
+static d_poll_t logpoll;
+static d_kqfilter_t logkqfilter;
+
+static void logtimeout(void *arg);
+
+static struct cdevsw log_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = logopen,
+ .d_close = logclose,
+ .d_read = logread,
+ .d_ioctl = logioctl,
+ .d_poll = logpoll,
+ .d_kqfilter = logkqfilter,
+ .d_name = "log",
+};
+
+static int logkqread(struct knote *note, long hint);
+static void logkqdetach(struct knote *note);
+
+static struct filterops log_read_filterops = {
+ .f_isfd = 1,
+ .f_attach = NULL,
+ .f_detach = logkqdetach,
+ .f_event = logkqread,
+};
+
+static struct logsoftc {
+ int sc_state; /* see above for possibilities */
+ struct selinfo sc_selp; /* process waiting on select call */
+ struct sigio *sc_sigio; /* information for async I/O */
+ struct callout sc_callout; /* callout to wakeup syslog */
+} logsoftc;
+
+int log_open; /* also used in log() */
+static struct cv log_wakeup;
+struct mtx msgbuf_lock;
+MTX_SYSINIT(msgbuf_lock, &msgbuf_lock, "msgbuf lock", MTX_DEF);
+
+/* Times per second to check for a pending syslog wakeup. */
+static int log_wakeups_per_second = 5;
+SYSCTL_INT(_kern, OID_AUTO, log_wakeups_per_second, CTLFLAG_RW,
+ &log_wakeups_per_second, 0, "");
+
+/*ARGSUSED*/
+static int
+logopen(struct cdev *dev, int flags, int mode, struct thread *td)
+{
+
+ if (log_wakeups_per_second < 1) {
+ printf("syslog wakeup is less than one. Adjusting to 1.\n");
+ log_wakeups_per_second = 1;
+ }
+
+ mtx_lock(&msgbuf_lock);
+ if (log_open) {
+ mtx_unlock(&msgbuf_lock);
+ return (EBUSY);
+ }
+ log_open = 1;
+ callout_reset_sbt(&logsoftc.sc_callout,
+ SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1));
+ mtx_unlock(&msgbuf_lock);
+
+ fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio); /* signal process only */
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+logclose(struct cdev *dev, int flag, int mode, struct thread *td)
+{
+
+ funsetown(&logsoftc.sc_sigio);
+
+ mtx_lock(&msgbuf_lock);
+ callout_stop(&logsoftc.sc_callout);
+ logsoftc.sc_state = 0;
+ log_open = 0;
+ mtx_unlock(&msgbuf_lock);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+logread(struct cdev *dev, struct uio *uio, int flag)
+{
+ char buf[128];
+ struct msgbuf *mbp = msgbufp;
+ int error = 0, l;
+
+ mtx_lock(&msgbuf_lock);
+ while (msgbuf_getcount(mbp) == 0) {
+ if (flag & IO_NDELAY) {
+ mtx_unlock(&msgbuf_lock);
+ return (EWOULDBLOCK);
+ }
+ if ((error = cv_wait_sig(&log_wakeup, &msgbuf_lock)) != 0) {
+ mtx_unlock(&msgbuf_lock);
+ return (error);
+ }
+ }
+
+ while (uio->uio_resid > 0) {
+ l = imin(sizeof(buf), uio->uio_resid);
+ l = msgbuf_getbytes(mbp, buf, l);
+ if (l == 0)
+ break;
+ mtx_unlock(&msgbuf_lock);
+ error = uiomove(buf, l, uio);
+ if (error || uio->uio_resid == 0)
+ return (error);
+ mtx_lock(&msgbuf_lock);
+ }
+ mtx_unlock(&msgbuf_lock);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+logpoll(struct cdev *dev, int events, struct thread *td)
+{
+ int revents = 0;
+
+ if (events & (POLLIN | POLLRDNORM)) {
+ mtx_lock(&msgbuf_lock);
+ if (msgbuf_getcount(msgbufp) > 0)
+ revents |= events & (POLLIN | POLLRDNORM);
+ else
+ selrecord(td, &logsoftc.sc_selp);
+ mtx_unlock(&msgbuf_lock);
+ }
+ return (revents);
+}
+
+static int
+logkqfilter(struct cdev *dev, struct knote *kn)
+{
+
+ if (kn->kn_filter != EVFILT_READ)
+ return (EINVAL);
+
+ kn->kn_fop = &log_read_filterops;
+ kn->kn_hook = NULL;
+
+ mtx_lock(&msgbuf_lock);
+ knlist_add(&logsoftc.sc_selp.si_note, kn, 1);
+ mtx_unlock(&msgbuf_lock);
+ return (0);
+}
+
+static int
+logkqread(struct knote *kn, long hint)
+{
+
+ mtx_assert(&msgbuf_lock, MA_OWNED);
+ kn->kn_data = msgbuf_getcount(msgbufp);
+ return (kn->kn_data != 0);
+}
+
+static void
+logkqdetach(struct knote *kn)
+{
+
+ mtx_lock(&msgbuf_lock);
+ knlist_remove(&logsoftc.sc_selp.si_note, kn, 1);
+ mtx_unlock(&msgbuf_lock);
+}
+
+static void
+logtimeout(void *arg)
+{
+
+ if (!log_open)
+ return;
+ if (msgbuftrigger == 0)
+ goto done;
+ msgbuftrigger = 0;
+ selwakeuppri(&logsoftc.sc_selp, LOG_RDPRI);
+ KNOTE_LOCKED(&logsoftc.sc_selp.si_note, 0);
+ if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL)
+ pgsigio(&logsoftc.sc_sigio, SIGIO, 0);
+ cv_broadcastpri(&log_wakeup, LOG_RDPRI);
+done:
+ if (log_wakeups_per_second < 1) {
+ printf("syslog wakeup is less than one. Adjusting to 1.\n");
+ log_wakeups_per_second = 1;
+ }
+ callout_reset_sbt(&logsoftc.sc_callout,
+ SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1));
+}
+
+/*ARGSUSED*/
+static int
+logioctl(struct cdev *dev, u_long com, caddr_t data, int flag, struct thread *td)
+{
+
+ switch (com) {
+
+ /* return number of characters immediately available */
+ case FIONREAD:
+ *(int *)data = msgbuf_getcount(msgbufp);
+ break;
+
+ case FIONBIO:
+ break;
+
+ case FIOASYNC:
+ mtx_lock(&msgbuf_lock);
+ if (*(int *)data)
+ logsoftc.sc_state |= LOG_ASYNC;
+ else
+ logsoftc.sc_state &= ~LOG_ASYNC;
+ mtx_unlock(&msgbuf_lock);
+ break;
+
+ case FIOSETOWN:
+ return (fsetown(*(int *)data, &logsoftc.sc_sigio));
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(&logsoftc.sc_sigio);
+ break;
+
+ /* This is deprecated, FIOSETOWN should be used instead. */
+ case TIOCSPGRP:
+ return (fsetown(-(*(int *)data), &logsoftc.sc_sigio));
+
+ /* This is deprecated, FIOGETOWN should be used instead */
+ case TIOCGPGRP:
+ *(int *)data = -fgetown(&logsoftc.sc_sigio);
+ break;
+
+ default:
+ return (ENOTTY);
+ }
+ return (0);
+}
+
+static void
+log_drvinit(void *unused)
+{
+
+ cv_init(&log_wakeup, "klog");
+ callout_init_mtx(&logsoftc.sc_callout, &msgbuf_lock, 0);
+ knlist_init_mtx(&logsoftc.sc_selp.si_note, &msgbuf_lock);
+ make_dev_credf(MAKEDEV_ETERNAL, &log_cdevsw, 0, NULL, UID_ROOT,
+ GID_WHEEL, 0600, "klog");
+}
+
+SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,log_drvinit,NULL);
diff --git a/sys/kern/subr_mbpool.c b/sys/kern/subr_mbpool.c
new file mode 100644
index 0000000..0b8cda6
--- /dev/null
+++ b/sys/kern/subr_mbpool.c
@@ -0,0 +1,402 @@
+/*-
+ * Copyright (c) 2003
+ * Fraunhofer Institute for Open Communication Systems (FhG Fokus).
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Hartmut Brandt <harti@freebsd.org>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+
+#include <machine/bus.h>
+
+#include <sys/mbuf.h>
+#include <sys/mbpool.h>
+
+MODULE_VERSION(libmbpool, 1);
+
+/*
+ * Memory is allocated as DMA-able pages. Each page is divided into a number
+ * of equal chunks where the last 4 bytes of each chunk are occupied by
+ * the page number and the chunk number. The caller must take these four
+ * bytes into account when specifying the chunk size. Each page is mapped by
+ * its own DMA map using the user specified DMA tag.
+ *
+ * Each chunk has a used and a card bit in the high bits of its page number.
+ * 0 0 chunk is free and may be allocated
+ * 1 1 chunk has been given to the interface
+ * 0 1 chunk is traveling through the system
+ * 1 0 illegal
+ */
+struct mbtrail {
+ uint16_t chunk;
+ uint16_t page;
+};
+#define MBP_CARD 0x8000
+#define MBP_USED 0x4000
+#define MBP_PMSK 0x3fff /* page number mask */
+#define MBP_CMSK 0x01ff /* chunk number mask */
+
+struct mbfree {
+ SLIST_ENTRY(mbfree) link; /* link on free list */
+};
+
+struct mbpage {
+ bus_dmamap_t map; /* map for this page */
+ bus_addr_t phy; /* physical address */
+ void *va; /* the memory */
+};
+
+struct mbpool {
+ const char *name; /* a name for this pool */
+ bus_dma_tag_t dmat; /* tag for mapping */
+ u_int max_pages; /* maximum number of pages */
+ size_t page_size; /* size of each allocation */
+ size_t chunk_size; /* size of each external mbuf */
+
+ struct mtx free_lock; /* lock of free list */
+ SLIST_HEAD(, mbfree) free_list; /* free list */
+ u_int npages; /* current number of pages */
+ u_int nchunks; /* chunks per page */
+ struct mbpage pages[]; /* pages */
+};
+
+static MALLOC_DEFINE(M_MBPOOL, "mbpools", "mbuf pools");
+
+/*
+ * Make a trail pointer from a chunk pointer
+ */
+#define C2T(P, C) ((struct mbtrail *)((char *)(C) + (P)->chunk_size - \
+ sizeof(struct mbtrail)))
+
+/*
+ * Make a free chunk pointer from a chunk number
+ */
+#define N2C(P, PG, C) ((struct mbfree *)((char *)(PG)->va + \
+ (C) * (P)->chunk_size))
+
+/*
+ * Make/parse handles
+ */
+#define HMAKE(P, C) ((((P) & MBP_PMSK) << 16) | ((C) << 7))
+#define HPAGE(H) (((H) >> 16) & MBP_PMSK)
+#define HCHUNK(H) (((H) >> 7) & MBP_CMSK)
+
+/*
+ * initialize a pool
+ */
+int
+mbp_create(struct mbpool **pp, const char *name, bus_dma_tag_t dmat,
+ u_int max_pages, size_t page_size, size_t chunk_size)
+{
+ u_int nchunks;
+
+ if (max_pages > MBPOOL_MAX_MAXPAGES || chunk_size == 0)
+ return (EINVAL);
+ nchunks = page_size / chunk_size;
+ if (nchunks == 0 || nchunks > MBPOOL_MAX_CHUNKS)
+ return (EINVAL);
+
+ (*pp) = malloc(sizeof(struct mbpool) +
+ max_pages * sizeof(struct mbpage),
+ M_MBPOOL, M_WAITOK | M_ZERO);
+
+ (*pp)->name = name;
+ (*pp)->dmat = dmat;
+ (*pp)->max_pages = max_pages;
+ (*pp)->page_size = page_size;
+ (*pp)->chunk_size = chunk_size;
+ (*pp)->nchunks = nchunks;
+
+ SLIST_INIT(&(*pp)->free_list);
+ mtx_init(&(*pp)->free_lock, name, NULL, MTX_DEF);
+
+ return (0);
+}
+
+/*
+ * destroy a pool
+ */
+void
+mbp_destroy(struct mbpool *p)
+{
+ u_int i;
+ struct mbpage *pg;
+#ifdef DIAGNOSTIC
+ struct mbtrail *tr;
+ u_int b;
+#endif
+
+ for (i = 0; i < p->npages; i++) {
+ pg = &p->pages[i];
+#ifdef DIAGNOSTIC
+ for (b = 0; b < p->nchunks; b++) {
+ tr = C2T(p, N2C(p, pg, b));
+ if (tr->page & MBP_CARD)
+ printf("%s: (%s) buf still on card"
+ " %u/%u\n", __func__, p->name, i, b);
+ if (tr->page & MBP_USED)
+ printf("%s: (%s) sbuf still in use"
+ " %u/%u\n", __func__, p->name, i, b);
+ }
+#endif
+ bus_dmamap_unload(p->dmat, pg->map);
+ bus_dmamem_free(p->dmat, pg->va, pg->map);
+ }
+ mtx_destroy(&p->free_lock);
+
+ free(p, M_MBPOOL);
+}
+
+/*
+ * Helper function when loading a one segment DMA buffer.
+ */
+static void
+mbp_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+ if (error == 0)
+ *(bus_addr_t *)arg = segs[0].ds_addr;
+}
+
+/*
+ * Allocate a new page
+ */
+static void
+mbp_alloc_page(struct mbpool *p)
+{
+ int error;
+ struct mbpage *pg;
+ u_int i;
+ struct mbfree *f;
+ struct mbtrail *t;
+
+ if (p->npages == p->max_pages) {
+#ifdef DIAGNOSTIC
+ printf("%s: (%s) page limit reached %u\n", __func__,
+ p->name, p->max_pages);
+#endif
+ return;
+ }
+ pg = &p->pages[p->npages];
+
+ error = bus_dmamem_alloc(p->dmat, &pg->va, BUS_DMA_NOWAIT, &pg->map);
+ if (error != 0) {
+ free(pg, M_MBPOOL);
+ return;
+ }
+
+ error = bus_dmamap_load(p->dmat, pg->map, pg->va, p->page_size,
+ mbp_callback, &pg->phy, 0);
+ if (error != 0) {
+ bus_dmamem_free(p->dmat, pg->va, pg->map);
+ free(pg, M_MBPOOL);
+ return;
+ }
+
+ for (i = 0; i < p->nchunks; i++) {
+ f = N2C(p, pg, i);
+ t = C2T(p, f);
+ t->page = p->npages;
+ t->chunk = i;
+ SLIST_INSERT_HEAD(&p->free_list, f, link);
+ }
+
+ p->npages++;
+}
+
+/*
+ * allocate a chunk
+ */
+void *
+mbp_alloc(struct mbpool *p, bus_addr_t *pap, uint32_t *hp)
+{
+ struct mbfree *cf;
+ struct mbtrail *t;
+
+ mtx_lock(&p->free_lock);
+ if ((cf = SLIST_FIRST(&p->free_list)) == NULL) {
+ mbp_alloc_page(p);
+ cf = SLIST_FIRST(&p->free_list);
+ }
+ if (cf == NULL) {
+ mtx_unlock(&p->free_lock);
+ return (NULL);
+ }
+ SLIST_REMOVE_HEAD(&p->free_list, link);
+ mtx_unlock(&p->free_lock);
+
+ t = C2T(p, cf);
+
+ *pap = p->pages[t->page].phy + t->chunk * p->chunk_size;
+ *hp = HMAKE(t->page, t->chunk);
+
+ t->page |= MBP_CARD | MBP_USED;
+
+ return (cf);
+}
+
+/*
+ * Free a chunk
+ */
+void
+mbp_free(struct mbpool *p, void *ptr)
+{
+ struct mbtrail *t;
+
+ mtx_lock(&p->free_lock);
+ t = C2T(p, ptr);
+ t->page &= ~(MBP_USED | MBP_CARD);
+ SLIST_INSERT_HEAD(&p->free_list, (struct mbfree *)ptr, link);
+ mtx_unlock(&p->free_lock);
+}
+
+/*
+ * Mbuf system external mbuf free routine
+ */
+int
+mbp_ext_free(struct mbuf *m, void *buf, void *arg)
+{
+ mbp_free(arg, buf);
+
+ return (EXT_FREE_OK);
+}
+
+/*
+ * Free all buffers that are marked as beeing on the card
+ */
+void
+mbp_card_free(struct mbpool *p)
+{
+ u_int i, b;
+ struct mbpage *pg;
+ struct mbtrail *tr;
+ struct mbfree *cf;
+
+ mtx_lock(&p->free_lock);
+ for (i = 0; i < p->npages; i++) {
+ pg = &p->pages[i];
+ for (b = 0; b < p->nchunks; b++) {
+ cf = N2C(p, pg, b);
+ tr = C2T(p, cf);
+ if (tr->page & MBP_CARD) {
+ tr->page &= MBP_PMSK;
+ SLIST_INSERT_HEAD(&p->free_list, cf, link);
+ }
+ }
+ }
+ mtx_unlock(&p->free_lock);
+}
+
+/*
+ * Count buffers
+ */
+void
+mbp_count(struct mbpool *p, u_int *used, u_int *card, u_int *free)
+{
+ u_int i, b;
+ struct mbpage *pg;
+ struct mbtrail *tr;
+ struct mbfree *cf;
+
+ *used = *card = *free = 0;
+ for (i = 0; i < p->npages; i++) {
+ pg = &p->pages[i];
+ for (b = 0; b < p->nchunks; b++) {
+ tr = C2T(p, N2C(p, pg, b));
+ if (tr->page & MBP_CARD)
+ (*card)++;
+ if (tr->page & MBP_USED)
+ (*used)++;
+ }
+ }
+ mtx_lock(&p->free_lock);
+ SLIST_FOREACH(cf, &p->free_list, link)
+ (*free)++;
+ mtx_unlock(&p->free_lock);
+}
+
+/*
+ * Get the buffer from a handle and clear the card flag.
+ */
+void *
+mbp_get(struct mbpool *p, uint32_t h)
+{
+ struct mbfree *cf;
+ struct mbtrail *tr;
+
+ cf = N2C(p, &p->pages[HPAGE(h)], HCHUNK(h));
+ tr = C2T(p, cf);
+
+#ifdef DIAGNOSTIC
+ if (!(tr->page & MBP_CARD))
+ printf("%s: (%s) chunk %u page %u not on card\n", __func__,
+ p->name, HCHUNK(h), HPAGE(h));
+#endif
+
+ tr->page &= ~MBP_CARD;
+ return (cf);
+}
+
+/*
+ * Get the buffer from a handle and keep the card flag.
+ */
+void *
+mbp_get_keep(struct mbpool *p, uint32_t h)
+{
+ struct mbfree *cf;
+ struct mbtrail *tr;
+
+ cf = N2C(p, &p->pages[HPAGE(h)], HCHUNK(h));
+ tr = C2T(p, cf);
+
+#ifdef DIAGNOSTIC
+ if (!(tr->page & MBP_CARD))
+ printf("%s: (%s) chunk %u page %u not on card\n", __func__,
+ p->name, HCHUNK(h), HPAGE(h));
+#endif
+
+ return (cf);
+}
+
+/*
+ * sync the chunk
+ */
+void
+mbp_sync(struct mbpool *p, uint32_t h, bus_addr_t off, bus_size_t len, u_int op)
+{
+
+#if 0
+ bus_dmamap_sync_size(p->dmat, p->pages[HPAGE(h)].map,
+ HCHUNK(h) * p->chunk_size + off, len, op);
+#endif
+}
diff --git a/sys/kern/subr_mchain.c b/sys/kern/subr_mchain.c
new file mode 100644
index 0000000..e9d7d22
--- /dev/null
+++ b/sys/kern/subr_mchain.c
@@ -0,0 +1,554 @@
+/*-
+ * Copyright (c) 2000, 2001 Boris Popov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/endian.h>
+#include <sys/errno.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/uio.h>
+
+#include <sys/mchain.h>
+
+FEATURE(libmchain, "mchain library");
+
+MODULE_VERSION(libmchain, 1);
+
+#define MBERROR(format, ...) printf("%s(%d): "format, __func__ , \
+ __LINE__ , ## __VA_ARGS__)
+
+#define MBPANIC(format, ...) printf("%s(%d): "format, __func__ , \
+ __LINE__ , ## __VA_ARGS__)
+
+/*
+ * Various helper functions
+ */
+int
+mb_init(struct mbchain *mbp)
+{
+ struct mbuf *m;
+
+ m = m_gethdr(M_WAITOK, MT_DATA);
+ m->m_len = 0;
+ mb_initm(mbp, m);
+ return (0);
+}
+
+void
+mb_initm(struct mbchain *mbp, struct mbuf *m)
+{
+ bzero(mbp, sizeof(*mbp));
+ mbp->mb_top = mbp->mb_cur = m;
+ mbp->mb_mleft = M_TRAILINGSPACE(m);
+}
+
+void
+mb_done(struct mbchain *mbp)
+{
+ if (mbp->mb_top) {
+ m_freem(mbp->mb_top);
+ mbp->mb_top = NULL;
+ }
+}
+
+struct mbuf *
+mb_detach(struct mbchain *mbp)
+{
+ struct mbuf *m;
+
+ m = mbp->mb_top;
+ mbp->mb_top = NULL;
+ return (m);
+}
+
+int
+mb_fixhdr(struct mbchain *mbp)
+{
+ return (mbp->mb_top->m_pkthdr.len = m_fixhdr(mbp->mb_top));
+}
+
+/*
+ * Check if object of size 'size' fit to the current position and
+ * allocate new mbuf if not. Advance pointers and increase length of mbuf(s).
+ * Return pointer to the object placeholder or NULL if any error occured.
+ * Note: size should be <= MLEN
+ */
+caddr_t
+mb_reserve(struct mbchain *mbp, int size)
+{
+ struct mbuf *m, *mn;
+ caddr_t bpos;
+
+ if (size > MLEN)
+ panic("mb_reserve: size = %d\n", size);
+ m = mbp->mb_cur;
+ if (mbp->mb_mleft < size) {
+ mn = m_get(M_WAITOK, MT_DATA);
+ mbp->mb_cur = m->m_next = mn;
+ m = mn;
+ m->m_len = 0;
+ mbp->mb_mleft = M_TRAILINGSPACE(m);
+ }
+ mbp->mb_mleft -= size;
+ mbp->mb_count += size;
+ bpos = mtod(m, caddr_t) + m->m_len;
+ m->m_len += size;
+ return (bpos);
+}
+
+int
+mb_put_padbyte(struct mbchain *mbp)
+{
+ caddr_t dst;
+ uint8_t x = 0;
+
+ dst = mtod(mbp->mb_cur, caddr_t) + mbp->mb_cur->m_len;
+
+ /* Only add padding if address is odd */
+ if ((unsigned long)dst & 1)
+ return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+ else
+ return (0);
+}
+
+int
+mb_put_uint8(struct mbchain *mbp, uint8_t x)
+{
+ return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_uint16be(struct mbchain *mbp, uint16_t x)
+{
+ x = htobe16(x);
+ return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_uint16le(struct mbchain *mbp, uint16_t x)
+{
+ x = htole16(x);
+ return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_uint32be(struct mbchain *mbp, uint32_t x)
+{
+ x = htobe32(x);
+ return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_uint32le(struct mbchain *mbp, uint32_t x)
+{
+ x = htole32(x);
+ return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_int64be(struct mbchain *mbp, int64_t x)
+{
+ x = htobe64(x);
+ return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_int64le(struct mbchain *mbp, int64_t x)
+{
+ x = htole64(x);
+ return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
+}
+
+int
+mb_put_mem(struct mbchain *mbp, c_caddr_t source, int size, int type)
+{
+ struct mbuf *m;
+ caddr_t dst;
+ c_caddr_t src;
+ int cplen, error, mleft, count;
+ size_t srclen, dstlen;
+
+ m = mbp->mb_cur;
+ mleft = mbp->mb_mleft;
+
+ while (size > 0) {
+ if (mleft == 0) {
+ if (m->m_next == NULL)
+ m = m_getm(m, size, M_WAITOK, MT_DATA);
+ else
+ m = m->m_next;
+ mleft = M_TRAILINGSPACE(m);
+ continue;
+ }
+ cplen = mleft > size ? size : mleft;
+ srclen = dstlen = cplen;
+ dst = mtod(m, caddr_t) + m->m_len;
+ switch (type) {
+ case MB_MCUSTOM:
+ srclen = size;
+ dstlen = mleft;
+ error = mbp->mb_copy(mbp, source, dst, &srclen, &dstlen);
+ if (error)
+ return (error);
+ break;
+ case MB_MINLINE:
+ for (src = source, count = cplen; count; count--)
+ *dst++ = *src++;
+ break;
+ case MB_MSYSTEM:
+ bcopy(source, dst, cplen);
+ break;
+ case MB_MUSER:
+ error = copyin(source, dst, cplen);
+ if (error)
+ return (error);
+ break;
+ case MB_MZERO:
+ bzero(dst, cplen);
+ break;
+ }
+ size -= srclen;
+ source += srclen;
+ m->m_len += dstlen;
+ mleft -= dstlen;
+ mbp->mb_count += dstlen;
+ }
+ mbp->mb_cur = m;
+ mbp->mb_mleft = mleft;
+ return (0);
+}
+
+int
+mb_put_mbuf(struct mbchain *mbp, struct mbuf *m)
+{
+ mbp->mb_cur->m_next = m;
+ while (m) {
+ mbp->mb_count += m->m_len;
+ if (m->m_next == NULL)
+ break;
+ m = m->m_next;
+ }
+ mbp->mb_mleft = M_TRAILINGSPACE(m);
+ mbp->mb_cur = m;
+ return (0);
+}
+
+/*
+ * copies a uio scatter/gather list to an mbuf chain.
+ */
+int
+mb_put_uio(struct mbchain *mbp, struct uio *uiop, int size)
+{
+ long left;
+ int mtype, error;
+
+ mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
+
+ while (size > 0 && uiop->uio_resid) {
+ if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
+ return (EFBIG);
+ left = uiop->uio_iov->iov_len;
+ if (left == 0) {
+ uiop->uio_iov++;
+ uiop->uio_iovcnt--;
+ continue;
+ }
+ if (left > size)
+ left = size;
+ error = mb_put_mem(mbp, uiop->uio_iov->iov_base, left, mtype);
+ if (error)
+ return (error);
+ uiop->uio_offset += left;
+ uiop->uio_resid -= left;
+ uiop->uio_iov->iov_base =
+ (char *)uiop->uio_iov->iov_base + left;
+ uiop->uio_iov->iov_len -= left;
+ size -= left;
+ }
+ return (0);
+}
+
+/*
+ * Routines for fetching data from an mbuf chain
+ */
+int
+md_init(struct mdchain *mdp)
+{
+ struct mbuf *m;
+
+ m = m_gethdr(M_WAITOK, MT_DATA);
+ m->m_len = 0;
+ md_initm(mdp, m);
+ return (0);
+}
+
+void
+md_initm(struct mdchain *mdp, struct mbuf *m)
+{
+ bzero(mdp, sizeof(*mdp));
+ mdp->md_top = mdp->md_cur = m;
+ mdp->md_pos = mtod(m, u_char*);
+}
+
+void
+md_done(struct mdchain *mdp)
+{
+ if (mdp->md_top) {
+ m_freem(mdp->md_top);
+ mdp->md_top = NULL;
+ }
+}
+
+/*
+ * Append a separate mbuf chain. It is caller responsibility to prevent
+ * multiple calls to fetch/record routines.
+ */
+void
+md_append_record(struct mdchain *mdp, struct mbuf *top)
+{
+ struct mbuf *m;
+
+ if (mdp->md_top == NULL) {
+ md_initm(mdp, top);
+ return;
+ }
+ m = mdp->md_top;
+ while (m->m_nextpkt)
+ m = m->m_nextpkt;
+ m->m_nextpkt = top;
+ top->m_nextpkt = NULL;
+ return;
+}
+
+/*
+ * Put next record in place of existing
+ */
+int
+md_next_record(struct mdchain *mdp)
+{
+ struct mbuf *m;
+
+ if (mdp->md_top == NULL)
+ return (ENOENT);
+ m = mdp->md_top->m_nextpkt;
+ md_done(mdp);
+ if (m == NULL)
+ return (ENOENT);
+ md_initm(mdp, m);
+ return (0);
+}
+
+int
+md_get_uint8(struct mdchain *mdp, uint8_t *x)
+{
+ return (md_get_mem(mdp, x, 1, MB_MINLINE));
+}
+
+int
+md_get_uint16(struct mdchain *mdp, uint16_t *x)
+{
+ return (md_get_mem(mdp, (caddr_t)x, 2, MB_MINLINE));
+}
+
+int
+md_get_uint16le(struct mdchain *mdp, uint16_t *x)
+{
+ uint16_t v;
+ int error = md_get_uint16(mdp, &v);
+
+ if (x != NULL)
+ *x = le16toh(v);
+ return (error);
+}
+
+int
+md_get_uint16be(struct mdchain *mdp, uint16_t *x)
+{
+ uint16_t v;
+ int error = md_get_uint16(mdp, &v);
+
+ if (x != NULL)
+ *x = be16toh(v);
+ return (error);
+}
+
+int
+md_get_uint32(struct mdchain *mdp, uint32_t *x)
+{
+ return (md_get_mem(mdp, (caddr_t)x, 4, MB_MINLINE));
+}
+
+int
+md_get_uint32be(struct mdchain *mdp, uint32_t *x)
+{
+ uint32_t v;
+ int error;
+
+ error = md_get_uint32(mdp, &v);
+ if (x != NULL)
+ *x = be32toh(v);
+ return (error);
+}
+
+int
+md_get_uint32le(struct mdchain *mdp, uint32_t *x)
+{
+ uint32_t v;
+ int error;
+
+ error = md_get_uint32(mdp, &v);
+ if (x != NULL)
+ *x = le32toh(v);
+ return (error);
+}
+
+int
+md_get_int64(struct mdchain *mdp, int64_t *x)
+{
+ return (md_get_mem(mdp, (caddr_t)x, 8, MB_MINLINE));
+}
+
+int
+md_get_int64be(struct mdchain *mdp, int64_t *x)
+{
+ int64_t v;
+ int error;
+
+ error = md_get_int64(mdp, &v);
+ if (x != NULL)
+ *x = be64toh(v);
+ return (error);
+}
+
+int
+md_get_int64le(struct mdchain *mdp, int64_t *x)
+{
+ int64_t v;
+ int error;
+
+ error = md_get_int64(mdp, &v);
+ if (x != NULL)
+ *x = le64toh(v);
+ return (error);
+}
+
+int
+md_get_mem(struct mdchain *mdp, caddr_t target, int size, int type)
+{
+ struct mbuf *m = mdp->md_cur;
+ int error;
+ u_int count;
+ u_char *s;
+
+ while (size > 0) {
+ if (m == NULL) {
+ MBERROR("incomplete copy\n");
+ return (EBADRPC);
+ }
+ s = mdp->md_pos;
+ count = mtod(m, u_char*) + m->m_len - s;
+ if (count == 0) {
+ mdp->md_cur = m = m->m_next;
+ if (m)
+ s = mdp->md_pos = mtod(m, caddr_t);
+ continue;
+ }
+ if (count > size)
+ count = size;
+ size -= count;
+ mdp->md_pos += count;
+ if (target == NULL)
+ continue;
+ switch (type) {
+ case MB_MUSER:
+ error = copyout(s, target, count);
+ if (error)
+ return error;
+ break;
+ case MB_MSYSTEM:
+ bcopy(s, target, count);
+ break;
+ case MB_MINLINE:
+ while (count--)
+ *target++ = *s++;
+ continue;
+ }
+ target += count;
+ }
+ return (0);
+}
+
+int
+md_get_mbuf(struct mdchain *mdp, int size, struct mbuf **ret)
+{
+ struct mbuf *m = mdp->md_cur, *rm;
+
+ rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_WAITOK);
+ md_get_mem(mdp, NULL, size, MB_MZERO);
+ *ret = rm;
+ return (0);
+}
+
+int
+md_get_uio(struct mdchain *mdp, struct uio *uiop, int size)
+{
+ char *uiocp;
+ long left;
+ int mtype, error;
+
+ mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
+ while (size > 0 && uiop->uio_resid) {
+ if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
+ return (EFBIG);
+ left = uiop->uio_iov->iov_len;
+ if (left == 0) {
+ uiop->uio_iov++;
+ uiop->uio_iovcnt--;
+ continue;
+ }
+ uiocp = uiop->uio_iov->iov_base;
+ if (left > size)
+ left = size;
+ error = md_get_mem(mdp, uiocp, left, mtype);
+ if (error)
+ return (error);
+ uiop->uio_offset += left;
+ uiop->uio_resid -= left;
+ uiop->uio_iov->iov_base =
+ (char *)uiop->uio_iov->iov_base + left;
+ uiop->uio_iov->iov_len -= left;
+ size -= left;
+ }
+ return (0);
+}
diff --git a/sys/kern/subr_module.c b/sys/kern/subr_module.c
new file mode 100644
index 0000000..2485c94
--- /dev/null
+++ b/sys/kern/subr_module.c
@@ -0,0 +1,290 @@
+/*-
+ * Copyright (c) 1998 Michael Smith
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/linker.h>
+
+/*
+ * Preloaded module support
+ */
+
+vm_offset_t preload_addr_relocate = 0;
+caddr_t preload_metadata;
+
+/*
+ * Search for the preloaded module (name)
+ */
+caddr_t
+preload_search_by_name(const char *name)
+{
+ caddr_t curp;
+ uint32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (uint32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Search for a MODINFO_NAME field */
+ if ((hdr[0] == MODINFO_NAME) &&
+ !strcmp(name, curp + sizeof(uint32_t) * 2))
+ return(curp);
+
+ /* skip to next field */
+ next = sizeof(uint32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Search for the first preloaded module of (type)
+ */
+caddr_t
+preload_search_by_type(const char *type)
+{
+ caddr_t curp, lname;
+ uint32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ lname = NULL;
+ for (;;) {
+ hdr = (uint32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* remember the start of each record */
+ if (hdr[0] == MODINFO_NAME)
+ lname = curp;
+
+ /* Search for a MODINFO_TYPE field */
+ if ((hdr[0] == MODINFO_TYPE) &&
+ !strcmp(type, curp + sizeof(uint32_t) * 2))
+ return(lname);
+
+ /* skip to next field */
+ next = sizeof(uint32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Walk through the preloaded module list
+ */
+caddr_t
+preload_search_next_name(caddr_t base)
+{
+ caddr_t curp;
+ uint32_t *hdr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ /* Pick up where we left off last time */
+ if (base) {
+ /* skip to next field */
+ curp = base;
+ hdr = (uint32_t *)curp;
+ next = sizeof(uint32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ } else
+ curp = preload_metadata;
+
+ for (;;) {
+ hdr = (uint32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Found a new record? */
+ if (hdr[0] == MODINFO_NAME)
+ return curp;
+
+ /* skip to next field */
+ next = sizeof(uint32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+ return(NULL);
+}
+
+/*
+ * Given a preloaded module handle (mod), return a pointer
+ * to the data for the attribute (inf).
+ */
+caddr_t
+preload_search_info(caddr_t mod, int inf)
+{
+ caddr_t curp;
+ uint32_t *hdr;
+ uint32_t type = 0;
+ int next;
+
+ curp = mod;
+ for (;;) {
+ hdr = (uint32_t *)curp;
+ /* end of module data? */
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+ /*
+ * We give up once we've looped back to what we were looking at
+ * first - this should normally be a MODINFO_NAME field.
+ */
+ if (type == 0) {
+ type = hdr[0];
+ } else {
+ if (hdr[0] == type)
+ break;
+ }
+
+ /*
+ * Attribute match? Return pointer to data.
+ * Consumer may safely assume that size value precedes
+ * data.
+ */
+ if (hdr[0] == inf)
+ return(curp + (sizeof(uint32_t) * 2));
+
+ /* skip to next field */
+ next = sizeof(uint32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ return(NULL);
+}
+
+/*
+ * Delete a preload record by name.
+ */
+void
+preload_delete_name(const char *name)
+{
+ caddr_t curp;
+ uint32_t *hdr;
+ int next;
+ int clearing;
+
+ if (preload_metadata != NULL) {
+
+ clearing = 0;
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (uint32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Search for a MODINFO_NAME field */
+ if (hdr[0] == MODINFO_NAME) {
+ if (!strcmp(name, curp + sizeof(uint32_t) * 2))
+ clearing = 1; /* got it, start clearing */
+ else if (clearing)
+ clearing = 0; /* at next one now.. better stop */
+ }
+ if (clearing)
+ hdr[0] = MODINFO_EMPTY;
+
+ /* skip to next field */
+ next = sizeof(uint32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+}
+
+void *
+preload_fetch_addr(caddr_t mod)
+{
+ caddr_t *mdp;
+
+ mdp = (caddr_t *)preload_search_info(mod, MODINFO_ADDR);
+ if (mdp == NULL)
+ return (NULL);
+ return (*mdp + preload_addr_relocate);
+}
+
+size_t
+preload_fetch_size(caddr_t mod)
+{
+ size_t *mdp;
+
+ mdp = (size_t *)preload_search_info(mod, MODINFO_SIZE);
+ if (mdp == NULL)
+ return (0);
+ return (*mdp);
+}
+
+/* Called from locore on i386. Convert physical pointers to kvm. Sigh. */
+void
+preload_bootstrap_relocate(vm_offset_t offset)
+{
+ caddr_t curp;
+ uint32_t *hdr;
+ vm_offset_t *ptr;
+ int next;
+
+ if (preload_metadata != NULL) {
+
+ curp = preload_metadata;
+ for (;;) {
+ hdr = (uint32_t *)curp;
+ if (hdr[0] == 0 && hdr[1] == 0)
+ break;
+
+ /* Deal with the ones that we know we have to fix */
+ switch (hdr[0]) {
+ case MODINFO_ADDR:
+ case MODINFO_METADATA|MODINFOMD_SSYM:
+ case MODINFO_METADATA|MODINFOMD_ESYM:
+ ptr = (vm_offset_t *)(curp + (sizeof(uint32_t) * 2));
+ *ptr += offset;
+ break;
+ }
+ /* The rest is beyond us for now */
+
+ /* skip to next field */
+ next = sizeof(uint32_t) * 2 + hdr[1];
+ next = roundup(next, sizeof(u_long));
+ curp += next;
+ }
+ }
+}
diff --git a/sys/kern/subr_msgbuf.c b/sys/kern/subr_msgbuf.c
new file mode 100644
index 0000000..ecdbe72
--- /dev/null
+++ b/sys/kern/subr_msgbuf.c
@@ -0,0 +1,418 @@
+/*-
+ * Copyright (c) 2003 Ian Dowse. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * Generic message buffer support routines.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/mutex.h>
+#include <sys/msgbuf.h>
+#include <sys/sysctl.h>
+
+/*
+ * Maximum number conversion buffer length: uintmax_t in base 2, plus <>
+ * around the priority, and a terminating NUL.
+ */
+#define MAXPRIBUF (sizeof(intmax_t) * NBBY + 3)
+
+/* Read/write sequence numbers are modulo a multiple of the buffer size. */
+#define SEQMOD(size) ((size) * 16)
+
+static u_int msgbuf_cksum(struct msgbuf *mbp);
+
+/*
+ * Timestamps in msgbuf are useful when trying to diagnose when core dumps
+ * or other actions occured.
+ */
+static int msgbuf_show_timestamp = 0;
+SYSCTL_INT(_kern, OID_AUTO, msgbuf_show_timestamp, CTLFLAG_RW | CTLFLAG_TUN,
+ &msgbuf_show_timestamp, 0, "Show timestamp in msgbuf");
+TUNABLE_INT("kern.msgbuf_show_timestamp", &msgbuf_show_timestamp);
+
+/*
+ * Initialize a message buffer of the specified size at the specified
+ * location. This also zeros the buffer area.
+ */
+void
+msgbuf_init(struct msgbuf *mbp, void *ptr, int size)
+{
+
+ mbp->msg_ptr = ptr;
+ mbp->msg_size = size;
+ mbp->msg_seqmod = SEQMOD(size);
+ msgbuf_clear(mbp);
+ mbp->msg_magic = MSG_MAGIC;
+ mbp->msg_lastpri = -1;
+ mbp->msg_flags = 0;
+ bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
+ mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
+}
+
+/*
+ * Reinitialize a message buffer, retaining its previous contents if
+ * the size and checksum are correct. If the old contents cannot be
+ * recovered, the message buffer is cleared.
+ */
+void
+msgbuf_reinit(struct msgbuf *mbp, void *ptr, int size)
+{
+ u_int cksum;
+
+ if (mbp->msg_magic != MSG_MAGIC || mbp->msg_size != size) {
+ msgbuf_init(mbp, ptr, size);
+ return;
+ }
+ mbp->msg_seqmod = SEQMOD(size);
+ mbp->msg_wseq = MSGBUF_SEQNORM(mbp, mbp->msg_wseq);
+ mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq);
+ mbp->msg_ptr = ptr;
+ cksum = msgbuf_cksum(mbp);
+ if (cksum != mbp->msg_cksum) {
+ if (bootverbose) {
+ printf("msgbuf cksum mismatch (read %x, calc %x)\n",
+ mbp->msg_cksum, cksum);
+ printf("Old msgbuf not recovered\n");
+ }
+ msgbuf_clear(mbp);
+ }
+
+ mbp->msg_lastpri = -1;
+ /* Assume that the old message buffer didn't end in a newline. */
+ mbp->msg_flags |= MSGBUF_NEEDNL;
+ bzero(&mbp->msg_lock, sizeof(mbp->msg_lock));
+ mtx_init(&mbp->msg_lock, "msgbuf", NULL, MTX_SPIN);
+}
+
+/*
+ * Clear the message buffer.
+ */
+void
+msgbuf_clear(struct msgbuf *mbp)
+{
+
+ bzero(mbp->msg_ptr, mbp->msg_size);
+ mbp->msg_wseq = 0;
+ mbp->msg_rseq = 0;
+ mbp->msg_cksum = 0;
+}
+
+/*
+ * Get a count of the number of unread characters in the message buffer.
+ */
+int
+msgbuf_getcount(struct msgbuf *mbp)
+{
+ u_int len;
+
+ len = MSGBUF_SEQSUB(mbp, mbp->msg_wseq, mbp->msg_rseq);
+ if (len > mbp->msg_size)
+ len = mbp->msg_size;
+ return (len);
+}
+
+/*
+ * Add a character into the message buffer, and update the checksum and
+ * sequence number.
+ *
+ * The caller should hold the message buffer spinlock.
+ */
+
+static void
+msgbuf_do_addchar(struct msgbuf * const mbp, u_int * const seq, const int c)
+{
+ u_int pos;
+
+ /* Make sure we properly wrap the sequence number. */
+ pos = MSGBUF_SEQ_TO_POS(mbp, *seq);
+ mbp->msg_cksum += (u_int)(u_char)c -
+ (u_int)(u_char)mbp->msg_ptr[pos];
+ mbp->msg_ptr[pos] = c;
+ *seq = MSGBUF_SEQNORM(mbp, *seq + 1);
+}
+
+/*
+ * Append a character to a message buffer.
+ */
+void
+msgbuf_addchar(struct msgbuf *mbp, int c)
+{
+ mtx_lock_spin(&mbp->msg_lock);
+
+ msgbuf_do_addchar(mbp, &mbp->msg_wseq, c);
+
+ mtx_unlock_spin(&mbp->msg_lock);
+}
+
+/*
+ * Append a NUL-terminated string with a priority to a message buffer.
+ * Filter carriage returns if the caller requests it.
+ *
+ * XXX The carriage return filtering behavior is present in the
+ * msglogchar() API, however testing has shown that we don't seem to send
+ * carriage returns down this path. So do we still need it?
+ */
+void
+msgbuf_addstr(struct msgbuf *mbp, int pri, char *str, int filter_cr)
+{
+ u_int seq;
+ size_t len, prefix_len;
+ char prefix[MAXPRIBUF];
+ char buf[32];
+ int nl, i, j, needtime;
+
+ len = strlen(str);
+ prefix_len = 0;
+ nl = 0;
+
+ /* If we have a zero-length string, no need to do anything. */
+ if (len == 0)
+ return;
+
+ mtx_lock_spin(&mbp->msg_lock);
+
+ /*
+ * If this is true, we may need to insert a new priority sequence,
+ * so prepare the prefix.
+ */
+ if (pri != -1)
+ prefix_len = sprintf(prefix, "<%d>", pri);
+
+ /*
+ * Starting write sequence number.
+ */
+ seq = mbp->msg_wseq;
+
+ /*
+ * Whenever there is a change in priority, we have to insert a
+ * newline, and a priority prefix if the priority is not -1. Here
+ * we detect whether there was a priority change, and whether we
+ * did not end with a newline. If that is the case, we need to
+ * insert a newline before this string.
+ */
+ if (mbp->msg_lastpri != pri && (mbp->msg_flags & MSGBUF_NEEDNL) != 0) {
+
+ msgbuf_do_addchar(mbp, &seq, '\n');
+ mbp->msg_flags &= ~MSGBUF_NEEDNL;
+ }
+
+ needtime = 1;
+ for (i = 0; i < len; i++) {
+ /*
+ * If we just had a newline, and the priority is not -1
+ * (and therefore prefix_len != 0), then we need a priority
+ * prefix for this line.
+ */
+ if ((mbp->msg_flags & MSGBUF_NEEDNL) == 0 && prefix_len != 0) {
+ int j;
+
+ for (j = 0; j < prefix_len; j++)
+ msgbuf_do_addchar(mbp, &seq, prefix[j]);
+ }
+
+ if (msgbuf_show_timestamp && needtime == 1 &&
+ (mbp->msg_flags & MSGBUF_NEEDNL) == 0) {
+
+ snprintf(buf, sizeof(buf), "[%jd] ",
+ (intmax_t)time_uptime);
+ for (j = 0; buf[j] != '\0'; j++)
+ msgbuf_do_addchar(mbp, &seq, buf[j]);
+ needtime = 0;
+ }
+
+ /*
+ * Don't copy carriage returns if the caller requested
+ * filtering.
+ *
+ * XXX This matches the behavior of msglogchar(), but is it
+ * necessary? Testing has shown that we don't seem to get
+ * carriage returns here.
+ */
+ if ((filter_cr != 0) && (str[i] == '\r'))
+ continue;
+
+ /*
+ * Clear this flag if we see a newline. This affects whether
+ * we need to insert a new prefix or insert a newline later.
+ */
+ if (str[i] == '\n')
+ mbp->msg_flags &= ~MSGBUF_NEEDNL;
+ else
+ mbp->msg_flags |= MSGBUF_NEEDNL;
+
+ msgbuf_do_addchar(mbp, &seq, str[i]);
+ }
+ /*
+ * Update the write sequence number for the actual number of
+ * characters we put in the message buffer. (Depends on whether
+ * carriage returns are filtered.)
+ */
+ mbp->msg_wseq = seq;
+
+ /*
+ * Set the last priority.
+ */
+ mbp->msg_lastpri = pri;
+
+ mtx_unlock_spin(&mbp->msg_lock);
+
+}
+
+/*
+ * Read and mark as read a character from a message buffer.
+ * Returns the character, or -1 if no characters are available.
+ */
+int
+msgbuf_getchar(struct msgbuf *mbp)
+{
+ u_int len, wseq;
+ int c;
+
+ mtx_lock_spin(&mbp->msg_lock);
+
+ wseq = mbp->msg_wseq;
+ len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
+ if (len == 0) {
+ mtx_unlock_spin(&mbp->msg_lock);
+ return (-1);
+ }
+ if (len > mbp->msg_size)
+ mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
+ c = (u_char)mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq)];
+ mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + 1);
+
+ mtx_unlock_spin(&mbp->msg_lock);
+
+ return (c);
+}
+
+/*
+ * Read and mark as read a number of characters from a message buffer.
+ * Returns the number of characters that were placed in `buf'.
+ */
+int
+msgbuf_getbytes(struct msgbuf *mbp, char *buf, int buflen)
+{
+ u_int len, pos, wseq;
+
+ mtx_lock_spin(&mbp->msg_lock);
+
+ wseq = mbp->msg_wseq;
+ len = MSGBUF_SEQSUB(mbp, wseq, mbp->msg_rseq);
+ if (len == 0) {
+ mtx_unlock_spin(&mbp->msg_lock);
+ return (0);
+ }
+ if (len > mbp->msg_size) {
+ mbp->msg_rseq = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
+ len = mbp->msg_size;
+ }
+ pos = MSGBUF_SEQ_TO_POS(mbp, mbp->msg_rseq);
+ len = min(len, mbp->msg_size - pos);
+ len = min(len, (u_int)buflen);
+
+ bcopy(&mbp->msg_ptr[pos], buf, len);
+ mbp->msg_rseq = MSGBUF_SEQNORM(mbp, mbp->msg_rseq + len);
+
+ mtx_unlock_spin(&mbp->msg_lock);
+
+ return (len);
+}
+
+/*
+ * Peek at the full contents of a message buffer without marking any
+ * data as read. `seqp' should point to an unsigned integer that
+ * msgbuf_peekbytes() can use to retain state between calls so that
+ * the whole message buffer can be read in multiple short reads.
+ * To initialise this variable to the start of the message buffer,
+ * call msgbuf_peekbytes() with a NULL `buf' parameter.
+ *
+ * Returns the number of characters that were placed in `buf'.
+ */
+int
+msgbuf_peekbytes(struct msgbuf *mbp, char *buf, int buflen, u_int *seqp)
+{
+ u_int len, pos, wseq;
+
+ mtx_lock_spin(&mbp->msg_lock);
+
+ if (buf == NULL) {
+ /* Just initialise *seqp. */
+ *seqp = MSGBUF_SEQNORM(mbp, mbp->msg_wseq - mbp->msg_size);
+ mtx_unlock_spin(&mbp->msg_lock);
+ return (0);
+ }
+
+ wseq = mbp->msg_wseq;
+ len = MSGBUF_SEQSUB(mbp, wseq, *seqp);
+ if (len == 0) {
+ mtx_unlock_spin(&mbp->msg_lock);
+ return (0);
+ }
+ if (len > mbp->msg_size) {
+ *seqp = MSGBUF_SEQNORM(mbp, wseq - mbp->msg_size);
+ len = mbp->msg_size;
+ }
+ pos = MSGBUF_SEQ_TO_POS(mbp, *seqp);
+ len = min(len, mbp->msg_size - pos);
+ len = min(len, (u_int)buflen);
+ bcopy(&mbp->msg_ptr[MSGBUF_SEQ_TO_POS(mbp, *seqp)], buf, len);
+ *seqp = MSGBUF_SEQNORM(mbp, *seqp + len);
+
+ mtx_unlock_spin(&mbp->msg_lock);
+
+ return (len);
+}
+
+/*
+ * Compute the checksum for the complete message buffer contents.
+ */
+static u_int
+msgbuf_cksum(struct msgbuf *mbp)
+{
+ u_int i, sum;
+
+ sum = 0;
+ for (i = 0; i < mbp->msg_size; i++)
+ sum += (u_char)mbp->msg_ptr[i];
+ return (sum);
+}
+
+/*
+ * Copy from one message buffer to another.
+ */
+void
+msgbuf_copy(struct msgbuf *src, struct msgbuf *dst)
+{
+ int c;
+
+ while ((c = msgbuf_getchar(src)) >= 0)
+ msgbuf_addchar(dst, c);
+}
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
new file mode 100644
index 0000000..a2e822c
--- /dev/null
+++ b/sys/kern/subr_param.c
@@ -0,0 +1,354 @@
+/*-
+ * Copyright (c) 1980, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)param.c 8.3 (Berkeley) 8/20/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_param.h"
+#include "opt_msgbuf.h"
+#include "opt_maxusers.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/msgbuf.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+
+/*
+ * System parameter formulae.
+ */
+
+#ifndef HZ
+# if defined(__mips__) || defined(__arm__)
+# define HZ 100
+# else
+# define HZ 1000
+# endif
+# ifndef HZ_VM
+# define HZ_VM 100
+# endif
+#else
+# ifndef HZ_VM
+# define HZ_VM HZ
+# endif
+#endif
+#define NPROC (20 + 16 * maxusers)
+#ifndef NBUF
+#define NBUF 0
+#endif
+#ifndef MAXFILES
+#define MAXFILES (maxproc * 2)
+#endif
+
+static int sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS);
+
+int hz; /* system clock's frequency */
+int tick; /* usec per tick (1000000 / hz) */
+struct bintime tick_bt; /* bintime per tick (1s / hz) */
+sbintime_t tick_sbt;
+int maxusers; /* base tunable */
+int maxproc; /* maximum # of processes */
+int maxprocperuid; /* max # of procs per user */
+int maxfiles; /* sys. wide open files limit */
+int maxfilesperproc; /* per-proc open files limit */
+int msgbufsize; /* size of kernel message buffer */
+int nbuf;
+int bio_transient_maxcnt;
+int ngroups_max; /* max # groups per process */
+int nswbuf;
+pid_t pid_max = PID_MAX;
+long maxswzone; /* max swmeta KVA storage */
+long maxbcache; /* max buffer cache KVA storage */
+long maxpipekva; /* Limit on pipe KVA */
+int vm_guest; /* Running as virtual machine guest? */
+u_long maxtsiz; /* max text size */
+u_long dfldsiz; /* initial data size limit */
+u_long maxdsiz; /* max data size */
+u_long dflssiz; /* initial stack size limit */
+u_long maxssiz; /* max stack size */
+u_long sgrowsiz; /* amount to grow stack */
+
+SYSCTL_INT(_kern, OID_AUTO, hz, CTLFLAG_RDTUN, &hz, 0,
+ "Number of clock ticks per second");
+SYSCTL_INT(_kern, OID_AUTO, nbuf, CTLFLAG_RDTUN, &nbuf, 0,
+ "Number of buffers in the buffer cache");
+SYSCTL_INT(_kern, OID_AUTO, nswbuf, CTLFLAG_RDTUN, &nswbuf, 0,
+ "Number of swap buffers");
+SYSCTL_INT(_kern, OID_AUTO, msgbufsize, CTLFLAG_RDTUN, &msgbufsize, 0,
+ "Size of the kernel message buffer");
+SYSCTL_LONG(_kern, OID_AUTO, maxswzone, CTLFLAG_RDTUN, &maxswzone, 0,
+ "Maximum memory for swap metadata");
+SYSCTL_LONG(_kern, OID_AUTO, maxbcache, CTLFLAG_RDTUN, &maxbcache, 0,
+ "Maximum value of vfs.maxbufspace");
+SYSCTL_INT(_kern, OID_AUTO, bio_transient_maxcnt, CTLFLAG_RDTUN,
+ &bio_transient_maxcnt, 0,
+ "Maximum number of transient BIOs mappings");
+SYSCTL_ULONG(_kern, OID_AUTO, maxtsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxtsiz, 0,
+ "Maximum text size");
+SYSCTL_ULONG(_kern, OID_AUTO, dfldsiz, CTLFLAG_RW | CTLFLAG_TUN, &dfldsiz, 0,
+ "Initial data size limit");
+SYSCTL_ULONG(_kern, OID_AUTO, maxdsiz, CTLFLAG_RW | CTLFLAG_TUN, &maxdsiz, 0,
+ "Maximum data size");
+SYSCTL_ULONG(_kern, OID_AUTO, dflssiz, CTLFLAG_RW | CTLFLAG_TUN, &dflssiz, 0,
+ "Initial stack size limit");
+SYSCTL_ULONG(_kern, OID_AUTO, maxssiz, CTLFLAG_RW | CTLFLAG_TUN, &maxssiz, 0,
+ "Maximum stack size");
+SYSCTL_ULONG(_kern, OID_AUTO, sgrowsiz, CTLFLAG_RW | CTLFLAG_TUN, &sgrowsiz, 0,
+ "Amount to grow stack on a stack fault");
+SYSCTL_PROC(_kern, OID_AUTO, vm_guest, CTLFLAG_RD | CTLTYPE_STRING,
+ NULL, 0, sysctl_kern_vm_guest, "A",
+ "Virtual machine guest detected? (none|generic|xen)");
+
+/*
+ * These have to be allocated somewhere; allocating
+ * them here forces loader errors if this file is omitted
+ * (if they've been externed everywhere else; hah!).
+ */
+struct buf *swbuf;
+
+/*
+ * The elements of this array are ordered based upon the values of the
+ * corresponding enum VM_GUEST members.
+ */
+static const char *const vm_guest_sysctl_names[] = {
+ "none",
+ "generic",
+ "xen",
+ NULL
+};
+
+#ifndef XEN
+static const char *const vm_bnames[] = {
+ "QEMU", /* QEMU */
+ "Plex86", /* Plex86 */
+ "Bochs", /* Bochs */
+ "Xen", /* Xen */
+ "BHYVE", /* bhyve */
+ "Seabios", /* KVM */
+ NULL
+};
+
+static const char *const vm_pnames[] = {
+ "VMware Virtual Platform", /* VMWare VM */
+ "Virtual Machine", /* Microsoft VirtualPC */
+ "VirtualBox", /* Sun xVM VirtualBox */
+ "Parallels Virtual Platform", /* Parallels VM */
+ "KVM", /* KVM */
+ NULL
+};
+
+
+/*
+ * Detect known Virtual Machine hosts by inspecting the emulated BIOS.
+ */
+static enum VM_GUEST
+detect_virtual(void)
+{
+ char *sysenv;
+ int i;
+
+ sysenv = getenv("smbios.bios.vendor");
+ if (sysenv != NULL) {
+ for (i = 0; vm_bnames[i] != NULL; i++)
+ if (strcmp(sysenv, vm_bnames[i]) == 0) {
+ freeenv(sysenv);
+ return (VM_GUEST_VM);
+ }
+ freeenv(sysenv);
+ }
+ sysenv = getenv("smbios.system.product");
+ if (sysenv != NULL) {
+ for (i = 0; vm_pnames[i] != NULL; i++)
+ if (strcmp(sysenv, vm_pnames[i]) == 0) {
+ freeenv(sysenv);
+ return (VM_GUEST_VM);
+ }
+ freeenv(sysenv);
+ }
+ return (VM_GUEST_NO);
+}
+#endif
+
+/*
+ * Boot time overrides that are not scaled against main memory
+ */
+void
+init_param1(void)
+{
+#ifndef XEN
+ vm_guest = detect_virtual();
+#else
+ vm_guest = VM_GUEST_XEN;
+#endif
+ hz = -1;
+ TUNABLE_INT_FETCH("kern.hz", &hz);
+ if (hz == -1)
+ hz = vm_guest > VM_GUEST_NO ? HZ_VM : HZ;
+ tick = 1000000 / hz;
+ tick_sbt = SBT_1S / hz;
+ tick_bt = sbttobt(tick_sbt);
+
+#ifdef VM_SWZONE_SIZE_MAX
+ maxswzone = VM_SWZONE_SIZE_MAX;
+#endif
+ TUNABLE_LONG_FETCH("kern.maxswzone", &maxswzone);
+#ifdef VM_BCACHE_SIZE_MAX
+ maxbcache = VM_BCACHE_SIZE_MAX;
+#endif
+ TUNABLE_LONG_FETCH("kern.maxbcache", &maxbcache);
+ msgbufsize = MSGBUF_SIZE;
+ TUNABLE_INT_FETCH("kern.msgbufsize", &msgbufsize);
+
+ maxtsiz = MAXTSIZ;
+ TUNABLE_ULONG_FETCH("kern.maxtsiz", &maxtsiz);
+ dfldsiz = DFLDSIZ;
+ TUNABLE_ULONG_FETCH("kern.dfldsiz", &dfldsiz);
+ maxdsiz = MAXDSIZ;
+ TUNABLE_ULONG_FETCH("kern.maxdsiz", &maxdsiz);
+ dflssiz = DFLSSIZ;
+ TUNABLE_ULONG_FETCH("kern.dflssiz", &dflssiz);
+ maxssiz = MAXSSIZ;
+ TUNABLE_ULONG_FETCH("kern.maxssiz", &maxssiz);
+ sgrowsiz = SGROWSIZ;
+ TUNABLE_ULONG_FETCH("kern.sgrowsiz", &sgrowsiz);
+
+ /*
+ * Let the administrator set {NGROUPS_MAX}, but disallow values
+ * less than NGROUPS_MAX which would violate POSIX.1-2008 or
+ * greater than INT_MAX-1 which would result in overflow.
+ */
+ ngroups_max = NGROUPS_MAX;
+ TUNABLE_INT_FETCH("kern.ngroups", &ngroups_max);
+ if (ngroups_max < NGROUPS_MAX)
+ ngroups_max = NGROUPS_MAX;
+
+ /*
+ * Only allow to lower the maximal pid.
+ * Prevent setting up a non-bootable system if pid_max is too low.
+ */
+ TUNABLE_INT_FETCH("kern.pid_max", &pid_max);
+ if (pid_max > PID_MAX)
+ pid_max = PID_MAX;
+ else if (pid_max < 300)
+ pid_max = 300;
+
+ TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed", &unmapped_buf_allowed);
+}
+
+/*
+ * Boot time overrides that are scaled against main memory
+ */
+void
+init_param2(long physpages)
+{
+
+ /* Base parameters */
+ maxusers = MAXUSERS;
+ TUNABLE_INT_FETCH("kern.maxusers", &maxusers);
+ if (maxusers == 0) {
+ maxusers = physpages / (2 * 1024 * 1024 / PAGE_SIZE);
+ if (maxusers < 32)
+ maxusers = 32;
+#ifdef VM_MAX_AUTOTUNE_MAXUSERS
+ if (maxusers > VM_MAX_AUTOTUNE_MAXUSERS)
+ maxusers = VM_MAX_AUTOTUNE_MAXUSERS;
+#endif
+ /*
+ * Scales down the function in which maxusers grows once
+ * we hit 384.
+ */
+ if (maxusers > 384)
+ maxusers = 384 + ((maxusers - 384) / 8);
+ }
+
+ /*
+ * The following can be overridden after boot via sysctl. Note:
+ * unless overriden, these macros are ultimately based on maxusers.
+ * Limit maxproc so that kmap entries cannot be exhausted by
+ * processes.
+ */
+ maxproc = NPROC;
+ TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
+ if (maxproc > (physpages / 12))
+ maxproc = physpages / 12;
+ maxprocperuid = (maxproc * 9) / 10;
+
+ /*
+ * The default limit for maxfiles is 1/12 of the number of
+ * physical page but not less than 16 times maxusers.
+ * At most it can be 1/6 the number of physical pages.
+ */
+ maxfiles = imax(MAXFILES, physpages / 8);
+ TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
+ if (maxfiles > (physpages / 4))
+ maxfiles = physpages / 4;
+ maxfilesperproc = (maxfiles / 10) * 9;
+
+ /*
+ * Cannot be changed after boot.
+ */
+ nbuf = NBUF;
+ TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
+ TUNABLE_INT_FETCH("kern.bio_transient_maxcnt", &bio_transient_maxcnt);
+
+ /*
+ * The default for maxpipekva is min(1/64 of the kernel address space,
+ * max(1/64 of main memory, 512KB)). See sys_pipe.c for more details.
+ */
+ maxpipekva = (physpages / 64) * PAGE_SIZE;
+ TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva);
+ if (maxpipekva < 512 * 1024)
+ maxpipekva = 512 * 1024;
+ if (maxpipekva > (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 64)
+ maxpipekva = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
+ 64;
+}
+
+/*
+ * Sysctl stringiying handler for kern.vm_guest.
+ */
+static int
+sysctl_kern_vm_guest(SYSCTL_HANDLER_ARGS)
+{
+ return (SYSCTL_OUT(req, vm_guest_sysctl_names[vm_guest],
+ strlen(vm_guest_sysctl_names[vm_guest])));
+}
diff --git a/sys/kern/subr_pcpu.c b/sys/kern/subr_pcpu.c
new file mode 100644
index 0000000..505a4df
--- /dev/null
+++ b/sys/kern/subr_pcpu.c
@@ -0,0 +1,394 @@
+/*-
+ * Copyright (c) 2001 Wind River Systems, Inc.
+ * All rights reserved.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ *
+ * Copyright (c) 2009 Jeffrey Roberson <jeff@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module provides MI support for per-cpu data.
+ *
+ * Each architecture determines the mapping of logical CPU IDs to physical
+ * CPUs. The requirements of this mapping are as follows:
+ * - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1.
+ * - The mapping is not required to be dense. That is, there may be
+ * gaps in the mappings.
+ * - The platform sets the value of MAXCPU in <machine/param.h>.
+ * - It is suggested, but not required, that in the non-SMP case, the
+ * platform define MAXCPU to be 1 and define the logical ID of the
+ * sole CPU as 0.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/smp.h>
+#include <sys/sx.h>
+#include <ddb/ddb.h>
+
+static MALLOC_DEFINE(M_PCPU, "Per-cpu", "Per-cpu resource accouting.");
+
+struct dpcpu_free {
+ uintptr_t df_start;
+ int df_len;
+ TAILQ_ENTRY(dpcpu_free) df_link;
+};
+
+static DPCPU_DEFINE(char, modspace[DPCPU_MODMIN]);
+static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head);
+static struct sx dpcpu_lock;
+uintptr_t dpcpu_off[MAXCPU];
+struct pcpu *cpuid_to_pcpu[MAXCPU];
+struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead);
+
+/*
+ * Initialize the MI portions of a struct pcpu.
+ */
+void
+pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
+{
+
+ bzero(pcpu, size);
+ KASSERT(cpuid >= 0 && cpuid < MAXCPU,
+ ("pcpu_init: invalid cpuid %d", cpuid));
+ pcpu->pc_cpuid = cpuid;
+ cpuid_to_pcpu[cpuid] = pcpu;
+ STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu);
+ cpu_pcpu_init(pcpu, cpuid, size);
+ pcpu->pc_rm_queue.rmq_next = &pcpu->pc_rm_queue;
+ pcpu->pc_rm_queue.rmq_prev = &pcpu->pc_rm_queue;
+}
+
+void
+dpcpu_init(void *dpcpu, int cpuid)
+{
+ struct pcpu *pcpu;
+
+ pcpu = pcpu_find(cpuid);
+ pcpu->pc_dynamic = (uintptr_t)dpcpu - DPCPU_START;
+
+ /*
+ * Initialize defaults from our linker section.
+ */
+ memcpy(dpcpu, (void *)DPCPU_START, DPCPU_BYTES);
+
+ /*
+ * Place it in the global pcpu offset array.
+ */
+ dpcpu_off[cpuid] = pcpu->pc_dynamic;
+}
+
+static void
+dpcpu_startup(void *dummy __unused)
+{
+ struct dpcpu_free *df;
+
+ df = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO);
+ df->df_start = (uintptr_t)&DPCPU_NAME(modspace);
+ df->df_len = DPCPU_MODMIN;
+ TAILQ_INSERT_HEAD(&dpcpu_head, df, df_link);
+ sx_init(&dpcpu_lock, "dpcpu alloc lock");
+}
+SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, 0);
+
+/*
+ * First-fit extent based allocator for allocating space in the per-cpu
+ * region reserved for modules. This is only intended for use by the
+ * kernel linkers to place module linker sets.
+ */
+void *
+dpcpu_alloc(int size)
+{
+ struct dpcpu_free *df;
+ void *s;
+
+ s = NULL;
+ size = roundup2(size, sizeof(void *));
+ sx_xlock(&dpcpu_lock);
+ TAILQ_FOREACH(df, &dpcpu_head, df_link) {
+ if (df->df_len < size)
+ continue;
+ if (df->df_len == size) {
+ s = (void *)df->df_start;
+ TAILQ_REMOVE(&dpcpu_head, df, df_link);
+ free(df, M_PCPU);
+ break;
+ }
+ s = (void *)df->df_start;
+ df->df_len -= size;
+ df->df_start = df->df_start + size;
+ break;
+ }
+ sx_xunlock(&dpcpu_lock);
+
+ return (s);
+}
+
+/*
+ * Free dynamic per-cpu space at module unload time.
+ */
+void
+dpcpu_free(void *s, int size)
+{
+ struct dpcpu_free *df;
+ struct dpcpu_free *dn;
+ uintptr_t start;
+ uintptr_t end;
+
+ size = roundup2(size, sizeof(void *));
+ start = (uintptr_t)s;
+ end = start + size;
+ /*
+ * Free a region of space and merge it with as many neighbors as
+ * possible. Keeping the list sorted simplifies this operation.
+ */
+ sx_xlock(&dpcpu_lock);
+ TAILQ_FOREACH(df, &dpcpu_head, df_link) {
+ if (df->df_start > end)
+ break;
+ /*
+ * If we expand at the end of an entry we may have to
+ * merge it with the one following it as well.
+ */
+ if (df->df_start + df->df_len == start) {
+ df->df_len += size;
+ dn = TAILQ_NEXT(df, df_link);
+ if (df->df_start + df->df_len == dn->df_start) {
+ df->df_len += dn->df_len;
+ TAILQ_REMOVE(&dpcpu_head, dn, df_link);
+ free(dn, M_PCPU);
+ }
+ sx_xunlock(&dpcpu_lock);
+ return;
+ }
+ if (df->df_start == end) {
+ df->df_start = start;
+ df->df_len += size;
+ sx_xunlock(&dpcpu_lock);
+ return;
+ }
+ }
+ dn = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO);
+ dn->df_start = start;
+ dn->df_len = size;
+ if (df)
+ TAILQ_INSERT_BEFORE(df, dn, df_link);
+ else
+ TAILQ_INSERT_TAIL(&dpcpu_head, dn, df_link);
+ sx_xunlock(&dpcpu_lock);
+}
+
+/*
+ * Initialize the per-cpu storage from an updated linker-set region.
+ */
+void
+dpcpu_copy(void *s, int size)
+{
+#ifdef SMP
+ uintptr_t dpcpu;
+ int i;
+
+ for (i = 0; i < mp_ncpus; ++i) {
+ dpcpu = dpcpu_off[i];
+ if (dpcpu == 0)
+ continue;
+ memcpy((void *)(dpcpu + (uintptr_t)s), s, size);
+ }
+#else
+ memcpy((void *)(dpcpu_off[0] + (uintptr_t)s), s, size);
+#endif
+}
+
+/*
+ * Destroy a struct pcpu.
+ */
+void
+pcpu_destroy(struct pcpu *pcpu)
+{
+
+ STAILQ_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu);
+ cpuid_to_pcpu[pcpu->pc_cpuid] = NULL;
+ dpcpu_off[pcpu->pc_cpuid] = 0;
+}
+
+/*
+ * Locate a struct pcpu by cpu id.
+ */
+struct pcpu *
+pcpu_find(u_int cpuid)
+{
+
+ return (cpuid_to_pcpu[cpuid]);
+}
+
+int
+sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS)
+{
+ uintptr_t dpcpu;
+ int64_t count;
+ int i;
+
+ count = 0;
+ for (i = 0; i < mp_ncpus; ++i) {
+ dpcpu = dpcpu_off[i];
+ if (dpcpu == 0)
+ continue;
+ count += *(int64_t *)(dpcpu + (uintptr_t)arg1);
+ }
+ return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
+int
+sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS)
+{
+ uintptr_t dpcpu;
+ long count;
+ int i;
+
+ count = 0;
+ for (i = 0; i < mp_ncpus; ++i) {
+ dpcpu = dpcpu_off[i];
+ if (dpcpu == 0)
+ continue;
+ count += *(long *)(dpcpu + (uintptr_t)arg1);
+ }
+ return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
+int
+sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS)
+{
+ uintptr_t dpcpu;
+ int count;
+ int i;
+
+ count = 0;
+ for (i = 0; i < mp_ncpus; ++i) {
+ dpcpu = dpcpu_off[i];
+ if (dpcpu == 0)
+ continue;
+ count += *(int *)(dpcpu + (uintptr_t)arg1);
+ }
+ return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(dpcpu_off, db_show_dpcpu_off)
+{
+ int id;
+
+ CPU_FOREACH(id) {
+ db_printf("dpcpu_off[%2d] = 0x%jx (+ DPCPU_START = %p)\n",
+ id, (uintmax_t)dpcpu_off[id],
+ (void *)(uintptr_t)(dpcpu_off[id] + DPCPU_START));
+ }
+}
+
+static void
+show_pcpu(struct pcpu *pc)
+{
+ struct thread *td;
+
+ db_printf("cpuid = %d\n", pc->pc_cpuid);
+ db_printf("dynamic pcpu = %p\n", (void *)pc->pc_dynamic);
+ db_printf("curthread = ");
+ td = pc->pc_curthread;
+ if (td != NULL)
+ db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+ td->td_name);
+ else
+ db_printf("none\n");
+ db_printf("curpcb = %p\n", pc->pc_curpcb);
+ db_printf("fpcurthread = ");
+ td = pc->pc_fpcurthread;
+ if (td != NULL)
+ db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid,
+ td->td_name);
+ else
+ db_printf("none\n");
+ db_printf("idlethread = ");
+ td = pc->pc_idlethread;
+ if (td != NULL)
+ db_printf("%p: tid %d \"%s\"\n", td, td->td_tid, td->td_name);
+ else
+ db_printf("none\n");
+ db_show_mdpcpu(pc);
+
+#ifdef VIMAGE
+ db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet);
+#endif
+
+#ifdef WITNESS
+ db_printf("spin locks held:\n");
+ witness_list_locks(&pc->pc_spinlocks, db_printf);
+#endif
+}
+
+DB_SHOW_COMMAND(pcpu, db_show_pcpu)
+{
+ struct pcpu *pc;
+ int id;
+
+ if (have_addr)
+ id = ((addr >> 4) % 16) * 10 + (addr % 16);
+ else
+ id = PCPU_GET(cpuid);
+ pc = pcpu_find(id);
+ if (pc == NULL) {
+ db_printf("CPU %d not found\n", id);
+ return;
+ }
+ show_pcpu(pc);
+}
+
+DB_SHOW_ALL_COMMAND(pcpu, db_show_cpu_all)
+{
+ struct pcpu *pc;
+ int id;
+
+ db_printf("Current CPU: %d\n\n", PCPU_GET(cpuid));
+ for (id = 0; id <= mp_maxid; id++) {
+ pc = pcpu_find(id);
+ if (pc != NULL) {
+ show_pcpu(pc);
+ db_printf("\n");
+ }
+ }
+}
+DB_SHOW_ALIAS(allpcpu, db_show_cpu_all);
+#endif
diff --git a/sys/kern/subr_pctrie.c b/sys/kern/subr_pctrie.c
new file mode 100644
index 0000000..2bbd16d
--- /dev/null
+++ b/sys/kern/subr_pctrie.c
@@ -0,0 +1,705 @@
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Path-compressed radix trie implementation.
+ *
+ * The implementation takes into account the following rationale:
+ * - Size of the nodes should be as small as possible but still big enough
+ * to avoid a large maximum depth for the trie. This is a balance
+ * between the necessity to not wire too much physical memory for the nodes
+ * and the necessity to avoid too much cache pollution during the trie
+ * operations.
+ * - There is not a huge bias toward the number of lookup operations over
+ * the number of insert and remove operations. This basically implies
+ * that optimizations supposedly helping one operation but hurting the
+ * other might be carefully evaluated.
+ * - On average not many nodes are expected to be fully populated, hence
+ * level compression may just complicate things.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/pctrie.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * These widths should allow the pointers to a node's children to fit within
+ * a single cache line. The extra levels from a narrow width should not be
+ * a problem thanks to path compression.
+ */
+#ifdef __LP64__
+#define PCTRIE_WIDTH 4
+#else
+#define PCTRIE_WIDTH 3
+#endif
+
+#define PCTRIE_COUNT (1 << PCTRIE_WIDTH)
+#define PCTRIE_MASK (PCTRIE_COUNT - 1)
+#define PCTRIE_LIMIT (howmany((sizeof(uint64_t) * NBBY), PCTRIE_WIDTH) - 1)
+
+/* Flag bits stored in node pointers. */
+#define PCTRIE_ISLEAF 0x1
+#define PCTRIE_FLAGS 0x1
+#define PCTRIE_PAD PCTRIE_FLAGS
+
+/* Returns one unit associated with specified level. */
+#define PCTRIE_UNITLEVEL(lev) \
+ ((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
+
+struct pctrie_node {
+ uint64_t pn_owner; /* Owner of record. */
+ uint16_t pn_count; /* Valid children. */
+ uint16_t pn_clev; /* Current level. */
+ void *pn_child[PCTRIE_COUNT]; /* Child nodes. */
+};
+
+/*
+ * Allocate a node. Pre-allocation should ensure that the request
+ * will always be satisfied.
+ */
+static __inline struct pctrie_node *
+pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
+ uint16_t count, uint16_t clevel)
+{
+ struct pctrie_node *node;
+
+ node = allocfn(ptree);
+ if (node == NULL)
+ return (NULL);
+ node->pn_owner = owner;
+ node->pn_count = count;
+ node->pn_clev = clevel;
+
+ return (node);
+}
+
+/*
+ * Free radix node.
+ */
+static __inline void
+pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
+ pctrie_free_t freefn)
+{
+#ifdef INVARIANTS
+ int slot;
+
+ KASSERT(node->pn_count == 0,
+ ("pctrie_node_put: node %p has %d children", node,
+ node->pn_count));
+ for (slot = 0; slot < PCTRIE_COUNT; slot++)
+ KASSERT(node->pn_child[slot] == NULL,
+ ("pctrie_node_put: node %p has a child", node));
+#endif
+ freefn(ptree, node);
+}
+
+/*
+ * Return the position in the array for a given level.
+ */
+static __inline int
+pctrie_slot(uint64_t index, uint16_t level)
+{
+
+ return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
+}
+
+/* Trims the key after the specified level. */
+static __inline uint64_t
+pctrie_trimkey(uint64_t index, uint16_t level)
+{
+ uint64_t ret;
+
+ ret = index;
+ if (level > 0) {
+ ret >>= level * PCTRIE_WIDTH;
+ ret <<= level * PCTRIE_WIDTH;
+ }
+ return (ret);
+}
+
+/*
+ * Get the root node for a tree.
+ */
+static __inline struct pctrie_node *
+pctrie_getroot(struct pctrie *ptree)
+{
+
+ return ((struct pctrie_node *)ptree->pt_root);
+}
+
+/*
+ * Set the root node for a tree.
+ */
+static __inline void
+pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
+{
+
+ ptree->pt_root = (uintptr_t)node;
+}
+
+/*
+ * Returns TRUE if the specified node is a leaf and FALSE otherwise.
+ */
+static __inline boolean_t
+pctrie_isleaf(struct pctrie_node *node)
+{
+
+ return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
+}
+
+/*
+ * Returns the associated val extracted from node.
+ */
+static __inline uint64_t *
+pctrie_toval(struct pctrie_node *node)
+{
+
+ return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
+}
+
+/*
+ * Adds the val as a child of the provided node.
+ */
+static __inline void
+pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
+ uint64_t *val)
+{
+ int slot;
+
+ slot = pctrie_slot(index, clev);
+ node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
+}
+
+/*
+ * Returns the slot where two keys differ.
+ * It cannot accept 2 equal keys.
+ */
+static __inline uint16_t
+pctrie_keydiff(uint64_t index1, uint64_t index2)
+{
+ uint16_t clev;
+
+ KASSERT(index1 != index2, ("%s: passing the same key value %jx",
+ __func__, (uintmax_t)index1));
+
+ index1 ^= index2;
+ for (clev = PCTRIE_LIMIT;; clev--)
+ if (pctrie_slot(index1, clev) != 0)
+ return (clev);
+}
+
+/*
+ * Returns TRUE if it can be determined that key does not belong to the
+ * specified node. Otherwise, returns FALSE.
+ */
+static __inline boolean_t
+pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
+{
+
+ if (node->pn_clev < PCTRIE_LIMIT) {
+ idx = pctrie_trimkey(idx, node->pn_clev + 1);
+ return (idx != node->pn_owner);
+ }
+ return (FALSE);
+}
+
+/*
+ * Internal helper for pctrie_reclaim_allnodes().
+ * This function is recursive.
+ */
+static void
+pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
+ pctrie_free_t freefn)
+{
+ int slot;
+
+ KASSERT(node->pn_count <= PCTRIE_COUNT,
+ ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
+ for (slot = 0; node->pn_count != 0; slot++) {
+ if (node->pn_child[slot] == NULL)
+ continue;
+ if (!pctrie_isleaf(node->pn_child[slot]))
+ pctrie_reclaim_allnodes_int(ptree,
+ node->pn_child[slot], freefn);
+ node->pn_child[slot] = NULL;
+ node->pn_count--;
+ }
+ pctrie_node_put(ptree, node, freefn);
+}
+
+/*
+ * pctrie node zone initializer.
+ */
+int
+pctrie_zone_init(void *mem, int size __unused, int flags __unused)
+{
+ struct pctrie_node *node;
+
+ node = mem;
+ memset(node->pn_child, 0, sizeof(node->pn_child));
+ return (0);
+}
+
+size_t
+pctrie_node_size(void)
+{
+
+ return (sizeof(struct pctrie_node));
+}
+
+/*
+ * Inserts the key-value pair into the trie.
+ * Panics if the key already exists.
+ */
+int
+pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
+{
+ uint64_t index, newind;
+ void **parentp;
+ struct pctrie_node *node, *tmp;
+ uint64_t *m;
+ int slot;
+ uint16_t clev;
+
+ index = *val;
+
+ /*
+ * The owner of record for root is not really important because it
+ * will never be used.
+ */
+ node = pctrie_getroot(ptree);
+ if (node == NULL) {
+ ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
+ return (0);
+ }
+ parentp = (void **)&ptree->pt_root;
+ for (;;) {
+ if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m == index)
+ panic("%s: key %jx is already present",
+ __func__, (uintmax_t)index);
+ clev = pctrie_keydiff(*m, index);
+ tmp = pctrie_node_get(ptree, allocfn,
+ pctrie_trimkey(index, clev + 1), 2, clev);
+ if (tmp == NULL)
+ return (ENOMEM);
+ *parentp = tmp;
+ pctrie_addval(tmp, index, clev, val);
+ pctrie_addval(tmp, *m, clev, m);
+ return (0);
+ } else if (pctrie_keybarr(node, index))
+ break;
+ slot = pctrie_slot(index, node->pn_clev);
+ if (node->pn_child[slot] == NULL) {
+ node->pn_count++;
+ pctrie_addval(node, index, node->pn_clev, val);
+ return (0);
+ }
+ parentp = &node->pn_child[slot];
+ node = node->pn_child[slot];
+ }
+
+ /*
+ * A new node is needed because the right insertion level is reached.
+ * Setup the new intermediate node and add the 2 children: the
+ * new object and the older edge.
+ */
+ newind = node->pn_owner;
+ clev = pctrie_keydiff(newind, index);
+ tmp = pctrie_node_get(ptree, allocfn,
+ pctrie_trimkey(index, clev + 1), 2, clev);
+ if (tmp == NULL)
+ return (ENOMEM);
+ *parentp = tmp;
+ pctrie_addval(tmp, index, clev, val);
+ slot = pctrie_slot(newind, clev);
+ tmp->pn_child[slot] = node;
+
+ return (0);
+}
+
+/*
+ * Returns the value stored at the index. If the index is not present,
+ * NULL is returned.
+ */
+uint64_t *
+pctrie_lookup(struct pctrie *ptree, uint64_t index)
+{
+ struct pctrie_node *node;
+ uint64_t *m;
+ int slot;
+
+ node = pctrie_getroot(ptree);
+ while (node != NULL) {
+ if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m == index)
+ return (m);
+ else
+ break;
+ } else if (pctrie_keybarr(node, index))
+ break;
+ slot = pctrie_slot(index, node->pn_clev);
+ node = node->pn_child[slot];
+ }
+ return (NULL);
+}
+
+/*
+ * Look up the nearest entry at a position bigger than or equal to index.
+ */
+uint64_t *
+pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
+{
+ struct pctrie_node *stack[PCTRIE_LIMIT];
+ uint64_t inc;
+ uint64_t *m;
+ struct pctrie_node *child, *node;
+#ifdef INVARIANTS
+ int loops = 0;
+#endif
+ int slot, tos;
+
+ node = pctrie_getroot(ptree);
+ if (node == NULL)
+ return (NULL);
+ else if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m >= index)
+ return (m);
+ else
+ return (NULL);
+ }
+ tos = 0;
+ for (;;) {
+ /*
+ * If the keys differ before the current bisection node,
+ * then the search key might rollback to the earliest
+ * available bisection node or to the smallest key
+ * in the current node (if the owner is bigger than the
+ * search key).
+ */
+ if (pctrie_keybarr(node, index)) {
+ if (index > node->pn_owner) {
+ascend:
+ KASSERT(++loops < 1000,
+ ("pctrie_lookup_ge: too many loops"));
+
+ /*
+ * Pop nodes from the stack until either the
+ * stack is empty or a node that could have a
+ * matching descendant is found.
+ */
+ do {
+ if (tos == 0)
+ return (NULL);
+ node = stack[--tos];
+ } while (pctrie_slot(index,
+ node->pn_clev) == (PCTRIE_COUNT - 1));
+
+ /*
+ * The following computation cannot overflow
+ * because index's slot at the current level
+ * is less than PCTRIE_COUNT - 1.
+ */
+ index = pctrie_trimkey(index,
+ node->pn_clev);
+ index += PCTRIE_UNITLEVEL(node->pn_clev);
+ } else
+ index = node->pn_owner;
+ KASSERT(!pctrie_keybarr(node, index),
+ ("pctrie_lookup_ge: keybarr failed"));
+ }
+ slot = pctrie_slot(index, node->pn_clev);
+ child = node->pn_child[slot];
+ if (pctrie_isleaf(child)) {
+ m = pctrie_toval(child);
+ if (*m >= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+
+ /*
+ * Look for an available edge or val within the current
+ * bisection node.
+ */
+ if (slot < (PCTRIE_COUNT - 1)) {
+ inc = PCTRIE_UNITLEVEL(node->pn_clev);
+ index = pctrie_trimkey(index, node->pn_clev);
+ do {
+ index += inc;
+ slot++;
+ child = node->pn_child[slot];
+ if (pctrie_isleaf(child)) {
+ m = pctrie_toval(child);
+ if (*m >= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+ } while (slot < (PCTRIE_COUNT - 1));
+ }
+ KASSERT(child == NULL || pctrie_isleaf(child),
+ ("pctrie_lookup_ge: child is radix node"));
+
+ /*
+ * If a value or edge bigger than the search slot is not found
+ * in the current node, ascend to the next higher-level node.
+ */
+ goto ascend;
+descend:
+ KASSERT(node->pn_clev > 0,
+ ("pctrie_lookup_ge: pushing leaf's parent"));
+ KASSERT(tos < PCTRIE_LIMIT,
+ ("pctrie_lookup_ge: stack overflow"));
+ stack[tos++] = node;
+ node = child;
+ }
+}
+
+/*
+ * Look up the nearest entry at a position less than or equal to index.
+ */
+uint64_t *
+pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
+{
+ struct pctrie_node *stack[PCTRIE_LIMIT];
+ uint64_t inc;
+ uint64_t *m;
+ struct pctrie_node *child, *node;
+#ifdef INVARIANTS
+ int loops = 0;
+#endif
+ int slot, tos;
+
+ node = pctrie_getroot(ptree);
+ if (node == NULL)
+ return (NULL);
+ else if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m <= index)
+ return (m);
+ else
+ return (NULL);
+ }
+ tos = 0;
+ for (;;) {
+ /*
+ * If the keys differ before the current bisection node,
+ * then the search key might rollback to the earliest
+ * available bisection node or to the largest key
+ * in the current node (if the owner is smaller than the
+ * search key).
+ */
+ if (pctrie_keybarr(node, index)) {
+ if (index > node->pn_owner) {
+ index = node->pn_owner + PCTRIE_COUNT *
+ PCTRIE_UNITLEVEL(node->pn_clev);
+ } else {
+ascend:
+ KASSERT(++loops < 1000,
+ ("pctrie_lookup_le: too many loops"));
+
+ /*
+ * Pop nodes from the stack until either the
+ * stack is empty or a node that could have a
+ * matching descendant is found.
+ */
+ do {
+ if (tos == 0)
+ return (NULL);
+ node = stack[--tos];
+ } while (pctrie_slot(index,
+ node->pn_clev) == 0);
+
+ /*
+ * The following computation cannot overflow
+ * because index's slot at the current level
+ * is greater than 0.
+ */
+ index = pctrie_trimkey(index,
+ node->pn_clev);
+ }
+ index--;
+ KASSERT(!pctrie_keybarr(node, index),
+ ("pctrie_lookup_le: keybarr failed"));
+ }
+ slot = pctrie_slot(index, node->pn_clev);
+ child = node->pn_child[slot];
+ if (pctrie_isleaf(child)) {
+ m = pctrie_toval(child);
+ if (*m <= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+
+ /*
+ * Look for an available edge or value within the current
+ * bisection node.
+ */
+ if (slot > 0) {
+ inc = PCTRIE_UNITLEVEL(node->pn_clev);
+ index |= inc - 1;
+ do {
+ index -= inc;
+ slot--;
+ child = node->pn_child[slot];
+ if (pctrie_isleaf(child)) {
+ m = pctrie_toval(child);
+ if (*m <= index)
+ return (m);
+ } else if (child != NULL)
+ goto descend;
+ } while (slot > 0);
+ }
+ KASSERT(child == NULL || pctrie_isleaf(child),
+ ("pctrie_lookup_le: child is radix node"));
+
+ /*
+ * If a value or edge smaller than the search slot is not found
+ * in the current node, ascend to the next higher-level node.
+ */
+ goto ascend;
+descend:
+ KASSERT(node->pn_clev > 0,
+ ("pctrie_lookup_le: pushing leaf's parent"));
+ KASSERT(tos < PCTRIE_LIMIT,
+ ("pctrie_lookup_le: stack overflow"));
+ stack[tos++] = node;
+ node = child;
+ }
+}
+
+/*
+ * Remove the specified index from the tree.
+ * Panics if the key is not present.
+ */
+void
+pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
+{
+ struct pctrie_node *node, *parent;
+ uint64_t *m;
+ int i, slot;
+
+ node = pctrie_getroot(ptree);
+ if (pctrie_isleaf(node)) {
+ m = pctrie_toval(node);
+ if (*m != index)
+ panic("%s: invalid key found", __func__);
+ pctrie_setroot(ptree, NULL);
+ return;
+ }
+ parent = NULL;
+ for (;;) {
+ if (node == NULL)
+ panic("pctrie_remove: impossible to locate the key");
+ slot = pctrie_slot(index, node->pn_clev);
+ if (pctrie_isleaf(node->pn_child[slot])) {
+ m = pctrie_toval(node->pn_child[slot]);
+ if (*m != index)
+ panic("%s: invalid key found", __func__);
+ node->pn_child[slot] = NULL;
+ node->pn_count--;
+ if (node->pn_count > 1)
+ break;
+ for (i = 0; i < PCTRIE_COUNT; i++)
+ if (node->pn_child[i] != NULL)
+ break;
+ KASSERT(i != PCTRIE_COUNT,
+ ("%s: invalid node configuration", __func__));
+ if (parent == NULL)
+ pctrie_setroot(ptree, node->pn_child[i]);
+ else {
+ slot = pctrie_slot(index, parent->pn_clev);
+ KASSERT(parent->pn_child[slot] == node,
+ ("%s: invalid child value", __func__));
+ parent->pn_child[slot] = node->pn_child[i];
+ }
+ node->pn_count--;
+ node->pn_child[i] = NULL;
+ pctrie_node_put(ptree, node, freefn);
+ break;
+ }
+ parent = node;
+ node = node->pn_child[slot];
+ }
+}
+
+/*
+ * Remove and free all the nodes from the tree.
+ * This function is recursive but there is a tight control on it as the
+ * maximum depth of the tree is fixed.
+ */
+void
+pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
+{
+ struct pctrie_node *root;
+
+ root = pctrie_getroot(ptree);
+ if (root == NULL)
+ return;
+ pctrie_setroot(ptree, NULL);
+ if (!pctrie_isleaf(root))
+ pctrie_reclaim_allnodes_int(ptree, root, freefn);
+}
+
+#ifdef DDB
+/*
+ * Show details about the given node.
+ */
+DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
+{
+ struct pctrie_node *node;
+ int i;
+
+ if (!have_addr)
+ return;
+ node = (struct pctrie_node *)addr;
+ db_printf("node %p, owner %jx, children count %u, level %u:\n",
+ (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
+ node->pn_clev);
+ for (i = 0; i < PCTRIE_COUNT; i++)
+ if (node->pn_child[i] != NULL)
+ db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
+ i, (void *)node->pn_child[i],
+ pctrie_isleaf(node->pn_child[i]) ?
+ pctrie_toval(node->pn_child[i]) : NULL,
+ node->pn_clev);
+}
+#endif /* DDB */
diff --git a/sys/kern/subr_power.c b/sys/kern/subr_power.c
new file mode 100644
index 0000000..ac6cd71
--- /dev/null
+++ b/sys/kern/subr_power.c
@@ -0,0 +1,122 @@
+/*-
+ * Copyright (c) 2001 Mitsuru IWASAKI
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+
+#include <sys/power.h>
+#include <sys/taskqueue.h>
+
+static u_int power_pm_type = POWER_PM_TYPE_NONE;
+static power_pm_fn_t power_pm_fn = NULL;
+static void *power_pm_arg = NULL;
+static struct task power_pm_task;
+
+static void
+power_pm_deferred_fn(void *arg, int pending)
+{
+ int state = (intptr_t)arg;
+
+ power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+}
+
+int
+power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
+{
+ int error;
+
+ if (power_pm_type == POWER_PM_TYPE_NONE ||
+ power_pm_type == pm_type) {
+ power_pm_type = pm_type;
+ power_pm_fn = pm_fn;
+ power_pm_arg = pm_arg;
+ error = 0;
+ TASK_INIT(&power_pm_task, 0, power_pm_deferred_fn, NULL);
+ } else {
+ error = ENXIO;
+ }
+
+ return (error);
+}
+
+u_int
+power_pm_get_type(void)
+{
+
+ return (power_pm_type);
+}
+
+void
+power_pm_suspend(int state)
+{
+ if (power_pm_fn == NULL)
+ return;
+
+ if (state != POWER_SLEEP_STATE_STANDBY &&
+ state != POWER_SLEEP_STATE_SUSPEND &&
+ state != POWER_SLEEP_STATE_HIBERNATE)
+ return;
+ power_pm_task.ta_context = (void *)(intptr_t)state;
+ taskqueue_enqueue(taskqueue_thread, &power_pm_task);
+}
+
+/*
+ * Power profile.
+ */
+
+static int power_profile_state = POWER_PROFILE_PERFORMANCE;
+
+int
+power_profile_get_state(void)
+{
+ return (power_profile_state);
+}
+
+void
+power_profile_set_state(int state)
+{
+ int changed;
+
+ if (state != power_profile_state) {
+ power_profile_state = state;
+ changed = 1;
+ if (bootverbose) {
+ printf("system power profile changed to '%s'\n",
+ (state == POWER_PROFILE_PERFORMANCE) ?
+ "performance" : "economy");
+ }
+ } else {
+ changed = 0;
+ }
+
+ if (changed)
+ EVENTHANDLER_INVOKE(power_profile_change, 0);
+}
+
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
new file mode 100644
index 0000000..042afa3
--- /dev/null
+++ b/sys/kern/subr_prf.c
@@ -0,0 +1,1140 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_printf.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kdb.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/kernel.h>
+#include <sys/msgbuf.h>
+#include <sys/malloc.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/stddef.h>
+#include <sys/sysctl.h>
+#include <sys/tty.h>
+#include <sys/syslog.h>
+#include <sys/cons.h>
+#include <sys/uio.h>
+#include <sys/ctype.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define TOCONS 0x01
+#define TOTTY 0x02
+#define TOLOG 0x04
+
+/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */
+#define MAXNBUF (sizeof(intmax_t) * NBBY + 1)
+
+struct putchar_arg {
+ int flags;
+ int pri;
+ struct tty *tty;
+ char *p_bufr;
+ size_t n_bufr;
+ char *p_next;
+ size_t remain;
+};
+
+struct snprintf_arg {
+ char *str;
+ size_t remain;
+};
+
+extern int log_open;
+
+static void msglogchar(int c, int pri);
+static void msglogstr(char *str, int pri, int filter_cr);
+static void putchar(int ch, void *arg);
+static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper);
+static void snprintf_func(int ch, void *arg);
+
+static int msgbufmapped; /* Set when safe to use msgbuf */
+int msgbuftrigger;
+
+static int log_console_output = 1;
+TUNABLE_INT("kern.log_console_output", &log_console_output);
+SYSCTL_INT(_kern, OID_AUTO, log_console_output, CTLFLAG_RW,
+ &log_console_output, 0, "Duplicate console output to the syslog.");
+
+/*
+ * See the comment in log_console() below for more explanation of this.
+ */
+static int log_console_add_linefeed = 0;
+TUNABLE_INT("kern.log_console_add_linefeed", &log_console_add_linefeed);
+SYSCTL_INT(_kern, OID_AUTO, log_console_add_linefeed, CTLFLAG_RW,
+ &log_console_add_linefeed, 0, "log_console() adds extra newlines.");
+
+static int always_console_output = 0;
+TUNABLE_INT("kern.always_console_output", &always_console_output);
+SYSCTL_INT(_kern, OID_AUTO, always_console_output, CTLFLAG_RW,
+ &always_console_output, 0, "Always output to console despite TIOCCONS.");
+
+/*
+ * Warn that a system table is full.
+ */
+void
+tablefull(const char *tab)
+{
+
+ log(LOG_ERR, "%s: table is full\n", tab);
+}
+
+/*
+ * Uprintf prints to the controlling terminal for the current process.
+ */
+int
+uprintf(const char *fmt, ...)
+{
+ va_list ap;
+ struct putchar_arg pca;
+ struct proc *p;
+ struct thread *td;
+ int retval;
+
+ td = curthread;
+ if (TD_IS_IDLETHREAD(td))
+ return (0);
+
+ sx_slock(&proctree_lock);
+ p = td->td_proc;
+ PROC_LOCK(p);
+ if ((p->p_flag & P_CONTROLT) == 0) {
+ PROC_UNLOCK(p);
+ retval = 0;
+ goto out;
+ }
+ SESS_LOCK(p->p_session);
+ pca.tty = p->p_session->s_ttyp;
+ SESS_UNLOCK(p->p_session);
+ PROC_UNLOCK(p);
+ if (pca.tty == NULL) {
+ retval = 0;
+ goto out;
+ }
+ pca.flags = TOTTY;
+ pca.p_bufr = NULL;
+ va_start(ap, fmt);
+ tty_lock(pca.tty);
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ tty_unlock(pca.tty);
+ va_end(ap);
+out:
+ sx_sunlock(&proctree_lock);
+ return (retval);
+}
+
+/*
+ * tprintf and vtprintf print on the controlling terminal associated with the
+ * given session, possibly to the log as well.
+ */
+void
+tprintf(struct proc *p, int pri, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vtprintf(p, pri, fmt, ap);
+ va_end(ap);
+}
+
+void
+vtprintf(struct proc *p, int pri, const char *fmt, va_list ap)
+{
+ struct tty *tp = NULL;
+ int flags = 0;
+ struct putchar_arg pca;
+ struct session *sess = NULL;
+
+ sx_slock(&proctree_lock);
+ if (pri != -1)
+ flags |= TOLOG;
+ if (p != NULL) {
+ PROC_LOCK(p);
+ if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
+ sess = p->p_session;
+ sess_hold(sess);
+ PROC_UNLOCK(p);
+ tp = sess->s_ttyp;
+ if (tp != NULL && tty_checkoutq(tp))
+ flags |= TOTTY;
+ else
+ tp = NULL;
+ } else
+ PROC_UNLOCK(p);
+ }
+ pca.pri = pri;
+ pca.tty = tp;
+ pca.flags = flags;
+ pca.p_bufr = NULL;
+ if (pca.tty != NULL)
+ tty_lock(pca.tty);
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ if (pca.tty != NULL)
+ tty_unlock(pca.tty);
+ if (sess != NULL)
+ sess_release(sess);
+ msgbuftrigger = 1;
+ sx_sunlock(&proctree_lock);
+}
+
+/*
+ * Ttyprintf displays a message on a tty; it should be used only by
+ * the tty driver, or anything that knows the underlying tty will not
+ * be revoke(2)'d away. Other callers should use tprintf.
+ */
+int
+ttyprintf(struct tty *tp, const char *fmt, ...)
+{
+ va_list ap;
+ struct putchar_arg pca;
+ int retval;
+
+ va_start(ap, fmt);
+ pca.tty = tp;
+ pca.flags = TOTTY;
+ pca.p_bufr = NULL;
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+ return (retval);
+}
+
+/*
+ * Log writes to the log buffer, and guarantees not to sleep (so can be
+ * called by interrupt routines). If there is no process reading the
+ * log yet, it writes to the console also.
+ */
+void
+log(int level, const char *fmt, ...)
+{
+ va_list ap;
+ struct putchar_arg pca;
+#ifdef PRINTF_BUFR_SIZE
+ char bufr[PRINTF_BUFR_SIZE];
+#endif
+
+ pca.tty = NULL;
+ pca.pri = level;
+ pca.flags = log_open ? TOLOG : TOCONS;
+#ifdef PRINTF_BUFR_SIZE
+ pca.p_bufr = bufr;
+ pca.p_next = pca.p_bufr;
+ pca.n_bufr = sizeof(bufr);
+ pca.remain = sizeof(bufr);
+ *pca.p_next = '\0';
+#else
+ pca.p_bufr = NULL;
+#endif
+
+ va_start(ap, fmt);
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ va_end(ap);
+
+#ifdef PRINTF_BUFR_SIZE
+ /* Write any buffered console/log output: */
+ if (*pca.p_bufr != '\0') {
+ if (pca.flags & TOLOG)
+ msglogstr(pca.p_bufr, level, /*filter_cr*/1);
+
+ if (pca.flags & TOCONS)
+ cnputs(pca.p_bufr);
+ }
+#endif
+ msgbuftrigger = 1;
+}
+
+#define CONSCHUNK 128
+
+void
+log_console(struct uio *uio)
+{
+ int c, error, nl;
+ char *consbuffer;
+ int pri;
+
+ if (!log_console_output)
+ return;
+
+ pri = LOG_INFO | LOG_CONSOLE;
+ uio = cloneuio(uio);
+ consbuffer = malloc(CONSCHUNK, M_TEMP, M_WAITOK);
+
+ nl = 0;
+ while (uio->uio_resid > 0) {
+ c = imin(uio->uio_resid, CONSCHUNK - 1);
+ error = uiomove(consbuffer, c, uio);
+ if (error != 0)
+ break;
+ /* Make sure we're NUL-terminated */
+ consbuffer[c] = '\0';
+ if (consbuffer[c - 1] == '\n')
+ nl = 1;
+ else
+ nl = 0;
+ msglogstr(consbuffer, pri, /*filter_cr*/ 1);
+ }
+ /*
+ * The previous behavior in log_console() is preserved when
+ * log_console_add_linefeed is non-zero. For that behavior, if an
+ * individual console write came in that was not terminated with a
+ * line feed, it would add a line feed.
+ *
+ * This results in different data in the message buffer than
+ * appears on the system console (which doesn't add extra line feed
+ * characters).
+ *
+ * A number of programs and rc scripts write a line feed, or a period
+ * and a line feed when they have completed their operation. On
+ * the console, this looks seamless, but when displayed with
+ * 'dmesg -a', you wind up with output that looks like this:
+ *
+ * Updating motd:
+ * .
+ *
+ * On the console, it looks like this:
+ * Updating motd:.
+ *
+ * We could add logic to detect that situation, or just not insert
+ * the extra newlines. Set the kern.log_console_add_linefeed
+ * sysctl/tunable variable to get the old behavior.
+ */
+ if (!nl && log_console_add_linefeed) {
+ consbuffer[0] = '\n';
+ consbuffer[1] = '\0';
+ msglogstr(consbuffer, pri, /*filter_cr*/ 1);
+ }
+ msgbuftrigger = 1;
+ free(uio, M_IOV);
+ free(consbuffer, M_TEMP);
+ return;
+}
+
+int
+printf(const char *fmt, ...)
+{
+ va_list ap;
+ int retval;
+
+ va_start(ap, fmt);
+ retval = vprintf(fmt, ap);
+ va_end(ap);
+
+ return (retval);
+}
+
+int
+vprintf(const char *fmt, va_list ap)
+{
+ struct putchar_arg pca;
+ int retval;
+#ifdef PRINTF_BUFR_SIZE
+ char bufr[PRINTF_BUFR_SIZE];
+#endif
+
+ pca.tty = NULL;
+ pca.flags = TOCONS | TOLOG;
+ pca.pri = -1;
+#ifdef PRINTF_BUFR_SIZE
+ pca.p_bufr = bufr;
+ pca.p_next = pca.p_bufr;
+ pca.n_bufr = sizeof(bufr);
+ pca.remain = sizeof(bufr);
+ *pca.p_next = '\0';
+#else
+ /* Don't buffer console output. */
+ pca.p_bufr = NULL;
+#endif
+
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
+
+#ifdef PRINTF_BUFR_SIZE
+ /* Write any buffered console/log output: */
+ if (*pca.p_bufr != '\0') {
+ cnputs(pca.p_bufr);
+ msglogstr(pca.p_bufr, pca.pri, /*filter_cr*/ 1);
+ }
+#endif
+
+ if (!panicstr)
+ msgbuftrigger = 1;
+
+ return (retval);
+}
+
+static void
+putbuf(int c, struct putchar_arg *ap)
+{
+ /* Check if no console output buffer was provided. */
+ if (ap->p_bufr == NULL) {
+ /* Output direct to the console. */
+ if (ap->flags & TOCONS)
+ cnputc(c);
+
+ if (ap->flags & TOLOG)
+ msglogchar(c, ap->pri);
+ } else {
+ /* Buffer the character: */
+ *ap->p_next++ = c;
+ ap->remain--;
+
+ /* Always leave the buffer zero terminated. */
+ *ap->p_next = '\0';
+
+ /* Check if the buffer needs to be flushed. */
+ if (ap->remain == 2 || c == '\n') {
+
+ if (ap->flags & TOLOG)
+ msglogstr(ap->p_bufr, ap->pri, /*filter_cr*/1);
+
+ if (ap->flags & TOCONS) {
+ if ((panicstr == NULL) && (constty != NULL))
+ msgbuf_addstr(&consmsgbuf, -1,
+ ap->p_bufr, /*filter_cr*/ 0);
+
+ if ((constty == NULL) ||(always_console_output))
+ cnputs(ap->p_bufr);
+ }
+
+ ap->p_next = ap->p_bufr;
+ ap->remain = ap->n_bufr;
+ *ap->p_next = '\0';
+ }
+
+ /*
+ * Since we fill the buffer up one character at a time,
+ * this should not happen. We should always catch it when
+ * ap->remain == 2 (if not sooner due to a newline), flush
+ * the buffer and move on. One way this could happen is
+ * if someone sets PRINTF_BUFR_SIZE to 1 or something
+ * similarly silly.
+ */
+ KASSERT(ap->remain > 2, ("Bad buffer logic, remain = %zd",
+ ap->remain));
+ }
+}
+
+/*
+ * Print a character on console or users terminal. If destination is
+ * the console then the last bunch of characters are saved in msgbuf for
+ * inspection later.
+ */
+static void
+putchar(int c, void *arg)
+{
+ struct putchar_arg *ap = (struct putchar_arg*) arg;
+ struct tty *tp = ap->tty;
+ int flags = ap->flags;
+
+ /* Don't use the tty code after a panic or while in ddb. */
+ if (kdb_active) {
+ if (c != '\0')
+ cnputc(c);
+ return;
+ }
+
+ if ((flags & TOTTY) && tp != NULL && panicstr == NULL)
+ tty_putchar(tp, c);
+
+ if ((flags & (TOCONS | TOLOG)) && c != '\0')
+ putbuf(c, ap);
+}
+
+/*
+ * Scaled down version of sprintf(3).
+ */
+int
+sprintf(char *buf, const char *cfmt, ...)
+{
+ int retval;
+ va_list ap;
+
+ va_start(ap, cfmt);
+ retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+ buf[retval] = '\0';
+ va_end(ap);
+ return (retval);
+}
+
+/*
+ * Scaled down version of vsprintf(3).
+ */
+int
+vsprintf(char *buf, const char *cfmt, va_list ap)
+{
+ int retval;
+
+ retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+ buf[retval] = '\0';
+ return (retval);
+}
+
+/*
+ * Scaled down version of snprintf(3).
+ */
+int
+snprintf(char *str, size_t size, const char *format, ...)
+{
+ int retval;
+ va_list ap;
+
+ va_start(ap, format);
+ retval = vsnprintf(str, size, format, ap);
+ va_end(ap);
+ return(retval);
+}
+
+/*
+ * Scaled down version of vsnprintf(3).
+ */
+int
+vsnprintf(char *str, size_t size, const char *format, va_list ap)
+{
+ struct snprintf_arg info;
+ int retval;
+
+ info.str = str;
+ info.remain = size;
+ retval = kvprintf(format, snprintf_func, &info, 10, ap);
+ if (info.remain >= 1)
+ *info.str++ = '\0';
+ return (retval);
+}
+
+/*
+ * Kernel version which takes radix argument vsnprintf(3).
+ */
+int
+vsnrprintf(char *str, size_t size, int radix, const char *format, va_list ap)
+{
+ struct snprintf_arg info;
+ int retval;
+
+ info.str = str;
+ info.remain = size;
+ retval = kvprintf(format, snprintf_func, &info, radix, ap);
+ if (info.remain >= 1)
+ *info.str++ = '\0';
+ return (retval);
+}
+
+static void
+snprintf_func(int ch, void *arg)
+{
+ struct snprintf_arg *const info = arg;
+
+ if (info->remain >= 2) {
+ *info->str++ = ch;
+ info->remain--;
+ }
+}
+
+/*
+ * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse
+ * order; return an optional length and a pointer to the last character
+ * written in the buffer (i.e., the first character of the string).
+ * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
+ */
+static char *
+ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper)
+{
+ char *p, c;
+
+ p = nbuf;
+ *p = '\0';
+ do {
+ c = hex2ascii(num % base);
+ *++p = upper ? toupper(c) : c;
+ } while (num /= base);
+ if (lenp)
+ *lenp = p - nbuf;
+ return (p);
+}
+
+/*
+ * Scaled down version of printf(3).
+ *
+ * Two additional formats:
+ *
+ * The format %b is supported to decode error registers.
+ * Its usage is:
+ *
+ * printf("reg=%b\n", regval, "<base><arg>*");
+ *
+ * where <base> is the output base expressed as a control character, e.g.
+ * \10 gives octal; \20 gives hex. Each arg is a sequence of characters,
+ * the first of which gives the bit number to be inspected (origin 1), and
+ * the next characters (up to a control character, i.e. a character <= 32),
+ * give the name of the register. Thus:
+ *
+ * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+ *
+ * would produce output:
+ *
+ * reg=3<BITTWO,BITONE>
+ *
+ * XXX: %D -- Hexdump, takes pointer and separator string:
+ * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX
+ * ("%*D", len, ptr, " " -> XX XX XX XX ...
+ */
+int
+kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
+{
+#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
+ char nbuf[MAXNBUF];
+ char *d;
+ const char *p, *percent, *q;
+ u_char *up;
+ int ch, n;
+ uintmax_t num;
+ int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+ int cflag, hflag, jflag, tflag, zflag;
+ int dwidth, upper;
+ char padc;
+ int stop = 0, retval = 0;
+
+ num = 0;
+ if (!func)
+ d = (char *) arg;
+ else
+ d = NULL;
+
+ if (fmt == NULL)
+ fmt = "(fmt null)\n";
+
+ if (radix < 2 || radix > 36)
+ radix = 10;
+
+ for (;;) {
+ padc = ' ';
+ width = 0;
+ while ((ch = (u_char)*fmt++) != '%' || stop) {
+ if (ch == '\0')
+ return (retval);
+ PCHAR(ch);
+ }
+ percent = fmt - 1;
+ qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
+ sign = 0; dot = 0; dwidth = 0; upper = 0;
+ cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
+reswitch: switch (ch = (u_char)*fmt++) {
+ case '.':
+ dot = 1;
+ goto reswitch;
+ case '#':
+ sharpflag = 1;
+ goto reswitch;
+ case '+':
+ sign = 1;
+ goto reswitch;
+ case '-':
+ ladjust = 1;
+ goto reswitch;
+ case '%':
+ PCHAR(ch);
+ break;
+ case '*':
+ if (!dot) {
+ width = va_arg(ap, int);
+ if (width < 0) {
+ ladjust = !ladjust;
+ width = -width;
+ }
+ } else {
+ dwidth = va_arg(ap, int);
+ }
+ goto reswitch;
+ case '0':
+ if (!dot) {
+ padc = '0';
+ goto reswitch;
+ }
+ case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ for (n = 0;; ++fmt) {
+ n = n * 10 + ch - '0';
+ ch = *fmt;
+ if (ch < '0' || ch > '9')
+ break;
+ }
+ if (dot)
+ dwidth = n;
+ else
+ width = n;
+ goto reswitch;
+ case 'b':
+ num = (u_int)va_arg(ap, int);
+ p = va_arg(ap, char *);
+ for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;)
+ PCHAR(*q--);
+
+ if (num == 0)
+ break;
+
+ for (tmp = 0; *p;) {
+ n = *p++;
+ if (num & (1 << (n - 1))) {
+ PCHAR(tmp ? ',' : '<');
+ for (; (n = *p) > ' '; ++p)
+ PCHAR(n);
+ tmp = 1;
+ } else
+ for (; *p > ' '; ++p)
+ continue;
+ }
+ if (tmp)
+ PCHAR('>');
+ break;
+ case 'c':
+ PCHAR(va_arg(ap, int));
+ break;
+ case 'D':
+ up = va_arg(ap, u_char *);
+ p = va_arg(ap, char *);
+ if (!width)
+ width = 16;
+ while(width--) {
+ PCHAR(hex2ascii(*up >> 4));
+ PCHAR(hex2ascii(*up & 0x0f));
+ up++;
+ if (width)
+ for (q=p;*q;q++)
+ PCHAR(*q);
+ }
+ break;
+ case 'd':
+ case 'i':
+ base = 10;
+ sign = 1;
+ goto handle_sign;
+ case 'h':
+ if (hflag) {
+ hflag = 0;
+ cflag = 1;
+ } else
+ hflag = 1;
+ goto reswitch;
+ case 'j':
+ jflag = 1;
+ goto reswitch;
+ case 'l':
+ if (lflag) {
+ lflag = 0;
+ qflag = 1;
+ } else
+ lflag = 1;
+ goto reswitch;
+ case 'n':
+ if (jflag)
+ *(va_arg(ap, intmax_t *)) = retval;
+ else if (qflag)
+ *(va_arg(ap, quad_t *)) = retval;
+ else if (lflag)
+ *(va_arg(ap, long *)) = retval;
+ else if (zflag)
+ *(va_arg(ap, size_t *)) = retval;
+ else if (hflag)
+ *(va_arg(ap, short *)) = retval;
+ else if (cflag)
+ *(va_arg(ap, char *)) = retval;
+ else
+ *(va_arg(ap, int *)) = retval;
+ break;
+ case 'o':
+ base = 8;
+ goto handle_nosign;
+ case 'p':
+ base = 16;
+ sharpflag = (width == 0);
+ sign = 0;
+ num = (uintptr_t)va_arg(ap, void *);
+ goto number;
+ case 'q':
+ qflag = 1;
+ goto reswitch;
+ case 'r':
+ base = radix;
+ if (sign)
+ goto handle_sign;
+ goto handle_nosign;
+ case 's':
+ p = va_arg(ap, char *);
+ if (p == NULL)
+ p = "(null)";
+ if (!dot)
+ n = strlen (p);
+ else
+ for (n = 0; n < dwidth && p[n]; n++)
+ continue;
+
+ width -= n;
+
+ if (!ladjust && width > 0)
+ while (width--)
+ PCHAR(padc);
+ while (n--)
+ PCHAR(*p++);
+ if (ladjust && width > 0)
+ while (width--)
+ PCHAR(padc);
+ break;
+ case 't':
+ tflag = 1;
+ goto reswitch;
+ case 'u':
+ base = 10;
+ goto handle_nosign;
+ case 'X':
+ upper = 1;
+ case 'x':
+ base = 16;
+ goto handle_nosign;
+ case 'y':
+ base = 16;
+ sign = 1;
+ goto handle_sign;
+ case 'z':
+ zflag = 1;
+ goto reswitch;
+handle_nosign:
+ sign = 0;
+ if (jflag)
+ num = va_arg(ap, uintmax_t);
+ else if (qflag)
+ num = va_arg(ap, u_quad_t);
+ else if (tflag)
+ num = va_arg(ap, ptrdiff_t);
+ else if (lflag)
+ num = va_arg(ap, u_long);
+ else if (zflag)
+ num = va_arg(ap, size_t);
+ else if (hflag)
+ num = (u_short)va_arg(ap, int);
+ else if (cflag)
+ num = (u_char)va_arg(ap, int);
+ else
+ num = va_arg(ap, u_int);
+ goto number;
+handle_sign:
+ if (jflag)
+ num = va_arg(ap, intmax_t);
+ else if (qflag)
+ num = va_arg(ap, quad_t);
+ else if (tflag)
+ num = va_arg(ap, ptrdiff_t);
+ else if (lflag)
+ num = va_arg(ap, long);
+ else if (zflag)
+ num = va_arg(ap, ssize_t);
+ else if (hflag)
+ num = (short)va_arg(ap, int);
+ else if (cflag)
+ num = (char)va_arg(ap, int);
+ else
+ num = va_arg(ap, int);
+number:
+ if (sign && (intmax_t)num < 0) {
+ neg = 1;
+ num = -(intmax_t)num;
+ }
+ p = ksprintn(nbuf, num, base, &n, upper);
+ tmp = 0;
+ if (sharpflag && num != 0) {
+ if (base == 8)
+ tmp++;
+ else if (base == 16)
+ tmp += 2;
+ }
+ if (neg)
+ tmp++;
+
+ if (!ladjust && padc == '0')
+ dwidth = width - tmp;
+ width -= tmp + imax(dwidth, n);
+ dwidth -= n;
+ if (!ladjust)
+ while (width-- > 0)
+ PCHAR(' ');
+ if (neg)
+ PCHAR('-');
+ if (sharpflag && num != 0) {
+ if (base == 8) {
+ PCHAR('0');
+ } else if (base == 16) {
+ PCHAR('0');
+ PCHAR('x');
+ }
+ }
+ while (dwidth-- > 0)
+ PCHAR('0');
+
+ while (*p)
+ PCHAR(*p--);
+
+ if (ladjust)
+ while (width-- > 0)
+ PCHAR(' ');
+
+ break;
+ default:
+ while (percent < fmt)
+ PCHAR(*percent++);
+ /*
+ * Since we ignore an formatting argument it is no
+ * longer safe to obey the remaining formatting
+ * arguments as the arguments will no longer match
+ * the format specs.
+ */
+ stop = 1;
+ break;
+ }
+ }
+#undef PCHAR
+}
+
+/*
+ * Put character in log buffer with a particular priority.
+ */
+static void
+msglogchar(int c, int pri)
+{
+ static int lastpri = -1;
+ static int dangling;
+ char nbuf[MAXNBUF];
+ char *p;
+
+ if (!msgbufmapped)
+ return;
+ if (c == '\0' || c == '\r')
+ return;
+ if (pri != -1 && pri != lastpri) {
+ if (dangling) {
+ msgbuf_addchar(msgbufp, '\n');
+ dangling = 0;
+ }
+ msgbuf_addchar(msgbufp, '<');
+ for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;)
+ msgbuf_addchar(msgbufp, *p--);
+ msgbuf_addchar(msgbufp, '>');
+ lastpri = pri;
+ }
+ msgbuf_addchar(msgbufp, c);
+ if (c == '\n') {
+ dangling = 0;
+ lastpri = -1;
+ } else {
+ dangling = 1;
+ }
+}
+
+static void
+msglogstr(char *str, int pri, int filter_cr)
+{
+ if (!msgbufmapped)
+ return;
+
+ msgbuf_addstr(msgbufp, pri, str, filter_cr);
+}
+
+void
+msgbufinit(void *ptr, int size)
+{
+ char *cp;
+ static struct msgbuf *oldp = NULL;
+
+ size -= sizeof(*msgbufp);
+ cp = (char *)ptr;
+ msgbufp = (struct msgbuf *)(cp + size);
+ msgbuf_reinit(msgbufp, cp, size);
+ if (msgbufmapped && oldp != msgbufp)
+ msgbuf_copy(oldp, msgbufp);
+ msgbufmapped = 1;
+ oldp = msgbufp;
+}
+
+static int unprivileged_read_msgbuf = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf,
+ CTLFLAG_RW, &unprivileged_read_msgbuf, 0,
+ "Unprivileged processes may read the kernel message buffer");
+
+/* Sysctls for accessing/clearing the msgbuf */
+static int
+sysctl_kern_msgbuf(SYSCTL_HANDLER_ARGS)
+{
+ char buf[128];
+ u_int seq;
+ int error, len;
+
+ if (!unprivileged_read_msgbuf) {
+ error = priv_check(req->td, PRIV_MSGBUF);
+ if (error)
+ return (error);
+ }
+
+ /* Read the whole buffer, one chunk at a time. */
+ mtx_lock(&msgbuf_lock);
+ msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
+ for (;;) {
+ len = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
+ mtx_unlock(&msgbuf_lock);
+ if (len == 0)
+ return (0);
+
+ error = sysctl_handle_opaque(oidp, buf, len, req);
+ if (error)
+ return (error);
+
+ mtx_lock(&msgbuf_lock);
+ }
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, msgbuf,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ NULL, 0, sysctl_kern_msgbuf, "A", "Contents of kernel message buffer");
+
+static int msgbuf_clearflag;
+
+static int
+sysctl_kern_msgbuf_clear(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
+ if (!error && req->newptr) {
+ mtx_lock(&msgbuf_lock);
+ msgbuf_clear(msgbufp);
+ mtx_unlock(&msgbuf_lock);
+ msgbuf_clearflag = 0;
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, msgbuf_clear,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE,
+ &msgbuf_clearflag, 0, sysctl_kern_msgbuf_clear, "I",
+ "Clear kernel message buffer");
+
+#ifdef DDB
+
+DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
+{
+ int i, j;
+
+ if (!msgbufmapped) {
+ db_printf("msgbuf not mapped yet\n");
+ return;
+ }
+ db_printf("msgbufp = %p\n", msgbufp);
+ db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n",
+ msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq,
+ msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum);
+ for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) {
+ j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq);
+ db_printf("%c", msgbufp->msg_ptr[j]);
+ }
+ db_printf("\n");
+}
+
+#endif /* DDB */
+
+void
+hexdump(const void *ptr, int length, const char *hdr, int flags)
+{
+ int i, j, k;
+ int cols;
+ const unsigned char *cp;
+ char delim;
+
+ if ((flags & HD_DELIM_MASK) != 0)
+ delim = (flags & HD_DELIM_MASK) >> 8;
+ else
+ delim = ' ';
+
+ if ((flags & HD_COLUMN_MASK) != 0)
+ cols = flags & HD_COLUMN_MASK;
+ else
+ cols = 16;
+
+ cp = ptr;
+ for (i = 0; i < length; i+= cols) {
+ if (hdr != NULL)
+ printf("%s", hdr);
+
+ if ((flags & HD_OMIT_COUNT) == 0)
+ printf("%04x ", i);
+
+ if ((flags & HD_OMIT_HEX) == 0) {
+ for (j = 0; j < cols; j++) {
+ k = i + j;
+ if (k < length)
+ printf("%c%02x", delim, cp[k]);
+ else
+ printf(" ");
+ }
+ }
+
+ if ((flags & HD_OMIT_CHARS) == 0) {
+ printf(" |");
+ for (j = 0; j < cols; j++) {
+ k = i + j;
+ if (k >= length)
+ printf(" ");
+ else if (cp[k] >= ' ' && cp[k] <= '~')
+ printf("%c", cp[k]);
+ else
+ printf(".");
+ }
+ printf("|");
+ }
+ printf("\n");
+ }
+}
+
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
new file mode 100644
index 0000000..c5b6b08
--- /dev/null
+++ b/sys/kern/subr_prof.c
@@ -0,0 +1,589 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+
+#ifdef GPROF
+#include <sys/malloc.h>
+#include <sys/gmon.h>
+#undef MCOUNT
+
+static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
+
+static void kmstartup(void *);
+SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL);
+
+struct gmonparam _gmonparam = { GMON_PROF_OFF };
+
+#ifdef GUPROF
+void
+nullfunc_loop_profiled()
+{
+ int i;
+
+ for (i = 0; i < CALIB_SCALE; i++)
+ nullfunc_profiled();
+}
+
+#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */
+
+void
+nullfunc_profiled()
+{
+}
+#endif /* GUPROF */
+
+/*
+ * Update the histograms to support extending the text region arbitrarily.
+ * This is done slightly naively (no sparse regions), so will waste slight
+ * amounts of memory, but will overall work nicely enough to allow profiling
+ * of KLDs.
+ */
+void
+kmupetext(uintfptr_t nhighpc)
+{
+ struct gmonparam np; /* slightly large */
+ struct gmonparam *p = &_gmonparam;
+ char *cp;
+
+ GIANT_REQUIRED;
+ bcopy(p, &np, sizeof(*p));
+ np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER));
+ if (np.highpc <= p->highpc)
+ return;
+ np.textsize = np.highpc - p->lowpc;
+ np.kcountsize = np.textsize / HISTFRACTION;
+ np.hashfraction = HASHFRACTION;
+ np.fromssize = np.textsize / HASHFRACTION;
+ np.tolimit = np.textsize * ARCDENSITY / 100;
+ if (np.tolimit < MINARCS)
+ np.tolimit = MINARCS;
+ else if (np.tolimit > MAXARCS)
+ np.tolimit = MAXARCS;
+ np.tossize = np.tolimit * sizeof(struct tostruct);
+ cp = malloc(np.kcountsize + np.fromssize + np.tossize,
+ M_GPROF, M_WAITOK);
+ /*
+ * Check for something else extending highpc while we slept.
+ */
+ if (np.highpc <= p->highpc) {
+ free(cp, M_GPROF);
+ return;
+ }
+ np.tos = (struct tostruct *)cp;
+ cp += np.tossize;
+ np.kcount = (HISTCOUNTER *)cp;
+ cp += np.kcountsize;
+ np.froms = (u_short *)cp;
+#ifdef GUPROF
+ /* Reinitialize pointers to overhead counters. */
+ np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime));
+ np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount));
+ np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount));
+#endif
+ critical_enter();
+ bcopy(p->tos, np.tos, p->tossize);
+ bzero((char *)np.tos + p->tossize, np.tossize - p->tossize);
+ bcopy(p->kcount, np.kcount, p->kcountsize);
+ bzero((char *)np.kcount + p->kcountsize, np.kcountsize -
+ p->kcountsize);
+ bcopy(p->froms, np.froms, p->fromssize);
+ bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize);
+ cp = (char *)p->tos;
+ bcopy(&np, p, sizeof(*p));
+ critical_exit();
+ free(cp, M_GPROF);
+}
+
+static void
+kmstartup(dummy)
+ void *dummy;
+{
+ char *cp;
+ struct gmonparam *p = &_gmonparam;
+#ifdef GUPROF
+ int cputime_overhead;
+ int empty_loop_time;
+ int i;
+ int mcount_overhead;
+ int mexitcount_overhead;
+ int nullfunc_loop_overhead;
+ int nullfunc_loop_profiled_time;
+ uintfptr_t tmp_addr;
+#endif
+
+ /*
+ * Round lowpc and highpc to multiples of the density we're using
+ * so the rest of the scaling (here and in gprof) stays in ints.
+ */
+ p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
+ p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
+ p->textsize = p->highpc - p->lowpc;
+ printf("Profiling kernel, textsize=%lu [%jx..%jx]\n",
+ p->textsize, (uintmax_t)p->lowpc, (uintmax_t)p->highpc);
+ p->kcountsize = p->textsize / HISTFRACTION;
+ p->hashfraction = HASHFRACTION;
+ p->fromssize = p->textsize / HASHFRACTION;
+ p->tolimit = p->textsize * ARCDENSITY / 100;
+ if (p->tolimit < MINARCS)
+ p->tolimit = MINARCS;
+ else if (p->tolimit > MAXARCS)
+ p->tolimit = MAXARCS;
+ p->tossize = p->tolimit * sizeof(struct tostruct);
+ cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
+ M_GPROF, M_WAITOK | M_ZERO);
+ p->tos = (struct tostruct *)cp;
+ cp += p->tossize;
+ p->kcount = (HISTCOUNTER *)cp;
+ cp += p->kcountsize;
+ p->froms = (u_short *)cp;
+ p->histcounter_type = FUNCTION_ALIGNMENT / HISTFRACTION * NBBY;
+
+#ifdef GUPROF
+ /* Signed counters. */
+ p->histcounter_type = -p->histcounter_type;
+
+ /* Initialize pointers to overhead counters. */
+ p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
+ p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
+ p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
+
+ /*
+ * Disable interrupts to avoid interference while we calibrate
+ * things.
+ */
+ critical_enter();
+
+ /*
+ * Determine overheads.
+ * XXX this needs to be repeated for each useful timer/counter.
+ */
+ cputime_overhead = 0;
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+ cputime_overhead += cputime();
+
+ empty_loop();
+ startguprof(p);
+ empty_loop();
+ empty_loop_time = cputime();
+
+ nullfunc_loop_profiled();
+
+ /*
+ * Start profiling. There won't be any normal function calls since
+ * interrupts are disabled, but we will call the profiling routines
+ * directly to determine their overheads.
+ */
+ p->state = GMON_PROF_HIRES;
+
+ startguprof(p);
+ nullfunc_loop_profiled();
+
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+ MCOUNT_OVERHEAD(sys_profil);
+ mcount_overhead = KCOUNT(p, PC_TO_I(p, sys_profil));
+
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+ MEXITCOUNT_OVERHEAD();
+ MEXITCOUNT_OVERHEAD_GETLABEL(tmp_addr);
+ mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
+
+ p->state = GMON_PROF_OFF;
+ stopguprof(p);
+
+ critical_exit();
+
+ nullfunc_loop_profiled_time = 0;
+ for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
+ tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
+ tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
+ nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
+#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE)
+#define c2n(count, freq) ((int)((count) * 1000000000LL / freq))
+ printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
+ CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
+ CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
+ CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
+ CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
+ CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
+ cputime_overhead -= empty_loop_time;
+ mcount_overhead -= empty_loop_time;
+ mexitcount_overhead -= empty_loop_time;
+
+ /*-
+ * Profiling overheads are determined by the times between the
+ * following events:
+ * MC1: mcount() is called
+ * MC2: cputime() (called from mcount()) latches the timer
+ * MC3: mcount() completes
+ * ME1: mexitcount() is called
+ * ME2: cputime() (called from mexitcount()) latches the timer
+ * ME3: mexitcount() completes.
+ * The times between the events vary slightly depending on instruction
+ * combination and cache misses, etc. Attempt to determine the
+ * minimum times. These can be subtracted from the profiling times
+ * without much risk of reducing the profiling times below what they
+ * would be when profiling is not configured. Abbreviate:
+ * ab = minimum time between MC1 and MC3
+ * a = minumum time between MC1 and MC2
+ * b = minimum time between MC2 and MC3
+ * cd = minimum time between ME1 and ME3
+ * c = minimum time between ME1 and ME2
+ * d = minimum time between ME2 and ME3.
+ * These satisfy the relations:
+ * ab <= mcount_overhead (just measured)
+ * a + b <= ab
+ * cd <= mexitcount_overhead (just measured)
+ * c + d <= cd
+ * a + d <= nullfunc_loop_profiled_time (just measured)
+ * a >= 0, b >= 0, c >= 0, d >= 0.
+ * Assume that ab and cd are equal to the minimums.
+ */
+ p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
+ p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
+ p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
+ - cputime_overhead);
+ nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
+ p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
+ - nullfunc_loop_overhead)
+ / 4);
+ p->mexitcount_pre_overhead = p->mexitcount_overhead
+ + p->cputime_overhead
+ - p->mexitcount_post_overhead;
+ p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
+ - p->mexitcount_post_overhead;
+ p->mcount_post_overhead = p->mcount_overhead
+ + p->cputime_overhead
+ - p->mcount_pre_overhead;
+ printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
+ c2n(p->cputime_overhead, p->profrate),
+ c2n(p->mcount_overhead, p->profrate),
+ c2n(p->mcount_pre_overhead, p->profrate),
+ c2n(p->mcount_post_overhead, p->profrate),
+ c2n(p->cputime_overhead, p->profrate),
+ c2n(p->mexitcount_overhead, p->profrate),
+ c2n(p->mexitcount_pre_overhead, p->profrate),
+ c2n(p->mexitcount_post_overhead, p->profrate));
+ printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
+ p->cputime_overhead, p->mcount_overhead,
+ p->mcount_pre_overhead, p->mcount_post_overhead,
+ p->cputime_overhead, p->mexitcount_overhead,
+ p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
+#endif /* GUPROF */
+}
+
+/*
+ * Return kernel profiling information.
+ */
+static int
+sysctl_kern_prof(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ struct gmonparam *gp = &_gmonparam;
+ int error;
+ int state;
+
+ /* all sysctl names at this level are terminal */
+ if (namelen != 1)
+ return (ENOTDIR); /* overloaded */
+
+ switch (name[0]) {
+ case GPROF_STATE:
+ state = gp->state;
+ error = sysctl_handle_int(oidp, &state, 0, req);
+ if (error)
+ return (error);
+ if (!req->newptr)
+ return (0);
+ if (state == GMON_PROF_OFF) {
+ gp->state = state;
+ PROC_LOCK(&proc0);
+ stopprofclock(&proc0);
+ PROC_UNLOCK(&proc0);
+ stopguprof(gp);
+ } else if (state == GMON_PROF_ON) {
+ gp->state = GMON_PROF_OFF;
+ stopguprof(gp);
+ gp->profrate = profhz;
+ PROC_LOCK(&proc0);
+ startprofclock(&proc0);
+ PROC_UNLOCK(&proc0);
+ gp->state = state;
+#ifdef GUPROF
+ } else if (state == GMON_PROF_HIRES) {
+ gp->state = GMON_PROF_OFF;
+ PROC_LOCK(&proc0);
+ stopprofclock(&proc0);
+ PROC_UNLOCK(&proc0);
+ startguprof(gp);
+ gp->state = state;
+#endif
+ } else if (state != gp->state)
+ return (EINVAL);
+ return (0);
+ case GPROF_COUNT:
+ return (sysctl_handle_opaque(oidp,
+ gp->kcount, gp->kcountsize, req));
+ case GPROF_FROMS:
+ return (sysctl_handle_opaque(oidp,
+ gp->froms, gp->fromssize, req));
+ case GPROF_TOS:
+ return (sysctl_handle_opaque(oidp,
+ gp->tos, gp->tossize, req));
+ case GPROF_GMONPARAM:
+ return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
+ default:
+ return (EOPNOTSUPP);
+ }
+ /* NOTREACHED */
+}
+
+static SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
+#endif /* GPROF */
+
+/*
+ * Profiling system call.
+ *
+ * The scale factor is a fixed point number with 16 bits of fraction, so that
+ * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct profil_args {
+ caddr_t samples;
+ size_t size;
+ size_t offset;
+ u_int scale;
+};
+#endif
+/* ARGSUSED */
+int
+sys_profil(struct thread *td, struct profil_args *uap)
+{
+ struct uprof *upp;
+ struct proc *p;
+
+ if (uap->scale > (1 << 16))
+ return (EINVAL);
+
+ p = td->td_proc;
+ if (uap->scale == 0) {
+ PROC_LOCK(p);
+ stopprofclock(p);
+ PROC_UNLOCK(p);
+ return (0);
+ }
+ PROC_LOCK(p);
+ upp = &td->td_proc->p_stats->p_prof;
+ PROC_SLOCK(p);
+ upp->pr_off = uap->offset;
+ upp->pr_scale = uap->scale;
+ upp->pr_base = uap->samples;
+ upp->pr_size = uap->size;
+ PROC_SUNLOCK(p);
+ startprofclock(p);
+ PROC_UNLOCK(p);
+
+ return (0);
+}
+
+/*
+ * Scale is a fixed-point number with the binary point 16 bits
+ * into the value, and is <= 1.0. pc is at most 32 bits, so the
+ * intermediate result is at most 48 bits.
+ */
+#define PC_TO_INDEX(pc, prof) \
+ ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
+ (u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
+
+/*
+ * Collect user-level profiling statistics; called on a profiling tick,
+ * when a process is running in user-mode. This routine may be called
+ * from an interrupt context. We try to update the user profiling buffers
+ * cheaply with fuswintr() and suswintr(). If that fails, we revert to
+ * an AST that will vector us to trap() with a context in which copyin
+ * and copyout will work. Trap will then call addupc_task().
+ *
+ * Note that we may (rarely) not get around to the AST soon enough, and
+ * lose profile ticks when the next tick overwrites this one, but in this
+ * case the system is overloaded and the profile is probably already
+ * inaccurate.
+ */
+void
+addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks)
+{
+ struct uprof *prof;
+ caddr_t addr;
+ u_int i;
+ int v;
+
+ if (ticks == 0)
+ return;
+ prof = &td->td_proc->p_stats->p_prof;
+ PROC_SLOCK(td->td_proc);
+ if (pc < prof->pr_off ||
+ (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
+ PROC_SUNLOCK(td->td_proc);
+ return; /* out of range; ignore */
+ }
+
+ addr = prof->pr_base + i;
+ PROC_SUNLOCK(td->td_proc);
+ if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
+ td->td_profil_addr = pc;
+ td->td_profil_ticks = ticks;
+ td->td_pflags |= TDP_OWEUPC;
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING;
+ thread_unlock(td);
+ }
+}
+
+/*
+ * Much like before, but we can afford to take faults here. If the
+ * update fails, we simply turn off profiling.
+ */
+void
+addupc_task(struct thread *td, uintfptr_t pc, u_int ticks)
+{
+ struct proc *p = td->td_proc;
+ struct uprof *prof;
+ caddr_t addr;
+ u_int i;
+ u_short v;
+ int stop = 0;
+
+ if (ticks == 0)
+ return;
+
+ PROC_LOCK(p);
+ if (!(p->p_flag & P_PROFIL)) {
+ PROC_UNLOCK(p);
+ return;
+ }
+ p->p_profthreads++;
+ prof = &p->p_stats->p_prof;
+ PROC_SLOCK(p);
+ if (pc < prof->pr_off ||
+ (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
+ PROC_SUNLOCK(p);
+ goto out;
+ }
+
+ addr = prof->pr_base + i;
+ PROC_SUNLOCK(p);
+ PROC_UNLOCK(p);
+ if (copyin(addr, &v, sizeof(v)) == 0) {
+ v += ticks;
+ if (copyout(&v, addr, sizeof(v)) == 0) {
+ PROC_LOCK(p);
+ goto out;
+ }
+ }
+ stop = 1;
+ PROC_LOCK(p);
+
+out:
+ if (--p->p_profthreads == 0) {
+ if (p->p_flag & P_STOPPROF) {
+ wakeup(&p->p_profthreads);
+ stop = 0;
+ }
+ }
+ if (stop)
+ stopprofclock(p);
+ PROC_UNLOCK(p);
+}
+
+#if (defined(__amd64__) || defined(__i386__)) && \
+ defined(__GNUCLIKE_CTOR_SECTION_HANDLING)
+/*
+ * Support for "--test-coverage --profile-arcs" in GCC.
+ *
+ * We need to call all the functions in the .ctor section, in order
+ * to get all the counter-arrays strung into a list.
+ *
+ * XXX: the .ctors call __bb_init_func which is located in over in
+ * XXX: i386/i386/support.s for historical reasons. There is probably
+ * XXX: no reason for that to be assembler anymore, but doing it right
+ * XXX: in MI C code requires one to reverse-engineer the type-selection
+ * XXX: inside GCC. Have fun.
+ *
+ * XXX: Worrisome perspective: Calling the .ctors may make C++ in the
+ * XXX: kernel feasible. Don't.
+ */
+typedef void (*ctor_t)(void);
+extern ctor_t _start_ctors, _stop_ctors;
+
+static void
+tcov_init(void *foo __unused)
+{
+ ctor_t *p, q;
+
+ for (p = &_start_ctors; p < &_stop_ctors; p++) {
+ q = *p;
+ q();
+ }
+}
+
+SYSINIT(tcov_init, SI_SUB_KPROF, SI_ORDER_SECOND, tcov_init, NULL);
+
+/*
+ * GCC contains magic to recognize calls to for instance execve() and
+ * puts in calls to this function to preserve the profile counters.
+ * XXX: Put zinging punchline here.
+ */
+void __bb_fork_func(void);
+void
+__bb_fork_func(void)
+{
+}
+
+#endif
+
diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c
new file mode 100644
index 0000000..e43dfcf
--- /dev/null
+++ b/sys/kern/subr_rman.c
@@ -0,0 +1,1160 @@
+/*-
+ * Copyright 1998 Massachusetts Institute of Technology
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose and without fee is hereby
+ * granted, provided that both the above copyright notice and this
+ * permission notice appear in all copies, that both the above
+ * copyright notice and this permission notice appear in all
+ * supporting documentation, and that the name of M.I.T. not be used
+ * in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission. M.I.T. makes
+ * no representations about the suitability of this software for any
+ * purpose. It is provided "as is" without express or implied
+ * warranty.
+ *
+ * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
+ * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
+ * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * The kernel resource manager. This code is responsible for keeping track
+ * of hardware resources which are apportioned out to various drivers.
+ * It does not actually assign those resources, and it is not expected
+ * that end-device drivers will call into this code directly. Rather,
+ * the code which implements the buses that those devices are attached to,
+ * and the code which manages CPU resources, will call this code, and the
+ * end-device drivers will make upcalls to that code to actually perform
+ * the allocation.
+ *
+ * There are two sorts of resources managed by this code. The first is
+ * the more familiar array (RMAN_ARRAY) type; resources in this class
+ * consist of a sequence of individually-allocatable objects which have
+ * been numbered in some well-defined order. Most of the resources
+ * are of this type, as it is the most familiar. The second type is
+ * called a gauge (RMAN_GAUGE), and models fungible resources (i.e.,
+ * resources in which each instance is indistinguishable from every
+ * other instance). The principal anticipated application of gauges
+ * is in the context of power consumption, where a bus may have a specific
+ * power budget which all attached devices share. RMAN_GAUGE is not
+ * implemented yet.
+ *
+ * For array resources, we make one simplifying assumption: two clients
+ * sharing the same resource must use the same range of indices. That
+ * is to say, sharing of overlapping-but-not-identical regions is not
+ * permitted.
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/bus.h> /* XXX debugging */
+#include <machine/bus.h>
+#include <sys/rman.h>
+#include <sys/sysctl.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * We use a linked list rather than a bitmap because we need to be able to
+ * represent potentially huge objects (like all of a processor's physical
+ * address space). That is also why the indices are defined to have type
+ * `unsigned long' -- that being the largest integral type in ISO C (1990).
+ * The 1999 version of C allows `long long'; we may need to switch to that
+ * at some point in the future, particularly if we want to support 36-bit
+ * addresses on IA32 hardware.
+ */
+struct resource_i {
+ struct resource r_r;
+ TAILQ_ENTRY(resource_i) r_link;
+ LIST_ENTRY(resource_i) r_sharelink;
+ LIST_HEAD(, resource_i) *r_sharehead;
+ u_long r_start; /* index of the first entry in this resource */
+ u_long r_end; /* index of the last entry (inclusive) */
+ u_int r_flags;
+ void *r_virtual; /* virtual address of this resource */
+ struct device *r_dev; /* device which has allocated this resource */
+ struct rman *r_rm; /* resource manager from whence this came */
+ int r_rid; /* optional rid for this resource. */
+};
+
+static int rman_debug = 0;
+TUNABLE_INT("debug.rman_debug", &rman_debug);
+SYSCTL_INT(_debug, OID_AUTO, rman_debug, CTLFLAG_RW,
+ &rman_debug, 0, "rman debug");
+
+#define DPRINTF(params) if (rman_debug) printf params
+
+static MALLOC_DEFINE(M_RMAN, "rman", "Resource manager");
+
+struct rman_head rman_head;
+static struct mtx rman_mtx; /* mutex to protect rman_head */
+static int int_rman_activate_resource(struct rman *rm, struct resource_i *r,
+ struct resource_i **whohas);
+static int int_rman_deactivate_resource(struct resource_i *r);
+static int int_rman_release_resource(struct rman *rm, struct resource_i *r);
+
+static __inline struct resource_i *
+int_alloc_resource(int malloc_flag)
+{
+ struct resource_i *r;
+
+ r = malloc(sizeof *r, M_RMAN, malloc_flag | M_ZERO);
+ if (r != NULL) {
+ r->r_r.__r_i = r;
+ }
+ return (r);
+}
+
+int
+rman_init(struct rman *rm)
+{
+ static int once = 0;
+
+ if (once == 0) {
+ once = 1;
+ TAILQ_INIT(&rman_head);
+ mtx_init(&rman_mtx, "rman head", NULL, MTX_DEF);
+ }
+
+ if (rm->rm_start == 0 && rm->rm_end == 0)
+ rm->rm_end = ~0ul;
+ if (rm->rm_type == RMAN_UNINIT)
+ panic("rman_init");
+ if (rm->rm_type == RMAN_GAUGE)
+ panic("implement RMAN_GAUGE");
+
+ TAILQ_INIT(&rm->rm_list);
+ rm->rm_mtx = malloc(sizeof *rm->rm_mtx, M_RMAN, M_NOWAIT | M_ZERO);
+ if (rm->rm_mtx == NULL)
+ return ENOMEM;
+ mtx_init(rm->rm_mtx, "rman", NULL, MTX_DEF);
+
+ mtx_lock(&rman_mtx);
+ TAILQ_INSERT_TAIL(&rman_head, rm, rm_link);
+ mtx_unlock(&rman_mtx);
+ return 0;
+}
+
+int
+rman_manage_region(struct rman *rm, u_long start, u_long end)
+{
+ struct resource_i *r, *s, *t;
+ int rv = 0;
+
+ DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n",
+ rm->rm_descr, start, end));
+ if (start < rm->rm_start || end > rm->rm_end)
+ return EINVAL;
+ r = int_alloc_resource(M_NOWAIT);
+ if (r == NULL)
+ return ENOMEM;
+ r->r_start = start;
+ r->r_end = end;
+ r->r_rm = rm;
+
+ mtx_lock(rm->rm_mtx);
+
+ /* Skip entries before us. */
+ TAILQ_FOREACH(s, &rm->rm_list, r_link) {
+ if (s->r_end == ULONG_MAX)
+ break;
+ if (s->r_end + 1 >= r->r_start)
+ break;
+ }
+
+ /* If we ran off the end of the list, insert at the tail. */
+ if (s == NULL) {
+ TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
+ } else {
+ /* Check for any overlap with the current region. */
+ if (r->r_start <= s->r_end && r->r_end >= s->r_start) {
+ rv = EBUSY;
+ goto out;
+ }
+
+ /* Check for any overlap with the next region. */
+ t = TAILQ_NEXT(s, r_link);
+ if (t && r->r_start <= t->r_end && r->r_end >= t->r_start) {
+ rv = EBUSY;
+ goto out;
+ }
+
+ /*
+ * See if this region can be merged with the next region. If
+ * not, clear the pointer.
+ */
+ if (t && (r->r_end + 1 != t->r_start || t->r_flags != 0))
+ t = NULL;
+
+ /* See if we can merge with the current region. */
+ if (s->r_end + 1 == r->r_start && s->r_flags == 0) {
+ /* Can we merge all 3 regions? */
+ if (t != NULL) {
+ s->r_end = t->r_end;
+ TAILQ_REMOVE(&rm->rm_list, t, r_link);
+ free(r, M_RMAN);
+ free(t, M_RMAN);
+ } else {
+ s->r_end = r->r_end;
+ free(r, M_RMAN);
+ }
+ } else if (t != NULL) {
+ /* Can we merge with just the next region? */
+ t->r_start = r->r_start;
+ free(r, M_RMAN);
+ } else if (s->r_end < r->r_start) {
+ TAILQ_INSERT_AFTER(&rm->rm_list, s, r, r_link);
+ } else {
+ TAILQ_INSERT_BEFORE(s, r, r_link);
+ }
+ }
+out:
+ mtx_unlock(rm->rm_mtx);
+ return rv;
+}
+
+int
+rman_init_from_resource(struct rman *rm, struct resource *r)
+{
+ int rv;
+
+ if ((rv = rman_init(rm)) != 0)
+ return (rv);
+ return (rman_manage_region(rm, r->__r_i->r_start, r->__r_i->r_end));
+}
+
+int
+rman_fini(struct rman *rm)
+{
+ struct resource_i *r;
+
+ mtx_lock(rm->rm_mtx);
+ TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+ if (r->r_flags & RF_ALLOCATED) {
+ mtx_unlock(rm->rm_mtx);
+ return EBUSY;
+ }
+ }
+
+ /*
+ * There really should only be one of these if we are in this
+ * state and the code is working properly, but it can't hurt.
+ */
+ while (!TAILQ_EMPTY(&rm->rm_list)) {
+ r = TAILQ_FIRST(&rm->rm_list);
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ free(r, M_RMAN);
+ }
+ mtx_unlock(rm->rm_mtx);
+ mtx_lock(&rman_mtx);
+ TAILQ_REMOVE(&rman_head, rm, rm_link);
+ mtx_unlock(&rman_mtx);
+ mtx_destroy(rm->rm_mtx);
+ free(rm->rm_mtx, M_RMAN);
+
+ return 0;
+}
+
+int
+rman_first_free_region(struct rman *rm, u_long *start, u_long *end)
+{
+ struct resource_i *r;
+
+ mtx_lock(rm->rm_mtx);
+ TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+ if (!(r->r_flags & RF_ALLOCATED)) {
+ *start = r->r_start;
+ *end = r->r_end;
+ mtx_unlock(rm->rm_mtx);
+ return (0);
+ }
+ }
+ mtx_unlock(rm->rm_mtx);
+ return (ENOENT);
+}
+
+int
+rman_last_free_region(struct rman *rm, u_long *start, u_long *end)
+{
+ struct resource_i *r;
+
+ mtx_lock(rm->rm_mtx);
+ TAILQ_FOREACH_REVERSE(r, &rm->rm_list, resource_head, r_link) {
+ if (!(r->r_flags & RF_ALLOCATED)) {
+ *start = r->r_start;
+ *end = r->r_end;
+ mtx_unlock(rm->rm_mtx);
+ return (0);
+ }
+ }
+ mtx_unlock(rm->rm_mtx);
+ return (ENOENT);
+}
+
+/* Shrink or extend one or both ends of an allocated resource. */
+int
+rman_adjust_resource(struct resource *rr, u_long start, u_long end)
+{
+ struct resource_i *r, *s, *t, *new;
+ struct rman *rm;
+
+ /* Not supported for shared resources. */
+ r = rr->__r_i;
+ if (r->r_flags & (RF_TIMESHARE | RF_SHAREABLE))
+ return (EINVAL);
+
+ /*
+ * This does not support wholesale moving of a resource. At
+ * least part of the desired new range must overlap with the
+ * existing resource.
+ */
+ if (end < r->r_start || r->r_end < start)
+ return (EINVAL);
+
+ /*
+ * Find the two resource regions immediately adjacent to the
+ * allocated resource.
+ */
+ rm = r->r_rm;
+ mtx_lock(rm->rm_mtx);
+#ifdef INVARIANTS
+ TAILQ_FOREACH(s, &rm->rm_list, r_link) {
+ if (s == r)
+ break;
+ }
+ if (s == NULL)
+ panic("resource not in list");
+#endif
+ s = TAILQ_PREV(r, resource_head, r_link);
+ t = TAILQ_NEXT(r, r_link);
+ KASSERT(s == NULL || s->r_end + 1 == r->r_start,
+ ("prev resource mismatch"));
+ KASSERT(t == NULL || r->r_end + 1 == t->r_start,
+ ("next resource mismatch"));
+
+ /*
+ * See if the changes are permitted. Shrinking is always allowed,
+ * but growing requires sufficient room in the adjacent region.
+ */
+ if (start < r->r_start && (s == NULL || (s->r_flags & RF_ALLOCATED) ||
+ s->r_start > start)) {
+ mtx_unlock(rm->rm_mtx);
+ return (EBUSY);
+ }
+ if (end > r->r_end && (t == NULL || (t->r_flags & RF_ALLOCATED) ||
+ t->r_end < end)) {
+ mtx_unlock(rm->rm_mtx);
+ return (EBUSY);
+ }
+
+ /*
+ * While holding the lock, grow either end of the resource as
+ * needed and shrink either end if the shrinking does not require
+ * allocating a new resource. We can safely drop the lock and then
+ * insert a new range to handle the shrinking case afterwards.
+ */
+ if (start < r->r_start ||
+ (start > r->r_start && s != NULL && !(s->r_flags & RF_ALLOCATED))) {
+ KASSERT(s->r_flags == 0, ("prev is busy"));
+ r->r_start = start;
+ if (s->r_start == start) {
+ TAILQ_REMOVE(&rm->rm_list, s, r_link);
+ free(s, M_RMAN);
+ } else
+ s->r_end = start - 1;
+ }
+ if (end > r->r_end ||
+ (end < r->r_end && t != NULL && !(t->r_flags & RF_ALLOCATED))) {
+ KASSERT(t->r_flags == 0, ("next is busy"));
+ r->r_end = end;
+ if (t->r_end == end) {
+ TAILQ_REMOVE(&rm->rm_list, t, r_link);
+ free(t, M_RMAN);
+ } else
+ t->r_start = end + 1;
+ }
+ mtx_unlock(rm->rm_mtx);
+
+ /*
+ * Handle the shrinking cases that require allocating a new
+ * resource to hold the newly-free region. We have to recheck
+ * if we still need this new region after acquiring the lock.
+ */
+ if (start > r->r_start) {
+ new = int_alloc_resource(M_WAITOK);
+ new->r_start = r->r_start;
+ new->r_end = start - 1;
+ new->r_rm = rm;
+ mtx_lock(rm->rm_mtx);
+ r->r_start = start;
+ s = TAILQ_PREV(r, resource_head, r_link);
+ if (s != NULL && !(s->r_flags & RF_ALLOCATED)) {
+ s->r_end = start - 1;
+ free(new, M_RMAN);
+ } else
+ TAILQ_INSERT_BEFORE(r, new, r_link);
+ mtx_unlock(rm->rm_mtx);
+ }
+ if (end < r->r_end) {
+ new = int_alloc_resource(M_WAITOK);
+ new->r_start = end + 1;
+ new->r_end = r->r_end;
+ new->r_rm = rm;
+ mtx_lock(rm->rm_mtx);
+ r->r_end = end;
+ t = TAILQ_NEXT(r, r_link);
+ if (t != NULL && !(t->r_flags & RF_ALLOCATED)) {
+ t->r_start = end + 1;
+ free(new, M_RMAN);
+ } else
+ TAILQ_INSERT_AFTER(&rm->rm_list, r, new, r_link);
+ mtx_unlock(rm->rm_mtx);
+ }
+ return (0);
+}
+
+struct resource *
+rman_reserve_resource_bound(struct rman *rm, u_long start, u_long end,
+ u_long count, u_long bound, u_int flags,
+ struct device *dev)
+{
+ u_int want_activate;
+ struct resource_i *r, *s, *rv;
+ u_long rstart, rend, amask, bmask;
+
+ rv = NULL;
+
+ DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#lx, %#lx], "
+ "length %#lx, flags %u, device %s\n", rm->rm_descr, start, end,
+ count, flags,
+ dev == NULL ? "<null>" : device_get_nameunit(dev)));
+ want_activate = (flags & RF_ACTIVE);
+ flags &= ~RF_ACTIVE;
+
+ mtx_lock(rm->rm_mtx);
+
+ for (r = TAILQ_FIRST(&rm->rm_list);
+ r && r->r_end < start;
+ r = TAILQ_NEXT(r, r_link))
+ ;
+
+ if (r == NULL) {
+ DPRINTF(("could not find a region\n"));
+ goto out;
+ }
+
+ amask = (1ul << RF_ALIGNMENT(flags)) - 1;
+ /* If bound is 0, bmask will also be 0 */
+ bmask = ~(bound - 1);
+ /*
+ * First try to find an acceptable totally-unshared region.
+ */
+ for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
+ DPRINTF(("considering [%#lx, %#lx]\n", s->r_start, s->r_end));
+ if (s->r_start + count - 1 > end) {
+ DPRINTF(("s->r_start (%#lx) + count - 1> end (%#lx)\n",
+ s->r_start, end));
+ break;
+ }
+ if (s->r_flags & RF_ALLOCATED) {
+ DPRINTF(("region is allocated\n"));
+ continue;
+ }
+ rstart = ulmax(s->r_start, start);
+ /*
+ * Try to find a region by adjusting to boundary and alignment
+ * until both conditions are satisfied. This is not an optimal
+ * algorithm, but in most cases it isn't really bad, either.
+ */
+ do {
+ rstart = (rstart + amask) & ~amask;
+ if (((rstart ^ (rstart + count - 1)) & bmask) != 0)
+ rstart += bound - (rstart & ~bmask);
+ } while ((rstart & amask) != 0 && rstart < end &&
+ rstart < s->r_end);
+ rend = ulmin(s->r_end, ulmax(rstart + count - 1, end));
+ if (rstart > rend) {
+ DPRINTF(("adjusted start exceeds end\n"));
+ continue;
+ }
+ DPRINTF(("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n",
+ rstart, rend, (rend - rstart + 1), count));
+
+ if ((rend - rstart + 1) >= count) {
+ DPRINTF(("candidate region: [%#lx, %#lx], size %#lx\n",
+ rstart, rend, (rend - rstart + 1)));
+ if ((s->r_end - s->r_start + 1) == count) {
+ DPRINTF(("candidate region is entire chunk\n"));
+ rv = s;
+ rv->r_flags |= RF_ALLOCATED | flags;
+ rv->r_dev = dev;
+ goto out;
+ }
+
+ /*
+ * If s->r_start < rstart and
+ * s->r_end > rstart + count - 1, then
+ * we need to split the region into three pieces
+ * (the middle one will get returned to the user).
+ * Otherwise, we are allocating at either the
+ * beginning or the end of s, so we only need to
+ * split it in two. The first case requires
+ * two new allocations; the second requires but one.
+ */
+ rv = int_alloc_resource(M_NOWAIT);
+ if (rv == NULL)
+ goto out;
+ rv->r_start = rstart;
+ rv->r_end = rstart + count - 1;
+ rv->r_flags = flags | RF_ALLOCATED;
+ rv->r_dev = dev;
+ rv->r_rm = rm;
+
+ if (s->r_start < rv->r_start && s->r_end > rv->r_end) {
+ DPRINTF(("splitting region in three parts: "
+ "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n",
+ s->r_start, rv->r_start - 1,
+ rv->r_start, rv->r_end,
+ rv->r_end + 1, s->r_end));
+ /*
+ * We are allocating in the middle.
+ */
+ r = int_alloc_resource(M_NOWAIT);
+ if (r == NULL) {
+ free(rv, M_RMAN);
+ rv = NULL;
+ goto out;
+ }
+ r->r_start = rv->r_end + 1;
+ r->r_end = s->r_end;
+ r->r_flags = s->r_flags;
+ r->r_rm = rm;
+ s->r_end = rv->r_start - 1;
+ TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
+ r_link);
+ TAILQ_INSERT_AFTER(&rm->rm_list, rv, r,
+ r_link);
+ } else if (s->r_start == rv->r_start) {
+ DPRINTF(("allocating from the beginning\n"));
+ /*
+ * We are allocating at the beginning.
+ */
+ s->r_start = rv->r_end + 1;
+ TAILQ_INSERT_BEFORE(s, rv, r_link);
+ } else {
+ DPRINTF(("allocating at the end\n"));
+ /*
+ * We are allocating at the end.
+ */
+ s->r_end = rv->r_start - 1;
+ TAILQ_INSERT_AFTER(&rm->rm_list, s, rv,
+ r_link);
+ }
+ goto out;
+ }
+ }
+
+ /*
+ * Now find an acceptable shared region, if the client's requirements
+ * allow sharing. By our implementation restriction, a candidate
+ * region must match exactly by both size and sharing type in order
+ * to be considered compatible with the client's request. (The
+ * former restriction could probably be lifted without too much
+ * additional work, but this does not seem warranted.)
+ */
+ DPRINTF(("no unshared regions found\n"));
+ if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0)
+ goto out;
+
+ for (s = r; s; s = TAILQ_NEXT(s, r_link)) {
+ if (s->r_start > end)
+ break;
+ if ((s->r_flags & flags) != flags)
+ continue;
+ rstart = ulmax(s->r_start, start);
+ rend = ulmin(s->r_end, ulmax(start + count - 1, end));
+ if (s->r_start >= start && s->r_end <= end
+ && (s->r_end - s->r_start + 1) == count &&
+ (s->r_start & amask) == 0 &&
+ ((s->r_start ^ s->r_end) & bmask) == 0) {
+ rv = int_alloc_resource(M_NOWAIT);
+ if (rv == NULL)
+ goto out;
+ rv->r_start = s->r_start;
+ rv->r_end = s->r_end;
+ rv->r_flags = s->r_flags &
+ (RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE);
+ rv->r_dev = dev;
+ rv->r_rm = rm;
+ if (s->r_sharehead == NULL) {
+ s->r_sharehead = malloc(sizeof *s->r_sharehead,
+ M_RMAN, M_NOWAIT | M_ZERO);
+ if (s->r_sharehead == NULL) {
+ free(rv, M_RMAN);
+ rv = NULL;
+ goto out;
+ }
+ LIST_INIT(s->r_sharehead);
+ LIST_INSERT_HEAD(s->r_sharehead, s,
+ r_sharelink);
+ s->r_flags |= RF_FIRSTSHARE;
+ }
+ rv->r_sharehead = s->r_sharehead;
+ LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink);
+ goto out;
+ }
+ }
+
+ /*
+ * We couldn't find anything.
+ */
+out:
+ /*
+ * If the user specified RF_ACTIVE in the initial flags,
+ * which is reflected in `want_activate', we attempt to atomically
+ * activate the resource. If this fails, we release the resource
+ * and indicate overall failure. (This behavior probably doesn't
+ * make sense for RF_TIMESHARE-type resources.)
+ */
+ if (rv && want_activate) {
+ struct resource_i *whohas;
+ if (int_rman_activate_resource(rm, rv, &whohas)) {
+ int_rman_release_resource(rm, rv);
+ rv = NULL;
+ }
+ }
+
+ mtx_unlock(rm->rm_mtx);
+ return (rv == NULL ? NULL : &rv->r_r);
+}
+
+struct resource *
+rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count,
+ u_int flags, struct device *dev)
+{
+
+ return (rman_reserve_resource_bound(rm, start, end, count, 0, flags,
+ dev));
+}
+
+static int
+int_rman_activate_resource(struct rman *rm, struct resource_i *r,
+ struct resource_i **whohas)
+{
+ struct resource_i *s;
+ int ok;
+
+ /*
+ * If we are not timesharing, then there is nothing much to do.
+ * If we already have the resource, then there is nothing at all to do.
+ * If we are not on a sharing list with anybody else, then there is
+ * little to do.
+ */
+ if ((r->r_flags & RF_TIMESHARE) == 0
+ || (r->r_flags & RF_ACTIVE) != 0
+ || r->r_sharehead == NULL) {
+ r->r_flags |= RF_ACTIVE;
+ return 0;
+ }
+
+ ok = 1;
+ for (s = LIST_FIRST(r->r_sharehead); s && ok;
+ s = LIST_NEXT(s, r_sharelink)) {
+ if ((s->r_flags & RF_ACTIVE) != 0) {
+ ok = 0;
+ *whohas = s;
+ }
+ }
+ if (ok) {
+ r->r_flags |= RF_ACTIVE;
+ return 0;
+ }
+ return EBUSY;
+}
+
+int
+rman_activate_resource(struct resource *re)
+{
+ int rv;
+ struct resource_i *r, *whohas;
+ struct rman *rm;
+
+ r = re->__r_i;
+ rm = r->r_rm;
+ mtx_lock(rm->rm_mtx);
+ rv = int_rman_activate_resource(rm, r, &whohas);
+ mtx_unlock(rm->rm_mtx);
+ return rv;
+}
+
+int
+rman_await_resource(struct resource *re, int pri, int timo)
+{
+ int rv;
+ struct resource_i *r, *whohas;
+ struct rman *rm;
+
+ r = re->__r_i;
+ rm = r->r_rm;
+ mtx_lock(rm->rm_mtx);
+ for (;;) {
+ rv = int_rman_activate_resource(rm, r, &whohas);
+ if (rv != EBUSY)
+ return (rv); /* returns with mutex held */
+
+ if (r->r_sharehead == NULL)
+ panic("rman_await_resource");
+ whohas->r_flags |= RF_WANTED;
+ rv = msleep(r->r_sharehead, rm->rm_mtx, pri, "rmwait", timo);
+ if (rv) {
+ mtx_unlock(rm->rm_mtx);
+ return (rv);
+ }
+ }
+}
+
+static int
+int_rman_deactivate_resource(struct resource_i *r)
+{
+
+ r->r_flags &= ~RF_ACTIVE;
+ if (r->r_flags & RF_WANTED) {
+ r->r_flags &= ~RF_WANTED;
+ wakeup(r->r_sharehead);
+ }
+ return 0;
+}
+
+int
+rman_deactivate_resource(struct resource *r)
+{
+ struct rman *rm;
+
+ rm = r->__r_i->r_rm;
+ mtx_lock(rm->rm_mtx);
+ int_rman_deactivate_resource(r->__r_i);
+ mtx_unlock(rm->rm_mtx);
+ return 0;
+}
+
+static int
+int_rman_release_resource(struct rman *rm, struct resource_i *r)
+{
+ struct resource_i *s, *t;
+
+ if (r->r_flags & RF_ACTIVE)
+ int_rman_deactivate_resource(r);
+
+ /*
+ * Check for a sharing list first. If there is one, then we don't
+ * have to think as hard.
+ */
+ if (r->r_sharehead) {
+ /*
+ * If a sharing list exists, then we know there are at
+ * least two sharers.
+ *
+ * If we are in the main circleq, appoint someone else.
+ */
+ LIST_REMOVE(r, r_sharelink);
+ s = LIST_FIRST(r->r_sharehead);
+ if (r->r_flags & RF_FIRSTSHARE) {
+ s->r_flags |= RF_FIRSTSHARE;
+ TAILQ_INSERT_BEFORE(r, s, r_link);
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ }
+
+ /*
+ * Make sure that the sharing list goes away completely
+ * if the resource is no longer being shared at all.
+ */
+ if (LIST_NEXT(s, r_sharelink) == NULL) {
+ free(s->r_sharehead, M_RMAN);
+ s->r_sharehead = NULL;
+ s->r_flags &= ~RF_FIRSTSHARE;
+ }
+ goto out;
+ }
+
+ /*
+ * Look at the adjacent resources in the list and see if our
+ * segment can be merged with any of them. If either of the
+ * resources is allocated or is not exactly adjacent then they
+ * cannot be merged with our segment.
+ */
+ s = TAILQ_PREV(r, resource_head, r_link);
+ if (s != NULL && ((s->r_flags & RF_ALLOCATED) != 0 ||
+ s->r_end + 1 != r->r_start))
+ s = NULL;
+ t = TAILQ_NEXT(r, r_link);
+ if (t != NULL && ((t->r_flags & RF_ALLOCATED) != 0 ||
+ r->r_end + 1 != t->r_start))
+ t = NULL;
+
+ if (s != NULL && t != NULL) {
+ /*
+ * Merge all three segments.
+ */
+ s->r_end = t->r_end;
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ TAILQ_REMOVE(&rm->rm_list, t, r_link);
+ free(t, M_RMAN);
+ } else if (s != NULL) {
+ /*
+ * Merge previous segment with ours.
+ */
+ s->r_end = r->r_end;
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ } else if (t != NULL) {
+ /*
+ * Merge next segment with ours.
+ */
+ t->r_start = r->r_start;
+ TAILQ_REMOVE(&rm->rm_list, r, r_link);
+ } else {
+ /*
+ * At this point, we know there is nothing we
+ * can potentially merge with, because on each
+ * side, there is either nothing there or what is
+ * there is still allocated. In that case, we don't
+ * want to remove r from the list; we simply want to
+ * change it to an unallocated region and return
+ * without freeing anything.
+ */
+ r->r_flags &= ~RF_ALLOCATED;
+ r->r_dev = NULL;
+ return 0;
+ }
+
+out:
+ free(r, M_RMAN);
+ return 0;
+}
+
+int
+rman_release_resource(struct resource *re)
+{
+ int rv;
+ struct resource_i *r;
+ struct rman *rm;
+
+ r = re->__r_i;
+ rm = r->r_rm;
+ mtx_lock(rm->rm_mtx);
+ rv = int_rman_release_resource(rm, r);
+ mtx_unlock(rm->rm_mtx);
+ return (rv);
+}
+
+uint32_t
+rman_make_alignment_flags(uint32_t size)
+{
+ int i;
+
+ /*
+ * Find the hightest bit set, and add one if more than one bit
+ * set. We're effectively computing the ceil(log2(size)) here.
+ */
+ for (i = 31; i > 0; i--)
+ if ((1 << i) & size)
+ break;
+ if (~(1 << i) & size)
+ i++;
+
+ return(RF_ALIGNMENT_LOG2(i));
+}
+
+void
+rman_set_start(struct resource *r, u_long start)
+{
+ r->__r_i->r_start = start;
+}
+
+u_long
+rman_get_start(struct resource *r)
+{
+ return (r->__r_i->r_start);
+}
+
+void
+rman_set_end(struct resource *r, u_long end)
+{
+ r->__r_i->r_end = end;
+}
+
+u_long
+rman_get_end(struct resource *r)
+{
+ return (r->__r_i->r_end);
+}
+
+u_long
+rman_get_size(struct resource *r)
+{
+ return (r->__r_i->r_end - r->__r_i->r_start + 1);
+}
+
+u_int
+rman_get_flags(struct resource *r)
+{
+ return (r->__r_i->r_flags);
+}
+
+void
+rman_set_virtual(struct resource *r, void *v)
+{
+ r->__r_i->r_virtual = v;
+}
+
+void *
+rman_get_virtual(struct resource *r)
+{
+ return (r->__r_i->r_virtual);
+}
+
+void
+rman_set_bustag(struct resource *r, bus_space_tag_t t)
+{
+ r->r_bustag = t;
+}
+
+bus_space_tag_t
+rman_get_bustag(struct resource *r)
+{
+ return (r->r_bustag);
+}
+
+void
+rman_set_bushandle(struct resource *r, bus_space_handle_t h)
+{
+ r->r_bushandle = h;
+}
+
+bus_space_handle_t
+rman_get_bushandle(struct resource *r)
+{
+ return (r->r_bushandle);
+}
+
+void
+rman_set_rid(struct resource *r, int rid)
+{
+ r->__r_i->r_rid = rid;
+}
+
+int
+rman_get_rid(struct resource *r)
+{
+ return (r->__r_i->r_rid);
+}
+
+void
+rman_set_device(struct resource *r, struct device *dev)
+{
+ r->__r_i->r_dev = dev;
+}
+
+struct device *
+rman_get_device(struct resource *r)
+{
+ return (r->__r_i->r_dev);
+}
+
+int
+rman_is_region_manager(struct resource *r, struct rman *rm)
+{
+
+ return (r->__r_i->r_rm == rm);
+}
+
+/*
+ * Sysctl interface for scanning the resource lists.
+ *
+ * We take two input parameters; the index into the list of resource
+ * managers, and the resource offset into the list.
+ */
+static int
+sysctl_rman(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1;
+ u_int namelen = arg2;
+ int rman_idx, res_idx;
+ struct rman *rm;
+ struct resource_i *res;
+ struct resource_i *sres;
+ struct u_rman urm;
+ struct u_resource ures;
+ int error;
+
+ if (namelen != 3)
+ return (EINVAL);
+
+ if (bus_data_generation_check(name[0]))
+ return (EINVAL);
+ rman_idx = name[1];
+ res_idx = name[2];
+
+ /*
+ * Find the indexed resource manager
+ */
+ mtx_lock(&rman_mtx);
+ TAILQ_FOREACH(rm, &rman_head, rm_link) {
+ if (rman_idx-- == 0)
+ break;
+ }
+ mtx_unlock(&rman_mtx);
+ if (rm == NULL)
+ return (ENOENT);
+
+ /*
+ * If the resource index is -1, we want details on the
+ * resource manager.
+ */
+ if (res_idx == -1) {
+ bzero(&urm, sizeof(urm));
+ urm.rm_handle = (uintptr_t)rm;
+ if (rm->rm_descr != NULL)
+ strlcpy(urm.rm_descr, rm->rm_descr, RM_TEXTLEN);
+ urm.rm_start = rm->rm_start;
+ urm.rm_size = rm->rm_end - rm->rm_start + 1;
+ urm.rm_type = rm->rm_type;
+
+ error = SYSCTL_OUT(req, &urm, sizeof(urm));
+ return (error);
+ }
+
+ /*
+ * Find the indexed resource and return it.
+ */
+ mtx_lock(rm->rm_mtx);
+ TAILQ_FOREACH(res, &rm->rm_list, r_link) {
+ if (res->r_sharehead != NULL) {
+ LIST_FOREACH(sres, res->r_sharehead, r_sharelink)
+ if (res_idx-- == 0) {
+ res = sres;
+ goto found;
+ }
+ }
+ else if (res_idx-- == 0)
+ goto found;
+ }
+ mtx_unlock(rm->rm_mtx);
+ return (ENOENT);
+
+found:
+ bzero(&ures, sizeof(ures));
+ ures.r_handle = (uintptr_t)res;
+ ures.r_parent = (uintptr_t)res->r_rm;
+ ures.r_device = (uintptr_t)res->r_dev;
+ if (res->r_dev != NULL) {
+ if (device_get_name(res->r_dev) != NULL) {
+ snprintf(ures.r_devname, RM_TEXTLEN,
+ "%s%d",
+ device_get_name(res->r_dev),
+ device_get_unit(res->r_dev));
+ } else {
+ strlcpy(ures.r_devname, "nomatch",
+ RM_TEXTLEN);
+ }
+ } else {
+ ures.r_devname[0] = '\0';
+ }
+ ures.r_start = res->r_start;
+ ures.r_size = res->r_end - res->r_start + 1;
+ ures.r_flags = res->r_flags;
+
+ mtx_unlock(rm->rm_mtx);
+ error = SYSCTL_OUT(req, &ures, sizeof(ures));
+ return (error);
+}
+
+static SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman,
+ "kernel resource manager");
+
+#ifdef DDB
+static void
+dump_rman_header(struct rman *rm)
+{
+
+ if (db_pager_quit)
+ return;
+ db_printf("rman %p: %s (0x%lx-0x%lx full range)\n",
+ rm, rm->rm_descr, rm->rm_start, rm->rm_end);
+}
+
+static void
+dump_rman(struct rman *rm)
+{
+ struct resource_i *r;
+ const char *devname;
+
+ if (db_pager_quit)
+ return;
+ TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+ if (r->r_dev != NULL) {
+ devname = device_get_nameunit(r->r_dev);
+ if (devname == NULL)
+ devname = "nomatch";
+ } else
+ devname = NULL;
+ db_printf(" 0x%lx-0x%lx ", r->r_start, r->r_end);
+ if (devname != NULL)
+ db_printf("(%s)\n", devname);
+ else
+ db_printf("----\n");
+ if (db_pager_quit)
+ return;
+ }
+}
+
+DB_SHOW_COMMAND(rman, db_show_rman)
+{
+
+ if (have_addr) {
+ dump_rman_header((struct rman *)addr);
+ dump_rman((struct rman *)addr);
+ }
+}
+
+DB_SHOW_COMMAND(rmans, db_show_rmans)
+{
+ struct rman *rm;
+
+ TAILQ_FOREACH(rm, &rman_head, rm_link) {
+ dump_rman_header(rm);
+ }
+}
+
+DB_SHOW_ALL_COMMAND(rman, db_show_all_rman)
+{
+ struct rman *rm;
+
+ TAILQ_FOREACH(rm, &rman_head, rm_link) {
+ dump_rman_header(rm);
+ dump_rman(rm);
+ }
+}
+DB_SHOW_ALIAS(allrman, db_show_all_rman);
+#endif
diff --git a/sys/kern/subr_rtc.c b/sys/kern/subr_rtc.c
new file mode 100644
index 0000000..ed2befc
--- /dev/null
+++ b/sys/kern/subr_rtc.c
@@ -0,0 +1,178 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2011 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Portions of this software were developed by Julien Ridoux at the University
+ * of Melbourne under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: clock.c 1.18 91/01/21$
+ * from: @(#)clock.c 8.2 (Berkeley) 1/12/94
+ * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ * and
+ * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ */
+
+/*
+ * Helpers for time-of-day clocks. This is useful for architectures that need
+ * support multiple models of such clocks, and generally serves to make the
+ * code more machine-independent.
+ * If the clock in question can also be used as a time counter, the driver
+ * needs to initiate this.
+ * This code is not yet used by all architectures.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ffclock.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#ifdef FFCLOCK
+#include <sys/timeffc.h>
+#endif
+#include <sys/timetc.h>
+
+#include "clock_if.h"
+
+static device_t clock_dev = NULL;
+static long clock_res;
+static struct timespec clock_adj;
+
+/* XXX: should be kern. now, it's no longer machdep. */
+static int disable_rtc_set;
+SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set,
+ 0, "Disallow adjusting time-of-day clock");
+
+void
+clock_register(device_t dev, long res) /* res has units of microseconds */
+{
+
+ if (clock_dev != NULL) {
+ if (clock_res > res) {
+ if (bootverbose)
+ device_printf(dev, "not installed as "
+ "time-of-day clock: clock %s has higher "
+ "resolution\n", device_get_name(clock_dev));
+ return;
+ }
+ if (bootverbose)
+ device_printf(clock_dev, "removed as "
+ "time-of-day clock: clock %s has higher "
+ "resolution\n", device_get_name(dev));
+ }
+ clock_dev = dev;
+ clock_res = res;
+ clock_adj.tv_sec = res / 2 / 1000000;
+ clock_adj.tv_nsec = res / 2 % 1000000 * 1000;
+ if (bootverbose)
+ device_printf(dev, "registered as a time-of-day clock "
+ "(resolution %ldus, adjustment %jd.%09jds)\n", res,
+ (intmax_t)clock_adj.tv_sec, (intmax_t)clock_adj.tv_nsec);
+}
+
+/*
+ * inittodr and settodr derived from the i386 versions written
+ * by Christoph Robitschko <chmr@edvz.tu-graz.ac.at>, reintroduced and
+ * updated by Chris Stenton <chris@gnome.co.uk> 8/10/94
+ */
+
+/*
+ * Initialize the time of day register, based on the time base which is, e.g.
+ * from a filesystem.
+ */
+void
+inittodr(time_t base)
+{
+ struct timespec ts;
+ int error;
+
+ if (clock_dev == NULL) {
+ printf("warning: no time-of-day clock registered, system time "
+ "will not be set accurately\n");
+ goto wrong_time;
+ }
+ /* XXX: We should poll all registered RTCs in case of failure */
+ error = CLOCK_GETTIME(clock_dev, &ts);
+ if (error != 0 && error != EINVAL) {
+ printf("warning: clock_gettime failed (%d), the system time "
+ "will not be set accurately\n", error);
+ goto wrong_time;
+ }
+ if (error == EINVAL || ts.tv_sec < 0) {
+ printf("Invalid time in real time clock.\n"
+ "Check and reset the date immediately!\n");
+ goto wrong_time;
+ }
+
+ ts.tv_sec += utc_offset();
+ timespecadd(&ts, &clock_adj);
+ tc_setclock(&ts);
+#ifdef FFCLOCK
+ ffclock_reset_clock(&ts);
+#endif
+ return;
+
+wrong_time:
+ if (base > 0) {
+ ts.tv_sec = base;
+ ts.tv_nsec = 0;
+ tc_setclock(&ts);
+ }
+}
+
+/*
+ * Write system time back to RTC
+ */
+void
+resettodr(void)
+{
+ struct timespec ts;
+ int error;
+
+ if (disable_rtc_set || clock_dev == NULL)
+ return;
+
+ getnanotime(&ts);
+ timespecadd(&ts, &clock_adj);
+ ts.tv_sec -= utc_offset();
+ /* XXX: We should really set all registered RTCs */
+ if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0)
+ printf("warning: clock_settime failed (%d), time-of-day clock "
+ "not adjusted to system time\n", error);
+}
diff --git a/sys/kern/subr_sbuf.c b/sys/kern/subr_sbuf.c
new file mode 100644
index 0000000..68a7b15
--- /dev/null
+++ b/sys/kern/subr_sbuf.c
@@ -0,0 +1,831 @@
+/*-
+ * Copyright (c) 2000-2008 Poul-Henning Kamp
+ * Copyright (c) 2000-2008 Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#ifdef _KERNEL
+#include <sys/ctype.h>
+#include <sys/errno.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/uio.h>
+#include <machine/stdarg.h>
+#else /* _KERNEL */
+#include <ctype.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif /* _KERNEL */
+
+#include <sys/sbuf.h>
+
+#ifdef _KERNEL
+static MALLOC_DEFINE(M_SBUF, "sbuf", "string buffers");
+#define SBMALLOC(size) malloc(size, M_SBUF, M_WAITOK)
+#define SBFREE(buf) free(buf, M_SBUF)
+#else /* _KERNEL */
+#define KASSERT(e, m)
+#define SBMALLOC(size) malloc(size)
+#define SBFREE(buf) free(buf)
+#endif /* _KERNEL */
+
+/*
+ * Predicates
+ */
+#define SBUF_ISDYNAMIC(s) ((s)->s_flags & SBUF_DYNAMIC)
+#define SBUF_ISDYNSTRUCT(s) ((s)->s_flags & SBUF_DYNSTRUCT)
+#define SBUF_ISFINISHED(s) ((s)->s_flags & SBUF_FINISHED)
+#define SBUF_HASROOM(s) ((s)->s_len < (s)->s_size - 1)
+#define SBUF_FREESPACE(s) ((s)->s_size - ((s)->s_len + 1))
+#define SBUF_CANEXTEND(s) ((s)->s_flags & SBUF_AUTOEXTEND)
+#define SBUF_ISSECTION(s) ((s)->s_flags & SBUF_INSECTION)
+
+/*
+ * Set / clear flags
+ */
+#define SBUF_SETFLAG(s, f) do { (s)->s_flags |= (f); } while (0)
+#define SBUF_CLEARFLAG(s, f) do { (s)->s_flags &= ~(f); } while (0)
+
+#define SBUF_MINEXTENDSIZE 16 /* Should be power of 2. */
+
+#ifdef PAGE_SIZE
+#define SBUF_MAXEXTENDSIZE PAGE_SIZE
+#define SBUF_MAXEXTENDINCR PAGE_SIZE
+#else
+#define SBUF_MAXEXTENDSIZE 4096
+#define SBUF_MAXEXTENDINCR 4096
+#endif
+
+/*
+ * Debugging support
+ */
+#if defined(_KERNEL) && defined(INVARIANTS)
+
+static void
+_assert_sbuf_integrity(const char *fun, struct sbuf *s)
+{
+
+ KASSERT(s != NULL,
+ ("%s called with a NULL sbuf pointer", fun));
+ KASSERT(s->s_buf != NULL,
+ ("%s called with uninitialized or corrupt sbuf", fun));
+ KASSERT(s->s_len < s->s_size,
+ ("wrote past end of sbuf (%jd >= %jd)",
+ (intmax_t)s->s_len, (intmax_t)s->s_size));
+}
+
+static void
+_assert_sbuf_state(const char *fun, struct sbuf *s, int state)
+{
+
+ KASSERT((s->s_flags & SBUF_FINISHED) == state,
+ ("%s called with %sfinished or corrupt sbuf", fun,
+ (state ? "un" : "")));
+}
+
+#define assert_sbuf_integrity(s) _assert_sbuf_integrity(__func__, (s))
+#define assert_sbuf_state(s, i) _assert_sbuf_state(__func__, (s), (i))
+
+#else /* _KERNEL && INVARIANTS */
+
+#define assert_sbuf_integrity(s) do { } while (0)
+#define assert_sbuf_state(s, i) do { } while (0)
+
+#endif /* _KERNEL && INVARIANTS */
+
+#ifdef CTASSERT
+CTASSERT(powerof2(SBUF_MAXEXTENDSIZE));
+CTASSERT(powerof2(SBUF_MAXEXTENDINCR));
+#endif
+
+static int
+sbuf_extendsize(int size)
+{
+ int newsize;
+
+ if (size < (int)SBUF_MAXEXTENDSIZE) {
+ newsize = SBUF_MINEXTENDSIZE;
+ while (newsize < size)
+ newsize *= 2;
+ } else {
+ newsize = roundup2(size, SBUF_MAXEXTENDINCR);
+ }
+ KASSERT(newsize >= size, ("%s: %d < %d\n", __func__, newsize, size));
+ return (newsize);
+}
+
+/*
+ * Extend an sbuf.
+ */
+static int
+sbuf_extend(struct sbuf *s, int addlen)
+{
+ char *newbuf;
+ int newsize;
+
+ if (!SBUF_CANEXTEND(s))
+ return (-1);
+ newsize = sbuf_extendsize(s->s_size + addlen);
+ newbuf = SBMALLOC(newsize);
+ if (newbuf == NULL)
+ return (-1);
+ memcpy(newbuf, s->s_buf, s->s_size);
+ if (SBUF_ISDYNAMIC(s))
+ SBFREE(s->s_buf);
+ else
+ SBUF_SETFLAG(s, SBUF_DYNAMIC);
+ s->s_buf = newbuf;
+ s->s_size = newsize;
+ return (0);
+}
+
+/*
+ * Initialize the internals of an sbuf.
+ * If buf is non-NULL, it points to a static or already-allocated string
+ * big enough to hold at least length characters.
+ */
+static struct sbuf *
+sbuf_newbuf(struct sbuf *s, char *buf, int length, int flags)
+{
+
+ memset(s, 0, sizeof(*s));
+ s->s_flags = flags;
+ s->s_size = length;
+ s->s_buf = buf;
+
+ if ((s->s_flags & SBUF_AUTOEXTEND) == 0) {
+ KASSERT(s->s_size >= 0,
+ ("attempt to create a too small sbuf"));
+ }
+
+ if (s->s_buf != NULL)
+ return (s);
+
+ if ((flags & SBUF_AUTOEXTEND) != 0)
+ s->s_size = sbuf_extendsize(s->s_size);
+
+ s->s_buf = SBMALLOC(s->s_size);
+ if (s->s_buf == NULL)
+ return (NULL);
+ SBUF_SETFLAG(s, SBUF_DYNAMIC);
+ return (s);
+}
+
+/*
+ * Initialize an sbuf.
+ * If buf is non-NULL, it points to a static or already-allocated string
+ * big enough to hold at least length characters.
+ */
+struct sbuf *
+sbuf_new(struct sbuf *s, char *buf, int length, int flags)
+{
+
+ KASSERT(length >= 0,
+ ("attempt to create an sbuf of negative length (%d)", length));
+ KASSERT((flags & ~SBUF_USRFLAGMSK) == 0,
+ ("%s called with invalid flags", __func__));
+
+ flags &= SBUF_USRFLAGMSK;
+ if (s != NULL)
+ return (sbuf_newbuf(s, buf, length, flags));
+
+ s = SBMALLOC(sizeof(*s));
+ if (s == NULL)
+ return (NULL);
+ if (sbuf_newbuf(s, buf, length, flags) == NULL) {
+ SBFREE(s);
+ return (NULL);
+ }
+ SBUF_SETFLAG(s, SBUF_DYNSTRUCT);
+ return (s);
+}
+
+#ifdef _KERNEL
+/*
+ * Create an sbuf with uio data
+ */
+struct sbuf *
+sbuf_uionew(struct sbuf *s, struct uio *uio, int *error)
+{
+
+ KASSERT(uio != NULL,
+ ("%s called with NULL uio pointer", __func__));
+ KASSERT(error != NULL,
+ ("%s called with NULL error pointer", __func__));
+
+ s = sbuf_new(s, NULL, uio->uio_resid + 1, 0);
+ if (s == NULL) {
+ *error = ENOMEM;
+ return (NULL);
+ }
+ *error = uiomove(s->s_buf, uio->uio_resid, uio);
+ if (*error != 0) {
+ sbuf_delete(s);
+ return (NULL);
+ }
+ s->s_len = s->s_size - 1;
+ if (SBUF_ISSECTION(s))
+ s->s_sect_len = s->s_size - 1;
+ *error = 0;
+ return (s);
+}
+#endif
+
+/*
+ * Clear an sbuf and reset its position.
+ */
+void
+sbuf_clear(struct sbuf *s)
+{
+
+ assert_sbuf_integrity(s);
+ /* don't care if it's finished or not */
+
+ SBUF_CLEARFLAG(s, SBUF_FINISHED);
+ s->s_error = 0;
+ s->s_len = 0;
+ s->s_sect_len = 0;
+}
+
+/*
+ * Set the sbuf's end position to an arbitrary value.
+ * Effectively truncates the sbuf at the new position.
+ */
+int
+sbuf_setpos(struct sbuf *s, ssize_t pos)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ KASSERT(pos >= 0,
+ ("attempt to seek to a negative position (%jd)", (intmax_t)pos));
+ KASSERT(pos < s->s_size,
+ ("attempt to seek past end of sbuf (%jd >= %jd)",
+ (intmax_t)pos, (intmax_t)s->s_size));
+ KASSERT(!SBUF_ISSECTION(s),
+ ("attempt to seek when in a section"));
+
+ if (pos < 0 || pos > s->s_len)
+ return (-1);
+ s->s_len = pos;
+ return (0);
+}
+
+/*
+ * Set up a drain function and argument on an sbuf to flush data to
+ * when the sbuf buffer overflows.
+ */
+void
+sbuf_set_drain(struct sbuf *s, sbuf_drain_func *func, void *ctx)
+{
+
+ assert_sbuf_state(s, 0);
+ assert_sbuf_integrity(s);
+ KASSERT(func == s->s_drain_func || s->s_len == 0,
+ ("Cannot change drain to %p on non-empty sbuf %p", func, s));
+ s->s_drain_func = func;
+ s->s_drain_arg = ctx;
+}
+
+/*
+ * Call the drain and process the return.
+ */
+static int
+sbuf_drain(struct sbuf *s)
+{
+ int len;
+
+ KASSERT(s->s_len > 0, ("Shouldn't drain empty sbuf %p", s));
+ KASSERT(s->s_error == 0, ("Called %s with error on %p", __func__, s));
+ len = s->s_drain_func(s->s_drain_arg, s->s_buf, s->s_len);
+ if (len < 0) {
+ s->s_error = -len;
+ return (s->s_error);
+ }
+ KASSERT(len > 0 && len <= s->s_len,
+ ("Bad drain amount %d for sbuf %p", len, s));
+ s->s_len -= len;
+ /*
+ * Fast path for the expected case where all the data was
+ * drained.
+ */
+ if (s->s_len == 0)
+ return (0);
+ /*
+ * Move the remaining characters to the beginning of the
+ * string.
+ */
+ memmove(s->s_buf, s->s_buf + len, s->s_len);
+ return (0);
+}
+
+/*
+ * Append a byte to an sbuf. This is the core function for appending
+ * to an sbuf and is the main place that deals with extending the
+ * buffer and marking overflow.
+ */
+static void
+sbuf_put_byte(struct sbuf *s, int c)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (s->s_error != 0)
+ return;
+ if (SBUF_FREESPACE(s) <= 0) {
+ /*
+ * If there is a drain, use it, otherwise extend the
+ * buffer.
+ */
+ if (s->s_drain_func != NULL)
+ (void)sbuf_drain(s);
+ else if (sbuf_extend(s, 1) < 0)
+ s->s_error = ENOMEM;
+ if (s->s_error != 0)
+ return;
+ }
+ s->s_buf[s->s_len++] = c;
+ if (SBUF_ISSECTION(s))
+ s->s_sect_len++;
+}
+
+/*
+ * Append a byte string to an sbuf.
+ */
+int
+sbuf_bcat(struct sbuf *s, const void *buf, size_t len)
+{
+ const char *str = buf;
+ const char *end = str + len;
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (s->s_error != 0)
+ return (-1);
+ for (; str < end; str++) {
+ sbuf_put_byte(s, *str);
+ if (s->s_error != 0)
+ return (-1);
+ }
+ return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Copy a byte string from userland into an sbuf.
+ */
+int
+sbuf_bcopyin(struct sbuf *s, const void *uaddr, size_t len)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+ KASSERT(s->s_drain_func == NULL,
+ ("Nonsensical copyin to sbuf %p with a drain", s));
+
+ if (s->s_error != 0)
+ return (-1);
+ if (len == 0)
+ return (0);
+ if (len > SBUF_FREESPACE(s)) {
+ sbuf_extend(s, len - SBUF_FREESPACE(s));
+ if (SBUF_FREESPACE(s) < len)
+ len = SBUF_FREESPACE(s);
+ }
+ if (copyin(uaddr, s->s_buf + s->s_len, len) != 0)
+ return (-1);
+ s->s_len += len;
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy a byte string into an sbuf.
+ */
+int
+sbuf_bcpy(struct sbuf *s, const void *buf, size_t len)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ sbuf_clear(s);
+ return (sbuf_bcat(s, buf, len));
+}
+
+/*
+ * Append a string to an sbuf.
+ */
+int
+sbuf_cat(struct sbuf *s, const char *str)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (s->s_error != 0)
+ return (-1);
+
+ while (*str != '\0') {
+ sbuf_put_byte(s, *str++);
+ if (s->s_error != 0)
+ return (-1);
+ }
+ return (0);
+}
+
+#ifdef _KERNEL
+/*
+ * Append a string from userland to an sbuf.
+ */
+int
+sbuf_copyin(struct sbuf *s, const void *uaddr, size_t len)
+{
+ size_t done;
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+ KASSERT(s->s_drain_func == NULL,
+ ("Nonsensical copyin to sbuf %p with a drain", s));
+
+ if (s->s_error != 0)
+ return (-1);
+
+ if (len == 0)
+ len = SBUF_FREESPACE(s); /* XXX return 0? */
+ if (len > SBUF_FREESPACE(s)) {
+ sbuf_extend(s, len);
+ if (SBUF_FREESPACE(s) < len)
+ len = SBUF_FREESPACE(s);
+ }
+ switch (copyinstr(uaddr, s->s_buf + s->s_len, len + 1, &done)) {
+ case ENAMETOOLONG:
+ s->s_error = ENOMEM;
+ /* fall through */
+ case 0:
+ s->s_len += done - 1;
+ if (SBUF_ISSECTION(s))
+ s->s_sect_len += done - 1;
+ break;
+ default:
+ return (-1); /* XXX */
+ }
+
+ return (done);
+}
+#endif
+
+/*
+ * Copy a string into an sbuf.
+ */
+int
+sbuf_cpy(struct sbuf *s, const char *str)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ sbuf_clear(s);
+ return (sbuf_cat(s, str));
+}
+
+/*
+ * Format the given argument list and append the resulting string to an sbuf.
+ */
+#ifdef _KERNEL
+
+/*
+ * Append a non-NUL character to an sbuf. This prototype signature is
+ * suitable for use with kvprintf(9).
+ */
+static void
+sbuf_putc_func(int c, void *arg)
+{
+
+ if (c != '\0')
+ sbuf_put_byte(arg, c);
+}
+
+int
+sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ KASSERT(fmt != NULL,
+ ("%s called with a NULL format string", __func__));
+
+ (void)kvprintf(fmt, sbuf_putc_func, s, 10, ap);
+ if (s->s_error != 0)
+ return (-1);
+ return (0);
+}
+#else /* !_KERNEL */
+int
+sbuf_vprintf(struct sbuf *s, const char *fmt, va_list ap)
+{
+ va_list ap_copy;
+ int error, len;
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ KASSERT(fmt != NULL,
+ ("%s called with a NULL format string", __func__));
+
+ if (s->s_error != 0)
+ return (-1);
+
+ /*
+ * For the moment, there is no way to get vsnprintf(3) to hand
+ * back a character at a time, to push everything into
+ * sbuf_putc_func() as was done for the kernel.
+ *
+ * In userspace, while drains are useful, there's generally
+ * not a problem attempting to malloc(3) on out of space. So
+ * expand a userland sbuf if there is not enough room for the
+ * data produced by sbuf_[v]printf(3).
+ */
+
+ error = 0;
+ do {
+ va_copy(ap_copy, ap);
+ len = vsnprintf(&s->s_buf[s->s_len], SBUF_FREESPACE(s) + 1,
+ fmt, ap_copy);
+ va_end(ap_copy);
+
+ if (SBUF_FREESPACE(s) >= len)
+ break;
+ /* Cannot print with the current available space. */
+ if (s->s_drain_func != NULL && s->s_len > 0)
+ error = sbuf_drain(s);
+ else
+ error = sbuf_extend(s, len - SBUF_FREESPACE(s));
+ } while (error == 0);
+
+ /*
+ * s->s_len is the length of the string, without the terminating nul.
+ * When updating s->s_len, we must subtract 1 from the length that
+ * we passed into vsnprintf() because that length includes the
+ * terminating nul.
+ *
+ * vsnprintf() returns the amount that would have been copied,
+ * given sufficient space, so don't over-increment s_len.
+ */
+ if (SBUF_FREESPACE(s) < len)
+ len = SBUF_FREESPACE(s);
+ s->s_len += len;
+ if (SBUF_ISSECTION(s))
+ s->s_sect_len += len;
+ if (!SBUF_HASROOM(s) && !SBUF_CANEXTEND(s))
+ s->s_error = ENOMEM;
+
+ KASSERT(s->s_len < s->s_size,
+ ("wrote past end of sbuf (%d >= %d)", s->s_len, s->s_size));
+
+ if (s->s_error != 0)
+ return (-1);
+ return (0);
+}
+#endif /* _KERNEL */
+
+/*
+ * Format the given arguments and append the resulting string to an sbuf.
+ */
+int
+sbuf_printf(struct sbuf *s, const char *fmt, ...)
+{
+ va_list ap;
+ int result;
+
+ va_start(ap, fmt);
+ result = sbuf_vprintf(s, fmt, ap);
+ va_end(ap);
+ return (result);
+}
+
+/*
+ * Append a character to an sbuf.
+ */
+int
+sbuf_putc(struct sbuf *s, int c)
+{
+
+ sbuf_put_byte(s, c);
+ if (s->s_error != 0)
+ return (-1);
+ return (0);
+}
+
+/*
+ * Trim whitespace characters from end of an sbuf.
+ */
+int
+sbuf_trim(struct sbuf *s)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+ KASSERT(s->s_drain_func == NULL,
+ ("%s makes no sense on sbuf %p with drain", __func__, s));
+
+ if (s->s_error != 0)
+ return (-1);
+
+ while (s->s_len > 0 && isspace(s->s_buf[s->s_len-1])) {
+ --s->s_len;
+ if (SBUF_ISSECTION(s))
+ s->s_sect_len--;
+ }
+
+ return (0);
+}
+
+/*
+ * Check if an sbuf has an error.
+ */
+int
+sbuf_error(const struct sbuf *s)
+{
+
+ return (s->s_error);
+}
+
+/*
+ * Finish off an sbuf.
+ */
+int
+sbuf_finish(struct sbuf *s)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (s->s_drain_func != NULL) {
+ while (s->s_len > 0 && s->s_error == 0)
+ s->s_error = sbuf_drain(s);
+ }
+ s->s_buf[s->s_len] = '\0';
+ SBUF_SETFLAG(s, SBUF_FINISHED);
+#ifdef _KERNEL
+ return (s->s_error);
+#else
+ if (s->s_error != 0) {
+ errno = s->s_error;
+ return (-1);
+ }
+ return (0);
+#endif
+}
+
+/*
+ * Return a pointer to the sbuf data.
+ */
+char *
+sbuf_data(struct sbuf *s)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, SBUF_FINISHED);
+ KASSERT(s->s_drain_func == NULL,
+ ("%s makes no sense on sbuf %p with drain", __func__, s));
+
+ return (s->s_buf);
+}
+
+/*
+ * Return the length of the sbuf data.
+ */
+ssize_t
+sbuf_len(struct sbuf *s)
+{
+
+ assert_sbuf_integrity(s);
+ /* don't care if it's finished or not */
+ KASSERT(s->s_drain_func == NULL,
+ ("%s makes no sense on sbuf %p with drain", __func__, s));
+
+ if (s->s_error != 0)
+ return (-1);
+ return (s->s_len);
+}
+
+/*
+ * Clear an sbuf, free its buffer if necessary.
+ */
+void
+sbuf_delete(struct sbuf *s)
+{
+ int isdyn;
+
+ assert_sbuf_integrity(s);
+ /* don't care if it's finished or not */
+
+ if (SBUF_ISDYNAMIC(s))
+ SBFREE(s->s_buf);
+ isdyn = SBUF_ISDYNSTRUCT(s);
+ memset(s, 0, sizeof(*s));
+ if (isdyn)
+ SBFREE(s);
+}
+
+/*
+ * Check if an sbuf has been finished.
+ */
+int
+sbuf_done(const struct sbuf *s)
+{
+
+ return (SBUF_ISFINISHED(s));
+}
+
+/*
+ * Start a section.
+ */
+void
+sbuf_start_section(struct sbuf *s, ssize_t *old_lenp)
+{
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+
+ if (!SBUF_ISSECTION(s)) {
+ KASSERT(s->s_sect_len == 0,
+ ("s_sect_len != 0 when starting a section"));
+ if (old_lenp != NULL)
+ *old_lenp = -1;
+ SBUF_SETFLAG(s, SBUF_INSECTION);
+ } else {
+ KASSERT(old_lenp != NULL,
+ ("s_sect_len should be saved when starting a subsection"));
+ *old_lenp = s->s_sect_len;
+ s->s_sect_len = 0;
+ }
+}
+
+/*
+ * End the section padding to the specified length with the specified
+ * character.
+ */
+ssize_t
+sbuf_end_section(struct sbuf *s, ssize_t old_len, size_t pad, int c)
+{
+ ssize_t len;
+
+ assert_sbuf_integrity(s);
+ assert_sbuf_state(s, 0);
+ KASSERT(SBUF_ISSECTION(s),
+ ("attempt to end a section when not in a section"));
+
+ if (pad > 1) {
+ len = roundup(s->s_sect_len, pad) - s->s_sect_len;
+ for (; s->s_error == 0 && len > 0; len--)
+ sbuf_put_byte(s, c);
+ }
+ len = s->s_sect_len;
+ if (old_len == -1) {
+ s->s_sect_len = 0;
+ SBUF_CLEARFLAG(s, SBUF_INSECTION);
+ } else {
+ s->s_sect_len += old_len;
+ }
+ if (s->s_error != 0)
+ return (-1);
+ return (len);
+}
diff --git a/sys/kern/subr_scanf.c b/sys/kern/subr_scanf.c
new file mode 100644
index 0000000..824e392
--- /dev/null
+++ b/sys/kern/subr_scanf.c
@@ -0,0 +1,641 @@
+/*-
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp
+ * From: static char sccsid[] = "@(#)strtol.c 8.1 (Berkeley) 6/4/93";
+ * From: static char sccsid[] = "@(#)strtoul.c 8.1 (Berkeley) 6/4/93";
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ctype.h>
+#include <sys/limits.h>
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#define BUF 32 /* Maximum length of numeric string. */
+
+/*
+ * Flags used during conversion.
+ */
+#define LONG 0x01 /* l: long or double */
+#define SHORT 0x04 /* h: short */
+#define SUPPRESS 0x08 /* suppress assignment */
+#define POINTER 0x10 /* weird %p pointer (`fake hex') */
+#define NOSKIP 0x20 /* do not skip blanks */
+#define QUAD 0x400
+#define SHORTSHORT 0x4000 /** hh: char */
+
+/*
+ * The following are used in numeric conversions only:
+ * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point;
+ * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral.
+ */
+#define SIGNOK 0x40 /* +/- is (still) legal */
+#define NDIGITS 0x80 /* no digits detected */
+
+#define DPTOK 0x100 /* (float) decimal point is still legal */
+#define EXPOK 0x200 /* (float) exponent (e+3, etc) still legal */
+
+#define PFXOK 0x100 /* 0x prefix is (still) legal */
+#define NZDIGITS 0x200 /* no zero digits detected */
+
+/*
+ * Conversion types.
+ */
+#define CT_CHAR 0 /* %c conversion */
+#define CT_CCL 1 /* %[...] conversion */
+#define CT_STRING 2 /* %s conversion */
+#define CT_INT 3 /* integer, i.e., strtoq or strtouq */
+typedef u_quad_t (*ccfntype)(const char *, char **, int);
+
+static const u_char *__sccl(char *, const u_char *);
+
+int
+sscanf(const char *ibuf, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = vsscanf(ibuf, fmt, ap);
+ va_end(ap);
+ return(ret);
+}
+
+int
+vsscanf(const char *inp, char const *fmt0, va_list ap)
+{
+ int inr;
+ const u_char *fmt = (const u_char *)fmt0;
+ int c; /* character from format, or conversion */
+ size_t width; /* field width, or 0 */
+ char *p; /* points into all kinds of strings */
+ int n; /* handy integer */
+ int flags; /* flags as defined above */
+ char *p0; /* saves original value of p when necessary */
+ int nassigned; /* number of fields assigned */
+ int nconversions; /* number of conversions */
+ int nread; /* number of characters consumed from fp */
+ int base; /* base argument to strtoq/strtouq */
+ ccfntype ccfn; /* conversion function (strtoq/strtouq) */
+ char ccltab[256]; /* character class table for %[...] */
+ char buf[BUF]; /* buffer for numeric conversions */
+
+ /* `basefix' is used to avoid `if' tests in the integer scanner */
+ static short basefix[17] =
+ { 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+ inr = strlen(inp);
+
+ nassigned = 0;
+ nconversions = 0;
+ nread = 0;
+ base = 0; /* XXX just to keep gcc happy */
+ ccfn = NULL; /* XXX just to keep gcc happy */
+ for (;;) {
+ c = *fmt++;
+ if (c == 0)
+ return (nassigned);
+ if (isspace(c)) {
+ while (inr > 0 && isspace(*inp))
+ nread++, inr--, inp++;
+ continue;
+ }
+ if (c != '%')
+ goto literal;
+ width = 0;
+ flags = 0;
+ /*
+ * switch on the format. continue if done;
+ * break once format type is derived.
+ */
+again: c = *fmt++;
+ switch (c) {
+ case '%':
+literal:
+ if (inr <= 0)
+ goto input_failure;
+ if (*inp != c)
+ goto match_failure;
+ inr--, inp++;
+ nread++;
+ continue;
+
+ case '*':
+ flags |= SUPPRESS;
+ goto again;
+ case 'l':
+ if (flags & LONG){
+ flags &= ~LONG;
+ flags |= QUAD;
+ } else {
+ flags |= LONG;
+ }
+ goto again;
+ case 'q':
+ flags |= QUAD;
+ goto again;
+ case 'h':
+ if (flags & SHORT){
+ flags &= ~SHORT;
+ flags |= SHORTSHORT;
+ } else {
+ flags |= SHORT;
+ }
+ goto again;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ width = width * 10 + c - '0';
+ goto again;
+
+ /*
+ * Conversions.
+ *
+ */
+ case 'd':
+ c = CT_INT;
+ ccfn = (ccfntype)strtoq;
+ base = 10;
+ break;
+
+ case 'i':
+ c = CT_INT;
+ ccfn = (ccfntype)strtoq;
+ base = 0;
+ break;
+
+ case 'o':
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 8;
+ break;
+
+ case 'u':
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 10;
+ break;
+
+ case 'x':
+ flags |= PFXOK; /* enable 0x prefixing */
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 16;
+ break;
+
+ case 's':
+ c = CT_STRING;
+ break;
+
+ case '[':
+ fmt = __sccl(ccltab, fmt);
+ flags |= NOSKIP;
+ c = CT_CCL;
+ break;
+
+ case 'c':
+ flags |= NOSKIP;
+ c = CT_CHAR;
+ break;
+
+ case 'p': /* pointer format is like hex */
+ flags |= POINTER | PFXOK;
+ c = CT_INT;
+ ccfn = strtouq;
+ base = 16;
+ break;
+
+ case 'n':
+ nconversions++;
+ if (flags & SUPPRESS) /* ??? */
+ continue;
+ if (flags & SHORTSHORT)
+ *va_arg(ap, char *) = nread;
+ else if (flags & SHORT)
+ *va_arg(ap, short *) = nread;
+ else if (flags & LONG)
+ *va_arg(ap, long *) = nread;
+ else if (flags & QUAD)
+ *va_arg(ap, quad_t *) = nread;
+ else
+ *va_arg(ap, int *) = nread;
+ continue;
+ }
+
+ /*
+ * We have a conversion that requires input.
+ */
+ if (inr <= 0)
+ goto input_failure;
+
+ /*
+ * Consume leading white space, except for formats
+ * that suppress this.
+ */
+ if ((flags & NOSKIP) == 0) {
+ while (isspace(*inp)) {
+ nread++;
+ if (--inr > 0)
+ inp++;
+ else
+ goto input_failure;
+ }
+ /*
+ * Note that there is at least one character in
+ * the buffer, so conversions that do not set NOSKIP
+ * can no longer result in an input failure.
+ */
+ }
+
+ /*
+ * Do the conversion.
+ */
+ switch (c) {
+
+ case CT_CHAR:
+ /* scan arbitrary characters (sets NOSKIP) */
+ if (width == 0)
+ width = 1;
+ if (flags & SUPPRESS) {
+ size_t sum = 0;
+ for (;;) {
+ if ((n = inr) < width) {
+ sum += n;
+ width -= n;
+ inp += n;
+ if (sum == 0)
+ goto input_failure;
+ break;
+ } else {
+ sum += width;
+ inr -= width;
+ inp += width;
+ break;
+ }
+ }
+ nread += sum;
+ } else {
+ bcopy(inp, va_arg(ap, char *), width);
+ inr -= width;
+ inp += width;
+ nread += width;
+ nassigned++;
+ }
+ nconversions++;
+ break;
+
+ case CT_CCL:
+ /* scan a (nonempty) character class (sets NOSKIP) */
+ if (width == 0)
+ width = (size_t)~0; /* `infinity' */
+ /* take only those things in the class */
+ if (flags & SUPPRESS) {
+ n = 0;
+ while (ccltab[(unsigned char)*inp]) {
+ n++, inr--, inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0) {
+ if (n == 0)
+ goto input_failure;
+ break;
+ }
+ }
+ if (n == 0)
+ goto match_failure;
+ } else {
+ p0 = p = va_arg(ap, char *);
+ while (ccltab[(unsigned char)*inp]) {
+ inr--;
+ *p++ = *inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0) {
+ if (p == p0)
+ goto input_failure;
+ break;
+ }
+ }
+ n = p - p0;
+ if (n == 0)
+ goto match_failure;
+ *p = 0;
+ nassigned++;
+ }
+ nread += n;
+ nconversions++;
+ break;
+
+ case CT_STRING:
+ /* like CCL, but zero-length string OK, & no NOSKIP */
+ if (width == 0)
+ width = (size_t)~0;
+ if (flags & SUPPRESS) {
+ n = 0;
+ while (!isspace(*inp)) {
+ n++, inr--, inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0)
+ break;
+ }
+ nread += n;
+ } else {
+ p0 = p = va_arg(ap, char *);
+ while (!isspace(*inp)) {
+ inr--;
+ *p++ = *inp++;
+ if (--width == 0)
+ break;
+ if (inr <= 0)
+ break;
+ }
+ *p = 0;
+ nread += p - p0;
+ nassigned++;
+ }
+ nconversions++;
+ continue;
+
+ case CT_INT:
+ /* scan an integer as if by strtoq/strtouq */
+#ifdef hardway
+ if (width == 0 || width > sizeof(buf) - 1)
+ width = sizeof(buf) - 1;
+#else
+ /* size_t is unsigned, hence this optimisation */
+ if (--width > sizeof(buf) - 2)
+ width = sizeof(buf) - 2;
+ width++;
+#endif
+ flags |= SIGNOK | NDIGITS | NZDIGITS;
+ for (p = buf; width; width--) {
+ c = *inp;
+ /*
+ * Switch on the character; `goto ok'
+ * if we accept it as a part of number.
+ */
+ switch (c) {
+
+ /*
+ * The digit 0 is always legal, but is
+ * special. For %i conversions, if no
+ * digits (zero or nonzero) have been
+ * scanned (only signs), we will have
+ * base==0. In that case, we should set
+ * it to 8 and enable 0x prefixing.
+ * Also, if we have not scanned zero digits
+ * before this, do not turn off prefixing
+ * (someone else will turn it off if we
+ * have scanned any nonzero digits).
+ */
+ case '0':
+ if (base == 0) {
+ base = 8;
+ flags |= PFXOK;
+ }
+ if (flags & NZDIGITS)
+ flags &= ~(SIGNOK|NZDIGITS|NDIGITS);
+ else
+ flags &= ~(SIGNOK|PFXOK|NDIGITS);
+ goto ok;
+
+ /* 1 through 7 always legal */
+ case '1': case '2': case '3':
+ case '4': case '5': case '6': case '7':
+ base = basefix[base];
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* digits 8 and 9 ok iff decimal or hex */
+ case '8': case '9':
+ base = basefix[base];
+ if (base <= 8)
+ break; /* not legal here */
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* letters ok iff hex */
+ case 'A': case 'B': case 'C':
+ case 'D': case 'E': case 'F':
+ case 'a': case 'b': case 'c':
+ case 'd': case 'e': case 'f':
+ /* no need to fix base here */
+ if (base <= 10)
+ break; /* not legal here */
+ flags &= ~(SIGNOK | PFXOK | NDIGITS);
+ goto ok;
+
+ /* sign ok only as first character */
+ case '+': case '-':
+ if (flags & SIGNOK) {
+ flags &= ~SIGNOK;
+ goto ok;
+ }
+ break;
+
+ /* x ok iff flag still set & 2nd char */
+ case 'x': case 'X':
+ if (flags & PFXOK && p == buf + 1) {
+ base = 16; /* if %i */
+ flags &= ~PFXOK;
+ goto ok;
+ }
+ break;
+ }
+
+ /*
+ * If we got here, c is not a legal character
+ * for a number. Stop accumulating digits.
+ */
+ break;
+ ok:
+ /*
+ * c is legal: store it and look at the next.
+ */
+ *p++ = c;
+ if (--inr > 0)
+ inp++;
+ else
+ break; /* end of input */
+ }
+ /*
+ * If we had only a sign, it is no good; push
+ * back the sign. If the number ends in `x',
+ * it was [sign] '0' 'x', so push back the x
+ * and treat it as [sign] '0'.
+ */
+ if (flags & NDIGITS) {
+ if (p > buf) {
+ inp--;
+ inr++;
+ }
+ goto match_failure;
+ }
+ c = ((u_char *)p)[-1];
+ if (c == 'x' || c == 'X') {
+ --p;
+ inp--;
+ inr++;
+ }
+ if ((flags & SUPPRESS) == 0) {
+ u_quad_t res;
+
+ *p = 0;
+ res = (*ccfn)(buf, (char **)NULL, base);
+ if (flags & POINTER)
+ *va_arg(ap, void **) =
+ (void *)(uintptr_t)res;
+ else if (flags & SHORTSHORT)
+ *va_arg(ap, char *) = res;
+ else if (flags & SHORT)
+ *va_arg(ap, short *) = res;
+ else if (flags & LONG)
+ *va_arg(ap, long *) = res;
+ else if (flags & QUAD)
+ *va_arg(ap, quad_t *) = res;
+ else
+ *va_arg(ap, int *) = res;
+ nassigned++;
+ }
+ nread += p - buf;
+ nconversions++;
+ break;
+
+ }
+ }
+input_failure:
+ return (nconversions != 0 ? nassigned : -1);
+match_failure:
+ return (nassigned);
+}
+
+/*
+ * Fill in the given table from the scanset at the given format
+ * (just after `['). Return a pointer to the character past the
+ * closing `]'. The table has a 1 wherever characters should be
+ * considered part of the scanset.
+ */
+static const u_char *
+__sccl(char *tab, const u_char *fmt)
+{
+ int c, n, v;
+
+ /* first `clear' the whole table */
+ c = *fmt++; /* first char hat => negated scanset */
+ if (c == '^') {
+ v = 1; /* default => accept */
+ c = *fmt++; /* get new first char */
+ } else
+ v = 0; /* default => reject */
+
+ /* XXX: Will not work if sizeof(tab*) > sizeof(char) */
+ for (n = 0; n < 256; n++)
+ tab[n] = v; /* memset(tab, v, 256) */
+
+ if (c == 0)
+ return (fmt - 1);/* format ended before closing ] */
+
+ /*
+ * Now set the entries corresponding to the actual scanset
+ * to the opposite of the above.
+ *
+ * The first character may be ']' (or '-') without being special;
+ * the last character may be '-'.
+ */
+ v = 1 - v;
+ for (;;) {
+ tab[c] = v; /* take character c */
+doswitch:
+ n = *fmt++; /* and examine the next */
+ switch (n) {
+
+ case 0: /* format ended too soon */
+ return (fmt - 1);
+
+ case '-':
+ /*
+ * A scanset of the form
+ * [01+-]
+ * is defined as `the digit 0, the digit 1,
+ * the character +, the character -', but
+ * the effect of a scanset such as
+ * [a-zA-Z0-9]
+ * is implementation defined. The V7 Unix
+ * scanf treats `a-z' as `the letters a through
+ * z', but treats `a-a' as `the letter a, the
+ * character -, and the letter a'.
+ *
+ * For compatibility, the `-' is not considerd
+ * to define a range if the character following
+ * it is either a close bracket (required by ANSI)
+ * or is not numerically greater than the character
+ * we just stored in the table (c).
+ */
+ n = *fmt;
+ if (n == ']' || n < c) {
+ c = '-';
+ break; /* resume the for(;;) */
+ }
+ fmt++;
+ /* fill in the range */
+ do {
+ tab[++c] = v;
+ } while (c < n);
+ c = n;
+ /*
+ * Alas, the V7 Unix scanf also treats formats
+ * such as [a-c-e] as `the letters a through e'.
+ * This too is permitted by the standard....
+ */
+ goto doswitch;
+ break;
+
+ case ']': /* end of scanset */
+ return (fmt);
+
+ default: /* just another character */
+ c = n;
+ break;
+ }
+ }
+ /* NOTREACHED */
+}
+
diff --git a/sys/kern/subr_sglist.c b/sys/kern/subr_sglist.c
new file mode 100644
index 0000000..ea77161
--- /dev/null
+++ b/sys/kern/subr_sglist.c
@@ -0,0 +1,714 @@
+/*-
+ * Copyright (c) 2008 Yahoo!, Inc.
+ * All rights reserved.
+ * Written by: John Baldwin <jhb@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/sglist.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#include <sys/ktr.h>
+
+static MALLOC_DEFINE(M_SGLIST, "sglist", "scatter/gather lists");
+
+/*
+ * Convenience macros to save the state of an sglist so it can be restored
+ * if an append attempt fails. Since sglist's only grow we only need to
+ * save the current count of segments and the length of the ending segment.
+ * Earlier segments will not be changed by an append, and the only change
+ * that can occur to the ending segment is that it can be extended.
+ */
+struct sgsave {
+ u_short sg_nseg;
+ size_t ss_len;
+};
+
+#define SGLIST_SAVE(sg, sgsave) do { \
+ (sgsave).sg_nseg = (sg)->sg_nseg; \
+ if ((sgsave).sg_nseg > 0) \
+ (sgsave).ss_len = (sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len; \
+ else \
+ (sgsave).ss_len = 0; \
+} while (0)
+
+#define SGLIST_RESTORE(sg, sgsave) do { \
+ (sg)->sg_nseg = (sgsave).sg_nseg; \
+ if ((sgsave).sg_nseg > 0) \
+ (sg)->sg_segs[(sgsave).sg_nseg - 1].ss_len = (sgsave).ss_len; \
+} while (0)
+
+/*
+ * Append a single (paddr, len) to a sglist. sg is the list and ss is
+ * the current segment in the list. If we run out of segments then
+ * EFBIG will be returned.
+ */
+static __inline int
+_sglist_append_range(struct sglist *sg, struct sglist_seg **ssp,
+ vm_paddr_t paddr, size_t len)
+{
+ struct sglist_seg *ss;
+
+ ss = *ssp;
+ if (ss->ss_paddr + ss->ss_len == paddr)
+ ss->ss_len += len;
+ else {
+ if (sg->sg_nseg == sg->sg_maxseg)
+ return (EFBIG);
+ ss++;
+ ss->ss_paddr = paddr;
+ ss->ss_len = len;
+ sg->sg_nseg++;
+ *ssp = ss;
+ }
+ return (0);
+}
+
+/*
+ * Worker routine to append a virtual address range (either kernel or
+ * user) to a scatter/gather list.
+ */
+static __inline int
+_sglist_append_buf(struct sglist *sg, void *buf, size_t len, pmap_t pmap,
+ size_t *donep)
+{
+ struct sglist_seg *ss;
+ vm_offset_t vaddr, offset;
+ vm_paddr_t paddr;
+ size_t seglen;
+ int error;
+
+ if (donep)
+ *donep = 0;
+ if (len == 0)
+ return (0);
+
+ /* Do the first page. It may have an offset. */
+ vaddr = (vm_offset_t)buf;
+ offset = vaddr & PAGE_MASK;
+ if (pmap != NULL)
+ paddr = pmap_extract(pmap, vaddr);
+ else
+ paddr = pmap_kextract(vaddr);
+ seglen = MIN(len, PAGE_SIZE - offset);
+ if (sg->sg_nseg == 0) {
+ ss = sg->sg_segs;
+ ss->ss_paddr = paddr;
+ ss->ss_len = seglen;
+ sg->sg_nseg = 1;
+ } else {
+ ss = &sg->sg_segs[sg->sg_nseg - 1];
+ error = _sglist_append_range(sg, &ss, paddr, seglen);
+ if (error)
+ return (error);
+ }
+ vaddr += seglen;
+ len -= seglen;
+ if (donep)
+ *donep += seglen;
+
+ while (len > 0) {
+ seglen = MIN(len, PAGE_SIZE);
+ if (pmap != NULL)
+ paddr = pmap_extract(pmap, vaddr);
+ else
+ paddr = pmap_kextract(vaddr);
+ error = _sglist_append_range(sg, &ss, paddr, seglen);
+ if (error)
+ return (error);
+ vaddr += seglen;
+ len -= seglen;
+ if (donep)
+ *donep += seglen;
+ }
+
+ return (0);
+}
+
+/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe a kernel virtual address range.
+ */
+int
+sglist_count(void *buf, size_t len)
+{
+ vm_offset_t vaddr, vendaddr;
+ vm_paddr_t lastaddr, paddr;
+ int nsegs;
+
+ if (len == 0)
+ return (0);
+
+ vaddr = trunc_page((vm_offset_t)buf);
+ vendaddr = (vm_offset_t)buf + len;
+ nsegs = 1;
+ lastaddr = pmap_kextract(vaddr);
+ vaddr += PAGE_SIZE;
+ while (vaddr < vendaddr) {
+ paddr = pmap_kextract(vaddr);
+ if (lastaddr + PAGE_SIZE != paddr)
+ nsegs++;
+ lastaddr = paddr;
+ vaddr += PAGE_SIZE;
+ }
+ return (nsegs);
+}
+
+/*
+ * Allocate a scatter/gather list along with 'nsegs' segments. The
+ * 'mflags' parameters are the same as passed to malloc(9). The caller
+ * should use sglist_free() to free this list.
+ */
+struct sglist *
+sglist_alloc(int nsegs, int mflags)
+{
+ struct sglist *sg;
+
+ sg = malloc(sizeof(struct sglist) + nsegs * sizeof(struct sglist_seg),
+ M_SGLIST, mflags);
+ if (sg == NULL)
+ return (NULL);
+ sglist_init(sg, nsegs, (struct sglist_seg *)(sg + 1));
+ return (sg);
+}
+
+/*
+ * Free a scatter/gather list allocated via sglist_allc().
+ */
+void
+sglist_free(struct sglist *sg)
+{
+
+ if (refcount_release(&sg->sg_refs))
+ free(sg, M_SGLIST);
+}
+
+/*
+ * Append the segments to describe a single kernel virtual address
+ * range to a scatter/gather list. If there are insufficient
+ * segments, then this fails with EFBIG.
+ */
+int
+sglist_append(struct sglist *sg, void *buf, size_t len)
+{
+ struct sgsave save;
+ int error;
+
+ if (sg->sg_maxseg == 0)
+ return (EINVAL);
+ SGLIST_SAVE(sg, save);
+ error = _sglist_append_buf(sg, buf, len, NULL, NULL);
+ if (error)
+ SGLIST_RESTORE(sg, save);
+ return (error);
+}
+
+/*
+ * Append a single physical address range to a scatter/gather list.
+ * If there are insufficient segments, then this fails with EFBIG.
+ */
+int
+sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
+{
+ struct sglist_seg *ss;
+ struct sgsave save;
+ int error;
+
+ if (sg->sg_maxseg == 0)
+ return (EINVAL);
+ if (len == 0)
+ return (0);
+
+ if (sg->sg_nseg == 0) {
+ sg->sg_segs[0].ss_paddr = paddr;
+ sg->sg_segs[0].ss_len = len;
+ sg->sg_nseg = 1;
+ return (0);
+ }
+ ss = &sg->sg_segs[sg->sg_nseg - 1];
+ SGLIST_SAVE(sg, save);
+ error = _sglist_append_range(sg, &ss, paddr, len);
+ if (error)
+ SGLIST_RESTORE(sg, save);
+ return (error);
+}
+
+/*
+ * Append the segments that describe a single mbuf chain to a
+ * scatter/gather list. If there are insufficient segments, then this
+ * fails with EFBIG.
+ */
+int
+sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
+{
+ struct sgsave save;
+ struct mbuf *m;
+ int error;
+
+ if (sg->sg_maxseg == 0)
+ return (EINVAL);
+
+ error = 0;
+ SGLIST_SAVE(sg, save);
+ for (m = m0; m != NULL; m = m->m_next) {
+ if (m->m_len > 0) {
+ error = sglist_append(sg, m->m_data, m->m_len);
+ if (error) {
+ SGLIST_RESTORE(sg, save);
+ return (error);
+ }
+ }
+ }
+ return (0);
+}
+
+/*
+ * Append the segments that describe a single user address range to a
+ * scatter/gather list. If there are insufficient segments, then this
+ * fails with EFBIG.
+ */
+int
+sglist_append_user(struct sglist *sg, void *buf, size_t len, struct thread *td)
+{
+ struct sgsave save;
+ int error;
+
+ if (sg->sg_maxseg == 0)
+ return (EINVAL);
+ SGLIST_SAVE(sg, save);
+ error = _sglist_append_buf(sg, buf, len,
+ vmspace_pmap(td->td_proc->p_vmspace), NULL);
+ if (error)
+ SGLIST_RESTORE(sg, save);
+ return (error);
+}
+
+/*
+ * Append the segments that describe a single uio to a scatter/gather
+ * list. If there are insufficient segments, then this fails with
+ * EFBIG.
+ */
+int
+sglist_append_uio(struct sglist *sg, struct uio *uio)
+{
+ struct iovec *iov;
+ struct sgsave save;
+ size_t resid, minlen;
+ pmap_t pmap;
+ int error, i;
+
+ if (sg->sg_maxseg == 0)
+ return (EINVAL);
+
+ resid = uio->uio_resid;
+ iov = uio->uio_iov;
+
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ KASSERT(uio->uio_td != NULL,
+ ("sglist_append_uio: USERSPACE but no thread"));
+ pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
+ } else
+ pmap = NULL;
+
+ error = 0;
+ SGLIST_SAVE(sg, save);
+ for (i = 0; i < uio->uio_iovcnt && resid != 0; i++) {
+ /*
+ * Now at the first iovec to load. Load each iovec
+ * until we have exhausted the residual count.
+ */
+ minlen = MIN(resid, iov[i].iov_len);
+ if (minlen > 0) {
+ error = _sglist_append_buf(sg, iov[i].iov_base, minlen,
+ pmap, NULL);
+ if (error) {
+ SGLIST_RESTORE(sg, save);
+ return (error);
+ }
+ resid -= minlen;
+ }
+ }
+ return (0);
+}
+
+/*
+ * Append the segments that describe at most 'resid' bytes from a
+ * single uio to a scatter/gather list. If there are insufficient
+ * segments, then only the amount that fits is appended.
+ */
+int
+sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid)
+{
+ struct iovec *iov;
+ size_t done;
+ pmap_t pmap;
+ int error, len;
+
+ if (sg->sg_maxseg == 0)
+ return (EINVAL);
+
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ KASSERT(uio->uio_td != NULL,
+ ("sglist_consume_uio: USERSPACE but no thread"));
+ pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
+ } else
+ pmap = NULL;
+
+ error = 0;
+ while (resid > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ len = iov->iov_len;
+ if (len == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (len > resid)
+ len = resid;
+
+ /*
+ * Try to append this iovec. If we run out of room,
+ * then break out of the loop.
+ */
+ error = _sglist_append_buf(sg, iov->iov_base, len, pmap, &done);
+ iov->iov_base = (char *)iov->iov_base + done;
+ iov->iov_len -= done;
+ uio->uio_resid -= done;
+ uio->uio_offset += done;
+ resid -= done;
+ if (error)
+ break;
+ }
+ return (0);
+}
+
+/*
+ * Allocate and populate a scatter/gather list to describe a single
+ * kernel virtual address range.
+ */
+struct sglist *
+sglist_build(void *buf, size_t len, int mflags)
+{
+ struct sglist *sg;
+ int nsegs;
+
+ if (len == 0)
+ return (NULL);
+
+ nsegs = sglist_count(buf, len);
+ sg = sglist_alloc(nsegs, mflags);
+ if (sg == NULL)
+ return (NULL);
+ if (sglist_append(sg, buf, len) != 0) {
+ sglist_free(sg);
+ return (NULL);
+ }
+ return (sg);
+}
+
+/*
+ * Clone a new copy of a scatter/gather list.
+ */
+struct sglist *
+sglist_clone(struct sglist *sg, int mflags)
+{
+ struct sglist *new;
+
+ if (sg == NULL)
+ return (NULL);
+ new = sglist_alloc(sg->sg_maxseg, mflags);
+ if (new == NULL)
+ return (NULL);
+ new->sg_nseg = sg->sg_nseg;
+ bcopy(sg->sg_segs, new->sg_segs, sizeof(struct sglist_seg) *
+ sg->sg_nseg);
+ return (new);
+}
+
+/*
+ * Calculate the total length of the segments described in a
+ * scatter/gather list.
+ */
+size_t
+sglist_length(struct sglist *sg)
+{
+ size_t space;
+ int i;
+
+ space = 0;
+ for (i = 0; i < sg->sg_nseg; i++)
+ space += sg->sg_segs[i].ss_len;
+ return (space);
+}
+
+/*
+ * Split a scatter/gather list into two lists. The scatter/gather
+ * entries for the first 'length' bytes of the 'original' list are
+ * stored in the '*head' list and are removed from 'original'.
+ *
+ * If '*head' is NULL, then a new list will be allocated using
+ * 'mflags'. If M_NOWAIT is specified and the allocation fails,
+ * ENOMEM will be returned.
+ *
+ * If '*head' is not NULL, it should point to an empty sglist. If it
+ * does not have enough room for the remaining space, then EFBIG will
+ * be returned. If '*head' is not empty, then EINVAL will be
+ * returned.
+ *
+ * If 'original' is shared (refcount > 1), then EDOOFUS will be
+ * returned.
+ */
+int
+sglist_split(struct sglist *original, struct sglist **head, size_t length,
+ int mflags)
+{
+ struct sglist *sg;
+ size_t space, split;
+ int count, i;
+
+ if (original->sg_refs > 1)
+ return (EDOOFUS);
+
+ /* Figure out how big of a sglist '*head' has to hold. */
+ count = 0;
+ space = 0;
+ split = 0;
+ for (i = 0; i < original->sg_nseg; i++) {
+ space += original->sg_segs[i].ss_len;
+ count++;
+ if (space >= length) {
+ /*
+ * If 'length' falls in the middle of a
+ * scatter/gather list entry, then 'split'
+ * holds how much of that entry will remain in
+ * 'original'.
+ */
+ split = space - length;
+ break;
+ }
+ }
+
+ /* Nothing to do, so leave head empty. */
+ if (count == 0)
+ return (0);
+
+ if (*head == NULL) {
+ sg = sglist_alloc(count, mflags);
+ if (sg == NULL)
+ return (ENOMEM);
+ *head = sg;
+ } else {
+ sg = *head;
+ if (sg->sg_maxseg < count)
+ return (EFBIG);
+ if (sg->sg_nseg != 0)
+ return (EINVAL);
+ }
+
+ /* Copy 'count' entries to 'sg' from 'original'. */
+ bcopy(original->sg_segs, sg->sg_segs, count *
+ sizeof(struct sglist_seg));
+ sg->sg_nseg = count;
+
+ /*
+ * If we had to split a list entry, fixup the last entry in
+ * 'sg' and the new first entry in 'original'. We also
+ * decrement 'count' by 1 since we will only be removing
+ * 'count - 1' segments from 'original' now.
+ */
+ if (split != 0) {
+ count--;
+ sg->sg_segs[count].ss_len -= split;
+ original->sg_segs[count].ss_paddr =
+ sg->sg_segs[count].ss_paddr + split;
+ original->sg_segs[count].ss_len = split;
+ }
+
+ /* Trim 'count' entries from the front of 'original'. */
+ original->sg_nseg -= count;
+ bcopy(original->sg_segs + count, original->sg_segs, count *
+ sizeof(struct sglist_seg));
+ return (0);
+}
+
+/*
+ * Append the scatter/gather list elements in 'second' to the
+ * scatter/gather list 'first'. If there is not enough space in
+ * 'first', EFBIG is returned.
+ */
+int
+sglist_join(struct sglist *first, struct sglist *second)
+{
+ struct sglist_seg *flast, *sfirst;
+ int append;
+
+ /* If 'second' is empty, there is nothing to do. */
+ if (second->sg_nseg == 0)
+ return (0);
+
+ /*
+ * If the first entry in 'second' can be appended to the last entry
+ * in 'first' then set append to '1'.
+ */
+ append = 0;
+ flast = &first->sg_segs[first->sg_nseg - 1];
+ sfirst = &second->sg_segs[0];
+ if (first->sg_nseg != 0 &&
+ flast->ss_paddr + flast->ss_len == sfirst->ss_paddr)
+ append = 1;
+
+ /* Make sure 'first' has enough room. */
+ if (first->sg_nseg + second->sg_nseg - append > first->sg_maxseg)
+ return (EFBIG);
+
+ /* Merge last in 'first' and first in 'second' if needed. */
+ if (append)
+ flast->ss_len += sfirst->ss_len;
+
+ /* Append new segments from 'second' to 'first'. */
+ bcopy(first->sg_segs + first->sg_nseg, second->sg_segs + append,
+ (second->sg_nseg - append) * sizeof(struct sglist_seg));
+ first->sg_nseg += second->sg_nseg - append;
+ sglist_reset(second);
+ return (0);
+}
+
+/*
+ * Generate a new scatter/gather list from a range of an existing
+ * scatter/gather list. The 'offset' and 'length' parameters specify
+ * the logical range of the 'original' list to extract. If that range
+ * is not a subset of the length of 'original', then EINVAL is
+ * returned. The new scatter/gather list is stored in '*slice'.
+ *
+ * If '*slice' is NULL, then a new list will be allocated using
+ * 'mflags'. If M_NOWAIT is specified and the allocation fails,
+ * ENOMEM will be returned.
+ *
+ * If '*slice' is not NULL, it should point to an empty sglist. If it
+ * does not have enough room for the remaining space, then EFBIG will
+ * be returned. If '*slice' is not empty, then EINVAL will be
+ * returned.
+ */
+int
+sglist_slice(struct sglist *original, struct sglist **slice, size_t offset,
+ size_t length, int mflags)
+{
+ struct sglist *sg;
+ size_t space, end, foffs, loffs;
+ int count, i, fseg;
+
+ /* Nothing to do. */
+ if (length == 0)
+ return (0);
+
+ /* Figure out how many segments '*slice' needs to have. */
+ end = offset + length;
+ space = 0;
+ count = 0;
+ fseg = 0;
+ foffs = loffs = 0;
+ for (i = 0; i < original->sg_nseg; i++) {
+ space += original->sg_segs[i].ss_len;
+ if (space > offset) {
+ /*
+ * When we hit the first segment, store its index
+ * in 'fseg' and the offset into the first segment
+ * of 'offset' in 'foffs'.
+ */
+ if (count == 0) {
+ fseg = i;
+ foffs = offset - (space -
+ original->sg_segs[i].ss_len);
+ CTR1(KTR_DEV, "sglist_slice: foffs = %08lx",
+ foffs);
+ }
+ count++;
+
+ /*
+ * When we hit the last segment, break out of
+ * the loop. Store the amount of extra space
+ * at the end of this segment in 'loffs'.
+ */
+ if (space >= end) {
+ loffs = space - end;
+ CTR1(KTR_DEV, "sglist_slice: loffs = %08lx",
+ loffs);
+ break;
+ }
+ }
+ }
+
+ /* If we never hit 'end', then 'length' ran off the end, so fail. */
+ if (space < end)
+ return (EINVAL);
+
+ if (*slice == NULL) {
+ sg = sglist_alloc(count, mflags);
+ if (sg == NULL)
+ return (ENOMEM);
+ *slice = sg;
+ } else {
+ sg = *slice;
+ if (sg->sg_maxseg < count)
+ return (EFBIG);
+ if (sg->sg_nseg != 0)
+ return (EINVAL);
+ }
+
+ /*
+ * Copy over 'count' segments from 'original' starting at
+ * 'fseg' to 'sg'.
+ */
+ bcopy(original->sg_segs + fseg, sg->sg_segs,
+ count * sizeof(struct sglist_seg));
+ sg->sg_nseg = count;
+
+ /* Fixup first and last segments if needed. */
+ if (foffs != 0) {
+ sg->sg_segs[0].ss_paddr += foffs;
+ sg->sg_segs[0].ss_len -= foffs;
+ CTR2(KTR_DEV, "sglist_slice seg[0]: %08lx:%08lx",
+ (long)sg->sg_segs[0].ss_paddr, sg->sg_segs[0].ss_len);
+ }
+ if (loffs != 0) {
+ sg->sg_segs[count - 1].ss_len -= loffs;
+ CTR2(KTR_DEV, "sglist_slice seg[%d]: len %08x", count - 1,
+ sg->sg_segs[count - 1].ss_len);
+ }
+ return (0);
+}
diff --git a/sys/kern/subr_sleepqueue.c b/sys/kern/subr_sleepqueue.c
new file mode 100644
index 0000000..92b5147
--- /dev/null
+++ b/sys/kern/subr_sleepqueue.c
@@ -0,0 +1,1236 @@
+/*-
+ * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Implementation of sleep queues used to hold queue of threads blocked on
+ * a wait channel. Sleep queues different from turnstiles in that wait
+ * channels are not owned by anyone, so there is no priority propagation.
+ * Sleep queues can also provide a timeout and can also be interrupted by
+ * signals. That said, there are several similarities between the turnstile
+ * and sleep queue implementations. (Note: turnstiles were implemented
+ * first.) For example, both use a hash table of the same size where each
+ * bucket is referred to as a "chain" that contains both a spin lock and
+ * a linked list of queues. An individual queue is located by using a hash
+ * to pick a chain, locking the chain, and then walking the chain searching
+ * for the queue. This means that a wait channel object does not need to
+ * embed it's queue head just as locks do not embed their turnstile queue
+ * head. Threads also carry around a sleep queue that they lend to the
+ * wait channel when blocking. Just as in turnstiles, the queue includes
+ * a free list of the sleep queues of other threads blocked on the same
+ * wait channel in the case of multiple waiters.
+ *
+ * Some additional functionality provided by sleep queues include the
+ * ability to set a timeout. The timeout is managed using a per-thread
+ * callout that resumes a thread if it is asleep. A thread may also
+ * catch signals while it is asleep (aka an interruptible sleep). The
+ * signal code uses sleepq_abort() to interrupt a sleeping thread. Finally,
+ * sleep queues also provide some extra assertions. One is not allowed to
+ * mix the sleep/wakeup and cv APIs for a given wait channel. Also, one
+ * must consistently use the same lock to synchronize with a wait channel,
+ * though this check is currently only a warning for sleep/wakeup due to
+ * pre-existing abuse of that API. The same lock must also be held when
+ * awakening threads, though that is currently only enforced for condition
+ * variables.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_sleepqueue_profiling.h"
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/sleepqueue.h>
+#include <sys/sysctl.h>
+
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * Constants for the hash table of sleep queue chains.
+ * SC_TABLESIZE must be a power of two for SC_MASK to work properly.
+ */
+#define SC_TABLESIZE 256 /* Must be power of 2. */
+#define SC_MASK (SC_TABLESIZE - 1)
+#define SC_SHIFT 8
+#define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
+ SC_MASK)
+#define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)]
+#define NR_SLEEPQS 2
+/*
+ * There two different lists of sleep queues. Both lists are connected
+ * via the sq_hash entries. The first list is the sleep queue chain list
+ * that a sleep queue is on when it is attached to a wait channel. The
+ * second list is the free list hung off of a sleep queue that is attached
+ * to a wait channel.
+ *
+ * Each sleep queue also contains the wait channel it is attached to, the
+ * list of threads blocked on that wait channel, flags specific to the
+ * wait channel, and the lock used to synchronize with a wait channel.
+ * The flags are used to catch mismatches between the various consumers
+ * of the sleep queue API (e.g. sleep/wakeup and condition variables).
+ * The lock pointer is only used when invariants are enabled for various
+ * debugging checks.
+ *
+ * Locking key:
+ * c - sleep queue chain lock
+ */
+struct sleepqueue {
+ TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */
+ u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */
+ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */
+ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */
+ void *sq_wchan; /* (c) Wait channel. */
+ int sq_type; /* (c) Queue type. */
+#ifdef INVARIANTS
+ struct lock_object *sq_lock; /* (c) Associated lock. */
+#endif
+};
+
+struct sleepqueue_chain {
+ LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */
+ struct mtx sc_lock; /* Spin lock for this chain. */
+#ifdef SLEEPQUEUE_PROFILING
+ u_int sc_depth; /* Length of sc_queues. */
+ u_int sc_max_depth; /* Max length of sc_queues. */
+#endif
+};
+
+#ifdef SLEEPQUEUE_PROFILING
+u_int sleepq_max_depth;
+static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling");
+static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0,
+ "sleepq chain stats");
+SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
+ 0, "maxmimum depth achieved of a single chain");
+
+static void sleepq_profile(const char *wmesg);
+static int prof_enabled;
+#endif
+static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
+static uma_zone_t sleepq_zone;
+
+/*
+ * Prototypes for non-exported routines.
+ */
+static int sleepq_catch_signals(void *wchan, int pri);
+static int sleepq_check_signals(void);
+static int sleepq_check_timeout(void);
+#ifdef INVARIANTS
+static void sleepq_dtor(void *mem, int size, void *arg);
+#endif
+static int sleepq_init(void *mem, int size, int flags);
+static int sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
+ int pri);
+static void sleepq_switch(void *wchan, int pri);
+static void sleepq_timeout(void *arg);
+
+SDT_PROBE_DECLARE(sched, , , sleep);
+SDT_PROBE_DECLARE(sched, , , wakeup);
+
+/*
+ * Early initialization of sleep queues that is called from the sleepinit()
+ * SYSINIT.
+ */
+void
+init_sleepqueues(void)
+{
+#ifdef SLEEPQUEUE_PROFILING
+ struct sysctl_oid *chain_oid;
+ char chain_name[10];
+#endif
+ int i;
+
+ for (i = 0; i < SC_TABLESIZE; i++) {
+ LIST_INIT(&sleepq_chains[i].sc_queues);
+ mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
+ MTX_SPIN | MTX_RECURSE);
+#ifdef SLEEPQUEUE_PROFILING
+ snprintf(chain_name, sizeof(chain_name), "%d", i);
+ chain_oid = SYSCTL_ADD_NODE(NULL,
+ SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
+ chain_name, CTLFLAG_RD, NULL, "sleepq chain stats");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+ "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+ "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
+ NULL);
+#endif
+ }
+ sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
+#ifdef INVARIANTS
+ NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
+#else
+ NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
+#endif
+
+ thread0.td_sleepqueue = sleepq_alloc();
+}
+
+/*
+ * Get a sleep queue for a new thread.
+ */
+struct sleepqueue *
+sleepq_alloc(void)
+{
+
+ return (uma_zalloc(sleepq_zone, M_WAITOK));
+}
+
+/*
+ * Free a sleep queue when a thread is destroyed.
+ */
+void
+sleepq_free(struct sleepqueue *sq)
+{
+
+ uma_zfree(sleepq_zone, sq);
+}
+
+/*
+ * Lock the sleep queue chain associated with the specified wait channel.
+ */
+void
+sleepq_lock(void *wchan)
+{
+ struct sleepqueue_chain *sc;
+
+ sc = SC_LOOKUP(wchan);
+ mtx_lock_spin(&sc->sc_lock);
+}
+
+/*
+ * Look up the sleep queue associated with a given wait channel in the hash
+ * table locking the associated sleep queue chain. If no queue is found in
+ * the table, NULL is returned.
+ */
+struct sleepqueue *
+sleepq_lookup(void *wchan)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+
+ KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+ sc = SC_LOOKUP(wchan);
+ mtx_assert(&sc->sc_lock, MA_OWNED);
+ LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
+ if (sq->sq_wchan == wchan)
+ return (sq);
+ return (NULL);
+}
+
+/*
+ * Unlock the sleep queue chain associated with a given wait channel.
+ */
+void
+sleepq_release(void *wchan)
+{
+ struct sleepqueue_chain *sc;
+
+ sc = SC_LOOKUP(wchan);
+ mtx_unlock_spin(&sc->sc_lock);
+}
+
+/*
+ * Places the current thread on the sleep queue for the specified wait
+ * channel. If INVARIANTS is enabled, then it associates the passed in
+ * lock with the sleepq to make sure it is held when that sleep queue is
+ * woken up.
+ */
+void
+sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
+ int queue)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+ struct thread *td;
+
+ td = curthread;
+ sc = SC_LOOKUP(wchan);
+ mtx_assert(&sc->sc_lock, MA_OWNED);
+ MPASS(td->td_sleepqueue != NULL);
+ MPASS(wchan != NULL);
+ MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+
+ /* If this thread is not allowed to sleep, die a horrible death. */
+ KASSERT(td->td_no_sleeping == 0,
+ ("%s: td %p to sleep on wchan %p with sleeping prohibited",
+ __func__, td, wchan));
+
+ /* Look up the sleep queue associated with the wait channel 'wchan'. */
+ sq = sleepq_lookup(wchan);
+
+ /*
+ * If the wait channel does not already have a sleep queue, use
+ * this thread's sleep queue. Otherwise, insert the current thread
+ * into the sleep queue already in use by this wait channel.
+ */
+ if (sq == NULL) {
+#ifdef INVARIANTS
+ int i;
+
+ sq = td->td_sleepqueue;
+ for (i = 0; i < NR_SLEEPQS; i++) {
+ KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
+ ("thread's sleep queue %d is not empty", i));
+ KASSERT(sq->sq_blockedcnt[i] == 0,
+ ("thread's sleep queue %d count mismatches", i));
+ }
+ KASSERT(LIST_EMPTY(&sq->sq_free),
+ ("thread's sleep queue has a non-empty free list"));
+ KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
+ sq->sq_lock = lock;
+#endif
+#ifdef SLEEPQUEUE_PROFILING
+ sc->sc_depth++;
+ if (sc->sc_depth > sc->sc_max_depth) {
+ sc->sc_max_depth = sc->sc_depth;
+ if (sc->sc_max_depth > sleepq_max_depth)
+ sleepq_max_depth = sc->sc_max_depth;
+ }
+#endif
+ sq = td->td_sleepqueue;
+ LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
+ sq->sq_wchan = wchan;
+ sq->sq_type = flags & SLEEPQ_TYPE;
+ } else {
+ MPASS(wchan == sq->sq_wchan);
+ MPASS(lock == sq->sq_lock);
+ MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
+ LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
+ }
+ thread_lock(td);
+ TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
+ sq->sq_blockedcnt[queue]++;
+ td->td_sleepqueue = NULL;
+ td->td_sqqueue = queue;
+ td->td_wchan = wchan;
+ td->td_wmesg = wmesg;
+ if (flags & SLEEPQ_INTERRUPTIBLE) {
+ td->td_flags |= TDF_SINTR;
+ td->td_flags &= ~TDF_SLEEPABORT;
+ }
+ thread_unlock(td);
+}
+
+/*
+ * Sets a timeout that will remove the current thread from the specified
+ * sleep queue after timo ticks if the thread has not already been awakened.
+ */
+void
+sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
+ int flags)
+{
+ struct sleepqueue_chain *sc;
+ struct thread *td;
+
+ td = curthread;
+ sc = SC_LOOKUP(wchan);
+ mtx_assert(&sc->sc_lock, MA_OWNED);
+ MPASS(TD_ON_SLEEPQ(td));
+ MPASS(td->td_sleepqueue == NULL);
+ MPASS(wchan != NULL);
+ callout_reset_sbt_on(&td->td_slpcallout, sbt, pr,
+ sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC);
+}
+
+/*
+ * Return the number of actual sleepers for the specified queue.
+ */
+u_int
+sleepq_sleepcnt(void *wchan, int queue)
+{
+ struct sleepqueue *sq;
+
+ KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+ MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+ sq = sleepq_lookup(wchan);
+ if (sq == NULL)
+ return (0);
+ return (sq->sq_blockedcnt[queue]);
+}
+
+/*
+ * Marks the pending sleep of the current thread as interruptible and
+ * makes an initial check for pending signals before putting a thread
+ * to sleep. Enters and exits with the thread lock held. Thread lock
+ * may have transitioned from the sleepq lock to a run lock.
+ */
+static int
+sleepq_catch_signals(void *wchan, int pri)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+ struct thread *td;
+ struct proc *p;
+ struct sigacts *ps;
+ int sig, ret;
+
+ td = curthread;
+ p = curproc;
+ sc = SC_LOOKUP(wchan);
+ mtx_assert(&sc->sc_lock, MA_OWNED);
+ MPASS(wchan != NULL);
+ if ((td->td_pflags & TDP_WAKEUP) != 0) {
+ td->td_pflags &= ~TDP_WAKEUP;
+ ret = EINTR;
+ thread_lock(td);
+ goto out;
+ }
+
+ /*
+ * See if there are any pending signals for this thread. If not
+ * we can switch immediately. Otherwise do the signal processing
+ * directly.
+ */
+ thread_lock(td);
+ if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) == 0) {
+ sleepq_switch(wchan, pri);
+ return (0);
+ }
+ thread_unlock(td);
+ mtx_unlock_spin(&sc->sc_lock);
+ CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
+ (void *)td, (long)p->p_pid, td->td_name);
+ PROC_LOCK(p);
+ ps = p->p_sigacts;
+ mtx_lock(&ps->ps_mtx);
+ sig = cursig(td);
+ if (sig == 0) {
+ mtx_unlock(&ps->ps_mtx);
+ ret = thread_suspend_check(1);
+ MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
+ } else {
+ if (SIGISMEMBER(ps->ps_sigintr, sig))
+ ret = EINTR;
+ else
+ ret = ERESTART;
+ mtx_unlock(&ps->ps_mtx);
+ }
+ /*
+ * Lock the per-process spinlock prior to dropping the PROC_LOCK
+ * to avoid a signal delivery race. PROC_LOCK, PROC_SLOCK, and
+ * thread_lock() are currently held in tdsendsignal().
+ */
+ PROC_SLOCK(p);
+ mtx_lock_spin(&sc->sc_lock);
+ PROC_UNLOCK(p);
+ thread_lock(td);
+ PROC_SUNLOCK(p);
+ if (ret == 0) {
+ sleepq_switch(wchan, pri);
+ return (0);
+ }
+out:
+ /*
+ * There were pending signals and this thread is still
+ * on the sleep queue, remove it from the sleep queue.
+ */
+ if (TD_ON_SLEEPQ(td)) {
+ sq = sleepq_lookup(wchan);
+ if (sleepq_resume_thread(sq, td, 0)) {
+#ifdef INVARIANTS
+ /*
+ * This thread hasn't gone to sleep yet, so it
+ * should not be swapped out.
+ */
+ panic("not waking up swapper");
+#endif
+ }
+ }
+ mtx_unlock_spin(&sc->sc_lock);
+ MPASS(td->td_lock != &sc->sc_lock);
+ return (ret);
+}
+
+/*
+ * Switches to another thread if we are still asleep on a sleep queue.
+ * Returns with thread lock.
+ */
+static void
+sleepq_switch(void *wchan, int pri)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+ struct thread *td;
+
+ td = curthread;
+ sc = SC_LOOKUP(wchan);
+ mtx_assert(&sc->sc_lock, MA_OWNED);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ /*
+ * If we have a sleep queue, then we've already been woken up, so
+ * just return.
+ */
+ if (td->td_sleepqueue != NULL) {
+ mtx_unlock_spin(&sc->sc_lock);
+ return;
+ }
+
+ /*
+ * If TDF_TIMEOUT is set, then our sleep has been timed out
+ * already but we are still on the sleep queue, so dequeue the
+ * thread and return.
+ */
+ if (td->td_flags & TDF_TIMEOUT) {
+ MPASS(TD_ON_SLEEPQ(td));
+ sq = sleepq_lookup(wchan);
+ if (sleepq_resume_thread(sq, td, 0)) {
+#ifdef INVARIANTS
+ /*
+ * This thread hasn't gone to sleep yet, so it
+ * should not be swapped out.
+ */
+ panic("not waking up swapper");
+#endif
+ }
+ mtx_unlock_spin(&sc->sc_lock);
+ return;
+ }
+#ifdef SLEEPQUEUE_PROFILING
+ if (prof_enabled)
+ sleepq_profile(td->td_wmesg);
+#endif
+ MPASS(td->td_sleepqueue == NULL);
+ sched_sleep(td, pri);
+ thread_lock_set(td, &sc->sc_lock);
+ SDT_PROBE0(sched, , , sleep);
+ TD_SET_SLEEPING(td);
+ mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
+ KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
+ CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
+ (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
+}
+
+/*
+ * Check to see if we timed out.
+ */
+static int
+sleepq_check_timeout(void)
+{
+ struct thread *td;
+
+ td = curthread;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ /*
+ * If TDF_TIMEOUT is set, we timed out.
+ */
+ if (td->td_flags & TDF_TIMEOUT) {
+ td->td_flags &= ~TDF_TIMEOUT;
+ return (EWOULDBLOCK);
+ }
+
+ /*
+ * If TDF_TIMOFAIL is set, the timeout ran after we had
+ * already been woken up.
+ */
+ if (td->td_flags & TDF_TIMOFAIL)
+ td->td_flags &= ~TDF_TIMOFAIL;
+
+ /*
+ * If callout_stop() fails, then the timeout is running on
+ * another CPU, so synchronize with it to avoid having it
+ * accidentally wake up a subsequent sleep.
+ */
+ else if (callout_stop(&td->td_slpcallout) == 0) {
+ td->td_flags |= TDF_TIMEOUT;
+ TD_SET_SLEEPING(td);
+ mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
+ }
+ return (0);
+}
+
+/*
+ * Check to see if we were awoken by a signal.
+ */
+static int
+sleepq_check_signals(void)
+{
+ struct thread *td;
+
+ td = curthread;
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+ /* We are no longer in an interruptible sleep. */
+ if (td->td_flags & TDF_SINTR)
+ td->td_flags &= ~TDF_SINTR;
+
+ if (td->td_flags & TDF_SLEEPABORT) {
+ td->td_flags &= ~TDF_SLEEPABORT;
+ return (td->td_intrval);
+ }
+
+ return (0);
+}
+
+/*
+ * Block the current thread until it is awakened from its sleep queue.
+ */
+void
+sleepq_wait(void *wchan, int pri)
+{
+ struct thread *td;
+
+ td = curthread;
+ MPASS(!(td->td_flags & TDF_SINTR));
+ thread_lock(td);
+ sleepq_switch(wchan, pri);
+ thread_unlock(td);
+}
+
+/*
+ * Block the current thread until it is awakened from its sleep queue
+ * or it is interrupted by a signal.
+ */
+int
+sleepq_wait_sig(void *wchan, int pri)
+{
+ int rcatch;
+ int rval;
+
+ rcatch = sleepq_catch_signals(wchan, pri);
+ rval = sleepq_check_signals();
+ thread_unlock(curthread);
+ if (rcatch)
+ return (rcatch);
+ return (rval);
+}
+
+/*
+ * Block the current thread until it is awakened from its sleep queue
+ * or it times out while waiting.
+ */
+int
+sleepq_timedwait(void *wchan, int pri)
+{
+ struct thread *td;
+ int rval;
+
+ td = curthread;
+ MPASS(!(td->td_flags & TDF_SINTR));
+ thread_lock(td);
+ sleepq_switch(wchan, pri);
+ rval = sleepq_check_timeout();
+ thread_unlock(td);
+
+ return (rval);
+}
+
+/*
+ * Block the current thread until it is awakened from its sleep queue,
+ * it is interrupted by a signal, or it times out waiting to be awakened.
+ */
+int
+sleepq_timedwait_sig(void *wchan, int pri)
+{
+ int rcatch, rvalt, rvals;
+
+ rcatch = sleepq_catch_signals(wchan, pri);
+ rvalt = sleepq_check_timeout();
+ rvals = sleepq_check_signals();
+ thread_unlock(curthread);
+ if (rcatch)
+ return (rcatch);
+ if (rvals)
+ return (rvals);
+ return (rvalt);
+}
+
+/*
+ * Returns the type of sleepqueue given a waitchannel.
+ */
+int
+sleepq_type(void *wchan)
+{
+ struct sleepqueue *sq;
+ int type;
+
+ MPASS(wchan != NULL);
+
+ sleepq_lock(wchan);
+ sq = sleepq_lookup(wchan);
+ if (sq == NULL) {
+ sleepq_release(wchan);
+ return (-1);
+ }
+ type = sq->sq_type;
+ sleepq_release(wchan);
+ return (type);
+}
+
+/*
+ * Removes a thread from a sleep queue and makes it
+ * runnable.
+ */
+static int
+sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri)
+{
+ struct sleepqueue_chain *sc;
+
+ MPASS(td != NULL);
+ MPASS(sq->sq_wchan != NULL);
+ MPASS(td->td_wchan == sq->sq_wchan);
+ MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ sc = SC_LOOKUP(sq->sq_wchan);
+ mtx_assert(&sc->sc_lock, MA_OWNED);
+
+ SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
+
+ /* Remove the thread from the queue. */
+ sq->sq_blockedcnt[td->td_sqqueue]--;
+ TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
+
+ /*
+ * Get a sleep queue for this thread. If this is the last waiter,
+ * use the queue itself and take it out of the chain, otherwise,
+ * remove a queue from the free list.
+ */
+ if (LIST_EMPTY(&sq->sq_free)) {
+ td->td_sleepqueue = sq;
+#ifdef INVARIANTS
+ sq->sq_wchan = NULL;
+#endif
+#ifdef SLEEPQUEUE_PROFILING
+ sc->sc_depth--;
+#endif
+ } else
+ td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
+ LIST_REMOVE(td->td_sleepqueue, sq_hash);
+
+ td->td_wmesg = NULL;
+ td->td_wchan = NULL;
+ td->td_flags &= ~TDF_SINTR;
+
+ CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
+ (void *)td, (long)td->td_proc->p_pid, td->td_name);
+
+ /* Adjust priority if requested. */
+ MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX));
+ if (pri != 0 && td->td_priority > pri &&
+ PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+ sched_prio(td, pri);
+
+ /*
+ * Note that thread td might not be sleeping if it is running
+ * sleepq_catch_signals() on another CPU or is blocked on its
+ * proc lock to check signals. There's no need to mark the
+ * thread runnable in that case.
+ */
+ if (TD_IS_SLEEPING(td)) {
+ TD_CLR_SLEEPING(td);
+ return (setrunnable(td));
+ }
+ return (0);
+}
+
+#ifdef INVARIANTS
+/*
+ * UMA zone item deallocator.
+ */
+static void
+sleepq_dtor(void *mem, int size, void *arg)
+{
+ struct sleepqueue *sq;
+ int i;
+
+ sq = mem;
+ for (i = 0; i < NR_SLEEPQS; i++) {
+ MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
+ MPASS(sq->sq_blockedcnt[i] == 0);
+ }
+}
+#endif
+
+/*
+ * UMA zone item initializer.
+ */
+static int
+sleepq_init(void *mem, int size, int flags)
+{
+ struct sleepqueue *sq;
+ int i;
+
+ bzero(mem, size);
+ sq = mem;
+ for (i = 0; i < NR_SLEEPQS; i++) {
+ TAILQ_INIT(&sq->sq_blocked[i]);
+ sq->sq_blockedcnt[i] = 0;
+ }
+ LIST_INIT(&sq->sq_free);
+ return (0);
+}
+
+/*
+ * Find the highest priority thread sleeping on a wait channel and resume it.
+ */
+int
+sleepq_signal(void *wchan, int flags, int pri, int queue)
+{
+ struct sleepqueue *sq;
+ struct thread *td, *besttd;
+ int wakeup_swapper;
+
+ CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
+ KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+ MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+ sq = sleepq_lookup(wchan);
+ if (sq == NULL)
+ return (0);
+ KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
+ ("%s: mismatch between sleep/wakeup and cv_*", __func__));
+
+ /*
+ * Find the highest priority thread on the queue. If there is a
+ * tie, use the thread that first appears in the queue as it has
+ * been sleeping the longest since threads are always added to
+ * the tail of sleep queues.
+ */
+ besttd = NULL;
+ TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
+ if (besttd == NULL || td->td_priority < besttd->td_priority)
+ besttd = td;
+ }
+ MPASS(besttd != NULL);
+ thread_lock(besttd);
+ wakeup_swapper = sleepq_resume_thread(sq, besttd, pri);
+ thread_unlock(besttd);
+ return (wakeup_swapper);
+}
+
+/*
+ * Resume all threads sleeping on a specified wait channel.
+ */
+int
+sleepq_broadcast(void *wchan, int flags, int pri, int queue)
+{
+ struct sleepqueue *sq;
+ struct thread *td, *tdn;
+ int wakeup_swapper;
+
+ CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
+ KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+ MPASS((queue >= 0) && (queue < NR_SLEEPQS));
+ sq = sleepq_lookup(wchan);
+ if (sq == NULL)
+ return (0);
+ KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
+ ("%s: mismatch between sleep/wakeup and cv_*", __func__));
+
+ /* Resume all blocked threads on the sleep queue. */
+ wakeup_swapper = 0;
+ TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
+ thread_lock(td);
+ if (sleepq_resume_thread(sq, td, pri))
+ wakeup_swapper = 1;
+ thread_unlock(td);
+ }
+ return (wakeup_swapper);
+}
+
+/*
+ * Time sleeping threads out. When the timeout expires, the thread is
+ * removed from the sleep queue and made runnable if it is still asleep.
+ */
+static void
+sleepq_timeout(void *arg)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+ struct thread *td;
+ void *wchan;
+ int wakeup_swapper;
+
+ td = arg;
+ wakeup_swapper = 0;
+ CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
+ (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
+
+ /*
+ * First, see if the thread is asleep and get the wait channel if
+ * it is.
+ */
+ thread_lock(td);
+ if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
+ wchan = td->td_wchan;
+ sc = SC_LOOKUP(wchan);
+ THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
+ sq = sleepq_lookup(wchan);
+ MPASS(sq != NULL);
+ td->td_flags |= TDF_TIMEOUT;
+ wakeup_swapper = sleepq_resume_thread(sq, td, 0);
+ thread_unlock(td);
+ if (wakeup_swapper)
+ kick_proc0();
+ return;
+ }
+
+ /*
+ * If the thread is on the SLEEPQ but isn't sleeping yet, it
+ * can either be on another CPU in between sleepq_add() and
+ * one of the sleepq_*wait*() routines or it can be in
+ * sleepq_catch_signals().
+ */
+ if (TD_ON_SLEEPQ(td)) {
+ td->td_flags |= TDF_TIMEOUT;
+ thread_unlock(td);
+ return;
+ }
+
+ /*
+ * Now check for the edge cases. First, if TDF_TIMEOUT is set,
+ * then the other thread has already yielded to us, so clear
+ * the flag and resume it. If TDF_TIMEOUT is not set, then the
+ * we know that the other thread is not on a sleep queue, but it
+ * hasn't resumed execution yet. In that case, set TDF_TIMOFAIL
+ * to let it know that the timeout has already run and doesn't
+ * need to be canceled.
+ */
+ if (td->td_flags & TDF_TIMEOUT) {
+ MPASS(TD_IS_SLEEPING(td));
+ td->td_flags &= ~TDF_TIMEOUT;
+ TD_CLR_SLEEPING(td);
+ wakeup_swapper = setrunnable(td);
+ } else
+ td->td_flags |= TDF_TIMOFAIL;
+ thread_unlock(td);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+/*
+ * Resumes a specific thread from the sleep queue associated with a specific
+ * wait channel if it is on that queue.
+ */
+void
+sleepq_remove(struct thread *td, void *wchan)
+{
+ struct sleepqueue *sq;
+ int wakeup_swapper;
+
+ /*
+ * Look up the sleep queue for this wait channel, then re-check
+ * that the thread is asleep on that channel, if it is not, then
+ * bail.
+ */
+ MPASS(wchan != NULL);
+ sleepq_lock(wchan);
+ sq = sleepq_lookup(wchan);
+ /*
+ * We can not lock the thread here as it may be sleeping on a
+ * different sleepq. However, holding the sleepq lock for this
+ * wchan can guarantee that we do not miss a wakeup for this
+ * channel. The asserts below will catch any false positives.
+ */
+ if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
+ sleepq_release(wchan);
+ return;
+ }
+ /* Thread is asleep on sleep queue sq, so wake it up. */
+ thread_lock(td);
+ MPASS(sq != NULL);
+ MPASS(td->td_wchan == wchan);
+ wakeup_swapper = sleepq_resume_thread(sq, td, 0);
+ thread_unlock(td);
+ sleepq_release(wchan);
+ if (wakeup_swapper)
+ kick_proc0();
+}
+
+/*
+ * Abort a thread as if an interrupt had occurred. Only abort
+ * interruptible waits (unfortunately it isn't safe to abort others).
+ */
+int
+sleepq_abort(struct thread *td, int intrval)
+{
+ struct sleepqueue *sq;
+ void *wchan;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ MPASS(TD_ON_SLEEPQ(td));
+ MPASS(td->td_flags & TDF_SINTR);
+ MPASS(intrval == EINTR || intrval == ERESTART);
+
+ /*
+ * If the TDF_TIMEOUT flag is set, just leave. A
+ * timeout is scheduled anyhow.
+ */
+ if (td->td_flags & TDF_TIMEOUT)
+ return (0);
+
+ CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
+ (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
+ td->td_intrval = intrval;
+ td->td_flags |= TDF_SLEEPABORT;
+ /*
+ * If the thread has not slept yet it will find the signal in
+ * sleepq_catch_signals() and call sleepq_resume_thread. Otherwise
+ * we have to do it here.
+ */
+ if (!TD_IS_SLEEPING(td))
+ return (0);
+ wchan = td->td_wchan;
+ MPASS(wchan != NULL);
+ sq = sleepq_lookup(wchan);
+ MPASS(sq != NULL);
+
+ /* Thread is asleep on sleep queue sq, so wake it up. */
+ return (sleepq_resume_thread(sq, td, 0));
+}
+
+#ifdef SLEEPQUEUE_PROFILING
+#define SLEEPQ_PROF_LOCATIONS 1024
+#define SLEEPQ_SBUFSIZE 512
+struct sleepq_prof {
+ LIST_ENTRY(sleepq_prof) sp_link;
+ const char *sp_wmesg;
+ long sp_count;
+};
+
+LIST_HEAD(sqphead, sleepq_prof);
+
+struct sqphead sleepq_prof_free;
+struct sqphead sleepq_hash[SC_TABLESIZE];
+static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
+static struct mtx sleepq_prof_lock;
+MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);
+
+static void
+sleepq_profile(const char *wmesg)
+{
+ struct sleepq_prof *sp;
+
+ mtx_lock_spin(&sleepq_prof_lock);
+ if (prof_enabled == 0)
+ goto unlock;
+ LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
+ if (sp->sp_wmesg == wmesg)
+ goto done;
+ sp = LIST_FIRST(&sleepq_prof_free);
+ if (sp == NULL)
+ goto unlock;
+ sp->sp_wmesg = wmesg;
+ LIST_REMOVE(sp, sp_link);
+ LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
+done:
+ sp->sp_count++;
+unlock:
+ mtx_unlock_spin(&sleepq_prof_lock);
+ return;
+}
+
+static void
+sleepq_prof_reset(void)
+{
+ struct sleepq_prof *sp;
+ int enabled;
+ int i;
+
+ mtx_lock_spin(&sleepq_prof_lock);
+ enabled = prof_enabled;
+ prof_enabled = 0;
+ for (i = 0; i < SC_TABLESIZE; i++)
+ LIST_INIT(&sleepq_hash[i]);
+ LIST_INIT(&sleepq_prof_free);
+ for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
+ sp = &sleepq_profent[i];
+ sp->sp_wmesg = NULL;
+ sp->sp_count = 0;
+ LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
+ }
+ prof_enabled = enabled;
+ mtx_unlock_spin(&sleepq_prof_lock);
+}
+
+static int
+enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
+{
+ int error, v;
+
+ v = prof_enabled;
+ error = sysctl_handle_int(oidp, &v, v, req);
+ if (error)
+ return (error);
+ if (req->newptr == NULL)
+ return (error);
+ if (v == prof_enabled)
+ return (0);
+ if (v == 1)
+ sleepq_prof_reset();
+ mtx_lock_spin(&sleepq_prof_lock);
+ prof_enabled = !!v;
+ mtx_unlock_spin(&sleepq_prof_lock);
+
+ return (0);
+}
+
+static int
+reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+ int error, v;
+
+ v = 0;
+ error = sysctl_handle_int(oidp, &v, 0, req);
+ if (error)
+ return (error);
+ if (req->newptr == NULL)
+ return (error);
+ if (v == 0)
+ return (0);
+ sleepq_prof_reset();
+
+ return (0);
+}
+
+static int
+dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+ struct sleepq_prof *sp;
+ struct sbuf *sb;
+ int enabled;
+ int error;
+ int i;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
+ sbuf_printf(sb, "\nwmesg\tcount\n");
+ enabled = prof_enabled;
+ mtx_lock_spin(&sleepq_prof_lock);
+ prof_enabled = 0;
+ mtx_unlock_spin(&sleepq_prof_lock);
+ for (i = 0; i < SC_TABLESIZE; i++) {
+ LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
+ sbuf_printf(sb, "%s\t%ld\n",
+ sp->sp_wmesg, sp->sp_count);
+ }
+ }
+ mtx_lock_spin(&sleepq_prof_lock);
+ prof_enabled = enabled;
+ mtx_unlock_spin(&sleepq_prof_lock);
+
+ error = sbuf_finish(sb);
+ sbuf_delete(sb);
+ return (error);
+}
+
+SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics");
+SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
+ NULL, 0, reset_sleepq_prof_stats, "I",
+ "Reset sleepqueue profiling statistics");
+SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
+ NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling");
+#endif
+
+#ifdef DDB
+DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+#ifdef INVARIANTS
+ struct lock_object *lock;
+#endif
+ struct thread *td;
+ void *wchan;
+ int i;
+
+ if (!have_addr)
+ return;
+
+ /*
+ * First, see if there is an active sleep queue for the wait channel
+ * indicated by the address.
+ */
+ wchan = (void *)addr;
+ sc = SC_LOOKUP(wchan);
+ LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
+ if (sq->sq_wchan == wchan)
+ goto found;
+
+ /*
+ * Second, see if there is an active sleep queue at the address
+ * indicated.
+ */
+ for (i = 0; i < SC_TABLESIZE; i++)
+ LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
+ if (sq == (struct sleepqueue *)addr)
+ goto found;
+ }
+
+ db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
+ return;
+found:
+ db_printf("Wait channel: %p\n", sq->sq_wchan);
+ db_printf("Queue type: %d\n", sq->sq_type);
+#ifdef INVARIANTS
+ if (sq->sq_lock) {
+ lock = sq->sq_lock;
+ db_printf("Associated Interlock: %p - (%s) %s\n", lock,
+ LOCK_CLASS(lock)->lc_name, lock->lo_name);
+ }
+#endif
+ db_printf("Blocked threads:\n");
+ for (i = 0; i < NR_SLEEPQS; i++) {
+ db_printf("\nQueue[%d]:\n", i);
+ if (TAILQ_EMPTY(&sq->sq_blocked[i]))
+ db_printf("\tempty\n");
+ else
+ TAILQ_FOREACH(td, &sq->sq_blocked[0],
+ td_slpq) {
+ db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
+ td->td_tid, td->td_proc->p_pid,
+ td->td_name);
+ }
+ db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
+ }
+}
+
+/* Alias 'show sleepqueue' to 'show sleepq'. */
+DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
+#endif
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
new file mode 100644
index 0000000..3614798
--- /dev/null
+++ b/sys/kern/subr_smp.c
@@ -0,0 +1,787 @@
+/*-
+ * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module holds the global variables and machine independent functions
+ * used for the kernel SMP support.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/proc.h>
+#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pcpu.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#include <machine/smp.h>
+
+#include "opt_sched.h"
+
+#ifdef SMP
+volatile cpuset_t stopped_cpus;
+volatile cpuset_t started_cpus;
+volatile cpuset_t suspended_cpus;
+cpuset_t hlt_cpus_mask;
+cpuset_t logical_cpus_mask;
+
+void (*cpustop_restartfunc)(void);
+#endif
+/* This is used in modules that need to work in both SMP and UP. */
+cpuset_t all_cpus;
+
+int mp_ncpus;
+/* export this for libkvm consumers. */
+int mp_maxcpus = MAXCPU;
+
+volatile int smp_started;
+u_int mp_maxid;
+
+static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
+ "Kernel SMP");
+
+SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
+ "Max CPU ID.");
+
+SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
+ 0, "Max number of CPUs that the system was compiled for.");
+
+int smp_active = 0; /* are the APs allowed to run? */
+SYSCTL_INT(_kern_smp, OID_AUTO, active, CTLFLAG_RW, &smp_active, 0,
+ "Number of Auxillary Processors (APs) that were successfully started");
+
+int smp_disabled = 0; /* has smp been disabled? */
+SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
+ &smp_disabled, 0, "SMP has been disabled from the loader");
+TUNABLE_INT("kern.smp.disabled", &smp_disabled);
+
+int smp_cpus = 1; /* how many cpu's running */
+SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
+ "Number of CPUs online");
+
+int smp_topology = 0; /* Which topology we're using. */
+SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0,
+ "Topology override setting; 0 is default provided by hardware.");
+TUNABLE_INT("kern.smp.topology", &smp_topology);
+
+#ifdef SMP
+/* Enable forwarding of a signal to a process running on a different CPU */
+static int forward_signal_enabled = 1;
+SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
+ &forward_signal_enabled, 0,
+ "Forwarding of a signal to a process on a different CPU");
+
+/* Variables needed for SMP rendezvous. */
+static volatile int smp_rv_ncpus;
+static void (*volatile smp_rv_setup_func)(void *arg);
+static void (*volatile smp_rv_action_func)(void *arg);
+static void (*volatile smp_rv_teardown_func)(void *arg);
+static void *volatile smp_rv_func_arg;
+static volatile int smp_rv_waiters[4];
+
+/*
+ * Shared mutex to restrict busywaits between smp_rendezvous() and
+ * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these
+ * functions trigger at once and cause multiple CPUs to busywait with
+ * interrupts disabled.
+ */
+struct mtx smp_ipi_mtx;
+
+/*
+ * Let the MD SMP code initialize mp_maxid very early if it can.
+ */
+static void
+mp_setmaxid(void *dummy)
+{
+ cpu_mp_setmaxid();
+}
+SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
+
+/*
+ * Call the MD SMP initialization code.
+ */
+static void
+mp_start(void *dummy)
+{
+
+ mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
+
+ /* Probe for MP hardware. */
+ if (smp_disabled != 0 || cpu_mp_probe() == 0) {
+ mp_ncpus = 1;
+ CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
+ return;
+ }
+
+ cpu_mp_start();
+ printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
+ mp_ncpus);
+ cpu_mp_announce();
+}
+SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
+
+void
+forward_signal(struct thread *td)
+{
+ int id;
+
+ /*
+ * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
+ * this thread, so all we need to do is poke it if it is currently
+ * executing so that it executes ast().
+ */
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ KASSERT(TD_IS_RUNNING(td),
+ ("forward_signal: thread is not TDS_RUNNING"));
+
+ CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
+
+ if (!smp_started || cold || panicstr)
+ return;
+ if (!forward_signal_enabled)
+ return;
+
+ /* No need to IPI ourself. */
+ if (td == curthread)
+ return;
+
+ id = td->td_oncpu;
+ if (id == NOCPU)
+ return;
+ ipi_cpu(id, IPI_AST);
+}
+
+/*
+ * When called the executing CPU will send an IPI to all other CPUs
+ * requesting that they halt execution.
+ *
+ * Usually (but not necessarily) called with 'other_cpus' as its arg.
+ *
+ * - Signals all CPUs in map to stop.
+ * - Waits for each to stop.
+ *
+ * Returns:
+ * -1: error
+ * 0: NA
+ * 1: ok
+ *
+ */
+static int
+generic_stop_cpus(cpuset_t map, u_int type)
+{
+#ifdef KTR
+ char cpusetbuf[CPUSETBUFSIZ];
+#endif
+ static volatile u_int stopping_cpu = NOCPU;
+ int i;
+ volatile cpuset_t *cpus;
+
+ KASSERT(
+#if defined(__amd64__) || defined(__i386__)
+ type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
+#else
+ type == IPI_STOP || type == IPI_STOP_HARD,
+#endif
+ ("%s: invalid stop type", __func__));
+
+ if (!smp_started)
+ return (0);
+
+ CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
+ cpusetobj_strprint(cpusetbuf, &map), type);
+
+ if (stopping_cpu != PCPU_GET(cpuid))
+ while (atomic_cmpset_int(&stopping_cpu, NOCPU,
+ PCPU_GET(cpuid)) == 0)
+ while (stopping_cpu != NOCPU)
+ cpu_spinwait(); /* spin */
+
+ /* send the stop IPI to all CPUs in map */
+ ipi_selected(map, type);
+
+#if defined(__amd64__) || defined(__i386__)
+ if (type == IPI_SUSPEND)
+ cpus = &suspended_cpus;
+ else
+#endif
+ cpus = &stopped_cpus;
+
+ i = 0;
+ while (!CPU_SUBSET(cpus, &map)) {
+ /* spin */
+ cpu_spinwait();
+ i++;
+ if (i == 100000000) {
+ printf("timeout stopping cpus\n");
+ break;
+ }
+ }
+
+ stopping_cpu = NOCPU;
+ return (1);
+}
+
+int
+stop_cpus(cpuset_t map)
+{
+
+ return (generic_stop_cpus(map, IPI_STOP));
+}
+
+int
+stop_cpus_hard(cpuset_t map)
+{
+
+ return (generic_stop_cpus(map, IPI_STOP_HARD));
+}
+
+#if defined(__amd64__) || defined(__i386__)
+int
+suspend_cpus(cpuset_t map)
+{
+
+ return (generic_stop_cpus(map, IPI_SUSPEND));
+}
+#endif
+
+/*
+ * Called by a CPU to restart stopped CPUs.
+ *
+ * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
+ *
+ * - Signals all CPUs in map to restart.
+ * - Waits for each to restart.
+ *
+ * Returns:
+ * -1: error
+ * 0: NA
+ * 1: ok
+ */
+int
+restart_cpus(cpuset_t map)
+{
+#ifdef KTR
+ char cpusetbuf[CPUSETBUFSIZ];
+#endif
+
+ if (!smp_started)
+ return 0;
+
+ CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
+
+ /* signal other cpus to restart */
+ CPU_COPY_STORE_REL(&map, &started_cpus);
+
+ /* wait for each to clear its bit */
+ while (CPU_OVERLAP(&stopped_cpus, &map))
+ cpu_spinwait();
+
+ return 1;
+}
+
+/*
+ * All-CPU rendezvous. CPUs are signalled, all execute the setup function
+ * (if specified), rendezvous, execute the action function (if specified),
+ * rendezvous again, execute the teardown function (if specified), and then
+ * resume.
+ *
+ * Note that the supplied external functions _must_ be reentrant and aware
+ * that they are running in parallel and in an unknown lock context.
+ */
+void
+smp_rendezvous_action(void)
+{
+ struct thread *td;
+ void *local_func_arg;
+ void (*local_setup_func)(void*);
+ void (*local_action_func)(void*);
+ void (*local_teardown_func)(void*);
+#ifdef INVARIANTS
+ int owepreempt;
+#endif
+
+ /* Ensure we have up-to-date values. */
+ atomic_add_acq_int(&smp_rv_waiters[0], 1);
+ while (smp_rv_waiters[0] < smp_rv_ncpus)
+ cpu_spinwait();
+
+ /* Fetch rendezvous parameters after acquire barrier. */
+ local_func_arg = smp_rv_func_arg;
+ local_setup_func = smp_rv_setup_func;
+ local_action_func = smp_rv_action_func;
+ local_teardown_func = smp_rv_teardown_func;
+
+ /*
+ * Use a nested critical section to prevent any preemptions
+ * from occurring during a rendezvous action routine.
+ * Specifically, if a rendezvous handler is invoked via an IPI
+ * and the interrupted thread was in the critical_exit()
+ * function after setting td_critnest to 0 but before
+ * performing a deferred preemption, this routine can be
+ * invoked with td_critnest set to 0 and td_owepreempt true.
+ * In that case, a critical_exit() during the rendezvous
+ * action would trigger a preemption which is not permitted in
+ * a rendezvous action. To fix this, wrap all of the
+ * rendezvous action handlers in a critical section. We
+ * cannot use a regular critical section however as having
+ * critical_exit() preempt from this routine would also be
+ * problematic (the preemption must not occur before the IPI
+ * has been acknowledged via an EOI). Instead, we
+ * intentionally ignore td_owepreempt when leaving the
+ * critical section. This should be harmless because we do
+ * not permit rendezvous action routines to schedule threads,
+ * and thus td_owepreempt should never transition from 0 to 1
+ * during this routine.
+ */
+ td = curthread;
+ td->td_critnest++;
+#ifdef INVARIANTS
+ owepreempt = td->td_owepreempt;
+#endif
+
+ /*
+ * If requested, run a setup function before the main action
+ * function. Ensure all CPUs have completed the setup
+ * function before moving on to the action function.
+ */
+ if (local_setup_func != smp_no_rendevous_barrier) {
+ if (smp_rv_setup_func != NULL)
+ smp_rv_setup_func(smp_rv_func_arg);
+ atomic_add_int(&smp_rv_waiters[1], 1);
+ while (smp_rv_waiters[1] < smp_rv_ncpus)
+ cpu_spinwait();
+ }
+
+ if (local_action_func != NULL)
+ local_action_func(local_func_arg);
+
+ if (local_teardown_func != smp_no_rendevous_barrier) {
+ /*
+ * Signal that the main action has been completed. If a
+ * full exit rendezvous is requested, then all CPUs will
+ * wait here until all CPUs have finished the main action.
+ */
+ atomic_add_int(&smp_rv_waiters[2], 1);
+ while (smp_rv_waiters[2] < smp_rv_ncpus)
+ cpu_spinwait();
+
+ if (local_teardown_func != NULL)
+ local_teardown_func(local_func_arg);
+ }
+
+ /*
+ * Signal that the rendezvous is fully completed by this CPU.
+ * This means that no member of smp_rv_* pseudo-structure will be
+ * accessed by this target CPU after this point; in particular,
+ * memory pointed by smp_rv_func_arg.
+ */
+ atomic_add_int(&smp_rv_waiters[3], 1);
+
+ td->td_critnest--;
+ KASSERT(owepreempt == td->td_owepreempt,
+ ("rendezvous action changed td_owepreempt"));
+}
+
+void
+smp_rendezvous_cpus(cpuset_t map,
+ void (* setup_func)(void *),
+ void (* action_func)(void *),
+ void (* teardown_func)(void *),
+ void *arg)
+{
+ int curcpumap, i, ncpus = 0;
+
+ /* Look comments in the !SMP case. */
+ if (!smp_started) {
+ spinlock_enter();
+ if (setup_func != NULL)
+ setup_func(arg);
+ if (action_func != NULL)
+ action_func(arg);
+ if (teardown_func != NULL)
+ teardown_func(arg);
+ spinlock_exit();
+ return;
+ }
+
+ CPU_FOREACH(i) {
+ if (CPU_ISSET(i, &map))
+ ncpus++;
+ }
+ if (ncpus == 0)
+ panic("ncpus is 0 with non-zero map");
+
+ mtx_lock_spin(&smp_ipi_mtx);
+
+ /* Pass rendezvous parameters via global variables. */
+ smp_rv_ncpus = ncpus;
+ smp_rv_setup_func = setup_func;
+ smp_rv_action_func = action_func;
+ smp_rv_teardown_func = teardown_func;
+ smp_rv_func_arg = arg;
+ smp_rv_waiters[1] = 0;
+ smp_rv_waiters[2] = 0;
+ smp_rv_waiters[3] = 0;
+ atomic_store_rel_int(&smp_rv_waiters[0], 0);
+
+ /*
+ * Signal other processors, which will enter the IPI with
+ * interrupts off.
+ */
+ curcpumap = CPU_ISSET(curcpu, &map);
+ CPU_CLR(curcpu, &map);
+ ipi_selected(map, IPI_RENDEZVOUS);
+
+ /* Check if the current CPU is in the map */
+ if (curcpumap != 0)
+ smp_rendezvous_action();
+
+ /*
+ * Ensure that the master CPU waits for all the other
+ * CPUs to finish the rendezvous, so that smp_rv_*
+ * pseudo-structure and the arg are guaranteed to not
+ * be in use.
+ */
+ while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
+ cpu_spinwait();
+
+ mtx_unlock_spin(&smp_ipi_mtx);
+}
+
+void
+smp_rendezvous(void (* setup_func)(void *),
+ void (* action_func)(void *),
+ void (* teardown_func)(void *),
+ void *arg)
+{
+ smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
+}
+
+static struct cpu_group group[MAXCPU];
+
+struct cpu_group *
+smp_topo(void)
+{
+ char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
+ struct cpu_group *top;
+
+ /*
+ * Check for a fake topology request for debugging purposes.
+ */
+ switch (smp_topology) {
+ case 1:
+ /* Dual core with no sharing. */
+ top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
+ break;
+ case 2:
+ /* No topology, all cpus are equal. */
+ top = smp_topo_none();
+ break;
+ case 3:
+ /* Dual core with shared L2. */
+ top = smp_topo_1level(CG_SHARE_L2, 2, 0);
+ break;
+ case 4:
+ /* quad core, shared l3 among each package, private l2. */
+ top = smp_topo_1level(CG_SHARE_L3, 4, 0);
+ break;
+ case 5:
+ /* quad core, 2 dualcore parts on each package share l2. */
+ top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
+ break;
+ case 6:
+ /* Single-core 2xHTT */
+ top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
+ break;
+ case 7:
+ /* quad core with a shared l3, 8 threads sharing L2. */
+ top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
+ CG_FLAG_SMT);
+ break;
+ default:
+ /* Default, ask the system what it wants. */
+ top = cpu_topo();
+ break;
+ }
+ /*
+ * Verify the returned topology.
+ */
+ if (top->cg_count != mp_ncpus)
+ panic("Built bad topology at %p. CPU count %d != %d",
+ top, top->cg_count, mp_ncpus);
+ if (CPU_CMP(&top->cg_mask, &all_cpus))
+ panic("Built bad topology at %p. CPU mask (%s) != (%s)",
+ top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
+ cpusetobj_strprint(cpusetbuf2, &all_cpus));
+ return (top);
+}
+
+struct cpu_group *
+smp_topo_none(void)
+{
+ struct cpu_group *top;
+
+ top = &group[0];
+ top->cg_parent = NULL;
+ top->cg_child = NULL;
+ top->cg_mask = all_cpus;
+ top->cg_count = mp_ncpus;
+ top->cg_children = 0;
+ top->cg_level = CG_SHARE_NONE;
+ top->cg_flags = 0;
+
+ return (top);
+}
+
+static int
+smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
+ int count, int flags, int start)
+{
+ char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
+ cpuset_t mask;
+ int i;
+
+ CPU_ZERO(&mask);
+ for (i = 0; i < count; i++, start++)
+ CPU_SET(start, &mask);
+ child->cg_parent = parent;
+ child->cg_child = NULL;
+ child->cg_children = 0;
+ child->cg_level = share;
+ child->cg_count = count;
+ child->cg_flags = flags;
+ child->cg_mask = mask;
+ parent->cg_children++;
+ for (; parent != NULL; parent = parent->cg_parent) {
+ if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
+ panic("Duplicate children in %p. mask (%s) child (%s)",
+ parent,
+ cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
+ cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
+ CPU_OR(&parent->cg_mask, &child->cg_mask);
+ parent->cg_count += child->cg_count;
+ }
+
+ return (start);
+}
+
+struct cpu_group *
+smp_topo_1level(int share, int count, int flags)
+{
+ struct cpu_group *child;
+ struct cpu_group *top;
+ int packages;
+ int cpu;
+ int i;
+
+ cpu = 0;
+ top = &group[0];
+ packages = mp_ncpus / count;
+ top->cg_child = child = &group[1];
+ top->cg_level = CG_SHARE_NONE;
+ for (i = 0; i < packages; i++, child++)
+ cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
+ return (top);
+}
+
+struct cpu_group *
+smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
+ int l1flags)
+{
+ struct cpu_group *top;
+ struct cpu_group *l1g;
+ struct cpu_group *l2g;
+ int cpu;
+ int i;
+ int j;
+
+ cpu = 0;
+ top = &group[0];
+ l2g = &group[1];
+ top->cg_child = l2g;
+ top->cg_level = CG_SHARE_NONE;
+ top->cg_children = mp_ncpus / (l2count * l1count);
+ l1g = l2g + top->cg_children;
+ for (i = 0; i < top->cg_children; i++, l2g++) {
+ l2g->cg_parent = top;
+ l2g->cg_child = l1g;
+ l2g->cg_level = l2share;
+ for (j = 0; j < l2count; j++, l1g++)
+ cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
+ l1flags, cpu);
+ }
+ return (top);
+}
+
+
+struct cpu_group *
+smp_topo_find(struct cpu_group *top, int cpu)
+{
+ struct cpu_group *cg;
+ cpuset_t mask;
+ int children;
+ int i;
+
+ CPU_SETOF(cpu, &mask);
+ cg = top;
+ for (;;) {
+ if (!CPU_OVERLAP(&cg->cg_mask, &mask))
+ return (NULL);
+ if (cg->cg_children == 0)
+ return (cg);
+ children = cg->cg_children;
+ for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
+ if (CPU_OVERLAP(&cg->cg_mask, &mask))
+ break;
+ }
+ return (NULL);
+}
+#else /* !SMP */
+
+void
+smp_rendezvous_cpus(cpuset_t map,
+ void (*setup_func)(void *),
+ void (*action_func)(void *),
+ void (*teardown_func)(void *),
+ void *arg)
+{
+ /*
+ * In the !SMP case we just need to ensure the same initial conditions
+ * as the SMP case.
+ */
+ spinlock_enter();
+ if (setup_func != NULL)
+ setup_func(arg);
+ if (action_func != NULL)
+ action_func(arg);
+ if (teardown_func != NULL)
+ teardown_func(arg);
+ spinlock_exit();
+}
+
+void
+smp_rendezvous(void (*setup_func)(void *),
+ void (*action_func)(void *),
+ void (*teardown_func)(void *),
+ void *arg)
+{
+
+ /* Look comments in the smp_rendezvous_cpus() case. */
+ spinlock_enter();
+ if (setup_func != NULL)
+ setup_func(arg);
+ if (action_func != NULL)
+ action_func(arg);
+ if (teardown_func != NULL)
+ teardown_func(arg);
+ spinlock_exit();
+}
+
+/*
+ * Provide dummy SMP support for UP kernels. Modules that need to use SMP
+ * APIs will still work using this dummy support.
+ */
+static void
+mp_setvariables_for_up(void *dummy)
+{
+ mp_ncpus = 1;
+ mp_maxid = PCPU_GET(cpuid);
+ CPU_SETOF(mp_maxid, &all_cpus);
+ KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
+}
+SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
+ mp_setvariables_for_up, NULL);
+#endif /* SMP */
+
+void
+smp_no_rendevous_barrier(void *dummy)
+{
+#ifdef SMP
+ KASSERT((!smp_started),("smp_no_rendevous called and smp is started"));
+#endif
+}
+
+/*
+ * Wait specified idle threads to switch once. This ensures that even
+ * preempted threads have cycled through the switch function once,
+ * exiting their codepaths. This allows us to change global pointers
+ * with no other synchronization.
+ */
+int
+quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
+{
+ struct pcpu *pcpu;
+ u_int gen[MAXCPU];
+ int error;
+ int cpu;
+
+ error = 0;
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
+ continue;
+ pcpu = pcpu_find(cpu);
+ gen[cpu] = pcpu->pc_idlethread->td_generation;
+ }
+ for (cpu = 0; cpu <= mp_maxid; cpu++) {
+ if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
+ continue;
+ pcpu = pcpu_find(cpu);
+ thread_lock(curthread);
+ sched_bind(curthread, cpu);
+ thread_unlock(curthread);
+ while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
+ error = tsleep(quiesce_cpus, prio, wmesg, 1);
+ if (error != EWOULDBLOCK)
+ goto out;
+ error = 0;
+ }
+ }
+out:
+ thread_lock(curthread);
+ sched_unbind(curthread);
+ thread_unlock(curthread);
+
+ return (error);
+}
+
+int
+quiesce_all_cpus(const char *wmesg, int prio)
+{
+
+ return quiesce_cpus(all_cpus, wmesg, prio);
+}
diff --git a/sys/kern/subr_stack.c b/sys/kern/subr_stack.c
new file mode 100644
index 0000000..6408aec
--- /dev/null
+++ b/sys/kern/subr_stack.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 2005 Antoine Brodin
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#ifdef KTR
+#include <sys/ktr.h>
+#endif
+#include <sys/linker.h>
+#include <sys/malloc.h>
+#include <sys/sbuf.h>
+#include <sys/stack.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+FEATURE(stack, "Support for capturing kernel stack");
+
+static MALLOC_DEFINE(M_STACK, "stack", "Stack Traces");
+
+static int stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen,
+ long *offset);
+static int stack_symbol_ddb(vm_offset_t pc, const char **name, long *offset);
+
+struct stack *
+stack_create(void)
+{
+ struct stack *st;
+
+ st = malloc(sizeof *st, M_STACK, M_WAITOK | M_ZERO);
+ return (st);
+}
+
+void
+stack_destroy(struct stack *st)
+{
+
+ free(st, M_STACK);
+}
+
+int
+stack_put(struct stack *st, vm_offset_t pc)
+{
+
+ if (st->depth < STACK_MAX) {
+ st->pcs[st->depth++] = pc;
+ return (0);
+ } else
+ return (-1);
+}
+
+void
+stack_copy(const struct stack *src, struct stack *dst)
+{
+
+ *dst = *src;
+}
+
+void
+stack_zero(struct stack *st)
+{
+
+ bzero(st, sizeof *st);
+}
+
+void
+stack_print(const struct stack *st)
+{
+ char namebuf[64];
+ long offset;
+ int i;
+
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+ for (i = 0; i < st->depth; i++) {
+ (void)stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
+ &offset);
+ printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
+ namebuf, offset);
+ }
+}
+
+void
+stack_print_short(const struct stack *st)
+{
+ char namebuf[64];
+ long offset;
+ int i;
+
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+ for (i = 0; i < st->depth; i++) {
+ if (i > 0)
+ printf(" ");
+ if (stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
+ &offset) == 0)
+ printf("%s+%#lx", namebuf, offset);
+ else
+ printf("%p", (void *)st->pcs[i]);
+ }
+ printf("\n");
+}
+
+void
+stack_print_ddb(const struct stack *st)
+{
+ const char *name;
+ long offset;
+ int i;
+
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+ for (i = 0; i < st->depth; i++) {
+ stack_symbol_ddb(st->pcs[i], &name, &offset);
+ printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
+ name, offset);
+ }
+}
+
+#ifdef DDB
+void
+stack_print_short_ddb(const struct stack *st)
+{
+ const char *name;
+ long offset;
+ int i;
+
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+ for (i = 0; i < st->depth; i++) {
+ if (i > 0)
+ printf(" ");
+ if (stack_symbol_ddb(st->pcs[i], &name, &offset) == 0)
+ printf("%s+%#lx", name, offset);
+ else
+ printf("%p", (void *)st->pcs[i]);
+ }
+ printf("\n");
+}
+#endif
+
+/*
+ * Two print routines -- one for use from DDB and DDB-like contexts, the
+ * other for use in the live kernel.
+ */
+void
+stack_sbuf_print(struct sbuf *sb, const struct stack *st)
+{
+ char namebuf[64];
+ long offset;
+ int i;
+
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+ for (i = 0; i < st->depth; i++) {
+ (void)stack_symbol(st->pcs[i], namebuf, sizeof(namebuf),
+ &offset);
+ sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
+ namebuf, offset);
+ }
+}
+
+#ifdef DDB
+void
+stack_sbuf_print_ddb(struct sbuf *sb, const struct stack *st)
+{
+ const char *name;
+ long offset;
+ int i;
+
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+ for (i = 0; i < st->depth; i++) {
+ (void)stack_symbol_ddb(st->pcs[i], &name, &offset);
+ sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
+ name, offset);
+ }
+}
+#endif
+
+#ifdef KTR
+void
+stack_ktr(u_int mask, const char *file, int line, const struct stack *st,
+ u_int depth, int cheap)
+{
+#ifdef DDB
+ const char *name;
+ long offset;
+ int i;
+#endif
+
+ KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
+ if (cheap) {
+ ktr_tracepoint(mask, file, line, "#0 %p %p %p %p %p %p",
+ st->pcs[0], st->pcs[1], st->pcs[2], st->pcs[3],
+ st->pcs[4], st->pcs[5]);
+ if (st->depth <= 6)
+ return;
+ ktr_tracepoint(mask, file, line, "#1 %p %p %p %p %p %p",
+ st->pcs[6], st->pcs[7], st->pcs[8], st->pcs[9],
+ st->pcs[10], st->pcs[11]);
+ if (st->depth <= 12)
+ return;
+ ktr_tracepoint(mask, file, line, "#2 %p %p %p %p %p %p",
+ st->pcs[12], st->pcs[13], st->pcs[14], st->pcs[15],
+ st->pcs[16], st->pcs[17]);
+#ifdef DDB
+ } else {
+ if (depth == 0 || st->depth < depth)
+ depth = st->depth;
+ for (i = 0; i < depth; i++) {
+ (void)stack_symbol_ddb(st->pcs[i], &name, &offset);
+ ktr_tracepoint(mask, file, line, "#%d %p at %s+%#lx",
+ i, st->pcs[i], (u_long)name, offset, 0, 0);
+ }
+#endif
+ }
+}
+#endif
+
+/*
+ * Two variants of stack symbol lookup -- one that uses the DDB interfaces
+ * and bypasses linker locking, and the other that doesn't.
+ */
+static int
+stack_symbol(vm_offset_t pc, char *namebuf, u_int buflen, long *offset)
+{
+
+ if (linker_search_symbol_name((caddr_t)pc, namebuf, buflen,
+ offset) != 0) {
+ *offset = 0;
+ strlcpy(namebuf, "??", buflen);
+ return (ENOENT);
+ } else
+ return (0);
+}
+
+static int
+stack_symbol_ddb(vm_offset_t pc, const char **name, long *offset)
+{
+ linker_symval_t symval;
+ c_linker_sym_t sym;
+
+ if (linker_ddb_search_symbol((caddr_t)pc, &sym, offset) != 0)
+ goto out;
+ if (linker_ddb_symbol_values(sym, &symval) != 0)
+ goto out;
+ if (symval.name != NULL) {
+ *name = symval.name;
+ return (0);
+ }
+ out:
+ *offset = 0;
+ *name = "??";
+ return (ENOENT);
+}
diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c
new file mode 100644
index 0000000..3d6dc5a
--- /dev/null
+++ b/sys/kern/subr_syscall.c
@@ -0,0 +1,235 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (C) 2010 Konstantin Belousov <kib@freebsd.org>
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
+ */
+
+#include "opt_capsicum.h"
+#include "opt_ktrace.h"
+#include "opt_kdtrace.h"
+
+__FBSDID("$FreeBSD$");
+
+#include <sys/capability.h>
+#include <sys/ktr.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+#include <security/audit/audit.h>
+
+static inline int
+syscallenter(struct thread *td, struct syscall_args *sa)
+{
+ struct proc *p;
+ int error, traced;
+
+ PCPU_INC(cnt.v_syscall);
+ p = td->td_proc;
+
+ td->td_pticks = 0;
+ if (td->td_ucred != p->p_ucred)
+ cred_update_thread(td);
+ if (p->p_flag & P_TRACED) {
+ traced = 1;
+ PROC_LOCK(p);
+ td->td_dbgflags &= ~TDB_USERWR;
+ td->td_dbgflags |= TDB_SCE;
+ PROC_UNLOCK(p);
+ } else
+ traced = 0;
+ error = (p->p_sysent->sv_fetch_syscall_args)(td, sa);
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_SYSCALL))
+ ktrsyscall(sa->code, sa->narg, sa->args);
+#endif
+ KTR_START4(KTR_SYSC, "syscall", syscallname(p, sa->code),
+ (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "arg0:%p", sa->args[0],
+ "arg1:%p", sa->args[1], "arg2:%p", sa->args[2]);
+
+ if (error == 0) {
+
+ STOPEVENT(p, S_SCE, sa->narg);
+ if (p->p_flag & P_TRACED && p->p_stops & S_PT_SCE) {
+ PROC_LOCK(p);
+ ptracestop((td), SIGTRAP);
+ PROC_UNLOCK(p);
+ }
+ if (td->td_dbgflags & TDB_USERWR) {
+ /*
+ * Reread syscall number and arguments if
+ * debugger modified registers or memory.
+ */
+ error = (p->p_sysent->sv_fetch_syscall_args)(td, sa);
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_SYSCALL))
+ ktrsyscall(sa->code, sa->narg, sa->args);
+#endif
+ if (error != 0)
+ goto retval;
+ }
+
+#ifdef CAPABILITY_MODE
+ /*
+ * In capability mode, we only allow access to system calls
+ * flagged with SYF_CAPENABLED.
+ */
+ if (IN_CAPABILITY_MODE(td) &&
+ !(sa->callp->sy_flags & SYF_CAPENABLED)) {
+ error = ECAPMODE;
+ goto retval;
+ }
+#endif
+
+ error = syscall_thread_enter(td, sa->callp);
+ if (error != 0)
+ goto retval;
+
+#ifdef KDTRACE_HOOKS
+ /*
+ * If the systrace module has registered it's probe
+ * callback and if there is a probe active for the
+ * syscall 'entry', process the probe.
+ */
+ if (systrace_probe_func != NULL && sa->callp->sy_entry != 0)
+ (*systrace_probe_func)(sa->callp->sy_entry, sa->code,
+ sa->callp, sa->args, 0);
+#endif
+
+ AUDIT_SYSCALL_ENTER(sa->code, td);
+ error = (sa->callp->sy_call)(td, sa->args);
+ AUDIT_SYSCALL_EXIT(error, td);
+
+ /* Save the latest error return value. */
+ if ((td->td_pflags & TDP_NERRNO) == 0)
+ td->td_errno = error;
+
+#ifdef KDTRACE_HOOKS
+ /*
+ * If the systrace module has registered it's probe
+ * callback and if there is a probe active for the
+ * syscall 'return', process the probe.
+ */
+ if (systrace_probe_func != NULL && sa->callp->sy_return != 0)
+ (*systrace_probe_func)(sa->callp->sy_return, sa->code,
+ sa->callp, NULL, (error) ? -1 : td->td_retval[0]);
+#endif
+ syscall_thread_exit(td, sa->callp);
+ }
+ retval:
+ KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code),
+ (uintptr_t)td, "pid:%d", td->td_proc->p_pid, "error:%d", error,
+ "retval0:%#lx", td->td_retval[0], "retval1:%#lx",
+ td->td_retval[1]);
+ if (traced) {
+ PROC_LOCK(p);
+ td->td_dbgflags &= ~TDB_SCE;
+ PROC_UNLOCK(p);
+ }
+ (p->p_sysent->sv_set_syscall_retval)(td, error);
+ return (error);
+}
+
+static inline void
+syscallret(struct thread *td, int error, struct syscall_args *sa __unused)
+{
+ struct proc *p, *p2;
+ int traced;
+
+ p = td->td_proc;
+
+ /*
+ * Handle reschedule and other end-of-syscall issues
+ */
+ userret(td, td->td_frame);
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_SYSRET)) {
+ ktrsysret(sa->code, (td->td_pflags & TDP_NERRNO) == 0 ?
+ error : td->td_errno, td->td_retval[0]);
+ }
+#endif
+ td->td_pflags &= ~TDP_NERRNO;
+
+ if (p->p_flag & P_TRACED) {
+ traced = 1;
+ PROC_LOCK(p);
+ td->td_dbgflags |= TDB_SCX;
+ PROC_UNLOCK(p);
+ } else
+ traced = 0;
+ /*
+ * This works because errno is findable through the
+ * register set. If we ever support an emulation where this
+ * is not the case, this code will need to be revisited.
+ */
+ STOPEVENT(p, S_SCX, sa->code);
+ if (traced || (td->td_dbgflags & (TDB_EXEC | TDB_FORK)) != 0) {
+ PROC_LOCK(p);
+ /*
+ * If tracing the execed process, trap to the debugger
+ * so that breakpoints can be set before the program
+ * executes. If debugger requested tracing of syscall
+ * returns, do it now too.
+ */
+ if (traced &&
+ ((td->td_dbgflags & (TDB_FORK | TDB_EXEC)) != 0 ||
+ (p->p_stops & S_PT_SCX) != 0))
+ ptracestop(td, SIGTRAP);
+ td->td_dbgflags &= ~(TDB_SCX | TDB_EXEC | TDB_FORK);
+ PROC_UNLOCK(p);
+ }
+
+ if (td->td_pflags & TDP_RFPPWAIT) {
+ /*
+ * Preserve synchronization semantics of vfork. If
+ * waiting for child to exec or exit, fork set
+ * P_PPWAIT on child, and there we sleep on our proc
+ * (in case of exit).
+ *
+ * Do it after the ptracestop() above is finished, to
+ * not block our debugger until child execs or exits
+ * to finish vfork wait.
+ */
+ td->td_pflags &= ~TDP_RFPPWAIT;
+ p2 = td->td_rfppwait_p;
+ PROC_LOCK(p2);
+ while (p2->p_flag & P_PPWAIT)
+ cv_wait(&p2->p_pwait, &p2->p_mtx);
+ PROC_UNLOCK(p2);
+ }
+}
diff --git a/sys/kern/subr_taskqueue.c b/sys/kern/subr_taskqueue.c
new file mode 100644
index 0000000..9c7bf41
--- /dev/null
+++ b/sys/kern/subr_taskqueue.c
@@ -0,0 +1,634 @@
+/*-
+ * Copyright (c) 2000 Doug Rabson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/taskqueue.h>
+#include <sys/unistd.h>
+#include <machine/stdarg.h>
+
+static MALLOC_DEFINE(M_TASKQUEUE, "taskqueue", "Task Queues");
+static void *taskqueue_giant_ih;
+static void *taskqueue_ih;
+
+struct taskqueue_busy {
+ struct task *tb_running;
+ TAILQ_ENTRY(taskqueue_busy) tb_link;
+};
+
+struct taskqueue {
+ STAILQ_HEAD(, task) tq_queue;
+ taskqueue_enqueue_fn tq_enqueue;
+ void *tq_context;
+ TAILQ_HEAD(, taskqueue_busy) tq_active;
+ struct mtx tq_mutex;
+ struct thread **tq_threads;
+ int tq_tcount;
+ int tq_spin;
+ int tq_flags;
+ int tq_callouts;
+ taskqueue_callback_fn tq_callbacks[TASKQUEUE_NUM_CALLBACKS];
+ void *tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS];
+};
+
+#define TQ_FLAGS_ACTIVE (1 << 0)
+#define TQ_FLAGS_BLOCKED (1 << 1)
+#define TQ_FLAGS_PENDING (1 << 2)
+
+#define DT_CALLOUT_ARMED (1 << 0)
+
+#define TQ_LOCK(tq) \
+ do { \
+ if ((tq)->tq_spin) \
+ mtx_lock_spin(&(tq)->tq_mutex); \
+ else \
+ mtx_lock(&(tq)->tq_mutex); \
+ } while (0)
+#define TQ_ASSERT_LOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_OWNED)
+
+#define TQ_UNLOCK(tq) \
+ do { \
+ if ((tq)->tq_spin) \
+ mtx_unlock_spin(&(tq)->tq_mutex); \
+ else \
+ mtx_unlock(&(tq)->tq_mutex); \
+ } while (0)
+#define TQ_ASSERT_UNLOCKED(tq) mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED)
+
+void
+_timeout_task_init(struct taskqueue *queue, struct timeout_task *timeout_task,
+ int priority, task_fn_t func, void *context)
+{
+
+ TASK_INIT(&timeout_task->t, priority, func, context);
+ callout_init_mtx(&timeout_task->c, &queue->tq_mutex, 0);
+ timeout_task->q = queue;
+ timeout_task->f = 0;
+}
+
+static __inline int
+TQ_SLEEP(struct taskqueue *tq, void *p, struct mtx *m, int pri, const char *wm,
+ int t)
+{
+ if (tq->tq_spin)
+ return (msleep_spin(p, m, wm, t));
+ return (msleep(p, m, pri, wm, t));
+}
+
+static struct taskqueue *
+_taskqueue_create(const char *name __unused, int mflags,
+ taskqueue_enqueue_fn enqueue, void *context,
+ int mtxflags, const char *mtxname)
+{
+ struct taskqueue *queue;
+
+ queue = malloc(sizeof(struct taskqueue), M_TASKQUEUE, mflags | M_ZERO);
+ if (!queue)
+ return NULL;
+
+ STAILQ_INIT(&queue->tq_queue);
+ TAILQ_INIT(&queue->tq_active);
+ queue->tq_enqueue = enqueue;
+ queue->tq_context = context;
+ queue->tq_spin = (mtxflags & MTX_SPIN) != 0;
+ queue->tq_flags |= TQ_FLAGS_ACTIVE;
+ mtx_init(&queue->tq_mutex, mtxname, NULL, mtxflags);
+
+ return queue;
+}
+
+struct taskqueue *
+taskqueue_create(const char *name, int mflags,
+ taskqueue_enqueue_fn enqueue, void *context)
+{
+ return _taskqueue_create(name, mflags, enqueue, context,
+ MTX_DEF, "taskqueue");
+}
+
+void
+taskqueue_set_callback(struct taskqueue *queue,
+ enum taskqueue_callback_type cb_type, taskqueue_callback_fn callback,
+ void *context)
+{
+
+ KASSERT(((cb_type >= TASKQUEUE_CALLBACK_TYPE_MIN) &&
+ (cb_type <= TASKQUEUE_CALLBACK_TYPE_MAX)),
+ ("Callback type %d not valid, must be %d-%d", cb_type,
+ TASKQUEUE_CALLBACK_TYPE_MIN, TASKQUEUE_CALLBACK_TYPE_MAX));
+ KASSERT((queue->tq_callbacks[cb_type] == NULL),
+ ("Re-initialization of taskqueue callback?"));
+
+ queue->tq_callbacks[cb_type] = callback;
+ queue->tq_cb_contexts[cb_type] = context;
+}
+
+/*
+ * Signal a taskqueue thread to terminate.
+ */
+static void
+taskqueue_terminate(struct thread **pp, struct taskqueue *tq)
+{
+
+ while (tq->tq_tcount > 0 || tq->tq_callouts > 0) {
+ wakeup(tq);
+ TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0);
+ }
+}
+
+void
+taskqueue_free(struct taskqueue *queue)
+{
+
+ TQ_LOCK(queue);
+ queue->tq_flags &= ~TQ_FLAGS_ACTIVE;
+ taskqueue_terminate(queue->tq_threads, queue);
+ KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?"));
+ KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks"));
+ mtx_destroy(&queue->tq_mutex);
+ free(queue->tq_threads, M_TASKQUEUE);
+ free(queue, M_TASKQUEUE);
+}
+
+static int
+taskqueue_enqueue_locked(struct taskqueue *queue, struct task *task)
+{
+ struct task *ins;
+ struct task *prev;
+
+ /*
+ * Count multiple enqueues.
+ */
+ if (task->ta_pending) {
+ if (task->ta_pending < USHRT_MAX)
+ task->ta_pending++;
+ return (0);
+ }
+
+ /*
+ * Optimise the case when all tasks have the same priority.
+ */
+ prev = STAILQ_LAST(&queue->tq_queue, task, ta_link);
+ if (!prev || prev->ta_priority >= task->ta_priority) {
+ STAILQ_INSERT_TAIL(&queue->tq_queue, task, ta_link);
+ } else {
+ prev = NULL;
+ for (ins = STAILQ_FIRST(&queue->tq_queue); ins;
+ prev = ins, ins = STAILQ_NEXT(ins, ta_link))
+ if (ins->ta_priority < task->ta_priority)
+ break;
+
+ if (prev)
+ STAILQ_INSERT_AFTER(&queue->tq_queue, prev, task, ta_link);
+ else
+ STAILQ_INSERT_HEAD(&queue->tq_queue, task, ta_link);
+ }
+
+ task->ta_pending = 1;
+ if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0)
+ queue->tq_enqueue(queue->tq_context);
+ else
+ queue->tq_flags |= TQ_FLAGS_PENDING;
+
+ return (0);
+}
+int
+taskqueue_enqueue(struct taskqueue *queue, struct task *task)
+{
+ int res;
+
+ TQ_LOCK(queue);
+ res = taskqueue_enqueue_locked(queue, task);
+ TQ_UNLOCK(queue);
+
+ return (res);
+}
+
+static void
+taskqueue_timeout_func(void *arg)
+{
+ struct taskqueue *queue;
+ struct timeout_task *timeout_task;
+
+ timeout_task = arg;
+ queue = timeout_task->q;
+ KASSERT((timeout_task->f & DT_CALLOUT_ARMED) != 0, ("Stray timeout"));
+ timeout_task->f &= ~DT_CALLOUT_ARMED;
+ queue->tq_callouts--;
+ taskqueue_enqueue_locked(timeout_task->q, &timeout_task->t);
+}
+
+int
+taskqueue_enqueue_timeout(struct taskqueue *queue,
+ struct timeout_task *timeout_task, int ticks)
+{
+ int res;
+
+ TQ_LOCK(queue);
+ KASSERT(timeout_task->q == NULL || timeout_task->q == queue,
+ ("Migrated queue"));
+ KASSERT(!queue->tq_spin, ("Timeout for spin-queue"));
+ timeout_task->q = queue;
+ res = timeout_task->t.ta_pending;
+ if (ticks == 0) {
+ taskqueue_enqueue_locked(queue, &timeout_task->t);
+ } else {
+ if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) {
+ res++;
+ } else {
+ queue->tq_callouts++;
+ timeout_task->f |= DT_CALLOUT_ARMED;
+ if (ticks < 0)
+ ticks = -ticks; /* Ignore overflow. */
+ }
+ if (ticks > 0) {
+ callout_reset(&timeout_task->c, ticks,
+ taskqueue_timeout_func, timeout_task);
+ }
+ }
+ TQ_UNLOCK(queue);
+ return (res);
+}
+
+void
+taskqueue_block(struct taskqueue *queue)
+{
+
+ TQ_LOCK(queue);
+ queue->tq_flags |= TQ_FLAGS_BLOCKED;
+ TQ_UNLOCK(queue);
+}
+
+void
+taskqueue_unblock(struct taskqueue *queue)
+{
+
+ TQ_LOCK(queue);
+ queue->tq_flags &= ~TQ_FLAGS_BLOCKED;
+ if (queue->tq_flags & TQ_FLAGS_PENDING) {
+ queue->tq_flags &= ~TQ_FLAGS_PENDING;
+ queue->tq_enqueue(queue->tq_context);
+ }
+ TQ_UNLOCK(queue);
+}
+
+static void
+taskqueue_run_locked(struct taskqueue *queue)
+{
+ struct taskqueue_busy tb;
+ struct task *task;
+ int pending;
+
+ TQ_ASSERT_LOCKED(queue);
+ tb.tb_running = NULL;
+ TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link);
+
+ while (STAILQ_FIRST(&queue->tq_queue)) {
+ /*
+ * Carefully remove the first task from the queue and
+ * zero its pending count.
+ */
+ task = STAILQ_FIRST(&queue->tq_queue);
+ STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
+ pending = task->ta_pending;
+ task->ta_pending = 0;
+ tb.tb_running = task;
+ TQ_UNLOCK(queue);
+
+ task->ta_func(task->ta_context, pending);
+
+ TQ_LOCK(queue);
+ tb.tb_running = NULL;
+ wakeup(task);
+ }
+ TAILQ_REMOVE(&queue->tq_active, &tb, tb_link);
+}
+
+void
+taskqueue_run(struct taskqueue *queue)
+{
+
+ TQ_LOCK(queue);
+ taskqueue_run_locked(queue);
+ TQ_UNLOCK(queue);
+}
+
+static int
+task_is_running(struct taskqueue *queue, struct task *task)
+{
+ struct taskqueue_busy *tb;
+
+ TQ_ASSERT_LOCKED(queue);
+ TAILQ_FOREACH(tb, &queue->tq_active, tb_link) {
+ if (tb->tb_running == task)
+ return (1);
+ }
+ return (0);
+}
+
+static int
+taskqueue_cancel_locked(struct taskqueue *queue, struct task *task,
+ u_int *pendp)
+{
+
+ if (task->ta_pending > 0)
+ STAILQ_REMOVE(&queue->tq_queue, task, task, ta_link);
+ if (pendp != NULL)
+ *pendp = task->ta_pending;
+ task->ta_pending = 0;
+ return (task_is_running(queue, task) ? EBUSY : 0);
+}
+
+int
+taskqueue_cancel(struct taskqueue *queue, struct task *task, u_int *pendp)
+{
+ u_int pending;
+ int error;
+
+ TQ_LOCK(queue);
+ pending = task->ta_pending;
+ error = taskqueue_cancel_locked(queue, task, pendp);
+ TQ_UNLOCK(queue);
+
+ return (error);
+}
+
+int
+taskqueue_cancel_timeout(struct taskqueue *queue,
+ struct timeout_task *timeout_task, u_int *pendp)
+{
+ u_int pending, pending1;
+ int error;
+
+ TQ_LOCK(queue);
+ pending = !!callout_stop(&timeout_task->c);
+ error = taskqueue_cancel_locked(queue, &timeout_task->t, &pending1);
+ if ((timeout_task->f & DT_CALLOUT_ARMED) != 0) {
+ timeout_task->f &= ~DT_CALLOUT_ARMED;
+ queue->tq_callouts--;
+ }
+ TQ_UNLOCK(queue);
+
+ if (pendp != NULL)
+ *pendp = pending + pending1;
+ return (error);
+}
+
+void
+taskqueue_drain(struct taskqueue *queue, struct task *task)
+{
+
+ if (!queue->tq_spin)
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
+
+ TQ_LOCK(queue);
+ while (task->ta_pending != 0 || task_is_running(queue, task))
+ TQ_SLEEP(queue, task, &queue->tq_mutex, PWAIT, "-", 0);
+ TQ_UNLOCK(queue);
+}
+
+void
+taskqueue_drain_timeout(struct taskqueue *queue,
+ struct timeout_task *timeout_task)
+{
+
+ callout_drain(&timeout_task->c);
+ taskqueue_drain(queue, &timeout_task->t);
+}
+
+static void
+taskqueue_swi_enqueue(void *context)
+{
+ swi_sched(taskqueue_ih, 0);
+}
+
+static void
+taskqueue_swi_run(void *dummy)
+{
+ taskqueue_run(taskqueue_swi);
+}
+
+static void
+taskqueue_swi_giant_enqueue(void *context)
+{
+ swi_sched(taskqueue_giant_ih, 0);
+}
+
+static void
+taskqueue_swi_giant_run(void *dummy)
+{
+ taskqueue_run(taskqueue_swi_giant);
+}
+
+int
+taskqueue_start_threads(struct taskqueue **tqp, int count, int pri,
+ const char *name, ...)
+{
+ va_list ap;
+ struct thread *td;
+ struct taskqueue *tq;
+ int i, error;
+ char ktname[MAXCOMLEN + 1];
+
+ if (count <= 0)
+ return (EINVAL);
+
+ tq = *tqp;
+
+ va_start(ap, name);
+ vsnprintf(ktname, sizeof(ktname), name, ap);
+ va_end(ap);
+
+ tq->tq_threads = malloc(sizeof(struct thread *) * count, M_TASKQUEUE,
+ M_NOWAIT | M_ZERO);
+ if (tq->tq_threads == NULL) {
+ printf("%s: no memory for %s threads\n", __func__, ktname);
+ return (ENOMEM);
+ }
+
+ for (i = 0; i < count; i++) {
+ if (count == 1)
+ error = kthread_add(taskqueue_thread_loop, tqp, NULL,
+ &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname);
+ else
+ error = kthread_add(taskqueue_thread_loop, tqp, NULL,
+ &tq->tq_threads[i], RFSTOPPED, 0,
+ "%s_%d", ktname, i);
+ if (error) {
+ /* should be ok to continue, taskqueue_free will dtrt */
+ printf("%s: kthread_add(%s): error %d", __func__,
+ ktname, error);
+ tq->tq_threads[i] = NULL; /* paranoid */
+ } else
+ tq->tq_tcount++;
+ }
+ for (i = 0; i < count; i++) {
+ if (tq->tq_threads[i] == NULL)
+ continue;
+ td = tq->tq_threads[i];
+ thread_lock(td);
+ sched_prio(td, pri);
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
+ }
+
+ return (0);
+}
+
+static inline void
+taskqueue_run_callback(struct taskqueue *tq,
+ enum taskqueue_callback_type cb_type)
+{
+ taskqueue_callback_fn tq_callback;
+
+ TQ_ASSERT_UNLOCKED(tq);
+ tq_callback = tq->tq_callbacks[cb_type];
+ if (tq_callback != NULL)
+ tq_callback(tq->tq_cb_contexts[cb_type]);
+}
+
+void
+taskqueue_thread_loop(void *arg)
+{
+ struct taskqueue **tqp, *tq;
+
+ tqp = arg;
+ tq = *tqp;
+ taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT);
+ TQ_LOCK(tq);
+ while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) {
+ taskqueue_run_locked(tq);
+ /*
+ * Because taskqueue_run() can drop tq_mutex, we need to
+ * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the
+ * meantime, which means we missed a wakeup.
+ */
+ if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0)
+ break;
+ TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
+ }
+ taskqueue_run_locked(tq);
+
+ /*
+ * This thread is on its way out, so just drop the lock temporarily
+ * in order to call the shutdown callback. This allows the callback
+ * to look at the taskqueue, even just before it dies.
+ */
+ TQ_UNLOCK(tq);
+ taskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN);
+ TQ_LOCK(tq);
+
+ /* rendezvous with thread that asked us to terminate */
+ tq->tq_tcount--;
+ wakeup_one(tq->tq_threads);
+ TQ_UNLOCK(tq);
+ kthread_exit();
+}
+
+void
+taskqueue_thread_enqueue(void *context)
+{
+ struct taskqueue **tqp, *tq;
+
+ tqp = context;
+ tq = *tqp;
+
+ TQ_ASSERT_LOCKED(tq);
+ wakeup_one(tq);
+}
+
+TASKQUEUE_DEFINE(swi, taskqueue_swi_enqueue, NULL,
+ swi_add(NULL, "task queue", taskqueue_swi_run, NULL, SWI_TQ,
+ INTR_MPSAFE, &taskqueue_ih));
+
+TASKQUEUE_DEFINE(swi_giant, taskqueue_swi_giant_enqueue, NULL,
+ swi_add(NULL, "Giant taskq", taskqueue_swi_giant_run,
+ NULL, SWI_TQ_GIANT, 0, &taskqueue_giant_ih));
+
+TASKQUEUE_DEFINE_THREAD(thread);
+
+struct taskqueue *
+taskqueue_create_fast(const char *name, int mflags,
+ taskqueue_enqueue_fn enqueue, void *context)
+{
+ return _taskqueue_create(name, mflags, enqueue, context,
+ MTX_SPIN, "fast_taskqueue");
+}
+
+/* NB: for backwards compatibility */
+int
+taskqueue_enqueue_fast(struct taskqueue *queue, struct task *task)
+{
+ return taskqueue_enqueue(queue, task);
+}
+
+static void *taskqueue_fast_ih;
+
+static void
+taskqueue_fast_enqueue(void *context)
+{
+ swi_sched(taskqueue_fast_ih, 0);
+}
+
+static void
+taskqueue_fast_run(void *dummy)
+{
+ taskqueue_run(taskqueue_fast);
+}
+
+TASKQUEUE_FAST_DEFINE(fast, taskqueue_fast_enqueue, NULL,
+ swi_add(NULL, "fast taskq", taskqueue_fast_run, NULL,
+ SWI_TQ_FAST, INTR_MPSAFE, &taskqueue_fast_ih));
+
+int
+taskqueue_member(struct taskqueue *queue, struct thread *td)
+{
+ int i, j, ret = 0;
+
+ for (i = 0, j = 0; ; i++) {
+ if (queue->tq_threads[i] == NULL)
+ continue;
+ if (queue->tq_threads[i] == td) {
+ ret = 1;
+ break;
+ }
+ if (++j >= queue->tq_tcount)
+ break;
+ }
+ return (ret);
+}
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
new file mode 100644
index 0000000..19729a4
--- /dev/null
+++ b/sys/kern/subr_trap.c
@@ -0,0 +1,303 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 2007 The FreeBSD Foundation
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Portions of this software were developed by A. Joseph Koshy under
+ * sponsorship from the FreeBSD Foundation and Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_hwpmc_hooks.h"
+#include "opt_ktrace.h"
+#include "opt_kdtrace.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/pmckern.h>
+#include <sys/proc.h>
+#include <sys/ktr.h>
+#include <sys/pioctl.h>
+#include <sys/ptrace.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/signalvar.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vmmeter.h>
+#ifdef KTRACE
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+#endif
+#include <security/audit/audit.h>
+
+#include <machine/cpu.h>
+
+#ifdef VIMAGE
+#include <net/vnet.h>
+#endif
+
+#ifdef XEN
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#endif
+
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * Define the code needed before returning to user mode, for trap and
+ * syscall.
+ */
+void
+userret(struct thread *td, struct trapframe *frame)
+{
+ struct proc *p = td->td_proc;
+
+ CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
+ td->td_name);
+ KASSERT((p->p_flag & P_WEXIT) == 0,
+ ("Exiting process returns to usermode"));
+#if 0
+#ifdef DIAGNOSTIC
+ /* Check that we called signotify() enough. */
+ PROC_LOCK(p);
+ thread_lock(td);
+ if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
+ (td->td_flags & TDF_ASTPENDING) == 0))
+ printf("failed to set signal flags properly for ast()\n");
+ thread_unlock(td);
+ PROC_UNLOCK(p);
+#endif
+#endif
+#ifdef KTRACE
+ KTRUSERRET(td);
+#endif
+ /*
+ * If this thread tickled GEOM, we need to wait for the giggling to
+ * stop before we return to userland
+ */
+ if (td->td_pflags & TDP_GEOM)
+ g_waitidle();
+
+ /*
+ * Charge system time if profiling.
+ */
+ if (p->p_flag & P_PROFIL)
+ addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
+ /*
+ * Let the scheduler adjust our priority etc.
+ */
+ sched_userret(td);
+#ifdef XEN
+ PT_UPDATES_FLUSH();
+#endif
+
+ /*
+ * Check for misbehavior.
+ *
+ * In case there is a callchain tracing ongoing because of
+ * hwpmc(4), skip the scheduler pinning check.
+ * hwpmc(4) subsystem, infact, will collect callchain informations
+ * at ast() checkpoint, which is past userret().
+ */
+ WITNESS_WARN(WARN_PANIC, NULL, "userret: returning");
+ KASSERT(td->td_critnest == 0,
+ ("userret: Returning in a critical section"));
+ KASSERT(td->td_locks == 0,
+ ("userret: Returning with %d locks held", td->td_locks));
+ KASSERT((td->td_pflags & TDP_NOFAULTING) == 0,
+ ("userret: Returning with pagefaults disabled"));
+ KASSERT(td->td_no_sleeping == 0,
+ ("userret: Returning with sleep disabled"));
+ KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0,
+ ("userret: Returning with with pinned thread"));
+ KASSERT(td->td_vp_reserv == 0,
+ ("userret: Returning while holding vnode reservation"));
+ KASSERT((td->td_flags & TDF_SBDRY) == 0,
+ ("userret: Returning with stop signals deferred"));
+#ifdef VIMAGE
+ /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
+ VNET_ASSERT(curvnet == NULL,
+ ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s",
+ __func__, td, p->p_pid, td->td_name, curvnet,
+ (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
+#endif
+#ifdef RACCT
+ PROC_LOCK(p);
+ while (p->p_throttled == 1)
+ msleep(p->p_racct, &p->p_mtx, 0, "racct", 0);
+ PROC_UNLOCK(p);
+#endif
+}
+
+/*
+ * Process an asynchronous software trap.
+ * This is relatively easy.
+ * This function will return with preemption disabled.
+ */
+void
+ast(struct trapframe *framep)
+{
+ struct thread *td;
+ struct proc *p;
+ int flags;
+ int sig;
+
+ td = curthread;
+ p = td->td_proc;
+
+ CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
+ p->p_comm);
+ KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
+ WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
+ mtx_assert(&Giant, MA_NOTOWNED);
+ THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
+ td->td_frame = framep;
+ td->td_pticks = 0;
+
+ /*
+ * This updates the td_flag's for the checks below in one
+ * "atomic" operation with turning off the astpending flag.
+ * If another AST is triggered while we are handling the
+ * AST's saved in flags, the astpending flag will be set and
+ * ast() will be called again.
+ */
+ thread_lock(td);
+ flags = td->td_flags;
+ td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK |
+ TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND);
+ thread_unlock(td);
+ PCPU_INC(cnt.v_trap);
+
+ if (td->td_ucred != p->p_ucred)
+ cred_update_thread(td);
+ if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) {
+ addupc_task(td, td->td_profil_addr, td->td_profil_ticks);
+ td->td_profil_ticks = 0;
+ td->td_pflags &= ~TDP_OWEUPC;
+ }
+#ifdef HWPMC_HOOKS
+ /* Handle Software PMC callchain capture. */
+ if (PMC_IS_PENDING_CALLCHAIN(td))
+ PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_USER_CALLCHAIN_SOFT, (void *) framep);
+#endif
+ if (flags & TDF_ALRMPEND) {
+ PROC_LOCK(p);
+ kern_psignal(p, SIGVTALRM);
+ PROC_UNLOCK(p);
+ }
+ if (flags & TDF_PROFPEND) {
+ PROC_LOCK(p);
+ kern_psignal(p, SIGPROF);
+ PROC_UNLOCK(p);
+ }
+#ifdef MAC
+ if (flags & TDF_MACPEND)
+ mac_thread_userret(td);
+#endif
+ if (flags & TDF_NEEDRESCHED) {
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(1, 1, __func__);
+#endif
+ thread_lock(td);
+ sched_prio(td, td->td_user_pri);
+ mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL);
+ thread_unlock(td);
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CSW))
+ ktrcsw(0, 1, __func__);
+#endif
+ }
+
+ /*
+ * Check for signals. Unlocked reads of p_pendingcnt or
+ * p_siglist might cause process-directed signal to be handled
+ * later.
+ */
+ if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 ||
+ !SIGISEMPTY(p->p_siglist)) {
+ PROC_LOCK(p);
+ mtx_lock(&p->p_sigacts->ps_mtx);
+ while ((sig = cursig(td)) != 0)
+ postsig(sig);
+ mtx_unlock(&p->p_sigacts->ps_mtx);
+ PROC_UNLOCK(p);
+ }
+ /*
+ * We need to check to see if we have to exit or wait due to a
+ * single threading requirement or some other STOP condition.
+ */
+ if (flags & TDF_NEEDSUSPCHK) {
+ PROC_LOCK(p);
+ thread_suspend_check(0);
+ PROC_UNLOCK(p);
+ }
+
+ if (td->td_pflags & TDP_OLDMASK) {
+ td->td_pflags &= ~TDP_OLDMASK;
+ kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0);
+ }
+
+ userret(td, framep);
+}
+
+const char *
+syscallname(struct proc *p, u_int code)
+{
+ static const char unknown[] = "unknown";
+ struct sysentvec *sv;
+
+ sv = p->p_sysent;
+ if (sv->sv_syscallnames == NULL || code >= sv->sv_size)
+ return (unknown);
+ return (sv->sv_syscallnames[code]);
+}
diff --git a/sys/kern/subr_turnstile.c b/sys/kern/subr_turnstile.c
new file mode 100644
index 0000000..0a21ad9
--- /dev/null
+++ b/sys/kern/subr_turnstile.c
@@ -0,0 +1,1308 @@
+/*-
+ * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ */
+
+/*
+ * Implementation of turnstiles used to hold queue of threads blocked on
+ * non-sleepable locks. Sleepable locks use condition variables to
+ * implement their queues. Turnstiles differ from a sleep queue in that
+ * turnstile queue's are assigned to a lock held by an owning thread. Thus,
+ * when one thread is enqueued onto a turnstile, it can lend its priority
+ * to the owning thread.
+ *
+ * We wish to avoid bloating locks with an embedded turnstile and we do not
+ * want to use back-pointers in the locks for the same reason. Thus, we
+ * use a similar approach to that of Solaris 7 as described in Solaris
+ * Internals by Jim Mauro and Richard McDougall. Turnstiles are looked up
+ * in a hash table based on the address of the lock. Each entry in the
+ * hash table is a linked-lists of turnstiles and is called a turnstile
+ * chain. Each chain contains a spin mutex that protects all of the
+ * turnstiles in the chain.
+ *
+ * Each time a thread is created, a turnstile is allocated from a UMA zone
+ * and attached to that thread. When a thread blocks on a lock, if it is the
+ * first thread to block, it lends its turnstile to the lock. If the lock
+ * already has a turnstile, then it gives its turnstile to the lock's
+ * turnstile's free list. When a thread is woken up, it takes a turnstile from
+ * the free list if there are any other waiters. If it is the only thread
+ * blocked on the lock, then it reclaims the turnstile associated with the lock
+ * and removes it from the hash table.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_kdtrace.h"
+#include "opt_turnstile_profiling.h"
+#include "opt_sched.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/sysctl.h>
+#include <sys/turnstile.h>
+
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <sys/lockmgr.h>
+#include <sys/sx.h>
+#endif
+
+/*
+ * Constants for the hash table of turnstile chains. TC_SHIFT is a magic
+ * number chosen because the sleep queue's use the same value for the
+ * shift. Basically, we ignore the lower 8 bits of the address.
+ * TC_TABLESIZE must be a power of two for TC_MASK to work properly.
+ */
+#define TC_TABLESIZE 128 /* Must be power of 2. */
+#define TC_MASK (TC_TABLESIZE - 1)
+#define TC_SHIFT 8
+#define TC_HASH(lock) (((uintptr_t)(lock) >> TC_SHIFT) & TC_MASK)
+#define TC_LOOKUP(lock) &turnstile_chains[TC_HASH(lock)]
+
+/*
+ * There are three different lists of turnstiles as follows. The list
+ * connected by ts_link entries is a per-thread list of all the turnstiles
+ * attached to locks that we own. This is used to fixup our priority when
+ * a lock is released. The other two lists use the ts_hash entries. The
+ * first of these two is the turnstile chain list that a turnstile is on
+ * when it is attached to a lock. The second list to use ts_hash is the
+ * free list hung off of a turnstile that is attached to a lock.
+ *
+ * Each turnstile contains three lists of threads. The two ts_blocked lists
+ * are linked list of threads blocked on the turnstile's lock. One list is
+ * for exclusive waiters, and the other is for shared waiters. The
+ * ts_pending list is a linked list of threads previously awakened by
+ * turnstile_signal() or turnstile_wait() that are waiting to be put on
+ * the run queue.
+ *
+ * Locking key:
+ * c - turnstile chain lock
+ * q - td_contested lock
+ */
+struct turnstile {
+ struct mtx ts_lock; /* Spin lock for self. */
+ struct threadqueue ts_blocked[2]; /* (c + q) Blocked threads. */
+ struct threadqueue ts_pending; /* (c) Pending threads. */
+ LIST_ENTRY(turnstile) ts_hash; /* (c) Chain and free list. */
+ LIST_ENTRY(turnstile) ts_link; /* (q) Contested locks. */
+ LIST_HEAD(, turnstile) ts_free; /* (c) Free turnstiles. */
+ struct lock_object *ts_lockobj; /* (c) Lock we reference. */
+ struct thread *ts_owner; /* (c + q) Who owns the lock. */
+};
+
+struct turnstile_chain {
+ LIST_HEAD(, turnstile) tc_turnstiles; /* List of turnstiles. */
+ struct mtx tc_lock; /* Spin lock for this chain. */
+#ifdef TURNSTILE_PROFILING
+ u_int tc_depth; /* Length of tc_queues. */
+ u_int tc_max_depth; /* Max length of tc_queues. */
+#endif
+};
+
+#ifdef TURNSTILE_PROFILING
+u_int turnstile_max_depth;
+static SYSCTL_NODE(_debug, OID_AUTO, turnstile, CTLFLAG_RD, 0,
+ "turnstile profiling");
+static SYSCTL_NODE(_debug_turnstile, OID_AUTO, chains, CTLFLAG_RD, 0,
+ "turnstile chain stats");
+SYSCTL_UINT(_debug_turnstile, OID_AUTO, max_depth, CTLFLAG_RD,
+ &turnstile_max_depth, 0, "maximum depth achieved of a single chain");
+#endif
+static struct mtx td_contested_lock;
+static struct turnstile_chain turnstile_chains[TC_TABLESIZE];
+static uma_zone_t turnstile_zone;
+
+/*
+ * Prototypes for non-exported routines.
+ */
+static void init_turnstile0(void *dummy);
+#ifdef TURNSTILE_PROFILING
+static void init_turnstile_profiling(void *arg);
+#endif
+static void propagate_priority(struct thread *td);
+static int turnstile_adjust_thread(struct turnstile *ts,
+ struct thread *td);
+static struct thread *turnstile_first_waiter(struct turnstile *ts);
+static void turnstile_setowner(struct turnstile *ts, struct thread *owner);
+#ifdef INVARIANTS
+static void turnstile_dtor(void *mem, int size, void *arg);
+#endif
+static int turnstile_init(void *mem, int size, int flags);
+static void turnstile_fini(void *mem, int size);
+
+SDT_PROVIDER_DECLARE(sched);
+SDT_PROBE_DEFINE(sched, , , sleep, sleep);
+SDT_PROBE_DEFINE2(sched, , , wakeup, wakeup, "struct thread *",
+ "struct proc *");
+
+/*
+ * Walks the chain of turnstiles and their owners to propagate the priority
+ * of the thread being blocked to all the threads holding locks that have to
+ * release their locks before this thread can run again.
+ */
+static void
+propagate_priority(struct thread *td)
+{
+ struct turnstile *ts;
+ int pri;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ pri = td->td_priority;
+ ts = td->td_blocked;
+ THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+ /*
+ * Grab a recursive lock on this turnstile chain so it stays locked
+ * for the whole operation. The caller expects us to return with
+ * the original lock held. We only ever lock down the chain so
+ * the lock order is constant.
+ */
+ mtx_lock_spin(&ts->ts_lock);
+ for (;;) {
+ td = ts->ts_owner;
+
+ if (td == NULL) {
+ /*
+ * This might be a read lock with no owner. There's
+ * not much we can do, so just bail.
+ */
+ mtx_unlock_spin(&ts->ts_lock);
+ return;
+ }
+
+ thread_lock_flags(td, MTX_DUPOK);
+ mtx_unlock_spin(&ts->ts_lock);
+ MPASS(td->td_proc != NULL);
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+
+ /*
+ * If the thread is asleep, then we are probably about
+ * to deadlock. To make debugging this easier, show
+ * backtrace of misbehaving thread and panic to not
+ * leave the kernel deadlocked.
+ */
+ if (TD_IS_SLEEPING(td)) {
+ printf(
+ "Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n",
+ td->td_tid, td->td_proc->p_pid);
+ kdb_backtrace_thread(td);
+ panic("sleeping thread");
+ }
+
+ /*
+ * If this thread already has higher priority than the
+ * thread that is being blocked, we are finished.
+ */
+ if (td->td_priority <= pri) {
+ thread_unlock(td);
+ return;
+ }
+
+ /*
+ * Bump this thread's priority.
+ */
+ sched_lend_prio(td, pri);
+
+ /*
+ * If lock holder is actually running or on the run queue
+ * then we are done.
+ */
+ if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) {
+ MPASS(td->td_blocked == NULL);
+ thread_unlock(td);
+ return;
+ }
+
+#ifndef SMP
+ /*
+ * For UP, we check to see if td is curthread (this shouldn't
+ * ever happen however as it would mean we are in a deadlock.)
+ */
+ KASSERT(td != curthread, ("Deadlock detected"));
+#endif
+
+ /*
+ * If we aren't blocked on a lock, we should be.
+ */
+ KASSERT(TD_ON_LOCK(td), (
+ "thread %d(%s):%d holds %s but isn't blocked on a lock\n",
+ td->td_tid, td->td_name, td->td_state,
+ ts->ts_lockobj->lo_name));
+
+ /*
+ * Pick up the lock that td is blocked on.
+ */
+ ts = td->td_blocked;
+ MPASS(ts != NULL);
+ THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+ /* Resort td on the list if needed. */
+ if (!turnstile_adjust_thread(ts, td)) {
+ mtx_unlock_spin(&ts->ts_lock);
+ return;
+ }
+ /* The thread lock is released as ts lock above. */
+ }
+}
+
+/*
+ * Adjust the thread's position on a turnstile after its priority has been
+ * changed.
+ */
+static int
+turnstile_adjust_thread(struct turnstile *ts, struct thread *td)
+{
+ struct thread *td1, *td2;
+ int queue;
+
+ THREAD_LOCK_ASSERT(td, MA_OWNED);
+ MPASS(TD_ON_LOCK(td));
+
+ /*
+ * This thread may not be blocked on this turnstile anymore
+ * but instead might already be woken up on another CPU
+ * that is waiting on the thread lock in turnstile_unpend() to
+ * finish waking this thread up. We can detect this case
+ * by checking to see if this thread has been given a
+ * turnstile by either turnstile_signal() or
+ * turnstile_broadcast(). In this case, treat the thread as
+ * if it was already running.
+ */
+ if (td->td_turnstile != NULL)
+ return (0);
+
+ /*
+ * Check if the thread needs to be moved on the blocked chain.
+ * It needs to be moved if either its priority is lower than
+ * the previous thread or higher than the next thread.
+ */
+ THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+ td1 = TAILQ_PREV(td, threadqueue, td_lockq);
+ td2 = TAILQ_NEXT(td, td_lockq);
+ if ((td1 != NULL && td->td_priority < td1->td_priority) ||
+ (td2 != NULL && td->td_priority > td2->td_priority)) {
+
+ /*
+ * Remove thread from blocked chain and determine where
+ * it should be moved to.
+ */
+ queue = td->td_tsqueue;
+ MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE);
+ mtx_lock_spin(&td_contested_lock);
+ TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
+ TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) {
+ MPASS(td1->td_proc->p_magic == P_MAGIC);
+ if (td1->td_priority > td->td_priority)
+ break;
+ }
+
+ if (td1 == NULL)
+ TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
+ else
+ TAILQ_INSERT_BEFORE(td1, td, td_lockq);
+ mtx_unlock_spin(&td_contested_lock);
+ if (td1 == NULL)
+ CTR3(KTR_LOCK,
+ "turnstile_adjust_thread: td %d put at tail on [%p] %s",
+ td->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name);
+ else
+ CTR4(KTR_LOCK,
+ "turnstile_adjust_thread: td %d moved before %d on [%p] %s",
+ td->td_tid, td1->td_tid, ts->ts_lockobj,
+ ts->ts_lockobj->lo_name);
+ }
+ return (1);
+}
+
+/*
+ * Early initialization of turnstiles. This is not done via a SYSINIT()
+ * since this needs to be initialized very early when mutexes are first
+ * initialized.
+ */
+void
+init_turnstiles(void)
+{
+ int i;
+
+ for (i = 0; i < TC_TABLESIZE; i++) {
+ LIST_INIT(&turnstile_chains[i].tc_turnstiles);
+ mtx_init(&turnstile_chains[i].tc_lock, "turnstile chain",
+ NULL, MTX_SPIN);
+ }
+ mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN);
+ LIST_INIT(&thread0.td_contested);
+ thread0.td_turnstile = NULL;
+}
+
+#ifdef TURNSTILE_PROFILING
+static void
+init_turnstile_profiling(void *arg)
+{
+ struct sysctl_oid *chain_oid;
+ char chain_name[10];
+ int i;
+
+ for (i = 0; i < TC_TABLESIZE; i++) {
+ snprintf(chain_name, sizeof(chain_name), "%d", i);
+ chain_oid = SYSCTL_ADD_NODE(NULL,
+ SYSCTL_STATIC_CHILDREN(_debug_turnstile_chains), OID_AUTO,
+ chain_name, CTLFLAG_RD, NULL, "turnstile chain stats");
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+ "depth", CTLFLAG_RD, &turnstile_chains[i].tc_depth, 0,
+ NULL);
+ SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
+ "max_depth", CTLFLAG_RD, &turnstile_chains[i].tc_max_depth,
+ 0, NULL);
+ }
+}
+SYSINIT(turnstile_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
+ init_turnstile_profiling, NULL);
+#endif
+
+static void
+init_turnstile0(void *dummy)
+{
+
+ turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile),
+ NULL,
+#ifdef INVARIANTS
+ turnstile_dtor,
+#else
+ NULL,
+#endif
+ turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
+ thread0.td_turnstile = turnstile_alloc();
+}
+SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL);
+
+/*
+ * Update a thread on the turnstile list after it's priority has been changed.
+ * The old priority is passed in as an argument.
+ */
+void
+turnstile_adjust(struct thread *td, u_char oldpri)
+{
+ struct turnstile *ts;
+
+ MPASS(TD_ON_LOCK(td));
+
+ /*
+ * Pick up the lock that td is blocked on.
+ */
+ ts = td->td_blocked;
+ MPASS(ts != NULL);
+ THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+
+ /* Resort the turnstile on the list. */
+ if (!turnstile_adjust_thread(ts, td))
+ return;
+ /*
+ * If our priority was lowered and we are at the head of the
+ * turnstile, then propagate our new priority up the chain.
+ * Note that we currently don't try to revoke lent priorities
+ * when our priority goes up.
+ */
+ MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE ||
+ td->td_tsqueue == TS_SHARED_QUEUE);
+ if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
+ td->td_priority < oldpri) {
+ propagate_priority(td);
+ }
+}
+
+/*
+ * Set the owner of the lock this turnstile is attached to.
+ */
+static void
+turnstile_setowner(struct turnstile *ts, struct thread *owner)
+{
+
+ mtx_assert(&td_contested_lock, MA_OWNED);
+ MPASS(ts->ts_owner == NULL);
+
+ /* A shared lock might not have an owner. */
+ if (owner == NULL)
+ return;
+
+ MPASS(owner->td_proc->p_magic == P_MAGIC);
+ ts->ts_owner = owner;
+ LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link);
+}
+
+#ifdef INVARIANTS
+/*
+ * UMA zone item deallocator.
+ */
+static void
+turnstile_dtor(void *mem, int size, void *arg)
+{
+ struct turnstile *ts;
+
+ ts = mem;
+ MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]));
+ MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
+ MPASS(TAILQ_EMPTY(&ts->ts_pending));
+}
+#endif
+
+/*
+ * UMA zone item initializer.
+ */
+static int
+turnstile_init(void *mem, int size, int flags)
+{
+ struct turnstile *ts;
+
+ bzero(mem, size);
+ ts = mem;
+ TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
+ TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]);
+ TAILQ_INIT(&ts->ts_pending);
+ LIST_INIT(&ts->ts_free);
+ mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE);
+ return (0);
+}
+
+static void
+turnstile_fini(void *mem, int size)
+{
+ struct turnstile *ts;
+
+ ts = mem;
+ mtx_destroy(&ts->ts_lock);
+}
+
+/*
+ * Get a turnstile for a new thread.
+ */
+struct turnstile *
+turnstile_alloc(void)
+{
+
+ return (uma_zalloc(turnstile_zone, M_WAITOK));
+}
+
+/*
+ * Free a turnstile when a thread is destroyed.
+ */
+void
+turnstile_free(struct turnstile *ts)
+{
+
+ uma_zfree(turnstile_zone, ts);
+}
+
+/*
+ * Lock the turnstile chain associated with the specified lock.
+ */
+void
+turnstile_chain_lock(struct lock_object *lock)
+{
+ struct turnstile_chain *tc;
+
+ tc = TC_LOOKUP(lock);
+ mtx_lock_spin(&tc->tc_lock);
+}
+
+struct turnstile *
+turnstile_trywait(struct lock_object *lock)
+{
+ struct turnstile_chain *tc;
+ struct turnstile *ts;
+
+ tc = TC_LOOKUP(lock);
+ mtx_lock_spin(&tc->tc_lock);
+ LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+ if (ts->ts_lockobj == lock) {
+ mtx_lock_spin(&ts->ts_lock);
+ return (ts);
+ }
+
+ ts = curthread->td_turnstile;
+ MPASS(ts != NULL);
+ mtx_lock_spin(&ts->ts_lock);
+ KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
+ ts->ts_lockobj = lock;
+
+ return (ts);
+}
+
+void
+turnstile_cancel(struct turnstile *ts)
+{
+ struct turnstile_chain *tc;
+ struct lock_object *lock;
+
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+
+ mtx_unlock_spin(&ts->ts_lock);
+ lock = ts->ts_lockobj;
+ if (ts == curthread->td_turnstile)
+ ts->ts_lockobj = NULL;
+ tc = TC_LOOKUP(lock);
+ mtx_unlock_spin(&tc->tc_lock);
+}
+
+/*
+ * Look up the turnstile for a lock in the hash table locking the associated
+ * turnstile chain along the way. If no turnstile is found in the hash
+ * table, NULL is returned.
+ */
+struct turnstile *
+turnstile_lookup(struct lock_object *lock)
+{
+ struct turnstile_chain *tc;
+ struct turnstile *ts;
+
+ tc = TC_LOOKUP(lock);
+ mtx_assert(&tc->tc_lock, MA_OWNED);
+ LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+ if (ts->ts_lockobj == lock) {
+ mtx_lock_spin(&ts->ts_lock);
+ return (ts);
+ }
+ return (NULL);
+}
+
+/*
+ * Unlock the turnstile chain associated with a given lock.
+ */
+void
+turnstile_chain_unlock(struct lock_object *lock)
+{
+ struct turnstile_chain *tc;
+
+ tc = TC_LOOKUP(lock);
+ mtx_unlock_spin(&tc->tc_lock);
+}
+
+/*
+ * Return a pointer to the thread waiting on this turnstile with the
+ * most important priority or NULL if the turnstile has no waiters.
+ */
+static struct thread *
+turnstile_first_waiter(struct turnstile *ts)
+{
+ struct thread *std, *xtd;
+
+ std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]);
+ xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
+ if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority))
+ return (std);
+ return (xtd);
+}
+
+/*
+ * Take ownership of a turnstile and adjust the priority of the new
+ * owner appropriately.
+ */
+void
+turnstile_claim(struct turnstile *ts)
+{
+ struct thread *td, *owner;
+ struct turnstile_chain *tc;
+
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ MPASS(ts != curthread->td_turnstile);
+
+ owner = curthread;
+ mtx_lock_spin(&td_contested_lock);
+ turnstile_setowner(ts, owner);
+ mtx_unlock_spin(&td_contested_lock);
+
+ td = turnstile_first_waiter(ts);
+ MPASS(td != NULL);
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+ THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+
+ /*
+ * Update the priority of the new owner if needed.
+ */
+ thread_lock(owner);
+ if (td->td_priority < owner->td_priority)
+ sched_lend_prio(owner, td->td_priority);
+ thread_unlock(owner);
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ mtx_unlock_spin(&ts->ts_lock);
+ mtx_unlock_spin(&tc->tc_lock);
+}
+
+/*
+ * Block the current thread on the turnstile assicated with 'lock'. This
+ * function will context switch and not return until this thread has been
+ * woken back up. This function must be called with the appropriate
+ * turnstile chain locked and will return with it unlocked.
+ */
+void
+turnstile_wait(struct turnstile *ts, struct thread *owner, int queue)
+{
+ struct turnstile_chain *tc;
+ struct thread *td, *td1;
+ struct lock_object *lock;
+
+ td = curthread;
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ if (owner)
+ MPASS(owner->td_proc->p_magic == P_MAGIC);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+
+ /*
+ * If the lock does not already have a turnstile, use this thread's
+ * turnstile. Otherwise insert the current thread into the
+ * turnstile already in use by this lock.
+ */
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ mtx_assert(&tc->tc_lock, MA_OWNED);
+ if (ts == td->td_turnstile) {
+#ifdef TURNSTILE_PROFILING
+ tc->tc_depth++;
+ if (tc->tc_depth > tc->tc_max_depth) {
+ tc->tc_max_depth = tc->tc_depth;
+ if (tc->tc_max_depth > turnstile_max_depth)
+ turnstile_max_depth = tc->tc_max_depth;
+ }
+#endif
+ LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash);
+ KASSERT(TAILQ_EMPTY(&ts->ts_pending),
+ ("thread's turnstile has pending threads"));
+ KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]),
+ ("thread's turnstile has exclusive waiters"));
+ KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]),
+ ("thread's turnstile has shared waiters"));
+ KASSERT(LIST_EMPTY(&ts->ts_free),
+ ("thread's turnstile has a non-empty free list"));
+ MPASS(ts->ts_lockobj != NULL);
+ mtx_lock_spin(&td_contested_lock);
+ TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
+ turnstile_setowner(ts, owner);
+ mtx_unlock_spin(&td_contested_lock);
+ } else {
+ TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq)
+ if (td1->td_priority > td->td_priority)
+ break;
+ mtx_lock_spin(&td_contested_lock);
+ if (td1 != NULL)
+ TAILQ_INSERT_BEFORE(td1, td, td_lockq);
+ else
+ TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
+ MPASS(owner == ts->ts_owner);
+ mtx_unlock_spin(&td_contested_lock);
+ MPASS(td->td_turnstile != NULL);
+ LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash);
+ }
+ thread_lock(td);
+ thread_lock_set(td, &ts->ts_lock);
+ td->td_turnstile = NULL;
+
+ /* Save who we are blocked on and switch. */
+ lock = ts->ts_lockobj;
+ td->td_tsqueue = queue;
+ td->td_blocked = ts;
+ td->td_lockname = lock->lo_name;
+ td->td_blktick = ticks;
+ TD_SET_LOCK(td);
+ mtx_unlock_spin(&tc->tc_lock);
+ propagate_priority(td);
+
+ if (LOCK_LOG_TEST(lock, 0))
+ CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
+ td->td_tid, lock, lock->lo_name);
+
+ SDT_PROBE0(sched, , , sleep);
+
+ THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+ mi_switch(SW_VOL | SWT_TURNSTILE, NULL);
+
+ if (LOCK_LOG_TEST(lock, 0))
+ CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s",
+ __func__, td->td_tid, lock, lock->lo_name);
+ thread_unlock(td);
+}
+
+/*
+ * Pick the highest priority thread on this turnstile and put it on the
+ * pending list. This must be called with the turnstile chain locked.
+ */
+int
+turnstile_signal(struct turnstile *ts, int queue)
+{
+ struct turnstile_chain *tc;
+ struct thread *td;
+ int empty;
+
+ MPASS(ts != NULL);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ MPASS(curthread->td_proc->p_magic == P_MAGIC);
+ MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+
+ /*
+ * Pick the highest priority thread blocked on this lock and
+ * move it to the pending list.
+ */
+ td = TAILQ_FIRST(&ts->ts_blocked[queue]);
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+ mtx_lock_spin(&td_contested_lock);
+ TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
+ mtx_unlock_spin(&td_contested_lock);
+ TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq);
+
+ /*
+ * If the turnstile is now empty, remove it from its chain and
+ * give it to the about-to-be-woken thread. Otherwise take a
+ * turnstile from the free list and give it to the thread.
+ */
+ empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
+ TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]);
+ if (empty) {
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ mtx_assert(&tc->tc_lock, MA_OWNED);
+ MPASS(LIST_EMPTY(&ts->ts_free));
+#ifdef TURNSTILE_PROFILING
+ tc->tc_depth--;
+#endif
+ } else
+ ts = LIST_FIRST(&ts->ts_free);
+ MPASS(ts != NULL);
+ LIST_REMOVE(ts, ts_hash);
+ td->td_turnstile = ts;
+
+ return (empty);
+}
+
+/*
+ * Put all blocked threads on the pending list. This must be called with
+ * the turnstile chain locked.
+ */
+void
+turnstile_broadcast(struct turnstile *ts, int queue)
+{
+ struct turnstile_chain *tc;
+ struct turnstile *ts1;
+ struct thread *td;
+
+ MPASS(ts != NULL);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ MPASS(curthread->td_proc->p_magic == P_MAGIC);
+ MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
+ /*
+ * We must have the chain locked so that we can remove the empty
+ * turnstile from the hash queue.
+ */
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ mtx_assert(&tc->tc_lock, MA_OWNED);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+
+ /*
+ * Transfer the blocked list to the pending list.
+ */
+ mtx_lock_spin(&td_contested_lock);
+ TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq);
+ mtx_unlock_spin(&td_contested_lock);
+
+ /*
+ * Give a turnstile to each thread. The last thread gets
+ * this turnstile if the turnstile is empty.
+ */
+ TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) {
+ if (LIST_EMPTY(&ts->ts_free)) {
+ MPASS(TAILQ_NEXT(td, td_lockq) == NULL);
+ ts1 = ts;
+#ifdef TURNSTILE_PROFILING
+ tc->tc_depth--;
+#endif
+ } else
+ ts1 = LIST_FIRST(&ts->ts_free);
+ MPASS(ts1 != NULL);
+ LIST_REMOVE(ts1, ts_hash);
+ td->td_turnstile = ts1;
+ }
+}
+
+/*
+ * Wakeup all threads on the pending list and adjust the priority of the
+ * current thread appropriately. This must be called with the turnstile
+ * chain locked.
+ */
+void
+turnstile_unpend(struct turnstile *ts, int owner_type)
+{
+ TAILQ_HEAD( ,thread) pending_threads;
+ struct turnstile *nts;
+ struct thread *td;
+ u_char cp, pri;
+
+ MPASS(ts != NULL);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ MPASS(ts->ts_owner == curthread || ts->ts_owner == NULL);
+ MPASS(!TAILQ_EMPTY(&ts->ts_pending));
+
+ /*
+ * Move the list of pending threads out of the turnstile and
+ * into a local variable.
+ */
+ TAILQ_INIT(&pending_threads);
+ TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq);
+#ifdef INVARIANTS
+ if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
+ TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]))
+ ts->ts_lockobj = NULL;
+#endif
+ /*
+ * Adjust the priority of curthread based on other contested
+ * locks it owns. Don't lower the priority below the base
+ * priority however.
+ */
+ td = curthread;
+ pri = PRI_MAX;
+ thread_lock(td);
+ mtx_lock_spin(&td_contested_lock);
+ /*
+ * Remove the turnstile from this thread's list of contested locks
+ * since this thread doesn't own it anymore. New threads will
+ * not be blocking on the turnstile until it is claimed by a new
+ * owner. There might not be a current owner if this is a shared
+ * lock.
+ */
+ if (ts->ts_owner != NULL) {
+ ts->ts_owner = NULL;
+ LIST_REMOVE(ts, ts_link);
+ }
+ LIST_FOREACH(nts, &td->td_contested, ts_link) {
+ cp = turnstile_first_waiter(nts)->td_priority;
+ if (cp < pri)
+ pri = cp;
+ }
+ mtx_unlock_spin(&td_contested_lock);
+ sched_unlend_prio(td, pri);
+ thread_unlock(td);
+ /*
+ * Wake up all the pending threads. If a thread is not blocked
+ * on a lock, then it is currently executing on another CPU in
+ * turnstile_wait() or sitting on a run queue waiting to resume
+ * in turnstile_wait(). Set a flag to force it to try to acquire
+ * the lock again instead of blocking.
+ */
+ while (!TAILQ_EMPTY(&pending_threads)) {
+ td = TAILQ_FIRST(&pending_threads);
+ TAILQ_REMOVE(&pending_threads, td, td_lockq);
+ SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
+ thread_lock(td);
+ THREAD_LOCKPTR_ASSERT(td, &ts->ts_lock);
+ MPASS(td->td_proc->p_magic == P_MAGIC);
+ MPASS(TD_ON_LOCK(td));
+ TD_CLR_LOCK(td);
+ MPASS(TD_CAN_RUN(td));
+ td->td_blocked = NULL;
+ td->td_lockname = NULL;
+ td->td_blktick = 0;
+#ifdef INVARIANTS
+ td->td_tsqueue = 0xff;
+#endif
+ sched_add(td, SRQ_BORING);
+ thread_unlock(td);
+ }
+ mtx_unlock_spin(&ts->ts_lock);
+}
+
+/*
+ * Give up ownership of a turnstile. This must be called with the
+ * turnstile chain locked.
+ */
+void
+turnstile_disown(struct turnstile *ts)
+{
+ struct thread *td;
+ u_char cp, pri;
+
+ MPASS(ts != NULL);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+ MPASS(ts->ts_owner == curthread);
+ MPASS(TAILQ_EMPTY(&ts->ts_pending));
+ MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) ||
+ !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
+
+ /*
+ * Remove the turnstile from this thread's list of contested locks
+ * since this thread doesn't own it anymore. New threads will
+ * not be blocking on the turnstile until it is claimed by a new
+ * owner.
+ */
+ mtx_lock_spin(&td_contested_lock);
+ ts->ts_owner = NULL;
+ LIST_REMOVE(ts, ts_link);
+ mtx_unlock_spin(&td_contested_lock);
+
+ /*
+ * Adjust the priority of curthread based on other contested
+ * locks it owns. Don't lower the priority below the base
+ * priority however.
+ */
+ td = curthread;
+ pri = PRI_MAX;
+ thread_lock(td);
+ mtx_unlock_spin(&ts->ts_lock);
+ mtx_lock_spin(&td_contested_lock);
+ LIST_FOREACH(ts, &td->td_contested, ts_link) {
+ cp = turnstile_first_waiter(ts)->td_priority;
+ if (cp < pri)
+ pri = cp;
+ }
+ mtx_unlock_spin(&td_contested_lock);
+ sched_unlend_prio(td, pri);
+ thread_unlock(td);
+}
+
+/*
+ * Return the first thread in a turnstile.
+ */
+struct thread *
+turnstile_head(struct turnstile *ts, int queue)
+{
+#ifdef INVARIANTS
+
+ MPASS(ts != NULL);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+#endif
+ return (TAILQ_FIRST(&ts->ts_blocked[queue]));
+}
+
+/*
+ * Returns true if a sub-queue of a turnstile is empty.
+ */
+int
+turnstile_empty(struct turnstile *ts, int queue)
+{
+#ifdef INVARIANTS
+
+ MPASS(ts != NULL);
+ MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+ mtx_assert(&ts->ts_lock, MA_OWNED);
+#endif
+ return (TAILQ_EMPTY(&ts->ts_blocked[queue]));
+}
+
+#ifdef DDB
+static void
+print_thread(struct thread *td, const char *prefix)
+{
+
+ db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid,
+ td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+ td->td_name);
+}
+
+static void
+print_queue(struct threadqueue *queue, const char *header, const char *prefix)
+{
+ struct thread *td;
+
+ db_printf("%s:\n", header);
+ if (TAILQ_EMPTY(queue)) {
+ db_printf("%sempty\n", prefix);
+ return;
+ }
+ TAILQ_FOREACH(td, queue, td_lockq) {
+ print_thread(td, prefix);
+ }
+}
+
+DB_SHOW_COMMAND(turnstile, db_show_turnstile)
+{
+ struct turnstile_chain *tc;
+ struct turnstile *ts;
+ struct lock_object *lock;
+ int i;
+
+ if (!have_addr)
+ return;
+
+ /*
+ * First, see if there is an active turnstile for the lock indicated
+ * by the address.
+ */
+ lock = (struct lock_object *)addr;
+ tc = TC_LOOKUP(lock);
+ LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+ if (ts->ts_lockobj == lock)
+ goto found;
+
+ /*
+ * Second, see if there is an active turnstile at the address
+ * indicated.
+ */
+ for (i = 0; i < TC_TABLESIZE; i++)
+ LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) {
+ if (ts == (struct turnstile *)addr)
+ goto found;
+ }
+
+ db_printf("Unable to locate a turnstile via %p\n", (void *)addr);
+ return;
+found:
+ lock = ts->ts_lockobj;
+ db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name,
+ lock->lo_name);
+ if (ts->ts_owner)
+ print_thread(ts->ts_owner, "Lock Owner: ");
+ else
+ db_printf("Lock Owner: none\n");
+ print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t");
+ print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters",
+ "\t");
+ print_queue(&ts->ts_pending, "Pending Threads", "\t");
+
+}
+
+/*
+ * Show all the threads a particular thread is waiting on based on
+ * non-sleepable and non-spin locks.
+ */
+static void
+print_lockchain(struct thread *td, const char *prefix)
+{
+ struct lock_object *lock;
+ struct lock_class *class;
+ struct turnstile *ts;
+
+ /*
+ * Follow the chain. We keep walking as long as the thread is
+ * blocked on a turnstile that has an owner.
+ */
+ while (!db_pager_quit) {
+ db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
+ td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+ td->td_name);
+ switch (td->td_state) {
+ case TDS_INACTIVE:
+ db_printf("is inactive\n");
+ return;
+ case TDS_CAN_RUN:
+ db_printf("can run\n");
+ return;
+ case TDS_RUNQ:
+ db_printf("is on a run queue\n");
+ return;
+ case TDS_RUNNING:
+ db_printf("running on CPU %d\n", td->td_oncpu);
+ return;
+ case TDS_INHIBITED:
+ if (TD_ON_LOCK(td)) {
+ ts = td->td_blocked;
+ lock = ts->ts_lockobj;
+ class = LOCK_CLASS(lock);
+ db_printf("blocked on lock %p (%s) \"%s\"\n",
+ lock, class->lc_name, lock->lo_name);
+ if (ts->ts_owner == NULL)
+ return;
+ td = ts->ts_owner;
+ break;
+ }
+ db_printf("inhibited\n");
+ return;
+ default:
+ db_printf("??? (%#x)\n", td->td_state);
+ return;
+ }
+ }
+}
+
+DB_SHOW_COMMAND(lockchain, db_show_lockchain)
+{
+ struct thread *td;
+
+ /* Figure out which thread to start with. */
+ if (have_addr)
+ td = db_lookup_thread(addr, TRUE);
+ else
+ td = kdb_thread;
+
+ print_lockchain(td, "");
+}
+
+DB_SHOW_ALL_COMMAND(chains, db_show_allchains)
+{
+ struct thread *td;
+ struct proc *p;
+ int i;
+
+ i = 1;
+ FOREACH_PROC_IN_SYSTEM(p) {
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) {
+ db_printf("chain %d:\n", i++);
+ print_lockchain(td, " ");
+ }
+ if (db_pager_quit)
+ return;
+ }
+ }
+}
+DB_SHOW_ALIAS(allchains, db_show_allchains)
+
+/*
+ * Show all the threads a particular thread is waiting on based on
+ * sleepable locks.
+ */
+static void
+print_sleepchain(struct thread *td, const char *prefix)
+{
+ struct thread *owner;
+
+ /*
+ * Follow the chain. We keep walking as long as the thread is
+ * blocked on a sleep lock that has an owner.
+ */
+ while (!db_pager_quit) {
+ db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
+ td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+ td->td_name);
+ switch (td->td_state) {
+ case TDS_INACTIVE:
+ db_printf("is inactive\n");
+ return;
+ case TDS_CAN_RUN:
+ db_printf("can run\n");
+ return;
+ case TDS_RUNQ:
+ db_printf("is on a run queue\n");
+ return;
+ case TDS_RUNNING:
+ db_printf("running on CPU %d\n", td->td_oncpu);
+ return;
+ case TDS_INHIBITED:
+ if (TD_ON_SLEEPQ(td)) {
+ if (lockmgr_chain(td, &owner) ||
+ sx_chain(td, &owner)) {
+ if (owner == NULL)
+ return;
+ td = owner;
+ break;
+ }
+ db_printf("sleeping on %p \"%s\"\n",
+ td->td_wchan, td->td_wmesg);
+ return;
+ }
+ db_printf("inhibited\n");
+ return;
+ default:
+ db_printf("??? (%#x)\n", td->td_state);
+ return;
+ }
+ }
+}
+
+DB_SHOW_COMMAND(sleepchain, db_show_sleepchain)
+{
+ struct thread *td;
+
+ /* Figure out which thread to start with. */
+ if (have_addr)
+ td = db_lookup_thread(addr, TRUE);
+ else
+ td = kdb_thread;
+
+ print_sleepchain(td, "");
+}
+
+static void print_waiters(struct turnstile *ts, int indent);
+
+static void
+print_waiter(struct thread *td, int indent)
+{
+ struct turnstile *ts;
+ int i;
+
+ if (db_pager_quit)
+ return;
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+ print_thread(td, "thread ");
+ LIST_FOREACH(ts, &td->td_contested, ts_link)
+ print_waiters(ts, indent + 1);
+}
+
+static void
+print_waiters(struct turnstile *ts, int indent)
+{
+ struct lock_object *lock;
+ struct lock_class *class;
+ struct thread *td;
+ int i;
+
+ if (db_pager_quit)
+ return;
+ lock = ts->ts_lockobj;
+ class = LOCK_CLASS(lock);
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+ db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name);
+ TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq)
+ print_waiter(td, indent + 1);
+ TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq)
+ print_waiter(td, indent + 1);
+ TAILQ_FOREACH(td, &ts->ts_pending, td_lockq)
+ print_waiter(td, indent + 1);
+}
+
+DB_SHOW_COMMAND(locktree, db_show_locktree)
+{
+ struct lock_object *lock;
+ struct lock_class *class;
+ struct turnstile_chain *tc;
+ struct turnstile *ts;
+
+ if (!have_addr)
+ return;
+ lock = (struct lock_object *)addr;
+ tc = TC_LOOKUP(lock);
+ LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+ if (ts->ts_lockobj == lock)
+ break;
+ if (ts == NULL) {
+ class = LOCK_CLASS(lock);
+ db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name,
+ lock->lo_name);
+ } else
+ print_waiters(ts, 0);
+}
+#endif
diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c
new file mode 100644
index 0000000..53f87c0
--- /dev/null
+++ b/sys/kern/subr_uio.c
@@ -0,0 +1,611 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_zero.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mman.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_map.h>
+#ifdef SOCKET_SEND_COW
+#include <vm/vm_object.h>
+#endif
+
+SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV,
+ "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)");
+
+static int uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault);
+
+#ifdef SOCKET_SEND_COW
+/* Declared in uipc_socket.c */
+extern int so_zero_copy_receive;
+
+/*
+ * Identify the physical page mapped at the given kernel virtual
+ * address. Insert this physical page into the given address space at
+ * the given virtual address, replacing the physical page, if any,
+ * that already exists there.
+ */
+static int
+vm_pgmoveco(vm_map_t mapa, vm_offset_t kaddr, vm_offset_t uaddr)
+{
+ vm_map_t map = mapa;
+ vm_page_t kern_pg, user_pg;
+ vm_object_t uobject;
+ vm_map_entry_t entry;
+ vm_pindex_t upindex;
+ vm_prot_t prot;
+ boolean_t wired;
+
+ KASSERT((uaddr & PAGE_MASK) == 0,
+ ("vm_pgmoveco: uaddr is not page aligned"));
+
+ /*
+ * Herein the physical page is validated and dirtied. It is
+ * unwired in sf_buf_mext().
+ */
+ kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr));
+ kern_pg->valid = VM_PAGE_BITS_ALL;
+ KASSERT(kern_pg->queue == PQ_NONE && kern_pg->wire_count == 1,
+ ("vm_pgmoveco: kern_pg is not correctly wired"));
+
+ if ((vm_map_lookup(&map, uaddr,
+ VM_PROT_WRITE, &entry, &uobject,
+ &upindex, &prot, &wired)) != KERN_SUCCESS) {
+ return(EFAULT);
+ }
+ VM_OBJECT_WLOCK(uobject);
+retry:
+ if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
+ if (vm_page_sleep_if_busy(user_pg, "vm_pgmoveco"))
+ goto retry;
+ vm_page_lock(user_pg);
+ pmap_remove_all(user_pg);
+ vm_page_free(user_pg);
+ vm_page_unlock(user_pg);
+ } else {
+ /*
+ * Even if a physical page does not exist in the
+ * object chain's first object, a physical page from a
+ * backing object may be mapped read only.
+ */
+ if (uobject->backing_object != NULL)
+ pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE);
+ }
+ if (vm_page_insert(kern_pg, uobject, upindex)) {
+ VM_OBJECT_WUNLOCK(uobject);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(uobject);
+ goto retry;
+ }
+ vm_page_dirty(kern_pg);
+ VM_OBJECT_WUNLOCK(uobject);
+ vm_map_lookup_done(map, entry);
+ return(KERN_SUCCESS);
+}
+#endif /* SOCKET_SEND_COW */
+
+int
+copyin_nofault(const void *udaddr, void *kaddr, size_t len)
+{
+ int error, save;
+
+ save = vm_fault_disable_pagefaults();
+ error = copyin(udaddr, kaddr, len);
+ vm_fault_enable_pagefaults(save);
+ return (error);
+}
+
+int
+copyout_nofault(const void *kaddr, void *udaddr, size_t len)
+{
+ int error, save;
+
+ save = vm_fault_disable_pagefaults();
+ error = copyout(kaddr, udaddr, len);
+ vm_fault_enable_pagefaults(save);
+ return (error);
+}
+
+#define PHYS_PAGE_COUNT(len) (howmany(len, PAGE_SIZE) + 1)
+
+int
+physcopyin(void *src, vm_paddr_t dst, size_t len)
+{
+ vm_page_t m[PHYS_PAGE_COUNT(len)];
+ struct iovec iov[1];
+ struct uio uio;
+ int i;
+
+ iov[0].iov_base = src;
+ iov[0].iov_len = len;
+ uio.uio_iov = iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = len;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_rw = UIO_WRITE;
+ for (i = 0; i < PHYS_PAGE_COUNT(len); i++, dst += PAGE_SIZE)
+ m[i] = PHYS_TO_VM_PAGE(dst);
+ return (uiomove_fromphys(m, dst & PAGE_MASK, len, &uio));
+}
+
+int
+physcopyout(vm_paddr_t src, void *dst, size_t len)
+{
+ vm_page_t m[PHYS_PAGE_COUNT(len)];
+ struct iovec iov[1];
+ struct uio uio;
+ int i;
+
+ iov[0].iov_base = dst;
+ iov[0].iov_len = len;
+ uio.uio_iov = iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = len;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_rw = UIO_READ;
+ for (i = 0; i < PHYS_PAGE_COUNT(len); i++, src += PAGE_SIZE)
+ m[i] = PHYS_TO_VM_PAGE(src);
+ return (uiomove_fromphys(m, src & PAGE_MASK, len, &uio));
+}
+
+#undef PHYS_PAGE_COUNT
+
+int
+uiomove(void *cp, int n, struct uio *uio)
+{
+
+ return (uiomove_faultflag(cp, n, uio, 0));
+}
+
+int
+uiomove_nofault(void *cp, int n, struct uio *uio)
+{
+
+ return (uiomove_faultflag(cp, n, uio, 1));
+}
+
+static int
+uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
+{
+ struct thread *td;
+ struct iovec *iov;
+ size_t cnt;
+ int error, newflags, save;
+
+ td = curthread;
+ error = 0;
+
+ KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+ ("uiomove: mode"));
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td,
+ ("uiomove proc"));
+ if (!nofault)
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "Calling uiomove()");
+
+ /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
+ newflags = TDP_DEADLKTREAT;
+ if (uio->uio_segflg == UIO_USERSPACE && nofault) {
+ /*
+ * Fail if a non-spurious page fault occurs.
+ */
+ newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
+ }
+ save = curthread_pflags_set(newflags);
+
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ maybe_yield();
+ if (uio->uio_rw == UIO_READ)
+ error = copyout(cp, iov->iov_base, cnt);
+ else
+ error = copyin(iov->iov_base, cp, cnt);
+ if (error)
+ goto out;
+ break;
+
+ case UIO_SYSSPACE:
+ if (uio->uio_rw == UIO_READ)
+ bcopy(cp, iov->iov_base, cnt);
+ else
+ bcopy(iov->iov_base, cp, cnt);
+ break;
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base = (char *)iov->iov_base + cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ cp = (char *)cp + cnt;
+ n -= cnt;
+ }
+out:
+ curthread_pflags_restore(save);
+ return (error);
+}
+
+/*
+ * Wrapper for uiomove() that validates the arguments against a known-good
+ * kernel buffer. Currently, uiomove accepts a signed (n) argument, which
+ * is almost definitely a bad thing, so we catch that here as well. We
+ * return a runtime failure, but it might be desirable to generate a runtime
+ * assertion failure instead.
+ */
+int
+uiomove_frombuf(void *buf, int buflen, struct uio *uio)
+{
+ size_t offset, n;
+
+ if (uio->uio_offset < 0 || uio->uio_resid < 0 ||
+ (offset = uio->uio_offset) != uio->uio_offset)
+ return (EINVAL);
+ if (buflen <= 0 || offset >= buflen)
+ return (0);
+ if ((n = buflen - offset) > IOSIZE_MAX)
+ return (EINVAL);
+ return (uiomove((char *)buf + offset, n, uio));
+}
+
+#ifdef SOCKET_RECV_PFLIP
+/*
+ * Experimental support for zero-copy I/O
+ */
+static int
+userspaceco(void *cp, u_int cnt, struct uio *uio, int disposable)
+{
+ struct iovec *iov;
+ int error;
+
+ iov = uio->uio_iov;
+ if (uio->uio_rw == UIO_READ) {
+ if ((so_zero_copy_receive != 0)
+ && ((cnt & PAGE_MASK) == 0)
+ && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0)
+ && ((uio->uio_offset & PAGE_MASK) == 0)
+ && ((((intptr_t) cp) & PAGE_MASK) == 0)
+ && (disposable != 0)) {
+ /* SOCKET: use page-trading */
+ /*
+ * We only want to call vm_pgmoveco() on
+ * disposeable pages, since it gives the
+ * kernel page to the userland process.
+ */
+ error = vm_pgmoveco(&curproc->p_vmspace->vm_map,
+ (vm_offset_t)cp, (vm_offset_t)iov->iov_base);
+
+ /*
+ * If we get an error back, attempt
+ * to use copyout() instead. The
+ * disposable page should be freed
+ * automatically if we weren't able to move
+ * it into userland.
+ */
+ if (error != 0)
+ error = copyout(cp, iov->iov_base, cnt);
+ } else {
+ error = copyout(cp, iov->iov_base, cnt);
+ }
+ } else {
+ error = copyin(iov->iov_base, cp, cnt);
+ }
+ return (error);
+}
+
+int
+uiomoveco(void *cp, int n, struct uio *uio, int disposable)
+{
+ struct iovec *iov;
+ u_int cnt;
+ int error;
+
+ KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
+ ("uiomoveco: mode"));
+ KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread,
+ ("uiomoveco proc"));
+
+ while (n > 0 && uio->uio_resid) {
+ iov = uio->uio_iov;
+ cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ maybe_yield();
+ error = userspaceco(cp, cnt, uio, disposable);
+ if (error)
+ return (error);
+ break;
+
+ case UIO_SYSSPACE:
+ if (uio->uio_rw == UIO_READ)
+ bcopy(cp, iov->iov_base, cnt);
+ else
+ bcopy(iov->iov_base, cp, cnt);
+ break;
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base = (char *)iov->iov_base + cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ cp = (char *)cp + cnt;
+ n -= cnt;
+ }
+ return (0);
+}
+#endif /* SOCKET_RECV_PFLIP */
+
+/*
+ * Give next character to user as result of read.
+ */
+int
+ureadc(int c, struct uio *uio)
+{
+ struct iovec *iov;
+ char *iov_base;
+
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "Calling ureadc()");
+
+again:
+ if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
+ panic("ureadc");
+ iov = uio->uio_iov;
+ if (iov->iov_len == 0) {
+ uio->uio_iovcnt--;
+ uio->uio_iov++;
+ goto again;
+ }
+ switch (uio->uio_segflg) {
+
+ case UIO_USERSPACE:
+ if (subyte(iov->iov_base, c) < 0)
+ return (EFAULT);
+ break;
+
+ case UIO_SYSSPACE:
+ iov_base = iov->iov_base;
+ *iov_base = c;
+ break;
+
+ case UIO_NOCOPY:
+ break;
+ }
+ iov->iov_base = (char *)iov->iov_base + 1;
+ iov->iov_len--;
+ uio->uio_resid--;
+ uio->uio_offset++;
+ return (0);
+}
+
+int
+copyinfrom(const void * __restrict src, void * __restrict dst, size_t len,
+ int seg)
+{
+ int error = 0;
+
+ switch (seg) {
+ case UIO_USERSPACE:
+ error = copyin(src, dst, len);
+ break;
+ case UIO_SYSSPACE:
+ bcopy(src, dst, len);
+ break;
+ default:
+ panic("copyinfrom: bad seg %d\n", seg);
+ }
+ return (error);
+}
+
+int
+copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len,
+ size_t * __restrict copied, int seg)
+{
+ int error = 0;
+
+ switch (seg) {
+ case UIO_USERSPACE:
+ error = copyinstr(src, dst, len, copied);
+ break;
+ case UIO_SYSSPACE:
+ error = copystr(src, dst, len, copied);
+ break;
+ default:
+ panic("copyinstrfrom: bad seg %d\n", seg);
+ }
+ return (error);
+}
+
+int
+copyiniov(const struct iovec *iovp, u_int iovcnt, struct iovec **iov, int error)
+{
+ u_int iovlen;
+
+ *iov = NULL;
+ if (iovcnt > UIO_MAXIOV)
+ return (error);
+ iovlen = iovcnt * sizeof (struct iovec);
+ *iov = malloc(iovlen, M_IOV, M_WAITOK);
+ error = copyin(iovp, *iov, iovlen);
+ if (error) {
+ free(*iov, M_IOV);
+ *iov = NULL;
+ }
+ return (error);
+}
+
+int
+copyinuio(const struct iovec *iovp, u_int iovcnt, struct uio **uiop)
+{
+ struct iovec *iov;
+ struct uio *uio;
+ u_int iovlen;
+ int error, i;
+
+ *uiop = NULL;
+ if (iovcnt > UIO_MAXIOV)
+ return (EINVAL);
+ iovlen = iovcnt * sizeof (struct iovec);
+ uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
+ iov = (struct iovec *)(uio + 1);
+ error = copyin(iovp, iov, iovlen);
+ if (error) {
+ free(uio, M_IOV);
+ return (error);
+ }
+ uio->uio_iov = iov;
+ uio->uio_iovcnt = iovcnt;
+ uio->uio_segflg = UIO_USERSPACE;
+ uio->uio_offset = -1;
+ uio->uio_resid = 0;
+ for (i = 0; i < iovcnt; i++) {
+ if (iov->iov_len > IOSIZE_MAX - uio->uio_resid) {
+ free(uio, M_IOV);
+ return (EINVAL);
+ }
+ uio->uio_resid += iov->iov_len;
+ iov++;
+ }
+ *uiop = uio;
+ return (0);
+}
+
+struct uio *
+cloneuio(struct uio *uiop)
+{
+ struct uio *uio;
+ int iovlen;
+
+ iovlen = uiop->uio_iovcnt * sizeof (struct iovec);
+ uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
+ *uio = *uiop;
+ uio->uio_iov = (struct iovec *)(uio + 1);
+ bcopy(uiop->uio_iov, uio->uio_iov, iovlen);
+ return (uio);
+}
+
+/*
+ * Map some anonymous memory in user space of size sz, rounded up to the page
+ * boundary.
+ */
+int
+copyout_map(struct thread *td, vm_offset_t *addr, size_t sz)
+{
+ struct vmspace *vms;
+ int error;
+ vm_size_t size;
+
+ vms = td->td_proc->p_vmspace;
+
+ /*
+ * Map somewhere after heap in process memory.
+ */
+ PROC_LOCK(td->td_proc);
+ *addr = round_page((vm_offset_t)vms->vm_daddr +
+ lim_max(td->td_proc, RLIMIT_DATA));
+ PROC_UNLOCK(td->td_proc);
+
+ /* round size up to page boundry */
+ size = (vm_size_t)round_page(sz);
+
+ error = vm_mmap(&vms->vm_map, addr, size, PROT_READ | PROT_WRITE,
+ VM_PROT_ALL, MAP_PRIVATE | MAP_ANON, OBJT_DEFAULT, NULL, 0);
+
+ return (error);
+}
+
+/*
+ * Unmap memory in user space.
+ */
+int
+copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz)
+{
+ vm_map_t map;
+ vm_size_t size;
+
+ if (sz == 0)
+ return (0);
+
+ map = &td->td_proc->p_vmspace->vm_map;
+ size = (vm_size_t)round_page(sz);
+
+ if (vm_map_remove(map, addr, addr + size) != KERN_SUCCESS)
+ return (EINVAL);
+
+ return (0);
+}
diff --git a/sys/kern/subr_unit.c b/sys/kern/subr_unit.c
new file mode 100644
index 0000000..3bf7aaf
--- /dev/null
+++ b/sys/kern/subr_unit.c
@@ -0,0 +1,1015 @@
+/*-
+ * Copyright (c) 2004 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ *
+ * Unit number allocation functions.
+ *
+ * These functions implement a mixed run-length/bitmap management of unit
+ * number spaces in a very memory efficient manner.
+ *
+ * Allocation policy is always lowest free number first.
+ *
+ * A return value of -1 signals that no more unit numbers are available.
+ *
+ * There is no cost associated with the range of unitnumbers, so unless
+ * the resource really is finite, specify INT_MAX to new_unrhdr() and
+ * forget about checking the return value.
+ *
+ * If a mutex is not provided when the unit number space is created, a
+ * default global mutex is used. The advantage to passing a mutex in, is
+ * that the alloc_unrl() function can be called with the mutex already
+ * held (it will not be released by alloc_unrl()).
+ *
+ * The allocation function alloc_unr{l}() never sleeps (but it may block on
+ * the mutex of course).
+ *
+ * Freeing a unit number may require allocating memory, and can therefore
+ * sleep so the free_unr() function does not come in a pre-locked variant.
+ *
+ * A userland test program is included.
+ *
+ * Memory usage is a very complex function of the exact allocation
+ * pattern, but always very compact:
+ * * For the very typical case where a single unbroken run of unit
+ * numbers are allocated 44 bytes are used on i386.
+ * * For a unit number space of 1000 units and the random pattern
+ * in the usermode test program included, the worst case usage
+ * was 252 bytes on i386 for 500 allocated and 500 free units.
+ * * For a unit number space of 10000 units and the random pattern
+ * in the usermode test program included, the worst case usage
+ * was 798 bytes on i386 for 5000 allocated and 5000 free units.
+ * * The worst case is where every other unit number is allocated and
+ * the rest are free. In that case 44 + N/4 bytes are used where
+ * N is the number of the highest unit allocated.
+ */
+
+#include <sys/types.h>
+#include <sys/bitstring.h>
+#include <sys/_unrhdr.h>
+
+#ifdef _KERNEL
+
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+/*
+ * In theory it would be smarter to allocate the individual blocks
+ * with the zone allocator, but at this time the expectation is that
+ * there will typically not even be enough allocations to fill a single
+ * page, so we stick with malloc for now.
+ */
+static MALLOC_DEFINE(M_UNIT, "Unitno", "Unit number allocation");
+
+#define Malloc(foo) malloc(foo, M_UNIT, M_WAITOK | M_ZERO)
+#define Free(foo) free(foo, M_UNIT)
+
+static struct mtx unitmtx;
+
+MTX_SYSINIT(unit, &unitmtx, "unit# allocation", MTX_DEF);
+
+#else /* ...USERLAND */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define KASSERT(cond, arg) \
+ do { \
+ if (!(cond)) { \
+ printf arg; \
+ abort(); \
+ } \
+ } while (0)
+
+static int no_alloc;
+#define Malloc(foo) _Malloc(foo, __LINE__)
+static void *
+_Malloc(size_t foo, int line)
+{
+
+ KASSERT(no_alloc == 0, ("malloc in wrong place() line %d", line));
+ return (calloc(foo, 1));
+}
+#define Free(foo) free(foo)
+
+struct unrhdr;
+
+
+struct mtx {
+ int state;
+} unitmtx;
+
+static void
+mtx_lock(struct mtx *mp)
+{
+ KASSERT(mp->state == 0, ("mutex already locked"));
+ mp->state = 1;
+}
+
+static void
+mtx_unlock(struct mtx *mp)
+{
+ KASSERT(mp->state == 1, ("mutex not locked"));
+ mp->state = 0;
+}
+
+#define MA_OWNED 9
+
+static void
+mtx_assert(struct mtx *mp, int flag)
+{
+ if (flag == MA_OWNED) {
+ KASSERT(mp->state == 1, ("mtx_assert(MA_OWNED) not true"));
+ }
+}
+
+#define CTASSERT(foo)
+#define WITNESS_WARN(flags, lock, fmt, ...) (void)0
+
+#endif /* USERLAND */
+
+/*
+ * This is our basic building block.
+ *
+ * It can be used in three different ways depending on the value of the ptr
+ * element:
+ * If ptr is NULL, it represents a run of free items.
+ * If ptr points to the unrhdr it represents a run of allocated items.
+ * Otherwise it points to an bitstring of allocated items.
+ *
+ * For runs the len field is the length of the run.
+ * For bitmaps the len field represents the number of allocated items.
+ *
+ * The bitmap is the same size as struct unr to optimize memory management.
+ */
+struct unr {
+ TAILQ_ENTRY(unr) list;
+ u_int len;
+ void *ptr;
+};
+
+struct unrb {
+ u_char busy;
+ bitstr_t map[sizeof(struct unr) - 1];
+};
+
+CTASSERT(sizeof(struct unr) == sizeof(struct unrb));
+
+/* Number of bits in the bitmap */
+#define NBITS ((int)sizeof(((struct unrb *)NULL)->map) * 8)
+
+#if defined(DIAGNOSTIC) || !defined(_KERNEL)
+/*
+ * Consistency check function.
+ *
+ * Checks the internal consistency as well as we can.
+ *
+ * Called at all boundaries of this API.
+ */
+static void
+check_unrhdr(struct unrhdr *uh, int line)
+{
+ struct unr *up;
+ struct unrb *ub;
+ u_int x, y, z, w;
+
+ y = uh->first;
+ z = 0;
+ TAILQ_FOREACH(up, &uh->head, list) {
+ z++;
+ if (up->ptr != uh && up->ptr != NULL) {
+ ub = up->ptr;
+ KASSERT (up->len <= NBITS,
+ ("UNR inconsistency: len %u max %d (line %d)\n",
+ up->len, NBITS, line));
+ z++;
+ w = 0;
+ for (x = 0; x < up->len; x++)
+ if (bit_test(ub->map, x))
+ w++;
+ KASSERT (w == ub->busy,
+ ("UNR inconsistency: busy %u found %u (line %d)\n",
+ ub->busy, w, line));
+ y += w;
+ } else if (up->ptr != NULL)
+ y += up->len;
+ }
+ KASSERT (y == uh->busy,
+ ("UNR inconsistency: items %u found %u (line %d)\n",
+ uh->busy, y, line));
+ KASSERT (z == uh->alloc,
+ ("UNR inconsistency: chunks %u found %u (line %d)\n",
+ uh->alloc, z, line));
+}
+
+#else
+
+static __inline void
+check_unrhdr(struct unrhdr *uh, int line)
+{
+
+}
+
+#endif
+
+
+/*
+ * Userland memory management. Just use calloc and keep track of how
+ * many elements we have allocated for check_unrhdr().
+ */
+
+static __inline void *
+new_unr(struct unrhdr *uh, void **p1, void **p2)
+{
+ void *p;
+
+ uh->alloc++;
+ KASSERT(*p1 != NULL || *p2 != NULL, ("Out of cached memory"));
+ if (*p1 != NULL) {
+ p = *p1;
+ *p1 = NULL;
+ return (p);
+ } else {
+ p = *p2;
+ *p2 = NULL;
+ return (p);
+ }
+}
+
+static __inline void
+delete_unr(struct unrhdr *uh, void *ptr)
+{
+ struct unr *up;
+
+ uh->alloc--;
+ up = ptr;
+ TAILQ_INSERT_TAIL(&uh->ppfree, up, list);
+}
+
+void
+clean_unrhdrl(struct unrhdr *uh)
+{
+ struct unr *up;
+
+ mtx_assert(uh->mtx, MA_OWNED);
+ while ((up = TAILQ_FIRST(&uh->ppfree)) != NULL) {
+ TAILQ_REMOVE(&uh->ppfree, up, list);
+ mtx_unlock(uh->mtx);
+ Free(up);
+ mtx_lock(uh->mtx);
+ }
+
+}
+
+void
+clean_unrhdr(struct unrhdr *uh)
+{
+
+ mtx_lock(uh->mtx);
+ clean_unrhdrl(uh);
+ mtx_unlock(uh->mtx);
+}
+
+void
+init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex)
+{
+
+ KASSERT(low >= 0 && low <= high,
+ ("UNR: use error: new_unrhdr(%d, %d)", low, high));
+ if (mutex != NULL)
+ uh->mtx = mutex;
+ else
+ uh->mtx = &unitmtx;
+ TAILQ_INIT(&uh->head);
+ TAILQ_INIT(&uh->ppfree);
+ uh->low = low;
+ uh->high = high;
+ uh->first = 0;
+ uh->last = 1 + (high - low);
+ check_unrhdr(uh, __LINE__);
+}
+
+/*
+ * Allocate a new unrheader set.
+ *
+ * Highest and lowest valid values given as parameters.
+ */
+
+struct unrhdr *
+new_unrhdr(int low, int high, struct mtx *mutex)
+{
+ struct unrhdr *uh;
+
+ uh = Malloc(sizeof *uh);
+ init_unrhdr(uh, low, high, mutex);
+ return (uh);
+}
+
+void
+delete_unrhdr(struct unrhdr *uh)
+{
+
+ check_unrhdr(uh, __LINE__);
+ KASSERT(uh->busy == 0, ("unrhdr has %u allocations", uh->busy));
+ KASSERT(uh->alloc == 0, ("UNR memory leak in delete_unrhdr"));
+ KASSERT(TAILQ_FIRST(&uh->ppfree) == NULL,
+ ("unrhdr has postponed item for free"));
+ Free(uh);
+}
+
+static __inline int
+is_bitmap(struct unrhdr *uh, struct unr *up)
+{
+ return (up->ptr != uh && up->ptr != NULL);
+}
+
+/*
+ * Look for sequence of items which can be combined into a bitmap, if
+ * multiple are present, take the one which saves most memory.
+ *
+ * Return (1) if a sequence was found to indicate that another call
+ * might be able to do more. Return (0) if we found no suitable sequence.
+ *
+ * NB: called from alloc_unr(), no new memory allocation allowed.
+ */
+static int
+optimize_unr(struct unrhdr *uh)
+{
+ struct unr *up, *uf, *us;
+ struct unrb *ub, *ubf;
+ u_int a, l, ba;
+
+ /*
+ * Look for the run of items (if any) which when collapsed into
+ * a bitmap would save most memory.
+ */
+ us = NULL;
+ ba = 0;
+ TAILQ_FOREACH(uf, &uh->head, list) {
+ if (uf->len >= NBITS)
+ continue;
+ a = 1;
+ if (is_bitmap(uh, uf))
+ a++;
+ l = uf->len;
+ up = uf;
+ while (1) {
+ up = TAILQ_NEXT(up, list);
+ if (up == NULL)
+ break;
+ if ((up->len + l) > NBITS)
+ break;
+ a++;
+ if (is_bitmap(uh, up))
+ a++;
+ l += up->len;
+ }
+ if (a > ba) {
+ ba = a;
+ us = uf;
+ }
+ }
+ if (ba < 3)
+ return (0);
+
+ /*
+ * If the first element is not a bitmap, make it one.
+ * Trying to do so without allocating more memory complicates things
+ * a bit
+ */
+ if (!is_bitmap(uh, us)) {
+ uf = TAILQ_NEXT(us, list);
+ TAILQ_REMOVE(&uh->head, us, list);
+ a = us->len;
+ l = us->ptr == uh ? 1 : 0;
+ ub = (void *)us;
+ ub->busy = 0;
+ if (l) {
+ bit_nset(ub->map, 0, a);
+ ub->busy += a;
+ } else {
+ bit_nclear(ub->map, 0, a);
+ }
+ if (!is_bitmap(uh, uf)) {
+ if (uf->ptr == NULL) {
+ bit_nclear(ub->map, a, a + uf->len - 1);
+ } else {
+ bit_nset(ub->map, a, a + uf->len - 1);
+ ub->busy += uf->len;
+ }
+ uf->ptr = ub;
+ uf->len += a;
+ us = uf;
+ } else {
+ ubf = uf->ptr;
+ for (l = 0; l < uf->len; l++, a++) {
+ if (bit_test(ubf->map, l)) {
+ bit_set(ub->map, a);
+ ub->busy++;
+ } else {
+ bit_clear(ub->map, a);
+ }
+ }
+ uf->len = a;
+ delete_unr(uh, uf->ptr);
+ uf->ptr = ub;
+ us = uf;
+ }
+ }
+ ub = us->ptr;
+ while (1) {
+ uf = TAILQ_NEXT(us, list);
+ if (uf == NULL)
+ return (1);
+ if (uf->len + us->len > NBITS)
+ return (1);
+ if (uf->ptr == NULL) {
+ bit_nclear(ub->map, us->len, us->len + uf->len - 1);
+ us->len += uf->len;
+ TAILQ_REMOVE(&uh->head, uf, list);
+ delete_unr(uh, uf);
+ } else if (uf->ptr == uh) {
+ bit_nset(ub->map, us->len, us->len + uf->len - 1);
+ ub->busy += uf->len;
+ us->len += uf->len;
+ TAILQ_REMOVE(&uh->head, uf, list);
+ delete_unr(uh, uf);
+ } else {
+ ubf = uf->ptr;
+ for (l = 0; l < uf->len; l++, us->len++) {
+ if (bit_test(ubf->map, l)) {
+ bit_set(ub->map, us->len);
+ ub->busy++;
+ } else {
+ bit_clear(ub->map, us->len);
+ }
+ }
+ TAILQ_REMOVE(&uh->head, uf, list);
+ delete_unr(uh, ubf);
+ delete_unr(uh, uf);
+ }
+ }
+}
+
+/*
+ * See if a given unr should be collapsed with a neighbor.
+ *
+ * NB: called from alloc_unr(), no new memory allocation allowed.
+ */
+static void
+collapse_unr(struct unrhdr *uh, struct unr *up)
+{
+ struct unr *upp;
+ struct unrb *ub;
+
+ /* If bitmap is all set or clear, change it to runlength */
+ if (is_bitmap(uh, up)) {
+ ub = up->ptr;
+ if (ub->busy == up->len) {
+ delete_unr(uh, up->ptr);
+ up->ptr = uh;
+ } else if (ub->busy == 0) {
+ delete_unr(uh, up->ptr);
+ up->ptr = NULL;
+ }
+ }
+
+ /* If nothing left in runlength, delete it */
+ if (up->len == 0) {
+ upp = TAILQ_PREV(up, unrhd, list);
+ if (upp == NULL)
+ upp = TAILQ_NEXT(up, list);
+ TAILQ_REMOVE(&uh->head, up, list);
+ delete_unr(uh, up);
+ up = upp;
+ }
+
+ /* If we have "hot-spot" still, merge with neighbor if possible */
+ if (up != NULL) {
+ upp = TAILQ_PREV(up, unrhd, list);
+ if (upp != NULL && up->ptr == upp->ptr) {
+ up->len += upp->len;
+ TAILQ_REMOVE(&uh->head, upp, list);
+ delete_unr(uh, upp);
+ }
+ upp = TAILQ_NEXT(up, list);
+ if (upp != NULL && up->ptr == upp->ptr) {
+ up->len += upp->len;
+ TAILQ_REMOVE(&uh->head, upp, list);
+ delete_unr(uh, upp);
+ }
+ }
+
+ /* Merge into ->first if possible */
+ upp = TAILQ_FIRST(&uh->head);
+ if (upp != NULL && upp->ptr == uh) {
+ uh->first += upp->len;
+ TAILQ_REMOVE(&uh->head, upp, list);
+ delete_unr(uh, upp);
+ if (up == upp)
+ up = NULL;
+ }
+
+ /* Merge into ->last if possible */
+ upp = TAILQ_LAST(&uh->head, unrhd);
+ if (upp != NULL && upp->ptr == NULL) {
+ uh->last += upp->len;
+ TAILQ_REMOVE(&uh->head, upp, list);
+ delete_unr(uh, upp);
+ if (up == upp)
+ up = NULL;
+ }
+
+ /* Try to make bitmaps */
+ while (optimize_unr(uh))
+ continue;
+}
+
+/*
+ * Allocate a free unr.
+ */
+int
+alloc_unrl(struct unrhdr *uh)
+{
+ struct unr *up;
+ struct unrb *ub;
+ u_int x;
+ int y;
+
+ mtx_assert(uh->mtx, MA_OWNED);
+ check_unrhdr(uh, __LINE__);
+ x = uh->low + uh->first;
+
+ up = TAILQ_FIRST(&uh->head);
+
+ /*
+ * If we have an ideal split, just adjust the first+last
+ */
+ if (up == NULL && uh->last > 0) {
+ uh->first++;
+ uh->last--;
+ uh->busy++;
+ return (x);
+ }
+
+ /*
+ * We can always allocate from the first list element, so if we have
+ * nothing on the list, we must have run out of unit numbers.
+ */
+ if (up == NULL)
+ return (-1);
+
+ KASSERT(up->ptr != uh, ("UNR first element is allocated"));
+
+ if (up->ptr == NULL) { /* free run */
+ uh->first++;
+ up->len--;
+ } else { /* bitmap */
+ ub = up->ptr;
+ KASSERT(ub->busy < up->len, ("UNR bitmap confusion"));
+ bit_ffc(ub->map, up->len, &y);
+ KASSERT(y != -1, ("UNR corruption: No clear bit in bitmap."));
+ bit_set(ub->map, y);
+ ub->busy++;
+ x += y;
+ }
+ uh->busy++;
+ collapse_unr(uh, up);
+ return (x);
+}
+
+int
+alloc_unr(struct unrhdr *uh)
+{
+ int i;
+
+ mtx_lock(uh->mtx);
+ i = alloc_unrl(uh);
+ clean_unrhdrl(uh);
+ mtx_unlock(uh->mtx);
+ return (i);
+}
+
+static int
+alloc_unr_specificl(struct unrhdr *uh, u_int item, void **p1, void **p2)
+{
+ struct unr *up, *upn;
+ struct unrb *ub;
+ u_int i, last, tl;
+
+ mtx_assert(uh->mtx, MA_OWNED);
+
+ if (item < uh->low + uh->first || item > uh->high)
+ return (-1);
+
+ up = TAILQ_FIRST(&uh->head);
+ /* Ideal split. */
+ if (up == NULL && item - uh->low == uh->first) {
+ uh->first++;
+ uh->last--;
+ uh->busy++;
+ check_unrhdr(uh, __LINE__);
+ return (item);
+ }
+
+ i = item - uh->low - uh->first;
+
+ if (up == NULL) {
+ up = new_unr(uh, p1, p2);
+ up->ptr = NULL;
+ up->len = i;
+ TAILQ_INSERT_TAIL(&uh->head, up, list);
+ up = new_unr(uh, p1, p2);
+ up->ptr = uh;
+ up->len = 1;
+ TAILQ_INSERT_TAIL(&uh->head, up, list);
+ uh->last = uh->high - uh->low - i;
+ uh->busy++;
+ check_unrhdr(uh, __LINE__);
+ return (item);
+ } else {
+ /* Find the item which contains the unit we want to allocate. */
+ TAILQ_FOREACH(up, &uh->head, list) {
+ if (up->len > i)
+ break;
+ i -= up->len;
+ }
+ }
+
+ if (up == NULL) {
+ if (i > 0) {
+ up = new_unr(uh, p1, p2);
+ up->ptr = NULL;
+ up->len = i;
+ TAILQ_INSERT_TAIL(&uh->head, up, list);
+ }
+ up = new_unr(uh, p1, p2);
+ up->ptr = uh;
+ up->len = 1;
+ TAILQ_INSERT_TAIL(&uh->head, up, list);
+ goto done;
+ }
+
+ if (is_bitmap(uh, up)) {
+ ub = up->ptr;
+ if (bit_test(ub->map, i) == 0) {
+ bit_set(ub->map, i);
+ ub->busy++;
+ goto done;
+ } else
+ return (-1);
+ } else if (up->ptr == uh)
+ return (-1);
+
+ KASSERT(up->ptr == NULL,
+ ("alloc_unr_specificl: up->ptr != NULL (up=%p)", up));
+
+ /* Split off the tail end, if any. */
+ tl = up->len - (1 + i);
+ if (tl > 0) {
+ upn = new_unr(uh, p1, p2);
+ upn->ptr = NULL;
+ upn->len = tl;
+ TAILQ_INSERT_AFTER(&uh->head, up, upn, list);
+ }
+
+ /* Split off head end, if any */
+ if (i > 0) {
+ upn = new_unr(uh, p1, p2);
+ upn->len = i;
+ upn->ptr = NULL;
+ TAILQ_INSERT_BEFORE(up, upn, list);
+ }
+ up->len = 1;
+ up->ptr = uh;
+
+done:
+ last = uh->high - uh->low - (item - uh->low);
+ if (uh->last > last)
+ uh->last = last;
+ uh->busy++;
+ collapse_unr(uh, up);
+ check_unrhdr(uh, __LINE__);
+ return (item);
+}
+
+int
+alloc_unr_specific(struct unrhdr *uh, u_int item)
+{
+ void *p1, *p2;
+ int i;
+
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "alloc_unr_specific");
+
+ p1 = Malloc(sizeof(struct unr));
+ p2 = Malloc(sizeof(struct unr));
+
+ mtx_lock(uh->mtx);
+ i = alloc_unr_specificl(uh, item, &p1, &p2);
+ mtx_unlock(uh->mtx);
+
+ if (p1 != NULL)
+ Free(p1);
+ if (p2 != NULL)
+ Free(p2);
+
+ return (i);
+}
+
+/*
+ * Free a unr.
+ *
+ * If we can save unrs by using a bitmap, do so.
+ */
+static void
+free_unrl(struct unrhdr *uh, u_int item, void **p1, void **p2)
+{
+ struct unr *up, *upp, *upn;
+ struct unrb *ub;
+ u_int pl;
+
+ KASSERT(item >= uh->low && item <= uh->high,
+ ("UNR: free_unr(%u) out of range [%u...%u]",
+ item, uh->low, uh->high));
+ check_unrhdr(uh, __LINE__);
+ item -= uh->low;
+ upp = TAILQ_FIRST(&uh->head);
+ /*
+ * Freeing in the ideal split case
+ */
+ if (item + 1 == uh->first && upp == NULL) {
+ uh->last++;
+ uh->first--;
+ uh->busy--;
+ check_unrhdr(uh, __LINE__);
+ return;
+ }
+ /*
+ * Freeing in the ->first section. Create a run starting at the
+ * freed item. The code below will subdivide it.
+ */
+ if (item < uh->first) {
+ up = new_unr(uh, p1, p2);
+ up->ptr = uh;
+ up->len = uh->first - item;
+ TAILQ_INSERT_HEAD(&uh->head, up, list);
+ uh->first -= up->len;
+ }
+
+ item -= uh->first;
+
+ /* Find the item which contains the unit we want to free */
+ TAILQ_FOREACH(up, &uh->head, list) {
+ if (up->len > item)
+ break;
+ item -= up->len;
+ }
+
+ /* Handle bitmap items */
+ if (is_bitmap(uh, up)) {
+ ub = up->ptr;
+
+ KASSERT(bit_test(ub->map, item) != 0,
+ ("UNR: Freeing free item %d (bitmap)\n", item));
+ bit_clear(ub->map, item);
+ uh->busy--;
+ ub->busy--;
+ collapse_unr(uh, up);
+ return;
+ }
+
+ KASSERT(up->ptr == uh, ("UNR Freeing free item %d (run))\n", item));
+
+ /* Just this one left, reap it */
+ if (up->len == 1) {
+ up->ptr = NULL;
+ uh->busy--;
+ collapse_unr(uh, up);
+ return;
+ }
+
+ /* Check if we can shift the item into the previous 'free' run */
+ upp = TAILQ_PREV(up, unrhd, list);
+ if (item == 0 && upp != NULL && upp->ptr == NULL) {
+ upp->len++;
+ up->len--;
+ uh->busy--;
+ collapse_unr(uh, up);
+ return;
+ }
+
+ /* Check if we can shift the item to the next 'free' run */
+ upn = TAILQ_NEXT(up, list);
+ if (item == up->len - 1 && upn != NULL && upn->ptr == NULL) {
+ upn->len++;
+ up->len--;
+ uh->busy--;
+ collapse_unr(uh, up);
+ return;
+ }
+
+ /* Split off the tail end, if any. */
+ pl = up->len - (1 + item);
+ if (pl > 0) {
+ upp = new_unr(uh, p1, p2);
+ upp->ptr = uh;
+ upp->len = pl;
+ TAILQ_INSERT_AFTER(&uh->head, up, upp, list);
+ }
+
+ /* Split off head end, if any */
+ if (item > 0) {
+ upp = new_unr(uh, p1, p2);
+ upp->len = item;
+ upp->ptr = uh;
+ TAILQ_INSERT_BEFORE(up, upp, list);
+ }
+ up->len = 1;
+ up->ptr = NULL;
+ uh->busy--;
+ collapse_unr(uh, up);
+}
+
+void
+free_unr(struct unrhdr *uh, u_int item)
+{
+ void *p1, *p2;
+
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "free_unr");
+ p1 = Malloc(sizeof(struct unr));
+ p2 = Malloc(sizeof(struct unr));
+ mtx_lock(uh->mtx);
+ free_unrl(uh, item, &p1, &p2);
+ clean_unrhdrl(uh);
+ mtx_unlock(uh->mtx);
+ if (p1 != NULL)
+ Free(p1);
+ if (p2 != NULL)
+ Free(p2);
+}
+
+#ifndef _KERNEL /* USERLAND test driver */
+
+/*
+ * Simple stochastic test driver for the above functions
+ */
+
+static void
+print_unr(struct unrhdr *uh, struct unr *up)
+{
+ u_int x;
+ struct unrb *ub;
+
+ printf(" %p len = %5u ", up, up->len);
+ if (up->ptr == NULL)
+ printf("free\n");
+ else if (up->ptr == uh)
+ printf("alloc\n");
+ else {
+ ub = up->ptr;
+ printf("bitmap(%d) [", ub->busy);
+ for (x = 0; x < up->len; x++) {
+ if (bit_test(ub->map, x))
+ printf("#");
+ else
+ printf(" ");
+ }
+ printf("]\n");
+ }
+}
+
+static void
+print_unrhdr(struct unrhdr *uh)
+{
+ struct unr *up;
+ u_int x;
+
+ printf(
+ "%p low = %u high = %u first = %u last = %u busy %u chunks = %u\n",
+ uh, uh->low, uh->high, uh->first, uh->last, uh->busy, uh->alloc);
+ x = uh->low + uh->first;
+ TAILQ_FOREACH(up, &uh->head, list) {
+ printf(" from = %5u", x);
+ print_unr(uh, up);
+ if (up->ptr == NULL || up->ptr == uh)
+ x += up->len;
+ else
+ x += NBITS;
+ }
+}
+
+static void
+test_alloc_unr(struct unrhdr *uh, u_int i, char a[])
+{
+ int j;
+
+ if (a[i]) {
+ printf("F %u\n", i);
+ free_unr(uh, i);
+ a[i] = 0;
+ } else {
+ no_alloc = 1;
+ j = alloc_unr(uh);
+ if (j != -1) {
+ a[j] = 1;
+ printf("A %d\n", j);
+ }
+ no_alloc = 0;
+ }
+}
+
+static void
+test_alloc_unr_specific(struct unrhdr *uh, u_int i, char a[])
+{
+ int j;
+
+ j = alloc_unr_specific(uh, i);
+ if (j == -1) {
+ printf("F %u\n", i);
+ a[i] = 0;
+ free_unr(uh, i);
+ } else {
+ a[i] = 1;
+ printf("A %d\n", j);
+ }
+}
+
+/* Number of unrs to test */
+#define NN 10000
+
+int
+main(int argc __unused, const char **argv __unused)
+{
+ struct unrhdr *uh;
+ u_int i, x, m, j;
+ char a[NN];
+
+ setbuf(stdout, NULL);
+ uh = new_unrhdr(0, NN - 1, NULL);
+ print_unrhdr(uh);
+
+ memset(a, 0, sizeof a);
+ srandomdev();
+
+ fprintf(stderr, "sizeof(struct unr) %zu\n", sizeof(struct unr));
+ fprintf(stderr, "sizeof(struct unrb) %zu\n", sizeof(struct unrb));
+ fprintf(stderr, "sizeof(struct unrhdr) %zu\n", sizeof(struct unrhdr));
+ fprintf(stderr, "NBITS %d\n", NBITS);
+ x = 1;
+ for (m = 0; m < NN * 100; m++) {
+ j = random();
+ i = (j >> 1) % NN;
+#if 0
+ if (a[i] && (j & 1))
+ continue;
+#endif
+ if ((random() & 1) != 0)
+ test_alloc_unr(uh, i, a);
+ else
+ test_alloc_unr_specific(uh, i, a);
+
+ if (1) /* XXX: change this for detailed debug printout */
+ print_unrhdr(uh);
+ check_unrhdr(uh, __LINE__);
+ }
+ for (i = 0; i < NN; i++) {
+ if (a[i]) {
+ printf("C %u\n", i);
+ free_unr(uh, i);
+ print_unrhdr(uh);
+ }
+ }
+ print_unrhdr(uh);
+ delete_unrhdr(uh);
+ return (0);
+}
+#endif
diff --git a/sys/kern/subr_vmem.c b/sys/kern/subr_vmem.c
new file mode 100644
index 0000000..f3f3eec
--- /dev/null
+++ b/sys/kern/subr_vmem.c
@@ -0,0 +1,1487 @@
+/*-
+ * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
+ * Copyright (c) 2013 EMC Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * From:
+ * $NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $
+ * $NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $
+ */
+
+/*
+ * reference:
+ * - Magazines and Vmem: Extending the Slab Allocator
+ * to Many CPUs and Arbitrary Resources
+ * http://www.usenix.org/event/usenix01/bonwick.html
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/callout.h>
+#include <sys/hash.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/smp.h>
+#include <sys/condvar.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/vmem.h>
+
+#include "opt_vm.h"
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
+
+#define VMEM_MAXORDER (sizeof(vmem_size_t) * NBBY)
+
+#define VMEM_HASHSIZE_MIN 16
+#define VMEM_HASHSIZE_MAX 131072
+
+#define VMEM_QCACHE_IDX_MAX 16
+
+#define VMEM_FITMASK (M_BESTFIT | M_FIRSTFIT)
+
+#define VMEM_FLAGS \
+ (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT)
+
+#define BT_FLAGS (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM)
+
+#define QC_NAME_MAX 16
+
+/*
+ * Data structures private to vmem.
+ */
+MALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures");
+
+typedef struct vmem_btag bt_t;
+
+TAILQ_HEAD(vmem_seglist, vmem_btag);
+LIST_HEAD(vmem_freelist, vmem_btag);
+LIST_HEAD(vmem_hashlist, vmem_btag);
+
+struct qcache {
+ uma_zone_t qc_cache;
+ vmem_t *qc_vmem;
+ vmem_size_t qc_size;
+ char qc_name[QC_NAME_MAX];
+};
+typedef struct qcache qcache_t;
+#define QC_POOL_TO_QCACHE(pool) ((qcache_t *)(pool->pr_qcache))
+
+#define VMEM_NAME_MAX 16
+
+/* vmem arena */
+struct vmem {
+ struct mtx_padalign vm_lock;
+ struct cv vm_cv;
+ char vm_name[VMEM_NAME_MAX+1];
+ LIST_ENTRY(vmem) vm_alllist;
+ struct vmem_hashlist vm_hash0[VMEM_HASHSIZE_MIN];
+ struct vmem_freelist vm_freelist[VMEM_MAXORDER];
+ struct vmem_seglist vm_seglist;
+ struct vmem_hashlist *vm_hashlist;
+ vmem_size_t vm_hashsize;
+
+ /* Constant after init */
+ vmem_size_t vm_qcache_max;
+ vmem_size_t vm_quantum_mask;
+ vmem_size_t vm_import_quantum;
+ int vm_quantum_shift;
+
+ /* Written on alloc/free */
+ LIST_HEAD(, vmem_btag) vm_freetags;
+ int vm_nfreetags;
+ int vm_nbusytag;
+ vmem_size_t vm_inuse;
+ vmem_size_t vm_size;
+
+ /* Used on import. */
+ vmem_import_t *vm_importfn;
+ vmem_release_t *vm_releasefn;
+ void *vm_arg;
+
+ /* Space exhaustion callback. */
+ vmem_reclaim_t *vm_reclaimfn;
+
+ /* quantum cache */
+ qcache_t vm_qcache[VMEM_QCACHE_IDX_MAX];
+};
+
+/* boundary tag */
+struct vmem_btag {
+ TAILQ_ENTRY(vmem_btag) bt_seglist;
+ union {
+ LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */
+ LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */
+ } bt_u;
+#define bt_hashlist bt_u.u_hashlist
+#define bt_freelist bt_u.u_freelist
+ vmem_addr_t bt_start;
+ vmem_size_t bt_size;
+ int bt_type;
+};
+
+#define BT_TYPE_SPAN 1 /* Allocated from importfn */
+#define BT_TYPE_SPAN_STATIC 2 /* vmem_add() or create. */
+#define BT_TYPE_FREE 3 /* Available space. */
+#define BT_TYPE_BUSY 4 /* Used space. */
+#define BT_ISSPAN_P(bt) ((bt)->bt_type <= BT_TYPE_SPAN_STATIC)
+
+#define BT_END(bt) ((bt)->bt_start + (bt)->bt_size - 1)
+
+#if defined(DIAGNOSTIC)
+static int enable_vmem_check = 1;
+SYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RW,
+ &enable_vmem_check, 0, "Enable vmem check");
+static void vmem_check(vmem_t *);
+#endif
+
+static struct callout vmem_periodic_ch;
+static int vmem_periodic_interval;
+static struct task vmem_periodic_wk;
+
+static struct mtx_padalign vmem_list_lock;
+static LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
+
+/* ---- misc */
+#define VMEM_CONDVAR_INIT(vm, wchan) cv_init(&vm->vm_cv, wchan)
+#define VMEM_CONDVAR_DESTROY(vm) cv_destroy(&vm->vm_cv)
+#define VMEM_CONDVAR_WAIT(vm) cv_wait(&vm->vm_cv, &vm->vm_lock)
+#define VMEM_CONDVAR_BROADCAST(vm) cv_broadcast(&vm->vm_cv)
+
+
+#define VMEM_LOCK(vm) mtx_lock(&vm->vm_lock)
+#define VMEM_TRYLOCK(vm) mtx_trylock(&vm->vm_lock)
+#define VMEM_UNLOCK(vm) mtx_unlock(&vm->vm_lock)
+#define VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF)
+#define VMEM_LOCK_DESTROY(vm) mtx_destroy(&vm->vm_lock)
+#define VMEM_ASSERT_LOCKED(vm) mtx_assert(&vm->vm_lock, MA_OWNED);
+
+#define VMEM_ALIGNUP(addr, align) (-(-(addr) & -(align)))
+
+#define VMEM_CROSS_P(addr1, addr2, boundary) \
+ ((((addr1) ^ (addr2)) & -(boundary)) != 0)
+
+#define ORDER2SIZE(order) ((vmem_size_t)1 << (order))
+#define SIZE2ORDER(size) ((int)flsl(size) - 1)
+
+/*
+ * Maximum number of boundary tags that may be required to satisfy an
+ * allocation. Two may be required to import. Another two may be
+ * required to clip edges.
+ */
+#define BT_MAXALLOC 4
+
+/*
+ * Max free limits the number of locally cached boundary tags. We
+ * just want to avoid hitting the zone allocator for every call.
+ */
+#define BT_MAXFREE (BT_MAXALLOC * 8)
+
+/* Allocator for boundary tags. */
+static uma_zone_t vmem_bt_zone;
+
+/* boot time arena storage. */
+static struct vmem kernel_arena_storage;
+static struct vmem kmem_arena_storage;
+static struct vmem buffer_arena_storage;
+static struct vmem transient_arena_storage;
+vmem_t *kernel_arena = &kernel_arena_storage;
+vmem_t *kmem_arena = &kmem_arena_storage;
+vmem_t *buffer_arena = &buffer_arena_storage;
+vmem_t *transient_arena = &transient_arena_storage;
+
+#ifdef DEBUG_MEMGUARD
+static struct vmem memguard_arena_storage;
+vmem_t *memguard_arena = &memguard_arena_storage;
+#endif
+
+/*
+ * Fill the vmem's boundary tag cache. We guarantee that boundary tag
+ * allocation will not fail once bt_fill() passes. To do so we cache
+ * at least the maximum possible tag allocations in the arena.
+ */
+static int
+bt_fill(vmem_t *vm, int flags)
+{
+ bt_t *bt;
+
+ VMEM_ASSERT_LOCKED(vm);
+
+ /*
+ * Only allow the kmem arena to dip into reserve tags. It is the
+ * vmem where new tags come from.
+ */
+ flags &= BT_FLAGS;
+ if (vm != kmem_arena)
+ flags &= ~M_USE_RESERVE;
+
+ /*
+ * Loop until we meet the reserve. To minimize the lock shuffle
+ * and prevent simultaneous fills we first try a NOWAIT regardless
+ * of the caller's flags. Specify M_NOVM so we don't recurse while
+ * holding a vmem lock.
+ */
+ while (vm->vm_nfreetags < BT_MAXALLOC) {
+ bt = uma_zalloc(vmem_bt_zone,
+ (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM);
+ if (bt == NULL) {
+ VMEM_UNLOCK(vm);
+ bt = uma_zalloc(vmem_bt_zone, flags);
+ VMEM_LOCK(vm);
+ if (bt == NULL && (flags & M_NOWAIT) != 0)
+ break;
+ }
+ LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
+ vm->vm_nfreetags++;
+ }
+
+ if (vm->vm_nfreetags < BT_MAXALLOC)
+ return ENOMEM;
+
+ return 0;
+}
+
+/*
+ * Pop a tag off of the freetag stack.
+ */
+static bt_t *
+bt_alloc(vmem_t *vm)
+{
+ bt_t *bt;
+
+ VMEM_ASSERT_LOCKED(vm);
+ bt = LIST_FIRST(&vm->vm_freetags);
+ MPASS(bt != NULL);
+ LIST_REMOVE(bt, bt_freelist);
+ vm->vm_nfreetags--;
+
+ return bt;
+}
+
+/*
+ * Trim the per-vmem free list. Returns with the lock released to
+ * avoid allocator recursions.
+ */
+static void
+bt_freetrim(vmem_t *vm, int freelimit)
+{
+ LIST_HEAD(, vmem_btag) freetags;
+ bt_t *bt;
+
+ LIST_INIT(&freetags);
+ VMEM_ASSERT_LOCKED(vm);
+ while (vm->vm_nfreetags > freelimit) {
+ bt = LIST_FIRST(&vm->vm_freetags);
+ LIST_REMOVE(bt, bt_freelist);
+ vm->vm_nfreetags--;
+ LIST_INSERT_HEAD(&freetags, bt, bt_freelist);
+ }
+ VMEM_UNLOCK(vm);
+ while ((bt = LIST_FIRST(&freetags)) != NULL) {
+ LIST_REMOVE(bt, bt_freelist);
+ uma_zfree(vmem_bt_zone, bt);
+ }
+}
+
+static inline void
+bt_free(vmem_t *vm, bt_t *bt)
+{
+
+ VMEM_ASSERT_LOCKED(vm);
+ MPASS(LIST_FIRST(&vm->vm_freetags) != bt);
+ LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
+ vm->vm_nfreetags++;
+}
+
+/*
+ * freelist[0] ... [1, 1]
+ * freelist[1] ... [2, 3]
+ * freelist[2] ... [4, 7]
+ * freelist[3] ... [8, 15]
+ * :
+ * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1]
+ * :
+ */
+
+static struct vmem_freelist *
+bt_freehead_tofree(vmem_t *vm, vmem_size_t size)
+{
+ const vmem_size_t qsize = size >> vm->vm_quantum_shift;
+ const int idx = SIZE2ORDER(qsize);
+
+ MPASS(size != 0 && qsize != 0);
+ MPASS((size & vm->vm_quantum_mask) == 0);
+ MPASS(idx >= 0);
+ MPASS(idx < VMEM_MAXORDER);
+
+ return &vm->vm_freelist[idx];
+}
+
+/*
+ * bt_freehead_toalloc: return the freelist for the given size and allocation
+ * strategy.
+ *
+ * For M_FIRSTFIT, return the list in which any blocks are large enough
+ * for the requested size. otherwise, return the list which can have blocks
+ * large enough for the requested size.
+ */
+static struct vmem_freelist *
+bt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat)
+{
+ const vmem_size_t qsize = size >> vm->vm_quantum_shift;
+ int idx = SIZE2ORDER(qsize);
+
+ MPASS(size != 0 && qsize != 0);
+ MPASS((size & vm->vm_quantum_mask) == 0);
+
+ if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) {
+ idx++;
+ /* check too large request? */
+ }
+ MPASS(idx >= 0);
+ MPASS(idx < VMEM_MAXORDER);
+
+ return &vm->vm_freelist[idx];
+}
+
+/* ---- boundary tag hash */
+
+static struct vmem_hashlist *
+bt_hashhead(vmem_t *vm, vmem_addr_t addr)
+{
+ struct vmem_hashlist *list;
+ unsigned int hash;
+
+ hash = hash32_buf(&addr, sizeof(addr), 0);
+ list = &vm->vm_hashlist[hash % vm->vm_hashsize];
+
+ return list;
+}
+
+static bt_t *
+bt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
+{
+ struct vmem_hashlist *list;
+ bt_t *bt;
+
+ VMEM_ASSERT_LOCKED(vm);
+ list = bt_hashhead(vm, addr);
+ LIST_FOREACH(bt, list, bt_hashlist) {
+ if (bt->bt_start == addr) {
+ break;
+ }
+ }
+
+ return bt;
+}
+
+static void
+bt_rembusy(vmem_t *vm, bt_t *bt)
+{
+
+ VMEM_ASSERT_LOCKED(vm);
+ MPASS(vm->vm_nbusytag > 0);
+ vm->vm_inuse -= bt->bt_size;
+ vm->vm_nbusytag--;
+ LIST_REMOVE(bt, bt_hashlist);
+}
+
+static void
+bt_insbusy(vmem_t *vm, bt_t *bt)
+{
+ struct vmem_hashlist *list;
+
+ VMEM_ASSERT_LOCKED(vm);
+ MPASS(bt->bt_type == BT_TYPE_BUSY);
+
+ list = bt_hashhead(vm, bt->bt_start);
+ LIST_INSERT_HEAD(list, bt, bt_hashlist);
+ vm->vm_nbusytag++;
+ vm->vm_inuse += bt->bt_size;
+}
+
+/* ---- boundary tag list */
+
+static void
+bt_remseg(vmem_t *vm, bt_t *bt)
+{
+
+ TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
+ bt_free(vm, bt);
+}
+
+static void
+bt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
+{
+
+ TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
+}
+
+static void
+bt_insseg_tail(vmem_t *vm, bt_t *bt)
+{
+
+ TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
+}
+
+static void
+bt_remfree(vmem_t *vm, bt_t *bt)
+{
+
+ MPASS(bt->bt_type == BT_TYPE_FREE);
+
+ LIST_REMOVE(bt, bt_freelist);
+}
+
+static void
+bt_insfree(vmem_t *vm, bt_t *bt)
+{
+ struct vmem_freelist *list;
+
+ list = bt_freehead_tofree(vm, bt->bt_size);
+ LIST_INSERT_HEAD(list, bt, bt_freelist);
+}
+
+/* ---- vmem internal functions */
+
+/*
+ * Import from the arena into the quantum cache in UMA.
+ */
+static int
+qc_import(void *arg, void **store, int cnt, int flags)
+{
+ qcache_t *qc;
+ vmem_addr_t addr;
+ int i;
+
+ qc = arg;
+ flags |= M_BESTFIT;
+ for (i = 0; i < cnt; i++) {
+ if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0,
+ VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0)
+ break;
+ store[i] = (void *)addr;
+ /* Only guarantee one allocation. */
+ flags &= ~M_WAITOK;
+ flags |= M_NOWAIT;
+ }
+ return i;
+}
+
+/*
+ * Release memory from the UMA cache to the arena.
+ */
+static void
+qc_release(void *arg, void **store, int cnt)
+{
+ qcache_t *qc;
+ int i;
+
+ qc = arg;
+ for (i = 0; i < cnt; i++)
+ vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size);
+}
+
+static void
+qc_init(vmem_t *vm, vmem_size_t qcache_max)
+{
+ qcache_t *qc;
+ vmem_size_t size;
+ int qcache_idx_max;
+ int i;
+
+ MPASS((qcache_max & vm->vm_quantum_mask) == 0);
+ qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift,
+ VMEM_QCACHE_IDX_MAX);
+ vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift;
+ for (i = 0; i < qcache_idx_max; i++) {
+ qc = &vm->vm_qcache[i];
+ size = (i + 1) << vm->vm_quantum_shift;
+ snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
+ vm->vm_name, size);
+ qc->qc_vmem = vm;
+ qc->qc_size = size;
+ qc->qc_cache = uma_zcache_create(qc->qc_name, size,
+ NULL, NULL, NULL, NULL, qc_import, qc_release, qc,
+ UMA_ZONE_VM);
+ MPASS(qc->qc_cache);
+ }
+}
+
+static void
+qc_destroy(vmem_t *vm)
+{
+ int qcache_idx_max;
+ int i;
+
+ qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
+ for (i = 0; i < qcache_idx_max; i++)
+ uma_zdestroy(vm->vm_qcache[i].qc_cache);
+}
+
+static void
+qc_drain(vmem_t *vm)
+{
+ int qcache_idx_max;
+ int i;
+
+ qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
+ for (i = 0; i < qcache_idx_max; i++)
+ zone_drain(vm->vm_qcache[i].qc_cache);
+}
+
+#ifndef UMA_MD_SMALL_ALLOC
+
+static struct mtx_padalign vmem_bt_lock;
+
+/*
+ * vmem_bt_alloc: Allocate a new page of boundary tags.
+ *
+ * On architectures with uma_small_alloc there is no recursion; no address
+ * space need be allocated to allocate boundary tags. For the others, we
+ * must handle recursion. Boundary tags are necessary to allocate new
+ * boundary tags.
+ *
+ * UMA guarantees that enough tags are held in reserve to allocate a new
+ * page of kva. We dip into this reserve by specifying M_USE_RESERVE only
+ * when allocating the page to hold new boundary tags. In this way the
+ * reserve is automatically filled by the allocation that uses the reserve.
+ *
+ * We still have to guarantee that the new tags are allocated atomically since
+ * many threads may try concurrently. The bt_lock provides this guarantee.
+ * We convert WAITOK allocations to NOWAIT and then handle the blocking here
+ * on failure. It's ok to return NULL for a WAITOK allocation as UMA will
+ * loop again after checking to see if we lost the race to allocate.
+ *
+ * There is a small race between vmem_bt_alloc() returning the page and the
+ * zone lock being acquired to add the page to the zone. For WAITOK
+ * allocations we just pause briefly. NOWAIT may experience a transient
+ * failure. To alleviate this we permit a small number of simultaneous
+ * fills to proceed concurrently so NOWAIT is less likely to fail unless
+ * we are really out of KVA.
+ */
+static void *
+vmem_bt_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
+{
+ vmem_addr_t addr;
+
+ *pflag = UMA_SLAB_KMEM;
+
+ /*
+ * Single thread boundary tag allocation so that the address space
+ * and memory are added in one atomic operation.
+ */
+ mtx_lock(&vmem_bt_lock);
+ if (vmem_xalloc(kmem_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN,
+ VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT,
+ &addr) == 0) {
+ if (kmem_back(kmem_object, addr, bytes,
+ M_NOWAIT | M_USE_RESERVE) == 0) {
+ mtx_unlock(&vmem_bt_lock);
+ return ((void *)addr);
+ }
+ vmem_xfree(kmem_arena, addr, bytes);
+ mtx_unlock(&vmem_bt_lock);
+ /*
+ * Out of memory, not address space. This may not even be
+ * possible due to M_USE_RESERVE page allocation.
+ */
+ if (wait & M_WAITOK)
+ VM_WAIT;
+ return (NULL);
+ }
+ mtx_unlock(&vmem_bt_lock);
+ /*
+ * We're either out of address space or lost a fill race.
+ */
+ if (wait & M_WAITOK)
+ pause("btalloc", 1);
+
+ return (NULL);
+}
+#endif
+
+void
+vmem_startup(void)
+{
+
+ mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF);
+ vmem_bt_zone = uma_zcreate("vmem btag",
+ sizeof(struct vmem_btag), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, UMA_ZONE_VM);
+#ifndef UMA_MD_SMALL_ALLOC
+ mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF);
+ uma_prealloc(vmem_bt_zone, BT_MAXALLOC);
+ /*
+ * Reserve enough tags to allocate new tags. We allow multiple
+ * CPUs to attempt to allocate new tags concurrently to limit
+ * false restarts in UMA.
+ */
+ uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2);
+ uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc);
+#endif
+}
+
+/* ---- rehash */
+
+static int
+vmem_rehash(vmem_t *vm, vmem_size_t newhashsize)
+{
+ bt_t *bt;
+ int i;
+ struct vmem_hashlist *newhashlist;
+ struct vmem_hashlist *oldhashlist;
+ vmem_size_t oldhashsize;
+
+ MPASS(newhashsize > 0);
+
+ newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize,
+ M_VMEM, M_NOWAIT);
+ if (newhashlist == NULL)
+ return ENOMEM;
+ for (i = 0; i < newhashsize; i++) {
+ LIST_INIT(&newhashlist[i]);
+ }
+
+ VMEM_LOCK(vm);
+ oldhashlist = vm->vm_hashlist;
+ oldhashsize = vm->vm_hashsize;
+ vm->vm_hashlist = newhashlist;
+ vm->vm_hashsize = newhashsize;
+ if (oldhashlist == NULL) {
+ VMEM_UNLOCK(vm);
+ return 0;
+ }
+ for (i = 0; i < oldhashsize; i++) {
+ while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
+ bt_rembusy(vm, bt);
+ bt_insbusy(vm, bt);
+ }
+ }
+ VMEM_UNLOCK(vm);
+
+ if (oldhashlist != vm->vm_hash0) {
+ free(oldhashlist, M_VMEM);
+ }
+
+ return 0;
+}
+
+static void
+vmem_periodic_kick(void *dummy)
+{
+
+ taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk);
+}
+
+static void
+vmem_periodic(void *unused, int pending)
+{
+ vmem_t *vm;
+ vmem_size_t desired;
+ vmem_size_t current;
+
+ mtx_lock(&vmem_list_lock);
+ LIST_FOREACH(vm, &vmem_list, vm_alllist) {
+#ifdef DIAGNOSTIC
+ /* Convenient time to verify vmem state. */
+ if (enable_vmem_check == 1) {
+ VMEM_LOCK(vm);
+ vmem_check(vm);
+ VMEM_UNLOCK(vm);
+ }
+#endif
+ desired = 1 << flsl(vm->vm_nbusytag);
+ desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN),
+ VMEM_HASHSIZE_MAX);
+ current = vm->vm_hashsize;
+
+ /* Grow in powers of two. Shrink less aggressively. */
+ if (desired >= current * 2 || desired * 4 <= current)
+ vmem_rehash(vm, desired);
+ }
+ mtx_unlock(&vmem_list_lock);
+
+ callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
+ vmem_periodic_kick, NULL);
+}
+
+static void
+vmem_start_callout(void *unused)
+{
+
+ TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL);
+ vmem_periodic_interval = hz * 10;
+ callout_init(&vmem_periodic_ch, CALLOUT_MPSAFE);
+ callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
+ vmem_periodic_kick, NULL);
+}
+SYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL);
+
+static void
+vmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type)
+{
+ bt_t *btspan;
+ bt_t *btfree;
+
+ MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC);
+ MPASS((size & vm->vm_quantum_mask) == 0);
+
+ btspan = bt_alloc(vm);
+ btspan->bt_type = type;
+ btspan->bt_start = addr;
+ btspan->bt_size = size;
+ bt_insseg_tail(vm, btspan);
+
+ btfree = bt_alloc(vm);
+ btfree->bt_type = BT_TYPE_FREE;
+ btfree->bt_start = addr;
+ btfree->bt_size = size;
+ bt_insseg(vm, btfree, btspan);
+ bt_insfree(vm, btfree);
+
+ vm->vm_size += size;
+}
+
+static void
+vmem_destroy1(vmem_t *vm)
+{
+ bt_t *bt;
+
+ /*
+ * Drain per-cpu quantum caches.
+ */
+ qc_destroy(vm);
+
+ /*
+ * The vmem should now only contain empty segments.
+ */
+ VMEM_LOCK(vm);
+ MPASS(vm->vm_nbusytag == 0);
+
+ while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL)
+ bt_remseg(vm, bt);
+
+ if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0)
+ free(vm->vm_hashlist, M_VMEM);
+
+ bt_freetrim(vm, 0);
+
+ VMEM_CONDVAR_DESTROY(vm);
+ VMEM_LOCK_DESTROY(vm);
+ free(vm, M_VMEM);
+}
+
+static int
+vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
+{
+ vmem_addr_t addr;
+ int error;
+
+ if (vm->vm_importfn == NULL)
+ return EINVAL;
+
+ /*
+ * To make sure we get a span that meets the alignment we double it
+ * and add the size to the tail. This slightly overestimates.
+ */
+ if (align != vm->vm_quantum_mask + 1)
+ size = (align * 2) + size;
+ size = roundup(size, vm->vm_import_quantum);
+
+ /*
+ * Hide MAXALLOC tags so we're guaranteed to be able to add this
+ * span and the tag we want to allocate from it.
+ */
+ MPASS(vm->vm_nfreetags >= BT_MAXALLOC);
+ vm->vm_nfreetags -= BT_MAXALLOC;
+ VMEM_UNLOCK(vm);
+ error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
+ VMEM_LOCK(vm);
+ vm->vm_nfreetags += BT_MAXALLOC;
+ if (error)
+ return ENOMEM;
+
+ vmem_add1(vm, addr, size, BT_TYPE_SPAN);
+
+ return 0;
+}
+
+/*
+ * vmem_fit: check if a bt can satisfy the given restrictions.
+ *
+ * it's a caller's responsibility to ensure the region is big enough
+ * before calling us.
+ */
+static int
+vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
+ vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
+ vmem_addr_t maxaddr, vmem_addr_t *addrp)
+{
+ vmem_addr_t start;
+ vmem_addr_t end;
+
+ MPASS(size > 0);
+ MPASS(bt->bt_size >= size); /* caller's responsibility */
+
+ /*
+ * XXX assumption: vmem_addr_t and vmem_size_t are
+ * unsigned integer of the same size.
+ */
+
+ start = bt->bt_start;
+ if (start < minaddr) {
+ start = minaddr;
+ }
+ end = BT_END(bt);
+ if (end > maxaddr)
+ end = maxaddr;
+ if (start > end)
+ return (ENOMEM);
+
+ start = VMEM_ALIGNUP(start - phase, align) + phase;
+ if (start < bt->bt_start)
+ start += align;
+ if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
+ MPASS(align < nocross);
+ start = VMEM_ALIGNUP(start - phase, nocross) + phase;
+ }
+ if (start <= end && end - start >= size - 1) {
+ MPASS((start & (align - 1)) == phase);
+ MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross));
+ MPASS(minaddr <= start);
+ MPASS(maxaddr == 0 || start + size - 1 <= maxaddr);
+ MPASS(bt->bt_start <= start);
+ MPASS(BT_END(bt) - start >= size - 1);
+ *addrp = start;
+
+ return (0);
+ }
+ return (ENOMEM);
+}
+
+/*
+ * vmem_clip: Trim the boundary tag edges to the requested start and size.
+ */
+static void
+vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size)
+{
+ bt_t *btnew;
+ bt_t *btprev;
+
+ VMEM_ASSERT_LOCKED(vm);
+ MPASS(bt->bt_type == BT_TYPE_FREE);
+ MPASS(bt->bt_size >= size);
+ bt_remfree(vm, bt);
+ if (bt->bt_start != start) {
+ btprev = bt_alloc(vm);
+ btprev->bt_type = BT_TYPE_FREE;
+ btprev->bt_start = bt->bt_start;
+ btprev->bt_size = start - bt->bt_start;
+ bt->bt_start = start;
+ bt->bt_size -= btprev->bt_size;
+ bt_insfree(vm, btprev);
+ bt_insseg(vm, btprev,
+ TAILQ_PREV(bt, vmem_seglist, bt_seglist));
+ }
+ MPASS(bt->bt_start == start);
+ if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
+ /* split */
+ btnew = bt_alloc(vm);
+ btnew->bt_type = BT_TYPE_BUSY;
+ btnew->bt_start = bt->bt_start;
+ btnew->bt_size = size;
+ bt->bt_start = bt->bt_start + size;
+ bt->bt_size -= size;
+ bt_insfree(vm, bt);
+ bt_insseg(vm, btnew,
+ TAILQ_PREV(bt, vmem_seglist, bt_seglist));
+ bt_insbusy(vm, btnew);
+ bt = btnew;
+ } else {
+ bt->bt_type = BT_TYPE_BUSY;
+ bt_insbusy(vm, bt);
+ }
+ MPASS(bt->bt_size >= size);
+ bt->bt_type = BT_TYPE_BUSY;
+}
+
+/* ---- vmem API */
+
+void
+vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
+ vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum)
+{
+
+ VMEM_LOCK(vm);
+ vm->vm_importfn = importfn;
+ vm->vm_releasefn = releasefn;
+ vm->vm_arg = arg;
+ vm->vm_import_quantum = import_quantum;
+ VMEM_UNLOCK(vm);
+}
+
+void
+vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn)
+{
+
+ VMEM_LOCK(vm);
+ vm->vm_reclaimfn = reclaimfn;
+ VMEM_UNLOCK(vm);
+}
+
+/*
+ * vmem_init: Initializes vmem arena.
+ */
+vmem_t *
+vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size,
+ vmem_size_t quantum, vmem_size_t qcache_max, int flags)
+{
+ int i;
+
+ MPASS(quantum > 0);
+
+ bzero(vm, sizeof(*vm));
+
+ VMEM_CONDVAR_INIT(vm, name);
+ VMEM_LOCK_INIT(vm, name);
+ vm->vm_nfreetags = 0;
+ LIST_INIT(&vm->vm_freetags);
+ strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
+ vm->vm_quantum_mask = quantum - 1;
+ vm->vm_quantum_shift = SIZE2ORDER(quantum);
+ MPASS(ORDER2SIZE(vm->vm_quantum_shift) == quantum);
+ vm->vm_nbusytag = 0;
+ vm->vm_size = 0;
+ vm->vm_inuse = 0;
+ qc_init(vm, qcache_max);
+
+ TAILQ_INIT(&vm->vm_seglist);
+ for (i = 0; i < VMEM_MAXORDER; i++) {
+ LIST_INIT(&vm->vm_freelist[i]);
+ }
+ memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
+ vm->vm_hashsize = VMEM_HASHSIZE_MIN;
+ vm->vm_hashlist = vm->vm_hash0;
+
+ if (size != 0) {
+ if (vmem_add(vm, base, size, flags) != 0) {
+ vmem_destroy1(vm);
+ return NULL;
+ }
+ }
+
+ mtx_lock(&vmem_list_lock);
+ LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
+ mtx_unlock(&vmem_list_lock);
+
+ return vm;
+}
+
+/*
+ * vmem_create: create an arena.
+ */
+vmem_t *
+vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
+ vmem_size_t quantum, vmem_size_t qcache_max, int flags)
+{
+
+ vmem_t *vm;
+
+ vm = malloc(sizeof(*vm), M_VMEM, flags & (M_WAITOK|M_NOWAIT));
+ if (vm == NULL)
+ return (NULL);
+ if (vmem_init(vm, name, base, size, quantum, qcache_max,
+ flags) == NULL) {
+ free(vm, M_VMEM);
+ return (NULL);
+ }
+ return (vm);
+}
+
+void
+vmem_destroy(vmem_t *vm)
+{
+
+ mtx_lock(&vmem_list_lock);
+ LIST_REMOVE(vm, vm_alllist);
+ mtx_unlock(&vmem_list_lock);
+
+ vmem_destroy1(vm);
+}
+
+vmem_size_t
+vmem_roundup_size(vmem_t *vm, vmem_size_t size)
+{
+
+ return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
+}
+
+/*
+ * vmem_alloc: allocate resource from the arena.
+ */
+int
+vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp)
+{
+ const int strat __unused = flags & VMEM_FITMASK;
+ qcache_t *qc;
+
+ flags &= VMEM_FLAGS;
+ MPASS(size > 0);
+ MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
+ if ((flags & M_NOWAIT) == 0)
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc");
+
+ if (size <= vm->vm_qcache_max) {
+ qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
+ *addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags);
+ if (*addrp == 0)
+ return (ENOMEM);
+ return (0);
+ }
+
+ return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
+ flags, addrp);
+}
+
+int
+vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
+ const vmem_size_t phase, const vmem_size_t nocross,
+ const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags,
+ vmem_addr_t *addrp)
+{
+ const vmem_size_t size = vmem_roundup_size(vm, size0);
+ struct vmem_freelist *list;
+ struct vmem_freelist *first;
+ struct vmem_freelist *end;
+ vmem_size_t avail;
+ bt_t *bt;
+ int error;
+ int strat;
+
+ flags &= VMEM_FLAGS;
+ strat = flags & VMEM_FITMASK;
+ MPASS(size0 > 0);
+ MPASS(size > 0);
+ MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
+ MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK));
+ if ((flags & M_NOWAIT) == 0)
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc");
+ MPASS((align & vm->vm_quantum_mask) == 0);
+ MPASS((align & (align - 1)) == 0);
+ MPASS((phase & vm->vm_quantum_mask) == 0);
+ MPASS((nocross & vm->vm_quantum_mask) == 0);
+ MPASS((nocross & (nocross - 1)) == 0);
+ MPASS((align == 0 && phase == 0) || phase < align);
+ MPASS(nocross == 0 || nocross >= size);
+ MPASS(minaddr <= maxaddr);
+ MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
+
+ if (align == 0)
+ align = vm->vm_quantum_mask + 1;
+
+ *addrp = 0;
+ end = &vm->vm_freelist[VMEM_MAXORDER];
+ /*
+ * choose a free block from which we allocate.
+ */
+ first = bt_freehead_toalloc(vm, size, strat);
+ VMEM_LOCK(vm);
+ for (;;) {
+ /*
+ * Make sure we have enough tags to complete the
+ * operation.
+ */
+ if (vm->vm_nfreetags < BT_MAXALLOC &&
+ bt_fill(vm, flags) != 0) {
+ error = ENOMEM;
+ break;
+ }
+ /*
+ * Scan freelists looking for a tag that satisfies the
+ * allocation. If we're doing BESTFIT we may encounter
+ * sizes below the request. If we're doing FIRSTFIT we
+ * inspect only the first element from each list.
+ */
+ for (list = first; list < end; list++) {
+ LIST_FOREACH(bt, list, bt_freelist) {
+ if (bt->bt_size >= size) {
+ error = vmem_fit(bt, size, align, phase,
+ nocross, minaddr, maxaddr, addrp);
+ if (error == 0) {
+ vmem_clip(vm, bt, *addrp, size);
+ goto out;
+ }
+ }
+ /* FIRST skips to the next list. */
+ if (strat == M_FIRSTFIT)
+ break;
+ }
+ }
+ /*
+ * Retry if the fast algorithm failed.
+ */
+ if (strat == M_FIRSTFIT) {
+ strat = M_BESTFIT;
+ first = bt_freehead_toalloc(vm, size, strat);
+ continue;
+ }
+ /*
+ * XXX it is possible to fail to meet restrictions with the
+ * imported region. It is up to the user to specify the
+ * import quantum such that it can satisfy any allocation.
+ */
+ if (vmem_import(vm, size, align, flags) == 0)
+ continue;
+
+ /*
+ * Try to free some space from the quantum cache or reclaim
+ * functions if available.
+ */
+ if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) {
+ avail = vm->vm_size - vm->vm_inuse;
+ VMEM_UNLOCK(vm);
+ if (vm->vm_qcache_max != 0)
+ qc_drain(vm);
+ if (vm->vm_reclaimfn != NULL)
+ vm->vm_reclaimfn(vm, flags);
+ VMEM_LOCK(vm);
+ /* If we were successful retry even NOWAIT. */
+ if (vm->vm_size - vm->vm_inuse > avail)
+ continue;
+ }
+ if ((flags & M_NOWAIT) != 0) {
+ error = ENOMEM;
+ break;
+ }
+ VMEM_CONDVAR_WAIT(vm);
+ }
+out:
+ VMEM_UNLOCK(vm);
+ if (error != 0 && (flags & M_NOWAIT) == 0)
+ panic("failed to allocate waiting allocation\n");
+
+ return (error);
+}
+
+/*
+ * vmem_free: free the resource to the arena.
+ */
+void
+vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
+{
+ qcache_t *qc;
+ MPASS(size > 0);
+
+ if (size <= vm->vm_qcache_max) {
+ qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
+ uma_zfree(qc->qc_cache, (void *)addr);
+ } else
+ vmem_xfree(vm, addr, size);
+}
+
+void
+vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
+{
+ bt_t *bt;
+ bt_t *t;
+
+ MPASS(size > 0);
+
+ VMEM_LOCK(vm);
+ bt = bt_lookupbusy(vm, addr);
+ MPASS(bt != NULL);
+ MPASS(bt->bt_start == addr);
+ MPASS(bt->bt_size == vmem_roundup_size(vm, size) ||
+ bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
+ MPASS(bt->bt_type == BT_TYPE_BUSY);
+ bt_rembusy(vm, bt);
+ bt->bt_type = BT_TYPE_FREE;
+
+ /* coalesce */
+ t = TAILQ_NEXT(bt, bt_seglist);
+ if (t != NULL && t->bt_type == BT_TYPE_FREE) {
+ MPASS(BT_END(bt) < t->bt_start); /* YYY */
+ bt->bt_size += t->bt_size;
+ bt_remfree(vm, t);
+ bt_remseg(vm, t);
+ }
+ t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
+ if (t != NULL && t->bt_type == BT_TYPE_FREE) {
+ MPASS(BT_END(t) < bt->bt_start); /* YYY */
+ bt->bt_size += t->bt_size;
+ bt->bt_start = t->bt_start;
+ bt_remfree(vm, t);
+ bt_remseg(vm, t);
+ }
+
+ t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
+ MPASS(t != NULL);
+ MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
+ if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
+ t->bt_size == bt->bt_size) {
+ vmem_addr_t spanaddr;
+ vmem_size_t spansize;
+
+ MPASS(t->bt_start == bt->bt_start);
+ spanaddr = bt->bt_start;
+ spansize = bt->bt_size;
+ bt_remseg(vm, bt);
+ bt_remseg(vm, t);
+ vm->vm_size -= spansize;
+ VMEM_CONDVAR_BROADCAST(vm);
+ bt_freetrim(vm, BT_MAXFREE);
+ (*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
+ } else {
+ bt_insfree(vm, bt);
+ VMEM_CONDVAR_BROADCAST(vm);
+ bt_freetrim(vm, BT_MAXFREE);
+ }
+}
+
+/*
+ * vmem_add:
+ *
+ */
+int
+vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags)
+{
+ int error;
+
+ error = 0;
+ flags &= VMEM_FLAGS;
+ VMEM_LOCK(vm);
+ if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0)
+ vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC);
+ else
+ error = ENOMEM;
+ VMEM_UNLOCK(vm);
+
+ return (error);
+}
+
+/*
+ * vmem_size: information about arenas size
+ */
+vmem_size_t
+vmem_size(vmem_t *vm, int typemask)
+{
+
+ switch (typemask) {
+ case VMEM_ALLOC:
+ return vm->vm_inuse;
+ case VMEM_FREE:
+ return vm->vm_size - vm->vm_inuse;
+ case VMEM_FREE|VMEM_ALLOC:
+ return vm->vm_size;
+ default:
+ panic("vmem_size");
+ }
+}
+
+/* ---- debug */
+
+#if defined(DDB) || defined(DIAGNOSTIC)
+
+static void bt_dump(const bt_t *, int (*)(const char *, ...)
+ __printflike(1, 2));
+
+static const char *
+bt_type_string(int type)
+{
+
+ switch (type) {
+ case BT_TYPE_BUSY:
+ return "busy";
+ case BT_TYPE_FREE:
+ return "free";
+ case BT_TYPE_SPAN:
+ return "span";
+ case BT_TYPE_SPAN_STATIC:
+ return "static span";
+ default:
+ break;
+ }
+ return "BOGUS";
+}
+
+static void
+bt_dump(const bt_t *bt, int (*pr)(const char *, ...))
+{
+
+ (*pr)("\t%p: %jx %jx, %d(%s)\n",
+ bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size,
+ bt->bt_type, bt_type_string(bt->bt_type));
+}
+
+static void
+vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2))
+{
+ const bt_t *bt;
+ int i;
+
+ (*pr)("vmem %p '%s'\n", vm, vm->vm_name);
+ TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
+ bt_dump(bt, pr);
+ }
+
+ for (i = 0; i < VMEM_MAXORDER; i++) {
+ const struct vmem_freelist *fl = &vm->vm_freelist[i];
+
+ if (LIST_EMPTY(fl)) {
+ continue;
+ }
+
+ (*pr)("freelist[%d]\n", i);
+ LIST_FOREACH(bt, fl, bt_freelist) {
+ bt_dump(bt, pr);
+ }
+ }
+}
+
+#endif /* defined(DDB) || defined(DIAGNOSTIC) */
+
+#if defined(DDB)
+static bt_t *
+vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr)
+{
+ bt_t *bt;
+
+ TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
+ if (BT_ISSPAN_P(bt)) {
+ continue;
+ }
+ if (bt->bt_start <= addr && addr <= BT_END(bt)) {
+ return bt;
+ }
+ }
+
+ return NULL;
+}
+
+void
+vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...))
+{
+ vmem_t *vm;
+
+ LIST_FOREACH(vm, &vmem_list, vm_alllist) {
+ bt_t *bt;
+
+ bt = vmem_whatis_lookup(vm, addr);
+ if (bt == NULL) {
+ continue;
+ }
+ (*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
+ (void *)addr, (void *)bt->bt_start,
+ (vmem_size_t)(addr - bt->bt_start), vm->vm_name,
+ (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
+ }
+}
+
+void
+vmem_printall(const char *modif, int (*pr)(const char *, ...))
+{
+ const vmem_t *vm;
+
+ LIST_FOREACH(vm, &vmem_list, vm_alllist) {
+ vmem_dump(vm, pr);
+ }
+}
+
+void
+vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...))
+{
+ const vmem_t *vm = (const void *)addr;
+
+ vmem_dump(vm, pr);
+}
+#endif /* defined(DDB) */
+
+#define vmem_printf printf
+
+#if defined(DIAGNOSTIC)
+
+static bool
+vmem_check_sanity(vmem_t *vm)
+{
+ const bt_t *bt, *bt2;
+
+ MPASS(vm != NULL);
+
+ TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
+ if (bt->bt_start > BT_END(bt)) {
+ printf("corrupted tag\n");
+ bt_dump(bt, vmem_printf);
+ return false;
+ }
+ }
+ TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
+ TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
+ if (bt == bt2) {
+ continue;
+ }
+ if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
+ continue;
+ }
+ if (bt->bt_start <= BT_END(bt2) &&
+ bt2->bt_start <= BT_END(bt)) {
+ printf("overwrapped tags\n");
+ bt_dump(bt, vmem_printf);
+ bt_dump(bt2, vmem_printf);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+static void
+vmem_check(vmem_t *vm)
+{
+
+ if (!vmem_check_sanity(vm)) {
+ panic("insanity vmem %p", vm);
+ }
+}
+
+#endif /* defined(DIAGNOSTIC) */
diff --git a/sys/kern/subr_witness.c b/sys/kern/subr_witness.c
new file mode 100644
index 0000000..9d3040d
--- /dev/null
+++ b/sys/kern/subr_witness.c
@@ -0,0 +1,2912 @@
+/*-
+ * Copyright (c) 2008 Isilon Systems, Inc.
+ * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
+ * Copyright (c) 1998 Berkeley Software Design, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Berkeley Software Design Inc's name may not be used to endorse or
+ * promote products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
+ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
+ */
+
+/*
+ * Implementation of the `witness' lock verifier. Originally implemented for
+ * mutexes in BSD/OS. Extended to handle generic lock objects and lock
+ * classes in FreeBSD.
+ */
+
+/*
+ * Main Entry: witness
+ * Pronunciation: 'wit-n&s
+ * Function: noun
+ * Etymology: Middle English witnesse, from Old English witnes knowledge,
+ * testimony, witness, from 2wit
+ * Date: before 12th century
+ * 1 : attestation of a fact or event : TESTIMONY
+ * 2 : one that gives evidence; specifically : one who testifies in
+ * a cause or before a judicial tribunal
+ * 3 : one asked to be present at a transaction so as to be able to
+ * testify to its having taken place
+ * 4 : one who has personal knowledge of something
+ * 5 a : something serving as evidence or proof : SIGN
+ * b : public affirmation by word or example of usually
+ * religious faith or conviction <the heroic witness to divine
+ * life -- Pilot>
+ * 6 capitalized : a member of the Jehovah's Witnesses
+ */
+
+/*
+ * Special rules concerning Giant and lock orders:
+ *
+ * 1) Giant must be acquired before any other mutexes. Stated another way,
+ * no other mutex may be held when Giant is acquired.
+ *
+ * 2) Giant must be released when blocking on a sleepable lock.
+ *
+ * This rule is less obvious, but is a result of Giant providing the same
+ * semantics as spl(). Basically, when a thread sleeps, it must release
+ * Giant. When a thread blocks on a sleepable lock, it sleeps. Hence rule
+ * 2).
+ *
+ * 3) Giant may be acquired before or after sleepable locks.
+ *
+ * This rule is also not quite as obvious. Giant may be acquired after
+ * a sleepable lock because it is a non-sleepable lock and non-sleepable
+ * locks may always be acquired while holding a sleepable lock. The second
+ * case, Giant before a sleepable lock, follows from rule 2) above. Suppose
+ * you have two threads T1 and T2 and a sleepable lock X. Suppose that T1
+ * acquires X and blocks on Giant. Then suppose that T2 acquires Giant and
+ * blocks on X. When T2 blocks on X, T2 will release Giant allowing T1 to
+ * execute. Thus, acquiring Giant both before and after a sleepable lock
+ * will not result in a lock order reversal.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
+#include "opt_stack.h"
+#include "opt_witness.h"
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/stack.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <machine/stdarg.h>
+
+#if !defined(DDB) && !defined(STACK)
+#error "DDB or STACK options are required for WITNESS"
+#endif
+
+/* Note that these traces do not work with KTR_ALQ. */
+#if 0
+#define KTR_WITNESS KTR_SUBSYS
+#else
+#define KTR_WITNESS 0
+#endif
+
+#define LI_RECURSEMASK 0x0000ffff /* Recursion depth of lock instance. */
+#define LI_EXCLUSIVE 0x00010000 /* Exclusive lock instance. */
+#define LI_NORELEASE 0x00020000 /* Lock not allowed to be released. */
+
+/* Define this to check for blessed mutexes */
+#undef BLESSING
+
+#define WITNESS_COUNT 1024
+#define WITNESS_CHILDCOUNT (WITNESS_COUNT * 4)
+#define WITNESS_HASH_SIZE 251 /* Prime, gives load factor < 2 */
+#define WITNESS_PENDLIST 1024
+
+/* Allocate 256 KB of stack data space */
+#define WITNESS_LO_DATA_COUNT 2048
+
+/* Prime, gives load factor of ~2 at full load */
+#define WITNESS_LO_HASH_SIZE 1021
+
+/*
+ * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
+ * will hold LOCK_NCHILDREN locks. We handle failure ok, and we should
+ * probably be safe for the most part, but it's still a SWAG.
+ */
+#define LOCK_NCHILDREN 5
+#define LOCK_CHILDCOUNT 2048
+
+#define MAX_W_NAME 64
+
+#define BADSTACK_SBUF_SIZE (256 * WITNESS_COUNT)
+#define FULLGRAPH_SBUF_SIZE 512
+
+/*
+ * These flags go in the witness relationship matrix and describe the
+ * relationship between any two struct witness objects.
+ */
+#define WITNESS_UNRELATED 0x00 /* No lock order relation. */
+#define WITNESS_PARENT 0x01 /* Parent, aka direct ancestor. */
+#define WITNESS_ANCESTOR 0x02 /* Direct or indirect ancestor. */
+#define WITNESS_CHILD 0x04 /* Child, aka direct descendant. */
+#define WITNESS_DESCENDANT 0x08 /* Direct or indirect descendant. */
+#define WITNESS_ANCESTOR_MASK (WITNESS_PARENT | WITNESS_ANCESTOR)
+#define WITNESS_DESCENDANT_MASK (WITNESS_CHILD | WITNESS_DESCENDANT)
+#define WITNESS_RELATED_MASK \
+ (WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
+#define WITNESS_REVERSAL 0x10 /* A lock order reversal has been
+ * observed. */
+#define WITNESS_RESERVED1 0x20 /* Unused flag, reserved. */
+#define WITNESS_RESERVED2 0x40 /* Unused flag, reserved. */
+#define WITNESS_LOCK_ORDER_KNOWN 0x80 /* This lock order is known. */
+
+/* Descendant to ancestor flags */
+#define WITNESS_DTOA(x) (((x) & WITNESS_RELATED_MASK) >> 2)
+
+/* Ancestor to descendant flags */
+#define WITNESS_ATOD(x) (((x) & WITNESS_RELATED_MASK) << 2)
+
+#define WITNESS_INDEX_ASSERT(i) \
+ MPASS((i) > 0 && (i) <= w_max_used_index && (i) < WITNESS_COUNT)
+
+static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
+
+/*
+ * Lock instances. A lock instance is the data associated with a lock while
+ * it is held by witness. For example, a lock instance will hold the
+ * recursion count of a lock. Lock instances are held in lists. Spin locks
+ * are held in a per-cpu list while sleep locks are held in per-thread list.
+ */
+struct lock_instance {
+ struct lock_object *li_lock;
+ const char *li_file;
+ int li_line;
+ u_int li_flags;
+};
+
+/*
+ * A simple list type used to build the list of locks held by a thread
+ * or CPU. We can't simply embed the list in struct lock_object since a
+ * lock may be held by more than one thread if it is a shared lock. Locks
+ * are added to the head of the list, so we fill up each list entry from
+ * "the back" logically. To ease some of the arithmetic, we actually fill
+ * in each list entry the normal way (children[0] then children[1], etc.) but
+ * when we traverse the list we read children[count-1] as the first entry
+ * down to children[0] as the final entry.
+ */
+struct lock_list_entry {
+ struct lock_list_entry *ll_next;
+ struct lock_instance ll_children[LOCK_NCHILDREN];
+ u_int ll_count;
+};
+
+/*
+ * The main witness structure. One of these per named lock type in the system
+ * (for example, "vnode interlock").
+ */
+struct witness {
+ char w_name[MAX_W_NAME];
+ uint32_t w_index; /* Index in the relationship matrix */
+ struct lock_class *w_class;
+ STAILQ_ENTRY(witness) w_list; /* List of all witnesses. */
+ STAILQ_ENTRY(witness) w_typelist; /* Witnesses of a type. */
+ struct witness *w_hash_next; /* Linked list in hash buckets. */
+ const char *w_file; /* File where last acquired */
+ uint32_t w_line; /* Line where last acquired */
+ uint32_t w_refcount;
+ uint16_t w_num_ancestors; /* direct/indirect
+ * ancestor count */
+ uint16_t w_num_descendants; /* direct/indirect
+ * descendant count */
+ int16_t w_ddb_level;
+ unsigned w_displayed:1;
+ unsigned w_reversed:1;
+};
+
+STAILQ_HEAD(witness_list, witness);
+
+/*
+ * The witness hash table. Keys are witness names (const char *), elements are
+ * witness objects (struct witness *).
+ */
+struct witness_hash {
+ struct witness *wh_array[WITNESS_HASH_SIZE];
+ uint32_t wh_size;
+ uint32_t wh_count;
+};
+
+/*
+ * Key type for the lock order data hash table.
+ */
+struct witness_lock_order_key {
+ uint16_t from;
+ uint16_t to;
+};
+
+struct witness_lock_order_data {
+ struct stack wlod_stack;
+ struct witness_lock_order_key wlod_key;
+ struct witness_lock_order_data *wlod_next;
+};
+
+/*
+ * The witness lock order data hash table. Keys are witness index tuples
+ * (struct witness_lock_order_key), elements are lock order data objects
+ * (struct witness_lock_order_data).
+ */
+struct witness_lock_order_hash {
+ struct witness_lock_order_data *wloh_array[WITNESS_LO_HASH_SIZE];
+ u_int wloh_size;
+ u_int wloh_count;
+};
+
+#ifdef BLESSING
+struct witness_blessed {
+ const char *b_lock1;
+ const char *b_lock2;
+};
+#endif
+
+struct witness_pendhelp {
+ const char *wh_type;
+ struct lock_object *wh_lock;
+};
+
+struct witness_order_list_entry {
+ const char *w_name;
+ struct lock_class *w_class;
+};
+
+/*
+ * Returns 0 if one of the locks is a spin lock and the other is not.
+ * Returns 1 otherwise.
+ */
+static __inline int
+witness_lock_type_equal(struct witness *w1, struct witness *w2)
+{
+
+ return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
+ (w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
+}
+
+static __inline int
+witness_lock_order_key_empty(const struct witness_lock_order_key *key)
+{
+
+ return (key->from == 0 && key->to == 0);
+}
+
+static __inline int
+witness_lock_order_key_equal(const struct witness_lock_order_key *a,
+ const struct witness_lock_order_key *b)
+{
+
+ return (a->from == b->from && a->to == b->to);
+}
+
+static int _isitmyx(struct witness *w1, struct witness *w2, int rmask,
+ const char *fname);
+#ifdef KDB
+static void _witness_debugger(int cond, const char *msg);
+#endif
+static void adopt(struct witness *parent, struct witness *child);
+#ifdef BLESSING
+static int blessed(struct witness *, struct witness *);
+#endif
+static void depart(struct witness *w);
+static struct witness *enroll(const char *description,
+ struct lock_class *lock_class);
+static struct lock_instance *find_instance(struct lock_list_entry *list,
+ const struct lock_object *lock);
+static int isitmychild(struct witness *parent, struct witness *child);
+static int isitmydescendant(struct witness *parent, struct witness *child);
+static void itismychild(struct witness *parent, struct witness *child);
+static int sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
+static int sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
+static int sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
+static void witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
+#ifdef DDB
+static void witness_ddb_compute_levels(void);
+static void witness_ddb_display(int(*)(const char *fmt, ...));
+static void witness_ddb_display_descendants(int(*)(const char *fmt, ...),
+ struct witness *, int indent);
+static void witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
+ struct witness_list *list);
+static void witness_ddb_level_descendants(struct witness *parent, int l);
+static void witness_ddb_list(struct thread *td);
+#endif
+static void witness_free(struct witness *m);
+static struct witness *witness_get(void);
+static uint32_t witness_hash_djb2(const uint8_t *key, uint32_t size);
+static struct witness *witness_hash_get(const char *key);
+static void witness_hash_put(struct witness *w);
+static void witness_init_hash_tables(void);
+static void witness_increment_graph_generation(void);
+static void witness_lock_list_free(struct lock_list_entry *lle);
+static struct lock_list_entry *witness_lock_list_get(void);
+static int witness_lock_order_add(struct witness *parent,
+ struct witness *child);
+static int witness_lock_order_check(struct witness *parent,
+ struct witness *child);
+static struct witness_lock_order_data *witness_lock_order_get(
+ struct witness *parent,
+ struct witness *child);
+static void witness_list_lock(struct lock_instance *instance,
+ int (*prnt)(const char *fmt, ...));
+static void witness_setflag(struct lock_object *lock, int flag, int set);
+
+#ifdef KDB
+#define witness_debugger(c) _witness_debugger(c, __func__)
+#else
+#define witness_debugger(c)
+#endif
+
+static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
+ "Witness Locking");
+
+/*
+ * If set to 0, lock order checking is disabled. If set to -1,
+ * witness is completely disabled. Otherwise witness performs full
+ * lock order checking for all locks. At runtime, lock order checking
+ * may be toggled. However, witness cannot be reenabled once it is
+ * completely disabled.
+ */
+static int witness_watch = 1;
+TUNABLE_INT("debug.witness.watch", &witness_watch);
+SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RW | CTLTYPE_INT, NULL, 0,
+ sysctl_debug_witness_watch, "I", "witness is watching lock operations");
+
+#ifdef KDB
+/*
+ * When KDB is enabled and witness_kdb is 1, it will cause the system
+ * to drop into kdebug() when:
+ * - a lock hierarchy violation occurs
+ * - locks are held when going to sleep.
+ */
+#ifdef WITNESS_KDB
+int witness_kdb = 1;
+#else
+int witness_kdb = 0;
+#endif
+TUNABLE_INT("debug.witness.kdb", &witness_kdb);
+SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RW, &witness_kdb, 0, "");
+
+/*
+ * When KDB is enabled and witness_trace is 1, it will cause the system
+ * to print a stack trace:
+ * - a lock hierarchy violation occurs
+ * - locks are held when going to sleep.
+ */
+int witness_trace = 1;
+TUNABLE_INT("debug.witness.trace", &witness_trace);
+SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RW, &witness_trace, 0, "");
+#endif /* KDB */
+
+#ifdef WITNESS_SKIPSPIN
+int witness_skipspin = 1;
+#else
+int witness_skipspin = 0;
+#endif
+TUNABLE_INT("debug.witness.skipspin", &witness_skipspin);
+SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin,
+ 0, "");
+
+/*
+ * Call this to print out the relations between locks.
+ */
+SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
+
+/*
+ * Call this to print out the witness faulty stacks.
+ */
+SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
+ NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
+
+static struct mtx w_mtx;
+
+/* w_list */
+static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
+static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
+
+/* w_typelist */
+static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
+static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
+
+/* lock list */
+static struct lock_list_entry *w_lock_list_free = NULL;
+static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
+static u_int pending_cnt;
+
+static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
+SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
+SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
+SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
+ "");
+
+static struct witness *w_data;
+static uint8_t w_rmatrix[WITNESS_COUNT+1][WITNESS_COUNT+1];
+static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
+static struct witness_hash w_hash; /* The witness hash table. */
+
+/* The lock order data hash */
+static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
+static struct witness_lock_order_data *w_lofree = NULL;
+static struct witness_lock_order_hash w_lohash;
+static int w_max_used_index = 0;
+static unsigned int w_generation = 0;
+static const char w_notrunning[] = "Witness not running\n";
+static const char w_stillcold[] = "Witness is still cold\n";
+
+
+static struct witness_order_list_entry order_lists[] = {
+ /*
+ * sx locks
+ */
+ { "proctree", &lock_class_sx },
+ { "allproc", &lock_class_sx },
+ { "allprison", &lock_class_sx },
+ { NULL, NULL },
+ /*
+ * Various mutexes
+ */
+ { "Giant", &lock_class_mtx_sleep },
+ { "pipe mutex", &lock_class_mtx_sleep },
+ { "sigio lock", &lock_class_mtx_sleep },
+ { "process group", &lock_class_mtx_sleep },
+ { "process lock", &lock_class_mtx_sleep },
+ { "session", &lock_class_mtx_sleep },
+ { "uidinfo hash", &lock_class_rw },
+#ifdef HWPMC_HOOKS
+ { "pmc-sleep", &lock_class_mtx_sleep },
+#endif
+ { "time lock", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * Sockets
+ */
+ { "accept", &lock_class_mtx_sleep },
+ { "so_snd", &lock_class_mtx_sleep },
+ { "so_rcv", &lock_class_mtx_sleep },
+ { "sellck", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * Routing
+ */
+ { "so_rcv", &lock_class_mtx_sleep },
+ { "radix node head", &lock_class_rw },
+ { "rtentry", &lock_class_mtx_sleep },
+ { "ifaddr", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * IPv4 multicast:
+ * protocol locks before interface locks, after UDP locks.
+ */
+ { "udpinp", &lock_class_rw },
+ { "in_multi_mtx", &lock_class_mtx_sleep },
+ { "igmp_mtx", &lock_class_mtx_sleep },
+ { "if_addr_lock", &lock_class_rw },
+ { NULL, NULL },
+ /*
+ * IPv6 multicast:
+ * protocol locks before interface locks, after UDP locks.
+ */
+ { "udpinp", &lock_class_rw },
+ { "in6_multi_mtx", &lock_class_mtx_sleep },
+ { "mld_mtx", &lock_class_mtx_sleep },
+ { "if_addr_lock", &lock_class_rw },
+ { NULL, NULL },
+ /*
+ * UNIX Domain Sockets
+ */
+ { "unp_global_rwlock", &lock_class_rw },
+ { "unp_list_lock", &lock_class_mtx_sleep },
+ { "unp", &lock_class_mtx_sleep },
+ { "so_snd", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * UDP/IP
+ */
+ { "udp", &lock_class_rw },
+ { "udpinp", &lock_class_rw },
+ { "so_snd", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * TCP/IP
+ */
+ { "tcp", &lock_class_rw },
+ { "tcpinp", &lock_class_rw },
+ { "so_snd", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * netatalk
+ */
+ { "ddp_list_mtx", &lock_class_mtx_sleep },
+ { "ddp_mtx", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * BPF
+ */
+ { "bpf global lock", &lock_class_mtx_sleep },
+ { "bpf interface lock", &lock_class_rw },
+ { "bpf cdev lock", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * NFS server
+ */
+ { "nfsd_mtx", &lock_class_mtx_sleep },
+ { "so_snd", &lock_class_mtx_sleep },
+ { NULL, NULL },
+
+ /*
+ * IEEE 802.11
+ */
+ { "802.11 com lock", &lock_class_mtx_sleep},
+ { NULL, NULL },
+ /*
+ * Network drivers
+ */
+ { "network driver", &lock_class_mtx_sleep},
+ { NULL, NULL },
+
+ /*
+ * Netgraph
+ */
+ { "ng_node", &lock_class_mtx_sleep },
+ { "ng_worklist", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * CDEV
+ */
+ { "vm map (system)", &lock_class_mtx_sleep },
+ { "vm page queue", &lock_class_mtx_sleep },
+ { "vnode interlock", &lock_class_mtx_sleep },
+ { "cdev", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * VM
+ */
+ { "vm map (user)", &lock_class_sx },
+ { "vm object", &lock_class_rw },
+ { "vm page", &lock_class_mtx_sleep },
+ { "vm page queue", &lock_class_mtx_sleep },
+ { "pmap pv global", &lock_class_rw },
+ { "pmap", &lock_class_mtx_sleep },
+ { "pmap pv list", &lock_class_rw },
+ { "vm page free queue", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * kqueue/VFS interaction
+ */
+ { "kqueue", &lock_class_mtx_sleep },
+ { "struct mount mtx", &lock_class_mtx_sleep },
+ { "vnode interlock", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
+ * ZFS locking
+ */
+ { "dn->dn_mtx", &lock_class_sx },
+ { "dr->dt.di.dr_mtx", &lock_class_sx },
+ { "db->db_mtx", &lock_class_sx },
+ { NULL, NULL },
+ /*
+ * spin locks
+ */
+#ifdef SMP
+ { "ap boot", &lock_class_mtx_spin },
+#endif
+ { "rm.mutex_mtx", &lock_class_mtx_spin },
+ { "sio", &lock_class_mtx_spin },
+ { "scrlock", &lock_class_mtx_spin },
+#ifdef __i386__
+ { "cy", &lock_class_mtx_spin },
+#endif
+#ifdef __sparc64__
+ { "pcib_mtx", &lock_class_mtx_spin },
+ { "rtc_mtx", &lock_class_mtx_spin },
+#endif
+ { "scc_hwmtx", &lock_class_mtx_spin },
+ { "uart_hwmtx", &lock_class_mtx_spin },
+ { "fast_taskqueue", &lock_class_mtx_spin },
+ { "intr table", &lock_class_mtx_spin },
+#ifdef HWPMC_HOOKS
+ { "pmc-per-proc", &lock_class_mtx_spin },
+#endif
+ { "process slock", &lock_class_mtx_spin },
+ { "sleepq chain", &lock_class_mtx_spin },
+ { "umtx lock", &lock_class_mtx_spin },
+ { "rm_spinlock", &lock_class_mtx_spin },
+ { "turnstile chain", &lock_class_mtx_spin },
+ { "turnstile lock", &lock_class_mtx_spin },
+ { "sched lock", &lock_class_mtx_spin },
+ { "td_contested", &lock_class_mtx_spin },
+ { "callout", &lock_class_mtx_spin },
+ { "entropy harvest mutex", &lock_class_mtx_spin },
+ { "syscons video lock", &lock_class_mtx_spin },
+#ifdef SMP
+ { "smp rendezvous", &lock_class_mtx_spin },
+#endif
+#ifdef __powerpc__
+ { "tlb0", &lock_class_mtx_spin },
+#endif
+ /*
+ * leaf locks
+ */
+ { "intrcnt", &lock_class_mtx_spin },
+ { "icu", &lock_class_mtx_spin },
+#ifdef __i386__
+ { "allpmaps", &lock_class_mtx_spin },
+ { "descriptor tables", &lock_class_mtx_spin },
+#endif
+ { "clk", &lock_class_mtx_spin },
+ { "cpuset", &lock_class_mtx_spin },
+ { "mprof lock", &lock_class_mtx_spin },
+ { "zombie lock", &lock_class_mtx_spin },
+ { "ALD Queue", &lock_class_mtx_spin },
+#ifdef __ia64__
+ { "MCA spin lock", &lock_class_mtx_spin },
+#endif
+#if defined(__i386__) || defined(__amd64__)
+ { "pcicfg", &lock_class_mtx_spin },
+ { "NDIS thread lock", &lock_class_mtx_spin },
+#endif
+ { "tw_osl_io_lock", &lock_class_mtx_spin },
+ { "tw_osl_q_lock", &lock_class_mtx_spin },
+ { "tw_cl_io_lock", &lock_class_mtx_spin },
+ { "tw_cl_intr_lock", &lock_class_mtx_spin },
+ { "tw_cl_gen_lock", &lock_class_mtx_spin },
+#ifdef HWPMC_HOOKS
+ { "pmc-leaf", &lock_class_mtx_spin },
+#endif
+ { "blocked lock", &lock_class_mtx_spin },
+ { NULL, NULL },
+ { NULL, NULL }
+};
+
+#ifdef BLESSING
+/*
+ * Pairs of locks which have been blessed
+ * Don't complain about order problems with blessed locks
+ */
+static struct witness_blessed blessed_list[] = {
+};
+static int blessed_count =
+ sizeof(blessed_list) / sizeof(struct witness_blessed);
+#endif
+
+/*
+ * This global is set to 0 once it becomes safe to use the witness code.
+ */
+static int witness_cold = 1;
+
+/*
+ * This global is set to 1 once the static lock orders have been enrolled
+ * so that a warning can be issued for any spin locks enrolled later.
+ */
+static int witness_spin_warn = 0;
+
+/* Trim useless garbage from filenames. */
+static const char *
+fixup_filename(const char *file)
+{
+
+ if (file == NULL)
+ return (NULL);
+ while (strncmp(file, "../", 3) == 0)
+ file += 3;
+ return (file);
+}
+
+/*
+ * The WITNESS-enabled diagnostic code. Note that the witness code does
+ * assume that the early boot is single-threaded at least until after this
+ * routine is completed.
+ */
+static void
+witness_initialize(void *dummy __unused)
+{
+ struct lock_object *lock;
+ struct witness_order_list_entry *order;
+ struct witness *w, *w1;
+ int i;
+
+ w_data = malloc(sizeof (struct witness) * WITNESS_COUNT, M_WITNESS,
+ M_NOWAIT | M_ZERO);
+
+ /*
+ * We have to release Giant before initializing its witness
+ * structure so that WITNESS doesn't get confused.
+ */
+ mtx_unlock(&Giant);
+ mtx_assert(&Giant, MA_NOTOWNED);
+
+ CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
+ mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
+ MTX_NOWITNESS | MTX_NOPROFILE);
+ for (i = WITNESS_COUNT - 1; i >= 0; i--) {
+ w = &w_data[i];
+ memset(w, 0, sizeof(*w));
+ w_data[i].w_index = i; /* Witness index never changes. */
+ witness_free(w);
+ }
+ KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
+ ("%s: Invalid list of free witness objects", __func__));
+
+ /* Witness with index 0 is not used to aid in debugging. */
+ STAILQ_REMOVE_HEAD(&w_free, w_list);
+ w_free_cnt--;
+
+ memset(w_rmatrix, 0,
+ (sizeof(**w_rmatrix) * (WITNESS_COUNT+1) * (WITNESS_COUNT+1)));
+
+ for (i = 0; i < LOCK_CHILDCOUNT; i++)
+ witness_lock_list_free(&w_locklistdata[i]);
+ witness_init_hash_tables();
+
+ /* First add in all the specified order lists. */
+ for (order = order_lists; order->w_name != NULL; order++) {
+ w = enroll(order->w_name, order->w_class);
+ if (w == NULL)
+ continue;
+ w->w_file = "order list";
+ for (order++; order->w_name != NULL; order++) {
+ w1 = enroll(order->w_name, order->w_class);
+ if (w1 == NULL)
+ continue;
+ w1->w_file = "order list";
+ itismychild(w, w1);
+ w = w1;
+ }
+ }
+ witness_spin_warn = 1;
+
+ /* Iterate through all locks and add them to witness. */
+ for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
+ lock = pending_locks[i].wh_lock;
+ KASSERT(lock->lo_flags & LO_WITNESS,
+ ("%s: lock %s is on pending list but not LO_WITNESS",
+ __func__, lock->lo_name));
+ lock->lo_witness = enroll(pending_locks[i].wh_type,
+ LOCK_CLASS(lock));
+ }
+
+ /* Mark the witness code as being ready for use. */
+ witness_cold = 0;
+
+ mtx_lock(&Giant);
+}
+SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize,
+ NULL);
+
+void
+witness_init(struct lock_object *lock, const char *type)
+{
+ struct lock_class *class;
+
+ /* Various sanity checks. */
+ class = LOCK_CLASS(lock);
+ if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
+ (class->lc_flags & LC_RECURSABLE) == 0)
+ kassert_panic("%s: lock (%s) %s can not be recursable",
+ __func__, class->lc_name, lock->lo_name);
+ if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+ (class->lc_flags & LC_SLEEPABLE) == 0)
+ kassert_panic("%s: lock (%s) %s can not be sleepable",
+ __func__, class->lc_name, lock->lo_name);
+ if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
+ (class->lc_flags & LC_UPGRADABLE) == 0)
+ kassert_panic("%s: lock (%s) %s can not be upgradable",
+ __func__, class->lc_name, lock->lo_name);
+
+ /*
+ * If we shouldn't watch this lock, then just clear lo_witness.
+ * Otherwise, if witness_cold is set, then it is too early to
+ * enroll this lock, so defer it to witness_initialize() by adding
+ * it to the pending_locks list. If it is not too early, then enroll
+ * the lock now.
+ */
+ if (witness_watch < 1 || panicstr != NULL ||
+ (lock->lo_flags & LO_WITNESS) == 0)
+ lock->lo_witness = NULL;
+ else if (witness_cold) {
+ pending_locks[pending_cnt].wh_lock = lock;
+ pending_locks[pending_cnt++].wh_type = type;
+ if (pending_cnt > WITNESS_PENDLIST)
+ panic("%s: pending locks list is too small, "
+ "increase WITNESS_PENDLIST\n",
+ __func__);
+ } else
+ lock->lo_witness = enroll(type, class);
+}
+
+void
+witness_destroy(struct lock_object *lock)
+{
+ struct lock_class *class;
+ struct witness *w;
+
+ class = LOCK_CLASS(lock);
+
+ if (witness_cold)
+ panic("lock (%s) %s destroyed while witness_cold",
+ class->lc_name, lock->lo_name);
+
+ /* XXX: need to verify that no one holds the lock */
+ if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
+ return;
+ w = lock->lo_witness;
+
+ mtx_lock_spin(&w_mtx);
+ MPASS(w->w_refcount > 0);
+ w->w_refcount--;
+
+ if (w->w_refcount == 0)
+ depart(w);
+ mtx_unlock_spin(&w_mtx);
+}
+
+#ifdef DDB
+static void
+witness_ddb_compute_levels(void)
+{
+ struct witness *w;
+
+ /*
+ * First clear all levels.
+ */
+ STAILQ_FOREACH(w, &w_all, w_list)
+ w->w_ddb_level = -1;
+
+ /*
+ * Look for locks with no parents and level all their descendants.
+ */
+ STAILQ_FOREACH(w, &w_all, w_list) {
+
+ /* If the witness has ancestors (is not a root), skip it. */
+ if (w->w_num_ancestors > 0)
+ continue;
+ witness_ddb_level_descendants(w, 0);
+ }
+}
+
+static void
+witness_ddb_level_descendants(struct witness *w, int l)
+{
+ int i;
+
+ if (w->w_ddb_level >= l)
+ return;
+
+ w->w_ddb_level = l;
+ l++;
+
+ for (i = 1; i <= w_max_used_index; i++) {
+ if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
+ witness_ddb_level_descendants(&w_data[i], l);
+ }
+}
+
+static void
+witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
+ struct witness *w, int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ prnt(" ");
+ prnt("%s (type: %s, depth: %d, active refs: %d)",
+ w->w_name, w->w_class->lc_name,
+ w->w_ddb_level, w->w_refcount);
+ if (w->w_displayed) {
+ prnt(" -- (already displayed)\n");
+ return;
+ }
+ w->w_displayed = 1;
+ if (w->w_file != NULL && w->w_line != 0)
+ prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
+ w->w_line);
+ else
+ prnt(" -- never acquired\n");
+ indent++;
+ WITNESS_INDEX_ASSERT(w->w_index);
+ for (i = 1; i <= w_max_used_index; i++) {
+ if (db_pager_quit)
+ return;
+ if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
+ witness_ddb_display_descendants(prnt, &w_data[i],
+ indent);
+ }
+}
+
+static void
+witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
+ struct witness_list *list)
+{
+ struct witness *w;
+
+ STAILQ_FOREACH(w, list, w_typelist) {
+ if (w->w_file == NULL || w->w_ddb_level > 0)
+ continue;
+
+ /* This lock has no anscestors - display its descendants. */
+ witness_ddb_display_descendants(prnt, w, 0);
+ if (db_pager_quit)
+ return;
+ }
+}
+
+static void
+witness_ddb_display(int(*prnt)(const char *fmt, ...))
+{
+ struct witness *w;
+
+ KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+ witness_ddb_compute_levels();
+
+ /* Clear all the displayed flags. */
+ STAILQ_FOREACH(w, &w_all, w_list)
+ w->w_displayed = 0;
+
+ /*
+ * First, handle sleep locks which have been acquired at least
+ * once.
+ */
+ prnt("Sleep locks:\n");
+ witness_ddb_display_list(prnt, &w_sleep);
+ if (db_pager_quit)
+ return;
+
+ /*
+ * Now do spin locks which have been acquired at least once.
+ */
+ prnt("\nSpin locks:\n");
+ witness_ddb_display_list(prnt, &w_spin);
+ if (db_pager_quit)
+ return;
+
+ /*
+ * Finally, any locks which have not been acquired yet.
+ */
+ prnt("\nLocks which were never acquired:\n");
+ STAILQ_FOREACH(w, &w_all, w_list) {
+ if (w->w_file != NULL || w->w_refcount == 0)
+ continue;
+ prnt("%s (type: %s, depth: %d)\n", w->w_name,
+ w->w_class->lc_name, w->w_ddb_level);
+ if (db_pager_quit)
+ return;
+ }
+}
+#endif /* DDB */
+
+int
+witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
+{
+
+ if (witness_watch == -1 || panicstr != NULL)
+ return (0);
+
+ /* Require locks that witness knows about. */
+ if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
+ lock2->lo_witness == NULL)
+ return (EINVAL);
+
+ mtx_assert(&w_mtx, MA_NOTOWNED);
+ mtx_lock_spin(&w_mtx);
+
+ /*
+ * If we already have either an explicit or implied lock order that
+ * is the other way around, then return an error.
+ */
+ if (witness_watch &&
+ isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
+ mtx_unlock_spin(&w_mtx);
+ return (EDOOFUS);
+ }
+
+ /* Try to add the new order. */
+ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
+ lock2->lo_witness->w_name, lock1->lo_witness->w_name);
+ itismychild(lock1->lo_witness, lock2->lo_witness);
+ mtx_unlock_spin(&w_mtx);
+ return (0);
+}
+
+void
+witness_checkorder(struct lock_object *lock, int flags, const char *file,
+ int line, struct lock_object *interlock)
+{
+ struct lock_list_entry *lock_list, *lle;
+ struct lock_instance *lock1, *lock2, *plock;
+ struct lock_class *class, *iclass;
+ struct witness *w, *w1;
+ struct thread *td;
+ int i, j;
+
+ if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
+ panicstr != NULL)
+ return;
+
+ w = lock->lo_witness;
+ class = LOCK_CLASS(lock);
+ td = curthread;
+
+ if (class->lc_flags & LC_SLEEPLOCK) {
+
+ /*
+ * Since spin locks include a critical section, this check
+ * implicitly enforces a lock order of all sleep locks before
+ * all spin locks.
+ */
+ if (td->td_critnest != 0 && !kdb_active)
+ kassert_panic("acquiring blockable sleep lock with "
+ "spinlock or critical section held (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+
+ /*
+ * If this is the first lock acquired then just return as
+ * no order checking is needed.
+ */
+ lock_list = td->td_sleeplocks;
+ if (lock_list == NULL || lock_list->ll_count == 0)
+ return;
+ } else {
+
+ /*
+ * If this is the first lock, just return as no order
+ * checking is needed. Avoid problems with thread
+ * migration pinning the thread while checking if
+ * spinlocks are held. If at least one spinlock is held
+ * the thread is in a safe path and it is allowed to
+ * unpin it.
+ */
+ sched_pin();
+ lock_list = PCPU_GET(spinlocks);
+ if (lock_list == NULL || lock_list->ll_count == 0) {
+ sched_unpin();
+ return;
+ }
+ sched_unpin();
+ }
+
+ /*
+ * Check to see if we are recursing on a lock we already own. If
+ * so, make sure that we don't mismatch exclusive and shared lock
+ * acquires.
+ */
+ lock1 = find_instance(lock_list, lock);
+ if (lock1 != NULL) {
+ if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
+ (flags & LOP_EXCLUSIVE) == 0) {
+ printf("shared lock of (%s) %s @ %s:%d\n",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ printf("while exclusively locked from %s:%d\n",
+ fixup_filename(lock1->li_file), lock1->li_line);
+ kassert_panic("excl->share");
+ }
+ if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
+ (flags & LOP_EXCLUSIVE) != 0) {
+ printf("exclusive lock of (%s) %s @ %s:%d\n",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ printf("while share locked from %s:%d\n",
+ fixup_filename(lock1->li_file), lock1->li_line);
+ kassert_panic("share->excl");
+ }
+ return;
+ }
+
+ /* Warn if the interlock is not locked exactly once. */
+ if (interlock != NULL) {
+ iclass = LOCK_CLASS(interlock);
+ lock1 = find_instance(lock_list, interlock);
+ if (lock1 == NULL)
+ kassert_panic("interlock (%s) %s not locked @ %s:%d",
+ iclass->lc_name, interlock->lo_name,
+ fixup_filename(file), line);
+ else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
+ kassert_panic("interlock (%s) %s recursed @ %s:%d",
+ iclass->lc_name, interlock->lo_name,
+ fixup_filename(file), line);
+ }
+
+ /*
+ * Find the previously acquired lock, but ignore interlocks.
+ */
+ plock = &lock_list->ll_children[lock_list->ll_count - 1];
+ if (interlock != NULL && plock->li_lock == interlock) {
+ if (lock_list->ll_count > 1)
+ plock =
+ &lock_list->ll_children[lock_list->ll_count - 2];
+ else {
+ lle = lock_list->ll_next;
+
+ /*
+ * The interlock is the only lock we hold, so
+ * simply return.
+ */
+ if (lle == NULL)
+ return;
+ plock = &lle->ll_children[lle->ll_count - 1];
+ }
+ }
+
+ /*
+ * Try to perform most checks without a lock. If this succeeds we
+ * can skip acquiring the lock and return success.
+ */
+ w1 = plock->li_lock->lo_witness;
+ if (witness_lock_order_check(w1, w))
+ return;
+
+ /*
+ * Check for duplicate locks of the same type. Note that we only
+ * have to check for this on the last lock we just acquired. Any
+ * other cases will be caught as lock order violations.
+ */
+ mtx_lock_spin(&w_mtx);
+ witness_lock_order_add(w1, w);
+ if (w1 == w) {
+ i = w->w_index;
+ if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
+ !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
+ w_rmatrix[i][i] |= WITNESS_REVERSAL;
+ w->w_reversed = 1;
+ mtx_unlock_spin(&w_mtx);
+ printf(
+ "acquiring duplicate lock of same type: \"%s\"\n",
+ w->w_name);
+ printf(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
+ fixup_filename(plock->li_file), plock->li_line);
+ printf(" 2nd %s @ %s:%d\n", lock->lo_name,
+ fixup_filename(file), line);
+ witness_debugger(1);
+ } else
+ mtx_unlock_spin(&w_mtx);
+ return;
+ }
+ mtx_assert(&w_mtx, MA_OWNED);
+
+ /*
+ * If we know that the lock we are acquiring comes after
+ * the lock we most recently acquired in the lock order tree,
+ * then there is no need for any further checks.
+ */
+ if (isitmychild(w1, w))
+ goto out;
+
+ for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
+ for (i = lle->ll_count - 1; i >= 0; i--, j++) {
+
+ MPASS(j < WITNESS_COUNT);
+ lock1 = &lle->ll_children[i];
+
+ /*
+ * Ignore the interlock.
+ */
+ if (interlock == lock1->li_lock)
+ continue;
+
+ /*
+ * If this lock doesn't undergo witness checking,
+ * then skip it.
+ */
+ w1 = lock1->li_lock->lo_witness;
+ if (w1 == NULL) {
+ KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
+ ("lock missing witness structure"));
+ continue;
+ }
+
+ /*
+ * If we are locking Giant and this is a sleepable
+ * lock, then skip it.
+ */
+ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
+ lock == &Giant.lock_object)
+ continue;
+
+ /*
+ * If we are locking a sleepable lock and this lock
+ * is Giant, then skip it.
+ */
+ if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+ lock1->li_lock == &Giant.lock_object)
+ continue;
+
+ /*
+ * If we are locking a sleepable lock and this lock
+ * isn't sleepable, we want to treat it as a lock
+ * order violation to enfore a general lock order of
+ * sleepable locks before non-sleepable locks.
+ */
+ if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+ (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
+ goto reversal;
+
+ /*
+ * If we are locking Giant and this is a non-sleepable
+ * lock, then treat it as a reversal.
+ */
+ if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
+ lock == &Giant.lock_object)
+ goto reversal;
+
+ /*
+ * Check the lock order hierarchy for a reveresal.
+ */
+ if (!isitmydescendant(w, w1))
+ continue;
+ reversal:
+
+ /*
+ * We have a lock order violation, check to see if it
+ * is allowed or has already been yelled about.
+ */
+#ifdef BLESSING
+
+ /*
+ * If the lock order is blessed, just bail. We don't
+ * look for other lock order violations though, which
+ * may be a bug.
+ */
+ if (blessed(w, w1))
+ goto out;
+#endif
+
+ /* Bail if this violation is known */
+ if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
+ goto out;
+
+ /* Record this as a violation */
+ w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
+ w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
+ w->w_reversed = w1->w_reversed = 1;
+ witness_increment_graph_generation();
+ mtx_unlock_spin(&w_mtx);
+
+#ifdef WITNESS_NO_VNODE
+ /*
+ * There are known LORs between VNODE locks. They are
+ * not an indication of a bug. VNODE locks are flagged
+ * as such (LO_IS_VNODE) and we don't yell if the LOR
+ * is between 2 VNODE locks.
+ */
+ if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
+ (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
+ return;
+#endif
+
+ /*
+ * Ok, yell about it.
+ */
+ if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
+ (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
+ printf(
+ "lock order reversal: (sleepable after non-sleepable)\n");
+ else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
+ && lock == &Giant.lock_object)
+ printf(
+ "lock order reversal: (Giant after non-sleepable)\n");
+ else
+ printf("lock order reversal:\n");
+
+ /*
+ * Try to locate an earlier lock with
+ * witness w in our list.
+ */
+ do {
+ lock2 = &lle->ll_children[i];
+ MPASS(lock2->li_lock != NULL);
+ if (lock2->li_lock->lo_witness == w)
+ break;
+ if (i == 0 && lle->ll_next != NULL) {
+ lle = lle->ll_next;
+ i = lle->ll_count - 1;
+ MPASS(i >= 0 && i < LOCK_NCHILDREN);
+ } else
+ i--;
+ } while (i >= 0);
+ if (i < 0) {
+ printf(" 1st %p %s (%s) @ %s:%d\n",
+ lock1->li_lock, lock1->li_lock->lo_name,
+ w1->w_name, fixup_filename(lock1->li_file),
+ lock1->li_line);
+ printf(" 2nd %p %s (%s) @ %s:%d\n", lock,
+ lock->lo_name, w->w_name,
+ fixup_filename(file), line);
+ } else {
+ printf(" 1st %p %s (%s) @ %s:%d\n",
+ lock2->li_lock, lock2->li_lock->lo_name,
+ lock2->li_lock->lo_witness->w_name,
+ fixup_filename(lock2->li_file),
+ lock2->li_line);
+ printf(" 2nd %p %s (%s) @ %s:%d\n",
+ lock1->li_lock, lock1->li_lock->lo_name,
+ w1->w_name, fixup_filename(lock1->li_file),
+ lock1->li_line);
+ printf(" 3rd %p %s (%s) @ %s:%d\n", lock,
+ lock->lo_name, w->w_name,
+ fixup_filename(file), line);
+ }
+ witness_debugger(1);
+ return;
+ }
+ }
+
+ /*
+ * If requested, build a new lock order. However, don't build a new
+ * relationship between a sleepable lock and Giant if it is in the
+ * wrong direction. The correct lock order is that sleepable locks
+ * always come before Giant.
+ */
+ if (flags & LOP_NEWORDER &&
+ !(plock->li_lock == &Giant.lock_object &&
+ (lock->lo_flags & LO_SLEEPABLE) != 0)) {
+ CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
+ w->w_name, plock->li_lock->lo_witness->w_name);
+ itismychild(plock->li_lock->lo_witness, w);
+ }
+out:
+ mtx_unlock_spin(&w_mtx);
+}
+
+void
+witness_lock(struct lock_object *lock, int flags, const char *file, int line)
+{
+ struct lock_list_entry **lock_list, *lle;
+ struct lock_instance *instance;
+ struct witness *w;
+ struct thread *td;
+
+ if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
+ panicstr != NULL)
+ return;
+ w = lock->lo_witness;
+ td = curthread;
+
+ /* Determine lock list for this lock. */
+ if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
+ lock_list = &td->td_sleeplocks;
+ else
+ lock_list = PCPU_PTR(spinlocks);
+
+ /* Check to see if we are recursing on a lock we already own. */
+ instance = find_instance(*lock_list, lock);
+ if (instance != NULL) {
+ instance->li_flags++;
+ CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
+ td->td_proc->p_pid, lock->lo_name,
+ instance->li_flags & LI_RECURSEMASK);
+ instance->li_file = file;
+ instance->li_line = line;
+ return;
+ }
+
+ /* Update per-witness last file and line acquire. */
+ w->w_file = file;
+ w->w_line = line;
+
+ /* Find the next open lock instance in the list and fill it. */
+ lle = *lock_list;
+ if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
+ lle = witness_lock_list_get();
+ if (lle == NULL)
+ return;
+ lle->ll_next = *lock_list;
+ CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
+ td->td_proc->p_pid, lle);
+ *lock_list = lle;
+ }
+ instance = &lle->ll_children[lle->ll_count++];
+ instance->li_lock = lock;
+ instance->li_line = line;
+ instance->li_file = file;
+ if ((flags & LOP_EXCLUSIVE) != 0)
+ instance->li_flags = LI_EXCLUSIVE;
+ else
+ instance->li_flags = 0;
+ CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
+ td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
+}
+
+void
+witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
+{
+ struct lock_instance *instance;
+ struct lock_class *class;
+
+ KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+ if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+ return;
+ class = LOCK_CLASS(lock);
+ if (witness_watch) {
+ if ((lock->lo_flags & LO_UPGRADABLE) == 0)
+ kassert_panic(
+ "upgrade of non-upgradable lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ if ((class->lc_flags & LC_SLEEPLOCK) == 0)
+ kassert_panic(
+ "upgrade of non-sleep lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ }
+ instance = find_instance(curthread->td_sleeplocks, lock);
+ if (instance == NULL) {
+ kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ return;
+ }
+ if (witness_watch) {
+ if ((instance->li_flags & LI_EXCLUSIVE) != 0)
+ kassert_panic(
+ "upgrade of exclusive lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ if ((instance->li_flags & LI_RECURSEMASK) != 0)
+ kassert_panic(
+ "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
+ class->lc_name, lock->lo_name,
+ instance->li_flags & LI_RECURSEMASK,
+ fixup_filename(file), line);
+ }
+ instance->li_flags |= LI_EXCLUSIVE;
+}
+
+void
+witness_downgrade(struct lock_object *lock, int flags, const char *file,
+ int line)
+{
+ struct lock_instance *instance;
+ struct lock_class *class;
+
+ KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+ if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+ return;
+ class = LOCK_CLASS(lock);
+ if (witness_watch) {
+ if ((lock->lo_flags & LO_UPGRADABLE) == 0)
+ kassert_panic(
+ "downgrade of non-upgradable lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ if ((class->lc_flags & LC_SLEEPLOCK) == 0)
+ kassert_panic(
+ "downgrade of non-sleep lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ }
+ instance = find_instance(curthread->td_sleeplocks, lock);
+ if (instance == NULL) {
+ kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ return;
+ }
+ if (witness_watch) {
+ if ((instance->li_flags & LI_EXCLUSIVE) == 0)
+ kassert_panic(
+ "downgrade of shared lock (%s) %s @ %s:%d",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ if ((instance->li_flags & LI_RECURSEMASK) != 0)
+ kassert_panic(
+ "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
+ class->lc_name, lock->lo_name,
+ instance->li_flags & LI_RECURSEMASK,
+ fixup_filename(file), line);
+ }
+ instance->li_flags &= ~LI_EXCLUSIVE;
+}
+
+void
+witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
+{
+ struct lock_list_entry **lock_list, *lle;
+ struct lock_instance *instance;
+ struct lock_class *class;
+ struct thread *td;
+ register_t s;
+ int i, j;
+
+ if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
+ return;
+ td = curthread;
+ class = LOCK_CLASS(lock);
+
+ /* Find lock instance associated with this lock. */
+ if (class->lc_flags & LC_SLEEPLOCK)
+ lock_list = &td->td_sleeplocks;
+ else
+ lock_list = PCPU_PTR(spinlocks);
+ lle = *lock_list;
+ for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
+ for (i = 0; i < (*lock_list)->ll_count; i++) {
+ instance = &(*lock_list)->ll_children[i];
+ if (instance->li_lock == lock)
+ goto found;
+ }
+
+ /*
+ * When disabling WITNESS through witness_watch we could end up in
+ * having registered locks in the td_sleeplocks queue.
+ * We have to make sure we flush these queues, so just search for
+ * eventual register locks and remove them.
+ */
+ if (witness_watch > 0) {
+ kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
+ lock->lo_name, fixup_filename(file), line);
+ return;
+ } else {
+ return;
+ }
+found:
+
+ /* First, check for shared/exclusive mismatches. */
+ if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
+ (flags & LOP_EXCLUSIVE) == 0) {
+ printf("shared unlock of (%s) %s @ %s:%d\n", class->lc_name,
+ lock->lo_name, fixup_filename(file), line);
+ printf("while exclusively locked from %s:%d\n",
+ fixup_filename(instance->li_file), instance->li_line);
+ kassert_panic("excl->ushare");
+ }
+ if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
+ (flags & LOP_EXCLUSIVE) != 0) {
+ printf("exclusive unlock of (%s) %s @ %s:%d\n", class->lc_name,
+ lock->lo_name, fixup_filename(file), line);
+ printf("while share locked from %s:%d\n",
+ fixup_filename(instance->li_file),
+ instance->li_line);
+ kassert_panic("share->uexcl");
+ }
+ /* If we are recursed, unrecurse. */
+ if ((instance->li_flags & LI_RECURSEMASK) > 0) {
+ CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
+ td->td_proc->p_pid, instance->li_lock->lo_name,
+ instance->li_flags);
+ instance->li_flags--;
+ return;
+ }
+ /* The lock is now being dropped, check for NORELEASE flag */
+ if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
+ printf("forbidden unlock of (%s) %s @ %s:%d\n", class->lc_name,
+ lock->lo_name, fixup_filename(file), line);
+ kassert_panic("lock marked norelease");
+ }
+
+ /* Otherwise, remove this item from the list. */
+ s = intr_disable();
+ CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
+ td->td_proc->p_pid, instance->li_lock->lo_name,
+ (*lock_list)->ll_count - 1);
+ for (j = i; j < (*lock_list)->ll_count - 1; j++)
+ (*lock_list)->ll_children[j] =
+ (*lock_list)->ll_children[j + 1];
+ (*lock_list)->ll_count--;
+ intr_restore(s);
+
+ /*
+ * In order to reduce contention on w_mtx, we want to keep always an
+ * head object into lists so that frequent allocation from the
+ * free witness pool (and subsequent locking) is avoided.
+ * In order to maintain the current code simple, when the head
+ * object is totally unloaded it means also that we do not have
+ * further objects in the list, so the list ownership needs to be
+ * hand over to another object if the current head needs to be freed.
+ */
+ if ((*lock_list)->ll_count == 0) {
+ if (*lock_list == lle) {
+ if (lle->ll_next == NULL)
+ return;
+ } else
+ lle = *lock_list;
+ *lock_list = lle->ll_next;
+ CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
+ td->td_proc->p_pid, lle);
+ witness_lock_list_free(lle);
+ }
+}
+
+void
+witness_thread_exit(struct thread *td)
+{
+ struct lock_list_entry *lle;
+ int i, n;
+
+ lle = td->td_sleeplocks;
+ if (lle == NULL || panicstr != NULL)
+ return;
+ if (lle->ll_count != 0) {
+ for (n = 0; lle != NULL; lle = lle->ll_next)
+ for (i = lle->ll_count - 1; i >= 0; i--) {
+ if (n == 0)
+ printf("Thread %p exiting with the following locks held:\n",
+ td);
+ n++;
+ witness_list_lock(&lle->ll_children[i], printf);
+
+ }
+ kassert_panic(
+ "Thread %p cannot exit while holding sleeplocks\n", td);
+ }
+ witness_lock_list_free(lle);
+}
+
+/*
+ * Warn if any locks other than 'lock' are held. Flags can be passed in to
+ * exempt Giant and sleepable locks from the checks as well. If any
+ * non-exempt locks are held, then a supplied message is printed to the
+ * console along with a list of the offending locks. If indicated in the
+ * flags then a failure results in a panic as well.
+ */
+int
+witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
+{
+ struct lock_list_entry *lock_list, *lle;
+ struct lock_instance *lock1;
+ struct thread *td;
+ va_list ap;
+ int i, n;
+
+ if (witness_cold || witness_watch < 1 || panicstr != NULL)
+ return (0);
+ n = 0;
+ td = curthread;
+ for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
+ for (i = lle->ll_count - 1; i >= 0; i--) {
+ lock1 = &lle->ll_children[i];
+ if (lock1->li_lock == lock)
+ continue;
+ if (flags & WARN_GIANTOK &&
+ lock1->li_lock == &Giant.lock_object)
+ continue;
+ if (flags & WARN_SLEEPOK &&
+ (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
+ continue;
+ if (n == 0) {
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ printf(" with the following");
+ if (flags & WARN_SLEEPOK)
+ printf(" non-sleepable");
+ printf(" locks held:\n");
+ }
+ n++;
+ witness_list_lock(lock1, printf);
+ }
+
+ /*
+ * Pin the thread in order to avoid problems with thread migration.
+ * Once that all verifies are passed about spinlocks ownership,
+ * the thread is in a safe path and it can be unpinned.
+ */
+ sched_pin();
+ lock_list = PCPU_GET(spinlocks);
+ if (lock_list != NULL && lock_list->ll_count != 0) {
+ sched_unpin();
+
+ /*
+ * We should only have one spinlock and as long as
+ * the flags cannot match for this locks class,
+ * check if the first spinlock is the one curthread
+ * should hold.
+ */
+ lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
+ if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
+ lock1->li_lock == lock && n == 0)
+ return (0);
+
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ printf(" with the following");
+ if (flags & WARN_SLEEPOK)
+ printf(" non-sleepable");
+ printf(" locks held:\n");
+ n += witness_list_locks(&lock_list, printf);
+ } else
+ sched_unpin();
+ if (flags & WARN_PANIC && n)
+ kassert_panic("%s", __func__);
+ else
+ witness_debugger(n);
+ return (n);
+}
+
+const char *
+witness_file(struct lock_object *lock)
+{
+ struct witness *w;
+
+ if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
+ return ("?");
+ w = lock->lo_witness;
+ return (w->w_file);
+}
+
+int
+witness_line(struct lock_object *lock)
+{
+ struct witness *w;
+
+ if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
+ return (0);
+ w = lock->lo_witness;
+ return (w->w_line);
+}
+
+static struct witness *
+enroll(const char *description, struct lock_class *lock_class)
+{
+ struct witness *w;
+ struct witness_list *typelist;
+
+ MPASS(description != NULL);
+
+ if (witness_watch == -1 || panicstr != NULL)
+ return (NULL);
+ if ((lock_class->lc_flags & LC_SPINLOCK)) {
+ if (witness_skipspin)
+ return (NULL);
+ else
+ typelist = &w_spin;
+ } else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
+ typelist = &w_sleep;
+ } else {
+ kassert_panic("lock class %s is not sleep or spin",
+ lock_class->lc_name);
+ return (NULL);
+ }
+
+ mtx_lock_spin(&w_mtx);
+ w = witness_hash_get(description);
+ if (w)
+ goto found;
+ if ((w = witness_get()) == NULL)
+ return (NULL);
+ MPASS(strlen(description) < MAX_W_NAME);
+ strcpy(w->w_name, description);
+ w->w_class = lock_class;
+ w->w_refcount = 1;
+ STAILQ_INSERT_HEAD(&w_all, w, w_list);
+ if (lock_class->lc_flags & LC_SPINLOCK) {
+ STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
+ w_spin_cnt++;
+ } else if (lock_class->lc_flags & LC_SLEEPLOCK) {
+ STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
+ w_sleep_cnt++;
+ }
+
+ /* Insert new witness into the hash */
+ witness_hash_put(w);
+ witness_increment_graph_generation();
+ mtx_unlock_spin(&w_mtx);
+ return (w);
+found:
+ w->w_refcount++;
+ mtx_unlock_spin(&w_mtx);
+ if (lock_class != w->w_class)
+ kassert_panic(
+ "lock (%s) %s does not match earlier (%s) lock",
+ description, lock_class->lc_name,
+ w->w_class->lc_name);
+ return (w);
+}
+
+static void
+depart(struct witness *w)
+{
+ struct witness_list *list;
+
+ MPASS(w->w_refcount == 0);
+ if (w->w_class->lc_flags & LC_SLEEPLOCK) {
+ list = &w_sleep;
+ w_sleep_cnt--;
+ } else {
+ list = &w_spin;
+ w_spin_cnt--;
+ }
+ /*
+ * Set file to NULL as it may point into a loadable module.
+ */
+ w->w_file = NULL;
+ w->w_line = 0;
+ witness_increment_graph_generation();
+}
+
+
+static void
+adopt(struct witness *parent, struct witness *child)
+{
+ int pi, ci, i, j;
+
+ if (witness_cold == 0)
+ mtx_assert(&w_mtx, MA_OWNED);
+
+ /* If the relationship is already known, there's no work to be done. */
+ if (isitmychild(parent, child))
+ return;
+
+ /* When the structure of the graph changes, bump up the generation. */
+ witness_increment_graph_generation();
+
+ /*
+ * The hard part ... create the direct relationship, then propagate all
+ * indirect relationships.
+ */
+ pi = parent->w_index;
+ ci = child->w_index;
+ WITNESS_INDEX_ASSERT(pi);
+ WITNESS_INDEX_ASSERT(ci);
+ MPASS(pi != ci);
+ w_rmatrix[pi][ci] |= WITNESS_PARENT;
+ w_rmatrix[ci][pi] |= WITNESS_CHILD;
+
+ /*
+ * If parent was not already an ancestor of child,
+ * then we increment the descendant and ancestor counters.
+ */
+ if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
+ parent->w_num_descendants++;
+ child->w_num_ancestors++;
+ }
+
+ /*
+ * Find each ancestor of 'pi'. Note that 'pi' itself is counted as
+ * an ancestor of 'pi' during this loop.
+ */
+ for (i = 1; i <= w_max_used_index; i++) {
+ if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
+ (i != pi))
+ continue;
+
+ /* Find each descendant of 'i' and mark it as a descendant. */
+ for (j = 1; j <= w_max_used_index; j++) {
+
+ /*
+ * Skip children that are already marked as
+ * descendants of 'i'.
+ */
+ if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
+ continue;
+
+ /*
+ * We are only interested in descendants of 'ci'. Note
+ * that 'ci' itself is counted as a descendant of 'ci'.
+ */
+ if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
+ (j != ci))
+ continue;
+ w_rmatrix[i][j] |= WITNESS_ANCESTOR;
+ w_rmatrix[j][i] |= WITNESS_DESCENDANT;
+ w_data[i].w_num_descendants++;
+ w_data[j].w_num_ancestors++;
+
+ /*
+ * Make sure we aren't marking a node as both an
+ * ancestor and descendant. We should have caught
+ * this as a lock order reversal earlier.
+ */
+ if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
+ (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
+ printf("witness rmatrix paradox! [%d][%d]=%d "
+ "both ancestor and descendant\n",
+ i, j, w_rmatrix[i][j]);
+ kdb_backtrace();
+ printf("Witness disabled.\n");
+ witness_watch = -1;
+ }
+ if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
+ (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
+ printf("witness rmatrix paradox! [%d][%d]=%d "
+ "both ancestor and descendant\n",
+ j, i, w_rmatrix[j][i]);
+ kdb_backtrace();
+ printf("Witness disabled.\n");
+ witness_watch = -1;
+ }
+ }
+ }
+}
+
+static void
+itismychild(struct witness *parent, struct witness *child)
+{
+ int unlocked;
+
+ MPASS(child != NULL && parent != NULL);
+ if (witness_cold == 0)
+ mtx_assert(&w_mtx, MA_OWNED);
+
+ if (!witness_lock_type_equal(parent, child)) {
+ if (witness_cold == 0) {
+ unlocked = 1;
+ mtx_unlock_spin(&w_mtx);
+ } else {
+ unlocked = 0;
+ }
+ kassert_panic(
+ "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
+ "the same lock type", __func__, parent->w_name,
+ parent->w_class->lc_name, child->w_name,
+ child->w_class->lc_name);
+ if (unlocked)
+ mtx_lock_spin(&w_mtx);
+ }
+ adopt(parent, child);
+}
+
+/*
+ * Generic code for the isitmy*() functions. The rmask parameter is the
+ * expected relationship of w1 to w2.
+ */
+static int
+_isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
+{
+ unsigned char r1, r2;
+ int i1, i2;
+
+ i1 = w1->w_index;
+ i2 = w2->w_index;
+ WITNESS_INDEX_ASSERT(i1);
+ WITNESS_INDEX_ASSERT(i2);
+ r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
+ r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
+
+ /* The flags on one better be the inverse of the flags on the other */
+ if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
+ (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
+ printf("%s: rmatrix mismatch between %s (index %d) and %s "
+ "(index %d): w_rmatrix[%d][%d] == %hhx but "
+ "w_rmatrix[%d][%d] == %hhx\n",
+ fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
+ i2, i1, r2);
+ kdb_backtrace();
+ printf("Witness disabled.\n");
+ witness_watch = -1;
+ }
+ return (r1 & rmask);
+}
+
+/*
+ * Checks if @child is a direct child of @parent.
+ */
+static int
+isitmychild(struct witness *parent, struct witness *child)
+{
+
+ return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
+}
+
+/*
+ * Checks if @descendant is a direct or inderect descendant of @ancestor.
+ */
+static int
+isitmydescendant(struct witness *ancestor, struct witness *descendant)
+{
+
+ return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
+ __func__));
+}
+
+#ifdef BLESSING
+static int
+blessed(struct witness *w1, struct witness *w2)
+{
+ int i;
+ struct witness_blessed *b;
+
+ for (i = 0; i < blessed_count; i++) {
+ b = &blessed_list[i];
+ if (strcmp(w1->w_name, b->b_lock1) == 0) {
+ if (strcmp(w2->w_name, b->b_lock2) == 0)
+ return (1);
+ continue;
+ }
+ if (strcmp(w1->w_name, b->b_lock2) == 0)
+ if (strcmp(w2->w_name, b->b_lock1) == 0)
+ return (1);
+ }
+ return (0);
+}
+#endif
+
+static struct witness *
+witness_get(void)
+{
+ struct witness *w;
+ int index;
+
+ if (witness_cold == 0)
+ mtx_assert(&w_mtx, MA_OWNED);
+
+ if (witness_watch == -1) {
+ mtx_unlock_spin(&w_mtx);
+ return (NULL);
+ }
+ if (STAILQ_EMPTY(&w_free)) {
+ witness_watch = -1;
+ mtx_unlock_spin(&w_mtx);
+ printf("WITNESS: unable to allocate a new witness object\n");
+ return (NULL);
+ }
+ w = STAILQ_FIRST(&w_free);
+ STAILQ_REMOVE_HEAD(&w_free, w_list);
+ w_free_cnt--;
+ index = w->w_index;
+ MPASS(index > 0 && index == w_max_used_index+1 &&
+ index < WITNESS_COUNT);
+ bzero(w, sizeof(*w));
+ w->w_index = index;
+ if (index > w_max_used_index)
+ w_max_used_index = index;
+ return (w);
+}
+
+static void
+witness_free(struct witness *w)
+{
+
+ STAILQ_INSERT_HEAD(&w_free, w, w_list);
+ w_free_cnt++;
+}
+
+static struct lock_list_entry *
+witness_lock_list_get(void)
+{
+ struct lock_list_entry *lle;
+
+ if (witness_watch == -1)
+ return (NULL);
+ mtx_lock_spin(&w_mtx);
+ lle = w_lock_list_free;
+ if (lle == NULL) {
+ witness_watch = -1;
+ mtx_unlock_spin(&w_mtx);
+ printf("%s: witness exhausted\n", __func__);
+ return (NULL);
+ }
+ w_lock_list_free = lle->ll_next;
+ mtx_unlock_spin(&w_mtx);
+ bzero(lle, sizeof(*lle));
+ return (lle);
+}
+
+static void
+witness_lock_list_free(struct lock_list_entry *lle)
+{
+
+ mtx_lock_spin(&w_mtx);
+ lle->ll_next = w_lock_list_free;
+ w_lock_list_free = lle;
+ mtx_unlock_spin(&w_mtx);
+}
+
+static struct lock_instance *
+find_instance(struct lock_list_entry *list, const struct lock_object *lock)
+{
+ struct lock_list_entry *lle;
+ struct lock_instance *instance;
+ int i;
+
+ for (lle = list; lle != NULL; lle = lle->ll_next)
+ for (i = lle->ll_count - 1; i >= 0; i--) {
+ instance = &lle->ll_children[i];
+ if (instance->li_lock == lock)
+ return (instance);
+ }
+ return (NULL);
+}
+
+static void
+witness_list_lock(struct lock_instance *instance,
+ int (*prnt)(const char *fmt, ...))
+{
+ struct lock_object *lock;
+
+ lock = instance->li_lock;
+ prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
+ "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
+ if (lock->lo_witness->w_name != lock->lo_name)
+ prnt(" (%s)", lock->lo_witness->w_name);
+ prnt(" r = %d (%p) locked @ %s:%d\n",
+ instance->li_flags & LI_RECURSEMASK, lock,
+ fixup_filename(instance->li_file), instance->li_line);
+}
+
+#ifdef DDB
+static int
+witness_thread_has_locks(struct thread *td)
+{
+
+ if (td->td_sleeplocks == NULL)
+ return (0);
+ return (td->td_sleeplocks->ll_count != 0);
+}
+
+static int
+witness_proc_has_locks(struct proc *p)
+{
+ struct thread *td;
+
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (witness_thread_has_locks(td))
+ return (1);
+ }
+ return (0);
+}
+#endif
+
+int
+witness_list_locks(struct lock_list_entry **lock_list,
+ int (*prnt)(const char *fmt, ...))
+{
+ struct lock_list_entry *lle;
+ int i, nheld;
+
+ nheld = 0;
+ for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
+ for (i = lle->ll_count - 1; i >= 0; i--) {
+ witness_list_lock(&lle->ll_children[i], prnt);
+ nheld++;
+ }
+ return (nheld);
+}
+
+/*
+ * This is a bit risky at best. We call this function when we have timed
+ * out acquiring a spin lock, and we assume that the other CPU is stuck
+ * with this lock held. So, we go groveling around in the other CPU's
+ * per-cpu data to try to find the lock instance for this spin lock to
+ * see when it was last acquired.
+ */
+void
+witness_display_spinlock(struct lock_object *lock, struct thread *owner,
+ int (*prnt)(const char *fmt, ...))
+{
+ struct lock_instance *instance;
+ struct pcpu *pc;
+
+ if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
+ return;
+ pc = pcpu_find(owner->td_oncpu);
+ instance = find_instance(pc->pc_spinlocks, lock);
+ if (instance != NULL)
+ witness_list_lock(instance, prnt);
+}
+
+void
+witness_save(struct lock_object *lock, const char **filep, int *linep)
+{
+ struct lock_list_entry *lock_list;
+ struct lock_instance *instance;
+ struct lock_class *class;
+
+ /*
+ * This function is used independently in locking code to deal with
+ * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
+ * is gone.
+ */
+ if (SCHEDULER_STOPPED())
+ return;
+ KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+ if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+ return;
+ class = LOCK_CLASS(lock);
+ if (class->lc_flags & LC_SLEEPLOCK)
+ lock_list = curthread->td_sleeplocks;
+ else {
+ if (witness_skipspin)
+ return;
+ lock_list = PCPU_GET(spinlocks);
+ }
+ instance = find_instance(lock_list, lock);
+ if (instance == NULL) {
+ kassert_panic("%s: lock (%s) %s not locked", __func__,
+ class->lc_name, lock->lo_name);
+ return;
+ }
+ *filep = instance->li_file;
+ *linep = instance->li_line;
+}
+
+void
+witness_restore(struct lock_object *lock, const char *file, int line)
+{
+ struct lock_list_entry *lock_list;
+ struct lock_instance *instance;
+ struct lock_class *class;
+
+ /*
+ * This function is used independently in locking code to deal with
+ * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
+ * is gone.
+ */
+ if (SCHEDULER_STOPPED())
+ return;
+ KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+ if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+ return;
+ class = LOCK_CLASS(lock);
+ if (class->lc_flags & LC_SLEEPLOCK)
+ lock_list = curthread->td_sleeplocks;
+ else {
+ if (witness_skipspin)
+ return;
+ lock_list = PCPU_GET(spinlocks);
+ }
+ instance = find_instance(lock_list, lock);
+ if (instance == NULL)
+ kassert_panic("%s: lock (%s) %s not locked", __func__,
+ class->lc_name, lock->lo_name);
+ lock->lo_witness->w_file = file;
+ lock->lo_witness->w_line = line;
+ if (instance == NULL)
+ return;
+ instance->li_file = file;
+ instance->li_line = line;
+}
+
+void
+witness_assert(const struct lock_object *lock, int flags, const char *file,
+ int line)
+{
+#ifdef INVARIANT_SUPPORT
+ struct lock_instance *instance;
+ struct lock_class *class;
+
+ if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
+ return;
+ class = LOCK_CLASS(lock);
+ if ((class->lc_flags & LC_SLEEPLOCK) != 0)
+ instance = find_instance(curthread->td_sleeplocks, lock);
+ else if ((class->lc_flags & LC_SPINLOCK) != 0)
+ instance = find_instance(PCPU_GET(spinlocks), lock);
+ else {
+ kassert_panic("Lock (%s) %s is not sleep or spin!",
+ class->lc_name, lock->lo_name);
+ return;
+ }
+ switch (flags) {
+ case LA_UNLOCKED:
+ if (instance != NULL)
+ kassert_panic("Lock (%s) %s locked @ %s:%d.",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ break;
+ case LA_LOCKED:
+ case LA_LOCKED | LA_RECURSED:
+ case LA_LOCKED | LA_NOTRECURSED:
+ case LA_SLOCKED:
+ case LA_SLOCKED | LA_RECURSED:
+ case LA_SLOCKED | LA_NOTRECURSED:
+ case LA_XLOCKED:
+ case LA_XLOCKED | LA_RECURSED:
+ case LA_XLOCKED | LA_NOTRECURSED:
+ if (instance == NULL) {
+ kassert_panic("Lock (%s) %s not locked @ %s:%d.",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ break;
+ }
+ if ((flags & LA_XLOCKED) != 0 &&
+ (instance->li_flags & LI_EXCLUSIVE) == 0)
+ kassert_panic(
+ "Lock (%s) %s not exclusively locked @ %s:%d.",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ if ((flags & LA_SLOCKED) != 0 &&
+ (instance->li_flags & LI_EXCLUSIVE) != 0)
+ kassert_panic(
+ "Lock (%s) %s exclusively locked @ %s:%d.",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ if ((flags & LA_RECURSED) != 0 &&
+ (instance->li_flags & LI_RECURSEMASK) == 0)
+ kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ if ((flags & LA_NOTRECURSED) != 0 &&
+ (instance->li_flags & LI_RECURSEMASK) != 0)
+ kassert_panic("Lock (%s) %s recursed @ %s:%d.",
+ class->lc_name, lock->lo_name,
+ fixup_filename(file), line);
+ break;
+ default:
+ kassert_panic("Invalid lock assertion at %s:%d.",
+ fixup_filename(file), line);
+
+ }
+#endif /* INVARIANT_SUPPORT */
+}
+
+static void
+witness_setflag(struct lock_object *lock, int flag, int set)
+{
+ struct lock_list_entry *lock_list;
+ struct lock_instance *instance;
+ struct lock_class *class;
+
+ if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
+ return;
+ class = LOCK_CLASS(lock);
+ if (class->lc_flags & LC_SLEEPLOCK)
+ lock_list = curthread->td_sleeplocks;
+ else {
+ if (witness_skipspin)
+ return;
+ lock_list = PCPU_GET(spinlocks);
+ }
+ instance = find_instance(lock_list, lock);
+ if (instance == NULL) {
+ kassert_panic("%s: lock (%s) %s not locked", __func__,
+ class->lc_name, lock->lo_name);
+ return;
+ }
+
+ if (set)
+ instance->li_flags |= flag;
+ else
+ instance->li_flags &= ~flag;
+}
+
+void
+witness_norelease(struct lock_object *lock)
+{
+
+ witness_setflag(lock, LI_NORELEASE, 1);
+}
+
+void
+witness_releaseok(struct lock_object *lock)
+{
+
+ witness_setflag(lock, LI_NORELEASE, 0);
+}
+
+#ifdef DDB
+static void
+witness_ddb_list(struct thread *td)
+{
+
+ KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
+ KASSERT(kdb_active, ("%s: not in the debugger", __func__));
+
+ if (witness_watch < 1)
+ return;
+
+ witness_list_locks(&td->td_sleeplocks, db_printf);
+
+ /*
+ * We only handle spinlocks if td == curthread. This is somewhat broken
+ * if td is currently executing on some other CPU and holds spin locks
+ * as we won't display those locks. If we had a MI way of getting
+ * the per-cpu data for a given cpu then we could use
+ * td->td_oncpu to get the list of spinlocks for this thread
+ * and "fix" this.
+ *
+ * That still wouldn't really fix this unless we locked the scheduler
+ * lock or stopped the other CPU to make sure it wasn't changing the
+ * list out from under us. It is probably best to just not try to
+ * handle threads on other CPU's for now.
+ */
+ if (td == curthread && PCPU_GET(spinlocks) != NULL)
+ witness_list_locks(PCPU_PTR(spinlocks), db_printf);
+}
+
+DB_SHOW_COMMAND(locks, db_witness_list)
+{
+ struct thread *td;
+
+ if (have_addr)
+ td = db_lookup_thread(addr, TRUE);
+ else
+ td = kdb_thread;
+ witness_ddb_list(td);
+}
+
+DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
+{
+ struct thread *td;
+ struct proc *p;
+
+ /*
+ * It would be nice to list only threads and processes that actually
+ * held sleep locks, but that information is currently not exported
+ * by WITNESS.
+ */
+ FOREACH_PROC_IN_SYSTEM(p) {
+ if (!witness_proc_has_locks(p))
+ continue;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ if (!witness_thread_has_locks(td))
+ continue;
+ db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
+ p->p_comm, td, td->td_tid);
+ witness_ddb_list(td);
+ if (db_pager_quit)
+ return;
+ }
+ }
+}
+DB_SHOW_ALIAS(alllocks, db_witness_list_all)
+
+DB_SHOW_COMMAND(witness, db_witness_display)
+{
+
+ witness_ddb_display(db_printf);
+}
+#endif
+
+static int
+sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
+{
+ struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
+ struct witness *tmp_w1, *tmp_w2, *w1, *w2;
+ struct sbuf *sb;
+ u_int w_rmatrix1, w_rmatrix2;
+ int error, generation, i, j;
+
+ tmp_data1 = NULL;
+ tmp_data2 = NULL;
+ tmp_w1 = NULL;
+ tmp_w2 = NULL;
+ if (witness_watch < 1) {
+ error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
+ return (error);
+ }
+ if (witness_cold) {
+ error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
+ return (error);
+ }
+ error = 0;
+ sb = sbuf_new(NULL, NULL, BADSTACK_SBUF_SIZE, SBUF_AUTOEXTEND);
+ if (sb == NULL)
+ return (ENOMEM);
+
+ /* Allocate and init temporary storage space. */
+ tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
+ tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
+ tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
+ M_WAITOK | M_ZERO);
+ tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP,
+ M_WAITOK | M_ZERO);
+ stack_zero(&tmp_data1->wlod_stack);
+ stack_zero(&tmp_data2->wlod_stack);
+
+restart:
+ mtx_lock_spin(&w_mtx);
+ generation = w_generation;
+ mtx_unlock_spin(&w_mtx);
+ sbuf_printf(sb, "Number of known direct relationships is %d\n",
+ w_lohash.wloh_count);
+ for (i = 1; i < w_max_used_index; i++) {
+ mtx_lock_spin(&w_mtx);
+ if (generation != w_generation) {
+ mtx_unlock_spin(&w_mtx);
+
+ /* The graph has changed, try again. */
+ req->oldidx = 0;
+ sbuf_clear(sb);
+ goto restart;
+ }
+
+ w1 = &w_data[i];
+ if (w1->w_reversed == 0) {
+ mtx_unlock_spin(&w_mtx);
+ continue;
+ }
+
+ /* Copy w1 locally so we can release the spin lock. */
+ *tmp_w1 = *w1;
+ mtx_unlock_spin(&w_mtx);
+
+ if (tmp_w1->w_reversed == 0)
+ continue;
+ for (j = 1; j < w_max_used_index; j++) {
+ if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
+ continue;
+
+ mtx_lock_spin(&w_mtx);
+ if (generation != w_generation) {
+ mtx_unlock_spin(&w_mtx);
+
+ /* The graph has changed, try again. */
+ req->oldidx = 0;
+ sbuf_clear(sb);
+ goto restart;
+ }
+
+ w2 = &w_data[j];
+ data1 = witness_lock_order_get(w1, w2);
+ data2 = witness_lock_order_get(w2, w1);
+
+ /*
+ * Copy information locally so we can release the
+ * spin lock.
+ */
+ *tmp_w2 = *w2;
+ w_rmatrix1 = (unsigned int)w_rmatrix[i][j];
+ w_rmatrix2 = (unsigned int)w_rmatrix[j][i];
+
+ if (data1) {
+ stack_zero(&tmp_data1->wlod_stack);
+ stack_copy(&data1->wlod_stack,
+ &tmp_data1->wlod_stack);
+ }
+ if (data2 && data2 != data1) {
+ stack_zero(&tmp_data2->wlod_stack);
+ stack_copy(&data2->wlod_stack,
+ &tmp_data2->wlod_stack);
+ }
+ mtx_unlock_spin(&w_mtx);
+
+ sbuf_printf(sb,
+ "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
+ tmp_w1->w_name, tmp_w1->w_class->lc_name,
+ tmp_w2->w_name, tmp_w2->w_class->lc_name);
+#if 0
+ sbuf_printf(sb,
+ "w_rmatrix[%s][%s] == %x, w_rmatrix[%s][%s] == %x\n",
+ tmp_w1->name, tmp_w2->w_name, w_rmatrix1,
+ tmp_w2->name, tmp_w1->w_name, w_rmatrix2);
+#endif
+ if (data1) {
+ sbuf_printf(sb,
+ "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
+ tmp_w1->w_name, tmp_w1->w_class->lc_name,
+ tmp_w2->w_name, tmp_w2->w_class->lc_name);
+ stack_sbuf_print(sb, &tmp_data1->wlod_stack);
+ sbuf_printf(sb, "\n");
+ }
+ if (data2 && data2 != data1) {
+ sbuf_printf(sb,
+ "Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
+ tmp_w2->w_name, tmp_w2->w_class->lc_name,
+ tmp_w1->w_name, tmp_w1->w_class->lc_name);
+ stack_sbuf_print(sb, &tmp_data2->wlod_stack);
+ sbuf_printf(sb, "\n");
+ }
+ }
+ }
+ mtx_lock_spin(&w_mtx);
+ if (generation != w_generation) {
+ mtx_unlock_spin(&w_mtx);
+
+ /*
+ * The graph changed while we were printing stack data,
+ * try again.
+ */
+ req->oldidx = 0;
+ sbuf_clear(sb);
+ goto restart;
+ }
+ mtx_unlock_spin(&w_mtx);
+
+ /* Free temporary storage space. */
+ free(tmp_data1, M_TEMP);
+ free(tmp_data2, M_TEMP);
+ free(tmp_w1, M_TEMP);
+ free(tmp_w2, M_TEMP);
+
+ sbuf_finish(sb);
+ error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+ sbuf_delete(sb);
+
+ return (error);
+}
+
+static int
+sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
+{
+ struct witness *w;
+ struct sbuf *sb;
+ int error;
+
+ if (witness_watch < 1) {
+ error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
+ return (error);
+ }
+ if (witness_cold) {
+ error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
+ return (error);
+ }
+ error = 0;
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
+ if (sb == NULL)
+ return (ENOMEM);
+ sbuf_printf(sb, "\n");
+
+ mtx_lock_spin(&w_mtx);
+ STAILQ_FOREACH(w, &w_all, w_list)
+ w->w_displayed = 0;
+ STAILQ_FOREACH(w, &w_all, w_list)
+ witness_add_fullgraph(sb, w);
+ mtx_unlock_spin(&w_mtx);
+
+ /*
+ * Close the sbuf and return to userland.
+ */
+ error = sbuf_finish(sb);
+ sbuf_delete(sb);
+
+ return (error);
+}
+
+static int
+sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
+{
+ int error, value;
+
+ value = witness_watch;
+ error = sysctl_handle_int(oidp, &value, 0, req);
+ if (error != 0 || req->newptr == NULL)
+ return (error);
+ if (value > 1 || value < -1 ||
+ (witness_watch == -1 && value != witness_watch))
+ return (EINVAL);
+ witness_watch = value;
+ return (0);
+}
+
+static void
+witness_add_fullgraph(struct sbuf *sb, struct witness *w)
+{
+ int i;
+
+ if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
+ return;
+ w->w_displayed = 1;
+
+ WITNESS_INDEX_ASSERT(w->w_index);
+ for (i = 1; i <= w_max_used_index; i++) {
+ if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
+ sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
+ w_data[i].w_name);
+ witness_add_fullgraph(sb, &w_data[i]);
+ }
+ }
+}
+
+/*
+ * A simple hash function. Takes a key pointer and a key size. If size == 0,
+ * interprets the key as a string and reads until the null
+ * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
+ * hash value computed from the key.
+ */
+static uint32_t
+witness_hash_djb2(const uint8_t *key, uint32_t size)
+{
+ unsigned int hash = 5381;
+ int i;
+
+ /* hash = hash * 33 + key[i] */
+ if (size)
+ for (i = 0; i < size; i++)
+ hash = ((hash << 5) + hash) + (unsigned int)key[i];
+ else
+ for (i = 0; key[i] != 0; i++)
+ hash = ((hash << 5) + hash) + (unsigned int)key[i];
+
+ return (hash);
+}
+
+
+/*
+ * Initializes the two witness hash tables. Called exactly once from
+ * witness_initialize().
+ */
+static void
+witness_init_hash_tables(void)
+{
+ int i;
+
+ MPASS(witness_cold);
+
+ /* Initialize the hash tables. */
+ for (i = 0; i < WITNESS_HASH_SIZE; i++)
+ w_hash.wh_array[i] = NULL;
+
+ w_hash.wh_size = WITNESS_HASH_SIZE;
+ w_hash.wh_count = 0;
+
+ /* Initialize the lock order data hash. */
+ w_lofree = NULL;
+ for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
+ memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
+ w_lodata[i].wlod_next = w_lofree;
+ w_lofree = &w_lodata[i];
+ }
+ w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
+ w_lohash.wloh_count = 0;
+ for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
+ w_lohash.wloh_array[i] = NULL;
+}
+
+static struct witness *
+witness_hash_get(const char *key)
+{
+ struct witness *w;
+ uint32_t hash;
+
+ MPASS(key != NULL);
+ if (witness_cold == 0)
+ mtx_assert(&w_mtx, MA_OWNED);
+ hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
+ w = w_hash.wh_array[hash];
+ while (w != NULL) {
+ if (strcmp(w->w_name, key) == 0)
+ goto out;
+ w = w->w_hash_next;
+ }
+
+out:
+ return (w);
+}
+
+static void
+witness_hash_put(struct witness *w)
+{
+ uint32_t hash;
+
+ MPASS(w != NULL);
+ MPASS(w->w_name != NULL);
+ if (witness_cold == 0)
+ mtx_assert(&w_mtx, MA_OWNED);
+ KASSERT(witness_hash_get(w->w_name) == NULL,
+ ("%s: trying to add a hash entry that already exists!", __func__));
+ KASSERT(w->w_hash_next == NULL,
+ ("%s: w->w_hash_next != NULL", __func__));
+
+ hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
+ w->w_hash_next = w_hash.wh_array[hash];
+ w_hash.wh_array[hash] = w;
+ w_hash.wh_count++;
+}
+
+
+static struct witness_lock_order_data *
+witness_lock_order_get(struct witness *parent, struct witness *child)
+{
+ struct witness_lock_order_data *data = NULL;
+ struct witness_lock_order_key key;
+ unsigned int hash;
+
+ MPASS(parent != NULL && child != NULL);
+ key.from = parent->w_index;
+ key.to = child->w_index;
+ WITNESS_INDEX_ASSERT(key.from);
+ WITNESS_INDEX_ASSERT(key.to);
+ if ((w_rmatrix[parent->w_index][child->w_index]
+ & WITNESS_LOCK_ORDER_KNOWN) == 0)
+ goto out;
+
+ hash = witness_hash_djb2((const char*)&key,
+ sizeof(key)) % w_lohash.wloh_size;
+ data = w_lohash.wloh_array[hash];
+ while (data != NULL) {
+ if (witness_lock_order_key_equal(&data->wlod_key, &key))
+ break;
+ data = data->wlod_next;
+ }
+
+out:
+ return (data);
+}
+
+/*
+ * Verify that parent and child have a known relationship, are not the same,
+ * and child is actually a child of parent. This is done without w_mtx
+ * to avoid contention in the common case.
+ */
+static int
+witness_lock_order_check(struct witness *parent, struct witness *child)
+{
+
+ if (parent != child &&
+ w_rmatrix[parent->w_index][child->w_index]
+ & WITNESS_LOCK_ORDER_KNOWN &&
+ isitmychild(parent, child))
+ return (1);
+
+ return (0);
+}
+
+static int
+witness_lock_order_add(struct witness *parent, struct witness *child)
+{
+ struct witness_lock_order_data *data = NULL;
+ struct witness_lock_order_key key;
+ unsigned int hash;
+
+ MPASS(parent != NULL && child != NULL);
+ key.from = parent->w_index;
+ key.to = child->w_index;
+ WITNESS_INDEX_ASSERT(key.from);
+ WITNESS_INDEX_ASSERT(key.to);
+ if (w_rmatrix[parent->w_index][child->w_index]
+ & WITNESS_LOCK_ORDER_KNOWN)
+ return (1);
+
+ hash = witness_hash_djb2((const char*)&key,
+ sizeof(key)) % w_lohash.wloh_size;
+ w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
+ data = w_lofree;
+ if (data == NULL)
+ return (0);
+ w_lofree = data->wlod_next;
+ data->wlod_next = w_lohash.wloh_array[hash];
+ data->wlod_key = key;
+ w_lohash.wloh_array[hash] = data;
+ w_lohash.wloh_count++;
+ stack_zero(&data->wlod_stack);
+ stack_save(&data->wlod_stack);
+ return (1);
+}
+
+/* Call this whenver the structure of the witness graph changes. */
+static void
+witness_increment_graph_generation(void)
+{
+
+ if (witness_cold == 0)
+ mtx_assert(&w_mtx, MA_OWNED);
+ w_generation++;
+}
+
+#ifdef KDB
+static void
+_witness_debugger(int cond, const char *msg)
+{
+
+ if (witness_trace && cond)
+ kdb_backtrace();
+ if (witness_kdb && cond)
+ kdb_enter(KDB_WHY_WITNESS, msg);
+}
+#endif
diff --git a/sys/kern/sys_capability.c b/sys/kern/sys_capability.c
new file mode 100644
index 0000000..7a82017
--- /dev/null
+++ b/sys/kern/sys_capability.c
@@ -0,0 +1,613 @@
+/*-
+ * Copyright (c) 2008-2011 Robert N. M. Watson
+ * Copyright (c) 2010-2011 Jonathan Anderson
+ * Copyright (c) 2012 FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed at the University of Cambridge Computer
+ * Laboratory with support from a grant from Google, Inc.
+ *
+ * Portions of this software were developed by Pawel Jakub Dawidek under
+ * sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * FreeBSD kernel capability facility.
+ *
+ * Two kernel features are implemented here: capability mode, a sandboxed mode
+ * of execution for processes, and capabilities, a refinement on file
+ * descriptors that allows fine-grained control over operations on the file
+ * descriptor. Collectively, these allow processes to run in the style of a
+ * historic "capability system" in which they can use only resources
+ * explicitly delegated to them. This model is enforced by restricting access
+ * to global namespaces in capability mode.
+ *
+ * Capabilities wrap other file descriptor types, binding them to a constant
+ * rights mask set when the capability is created. New capabilities may be
+ * derived from existing capabilities, but only if they have the same or a
+ * strict subset of the rights on the original capability.
+ *
+ * System calls permitted in capability mode are defined in capabilities.conf;
+ * calls must be carefully audited for safety to ensure that they don't allow
+ * escape from a sandbox. Some calls permit only a subset of operations in
+ * capability mode -- for example, shm_open(2) is limited to creating
+ * anonymous, rather than named, POSIX shared memory objects.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/ucred.h>
+#include <sys/uio.h>
+#include <sys/ktrace.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+
+#ifdef CAPABILITY_MODE
+
+FEATURE(security_capability_mode, "Capsicum Capability Mode");
+
+/*
+ * System call to enter capability mode for the process.
+ */
+int
+sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
+{
+ struct ucred *newcred, *oldcred;
+ struct proc *p;
+
+ if (IN_CAPABILITY_MODE(td))
+ return (0);
+
+ newcred = crget();
+ p = td->td_proc;
+ PROC_LOCK(p);
+ oldcred = p->p_ucred;
+ crcopy(newcred, oldcred);
+ newcred->cr_flags |= CRED_FLAG_CAPMODE;
+ p->p_ucred = newcred;
+ PROC_UNLOCK(p);
+ crfree(oldcred);
+ return (0);
+}
+
+/*
+ * System call to query whether the process is in capability mode.
+ */
+int
+sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
+{
+ u_int i;
+
+ i = IN_CAPABILITY_MODE(td) ? 1 : 0;
+ return (copyout(&i, uap->modep, sizeof(i)));
+}
+
+#else /* !CAPABILITY_MODE */
+
+int
+sys_cap_enter(struct thread *td, struct cap_enter_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_cap_getmode(struct thread *td, struct cap_getmode_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+#endif /* CAPABILITY_MODE */
+
+#ifdef CAPABILITIES
+
+FEATURE(security_capabilities, "Capsicum Capabilities");
+
+MALLOC_DECLARE(M_FILECAPS);
+
+static inline int
+_cap_check(const cap_rights_t *havep, const cap_rights_t *needp,
+ enum ktr_cap_fail_type type)
+{
+ int i;
+
+ for (i = 0; i < nitems(havep->cr_rights); i++) {
+ if (!cap_rights_contains(havep, needp)) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(type, needp, havep);
+#endif
+ return (ENOTCAPABLE);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Test whether a capability grants the requested rights.
+ */
+int
+cap_check(const cap_rights_t *havep, const cap_rights_t *needp)
+{
+
+ return (_cap_check(havep, needp, CAPFAIL_NOTCAPABLE));
+}
+
+/*
+ * Convert capability rights into VM access flags.
+ */
+u_char
+cap_rights_to_vmprot(cap_rights_t *havep)
+{
+ u_char maxprot;
+
+ maxprot = VM_PROT_NONE;
+ if (cap_rights_is_set(havep, CAP_MMAP_R))
+ maxprot |= VM_PROT_READ;
+ if (cap_rights_is_set(havep, CAP_MMAP_W))
+ maxprot |= VM_PROT_WRITE;
+ if (cap_rights_is_set(havep, CAP_MMAP_X))
+ maxprot |= VM_PROT_EXECUTE;
+
+ return (maxprot);
+}
+
+/*
+ * Extract rights from a capability for monitoring purposes -- not for use in
+ * any other way, as we want to keep all capability permission evaluation in
+ * this one file.
+ */
+cap_rights_t *
+cap_rights(struct filedesc *fdp, int fd)
+{
+
+ return (&fdp->fd_ofiles[fd].fde_rights);
+}
+
+/*
+ * System call to limit rights of the given capability.
+ */
+int
+sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
+{
+ struct filedesc *fdp;
+ cap_rights_t rights;
+ int error, fd, version;
+
+ cap_rights_init(&rights);
+
+ error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0]));
+ if (error != 0)
+ return (error);
+ version = CAPVER(&rights);
+ if (version != CAP_RIGHTS_VERSION_00)
+ return (EINVAL);
+
+ error = copyin(uap->rightsp, &rights,
+ sizeof(rights.cr_rights[0]) * CAPARSIZE(&rights));
+ if (error != 0)
+ return (error);
+ /* Check for race. */
+ if (CAPVER(&rights) != version)
+ return (EINVAL);
+
+ if (!cap_rights_is_valid(&rights))
+ return (EINVAL);
+
+ if (version != CAP_RIGHTS_VERSION) {
+ rights.cr_rights[0] &= ~(0x3ULL << 62);
+ rights.cr_rights[0] |= ((uint64_t)CAP_RIGHTS_VERSION << 62);
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrcaprights(&rights);
+#endif
+
+ fd = uap->fd;
+
+ AUDIT_ARG_FD(fd);
+ AUDIT_ARG_RIGHTS(&rights);
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+ if (fget_locked(fdp, fd) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+ error = _cap_check(cap_rights(fdp, fd), &rights, CAPFAIL_INCREASE);
+ if (error == 0) {
+ fdp->fd_ofiles[fd].fde_rights = rights;
+ if (!cap_rights_is_set(&rights, CAP_IOCTL)) {
+ free(fdp->fd_ofiles[fd].fde_ioctls, M_FILECAPS);
+ fdp->fd_ofiles[fd].fde_ioctls = NULL;
+ fdp->fd_ofiles[fd].fde_nioctls = 0;
+ }
+ if (!cap_rights_is_set(&rights, CAP_FCNTL))
+ fdp->fd_ofiles[fd].fde_fcntls = 0;
+ }
+ FILEDESC_XUNLOCK(fdp);
+ return (error);
+}
+
+/*
+ * System call to query the rights mask associated with a capability.
+ */
+int
+sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
+{
+ struct filedesc *fdp;
+ cap_rights_t rights;
+ int error, fd, i, n;
+
+ if (uap->version != CAP_RIGHTS_VERSION_00)
+ return (EINVAL);
+
+ fd = uap->fd;
+
+ AUDIT_ARG_FD(fd);
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_SLOCK(fdp);
+ if (fget_locked(fdp, fd) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ return (EBADF);
+ }
+ rights = *cap_rights(fdp, fd);
+ FILEDESC_SUNLOCK(fdp);
+ n = uap->version + 2;
+ if (uap->version != CAPVER(&rights)) {
+ /*
+ * For older versions we need to check if the descriptor
+ * doesn't contain rights not understood by the caller.
+ * If it does, we have to return an error.
+ */
+ for (i = n; i < CAPARSIZE(&rights); i++) {
+ if ((rights.cr_rights[i] & ~(0x7FULL << 57)) != 0)
+ return (EINVAL);
+ }
+ }
+ error = copyout(&rights, uap->rightsp, sizeof(rights.cr_rights[0]) * n);
+#ifdef KTRACE
+ if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+ ktrcaprights(&rights);
+#endif
+ return (error);
+}
+
+/*
+ * Test whether a capability grants the given ioctl command.
+ * If descriptor doesn't have CAP_IOCTL, then ioctls list is empty and
+ * ENOTCAPABLE will be returned.
+ */
+int
+cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd)
+{
+ u_long *cmds;
+ ssize_t ncmds;
+ long i;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+ KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+ ("%s: invalid fd=%d", __func__, fd));
+
+ ncmds = fdp->fd_ofiles[fd].fde_nioctls;
+ if (ncmds == -1)
+ return (0);
+
+ cmds = fdp->fd_ofiles[fd].fde_ioctls;
+ for (i = 0; i < ncmds; i++) {
+ if (cmds[i] == cmd)
+ return (0);
+ }
+
+ return (ENOTCAPABLE);
+}
+
+/*
+ * Check if the current ioctls list can be replaced by the new one.
+ */
+static int
+cap_ioctl_limit_check(struct filedesc *fdp, int fd, const u_long *cmds,
+ size_t ncmds)
+{
+ u_long *ocmds;
+ ssize_t oncmds;
+ u_long i;
+ long j;
+
+ oncmds = fdp->fd_ofiles[fd].fde_nioctls;
+ if (oncmds == -1)
+ return (0);
+ if (oncmds < (ssize_t)ncmds)
+ return (ENOTCAPABLE);
+
+ ocmds = fdp->fd_ofiles[fd].fde_ioctls;
+ for (i = 0; i < ncmds; i++) {
+ for (j = 0; j < oncmds; j++) {
+ if (cmds[i] == ocmds[j])
+ break;
+ }
+ if (j == oncmds)
+ return (ENOTCAPABLE);
+ }
+
+ return (0);
+}
+
+int
+kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, size_t ncmds)
+{
+ struct filedesc *fdp;
+ u_long *ocmds;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+
+ if (fget_locked(fdp, fd) == NULL) {
+ error = EBADF;
+ goto out;
+ }
+
+ error = cap_ioctl_limit_check(fdp, fd, cmds, ncmds);
+ if (error != 0)
+ goto out;
+
+ ocmds = fdp->fd_ofiles[fd].fde_ioctls;
+ fdp->fd_ofiles[fd].fde_ioctls = cmds;
+ fdp->fd_ofiles[fd].fde_nioctls = ncmds;
+
+ cmds = ocmds;
+ error = 0;
+out:
+ FILEDESC_XUNLOCK(fdp);
+ free(cmds, M_FILECAPS);
+ return (error);
+}
+
+int
+sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
+{
+ u_long *cmds;
+ size_t ncmds;
+ int error;
+
+ ncmds = uap->ncmds;
+
+ if (ncmds > 256) /* XXX: Is 256 sane? */
+ return (EINVAL);
+
+ if (ncmds == 0) {
+ cmds = NULL;
+ } else {
+ cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK);
+ error = copyin(uap->cmds, cmds, sizeof(cmds[0]) * ncmds);
+ if (error != 0) {
+ free(cmds, M_FILECAPS);
+ return (error);
+ }
+ }
+
+ return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds));
+}
+
+int
+sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
+{
+ struct filedesc *fdp;
+ struct filedescent *fdep;
+ u_long *cmds;
+ size_t maxcmds;
+ int error, fd;
+
+ fd = uap->fd;
+ cmds = uap->cmds;
+ maxcmds = uap->maxcmds;
+
+ AUDIT_ARG_FD(fd);
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_SLOCK(fdp);
+
+ if (fget_locked(fdp, fd) == NULL) {
+ error = EBADF;
+ goto out;
+ }
+
+ /*
+ * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL)
+ * the only sane thing we can do is to not populate the given array and
+ * return CAP_IOCTLS_ALL.
+ */
+
+ fdep = &fdp->fd_ofiles[fd];
+ if (cmds != NULL && fdep->fde_ioctls != NULL) {
+ error = copyout(fdep->fde_ioctls, cmds,
+ sizeof(cmds[0]) * MIN(fdep->fde_nioctls, maxcmds));
+ if (error != 0)
+ goto out;
+ }
+ if (fdep->fde_nioctls == -1)
+ td->td_retval[0] = CAP_IOCTLS_ALL;
+ else
+ td->td_retval[0] = fdep->fde_nioctls;
+
+ error = 0;
+out:
+ FILEDESC_SUNLOCK(fdp);
+ return (error);
+}
+
+/*
+ * Test whether a capability grants the given fcntl command.
+ */
+int
+cap_fcntl_check(struct filedesc *fdp, int fd, int cmd)
+{
+ uint32_t fcntlcap;
+
+ KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+ ("%s: invalid fd=%d", __func__, fd));
+
+ fcntlcap = (1 << cmd);
+ KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0,
+ ("Unsupported fcntl=%d.", cmd));
+
+ if ((fdp->fd_ofiles[fd].fde_fcntls & fcntlcap) != 0)
+ return (0);
+
+ return (ENOTCAPABLE);
+}
+
+int
+sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
+{
+ struct filedesc *fdp;
+ uint32_t fcntlrights;
+ int fd;
+
+ fd = uap->fd;
+ fcntlrights = uap->fcntlrights;
+
+ AUDIT_ARG_FD(fd);
+ AUDIT_ARG_FCNTL_RIGHTS(fcntlrights);
+
+ if ((fcntlrights & ~CAP_FCNTL_ALL) != 0)
+ return (EINVAL);
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+
+ if (fget_locked(fdp, fd) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+
+ if ((fcntlrights & ~fdp->fd_ofiles[fd].fde_fcntls) != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ return (ENOTCAPABLE);
+ }
+
+ fdp->fd_ofiles[fd].fde_fcntls = fcntlrights;
+ FILEDESC_XUNLOCK(fdp);
+
+ return (0);
+}
+
+int
+sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
+{
+ struct filedesc *fdp;
+ uint32_t rights;
+ int fd;
+
+ fd = uap->fd;
+
+ AUDIT_ARG_FD(fd);
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_SLOCK(fdp);
+ if (fget_locked(fdp, fd) == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ return (EBADF);
+ }
+ rights = fdp->fd_ofiles[fd].fde_fcntls;
+ FILEDESC_SUNLOCK(fdp);
+
+ return (copyout(&rights, uap->fcntlrightsp, sizeof(rights)));
+}
+
+#else /* !CAPABILITIES */
+
+/*
+ * Stub Capability functions for when options CAPABILITIES isn't compiled
+ * into the kernel.
+ */
+
+int
+sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+int
+sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+#endif /* CAPABILITIES */
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
new file mode 100644
index 0000000..d4d6293
--- /dev/null
+++ b/sys/kern/sys_generic.c
@@ -0,0 +1,1815 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/capability.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/lock.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/ktr.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/sleepqueue.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/condvar.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <security/audit/audit.h>
+
+int iosize_max_clamp = 1;
+SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
+ &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
+/*
+ * Assert that the return value of read(2) and write(2) syscalls fits
+ * into a register. If not, an architecture will need to provide the
+ * usermode wrappers to reconstruct the result.
+ */
+CTASSERT(sizeof(register_t) >= sizeof(size_t));
+
+static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
+static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
+MALLOC_DEFINE(M_IOV, "iov", "large iov's");
+
+static int pollout(struct thread *, struct pollfd *, struct pollfd *,
+ u_int);
+static int pollscan(struct thread *, struct pollfd *, u_int);
+static int pollrescan(struct thread *);
+static int selscan(struct thread *, fd_mask **, fd_mask **, int);
+static int selrescan(struct thread *, fd_mask **, fd_mask **);
+static void selfdalloc(struct thread *, void *);
+static void selfdfree(struct seltd *, struct selfd *);
+static int dofileread(struct thread *, int, struct file *, struct uio *,
+ off_t, int);
+static int dofilewrite(struct thread *, int, struct file *, struct uio *,
+ off_t, int);
+static void doselwakeup(struct selinfo *, int);
+static void seltdinit(struct thread *);
+static int seltdwait(struct thread *, sbintime_t, sbintime_t);
+static void seltdclear(struct thread *);
+
+/*
+ * One seltd per-thread allocated on demand as needed.
+ *
+ * t - protected by st_mtx
+ * k - Only accessed by curthread or read-only
+ */
+struct seltd {
+ STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */
+ struct selfd *st_free1; /* (k) free fd for read set. */
+ struct selfd *st_free2; /* (k) free fd for write set. */
+ struct mtx st_mtx; /* Protects struct seltd */
+ struct cv st_wait; /* (t) Wait channel. */
+ int st_flags; /* (t) SELTD_ flags. */
+};
+
+#define SELTD_PENDING 0x0001 /* We have pending events. */
+#define SELTD_RESCAN 0x0002 /* Doing a rescan. */
+
+/*
+ * One selfd allocated per-thread per-file-descriptor.
+ * f - protected by sf_mtx
+ */
+struct selfd {
+ STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */
+ TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */
+ struct selinfo *sf_si; /* (f) selinfo when linked. */
+ struct mtx *sf_mtx; /* Pointer to selinfo mtx. */
+ struct seltd *sf_td; /* (k) owning seltd. */
+ void *sf_cookie; /* (k) fd or pollfd. */
+};
+
+static uma_zone_t selfd_zone;
+static struct mtx_pool *mtxpool_select;
+
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+ int fd;
+ void *buf;
+ size_t nbyte;
+};
+#endif
+int
+sys_read(td, uap)
+ struct thread *td;
+ struct read_args *uap;
+{
+ struct uio auio;
+ struct iovec aiov;
+ int error;
+
+ if (uap->nbyte > IOSIZE_MAX)
+ return (EINVAL);
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = uap->nbyte;
+ auio.uio_segflg = UIO_USERSPACE;
+ error = kern_readv(td, uap->fd, &auio);
+ return(error);
+}
+
+/*
+ * Positioned read system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pread_args {
+ int fd;
+ void *buf;
+ size_t nbyte;
+ int pad;
+ off_t offset;
+};
+#endif
+int
+sys_pread(td, uap)
+ struct thread *td;
+ struct pread_args *uap;
+{
+ struct uio auio;
+ struct iovec aiov;
+ int error;
+
+ if (uap->nbyte > IOSIZE_MAX)
+ return (EINVAL);
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = uap->nbyte;
+ auio.uio_segflg = UIO_USERSPACE;
+ error = kern_preadv(td, uap->fd, &auio, uap->offset);
+ return(error);
+}
+
+int
+freebsd6_pread(td, uap)
+ struct thread *td;
+ struct freebsd6_pread_args *uap;
+{
+ struct pread_args oargs;
+
+ oargs.fd = uap->fd;
+ oargs.buf = uap->buf;
+ oargs.nbyte = uap->nbyte;
+ oargs.offset = uap->offset;
+ return (sys_pread(td, &oargs));
+}
+
+/*
+ * Scatter read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
+int
+sys_readv(struct thread *td, struct readv_args *uap)
+{
+ struct uio *auio;
+ int error;
+
+ error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+ if (error)
+ return (error);
+ error = kern_readv(td, uap->fd, auio);
+ free(auio, M_IOV);
+ return (error);
+}
+
+int
+kern_readv(struct thread *td, int fd, struct uio *auio)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
+ if (error)
+ return (error);
+ error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Scatter positioned read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct preadv_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+ off_t offset;
+};
+#endif
+int
+sys_preadv(struct thread *td, struct preadv_args *uap)
+{
+ struct uio *auio;
+ int error;
+
+ error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+ if (error)
+ return (error);
+ error = kern_preadv(td, uap->fd, auio, uap->offset);
+ free(auio, M_IOV);
+ return (error);
+}
+
+int
+kern_preadv(td, fd, auio, offset)
+ struct thread *td;
+ int fd;
+ struct uio *auio;
+ off_t offset;
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
+ if (error)
+ return (error);
+ if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
+ error = ESPIPE;
+ else if (offset < 0 && fp->f_vnode->v_type != VCHR)
+ error = EINVAL;
+ else
+ error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common code for readv and preadv that reads data in
+ * from a file using the passed in uio, offset, and flags.
+ */
+static int
+dofileread(td, fd, fp, auio, offset, flags)
+ struct thread *td;
+ int fd;
+ struct file *fp;
+ struct uio *auio;
+ off_t offset;
+ int flags;
+{
+ ssize_t cnt;
+ int error;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+
+ /* Finish zero length reads right here */
+ if (auio->uio_resid == 0) {
+ td->td_retval[0] = 0;
+ return(0);
+ }
+ auio->uio_rw = UIO_READ;
+ auio->uio_offset = offset;
+ auio->uio_td = td;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO))
+ ktruio = cloneuio(auio);
+#endif
+ cnt = auio->uio_resid;
+ if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
+ if (auio->uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ }
+ cnt -= auio->uio_resid;
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = cnt;
+ ktrgenio(fd, UIO_READ, ktruio, error);
+ }
+#endif
+ td->td_retval[0] = cnt;
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+ int fd;
+ const void *buf;
+ size_t nbyte;
+};
+#endif
+int
+sys_write(td, uap)
+ struct thread *td;
+ struct write_args *uap;
+{
+ struct uio auio;
+ struct iovec aiov;
+ int error;
+
+ if (uap->nbyte > IOSIZE_MAX)
+ return (EINVAL);
+ aiov.iov_base = (void *)(uintptr_t)uap->buf;
+ aiov.iov_len = uap->nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = uap->nbyte;
+ auio.uio_segflg = UIO_USERSPACE;
+ error = kern_writev(td, uap->fd, &auio);
+ return(error);
+}
+
+/*
+ * Positioned write system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pwrite_args {
+ int fd;
+ const void *buf;
+ size_t nbyte;
+ int pad;
+ off_t offset;
+};
+#endif
+int
+sys_pwrite(td, uap)
+ struct thread *td;
+ struct pwrite_args *uap;
+{
+ struct uio auio;
+ struct iovec aiov;
+ int error;
+
+ if (uap->nbyte > IOSIZE_MAX)
+ return (EINVAL);
+ aiov.iov_base = (void *)(uintptr_t)uap->buf;
+ aiov.iov_len = uap->nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = uap->nbyte;
+ auio.uio_segflg = UIO_USERSPACE;
+ error = kern_pwritev(td, uap->fd, &auio, uap->offset);
+ return(error);
+}
+
+int
+freebsd6_pwrite(td, uap)
+ struct thread *td;
+ struct freebsd6_pwrite_args *uap;
+{
+ struct pwrite_args oargs;
+
+ oargs.fd = uap->fd;
+ oargs.buf = uap->buf;
+ oargs.nbyte = uap->nbyte;
+ oargs.offset = uap->offset;
+ return (sys_pwrite(td, &oargs));
+}
+
+/*
+ * Gather write system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
+int
+sys_writev(struct thread *td, struct writev_args *uap)
+{
+ struct uio *auio;
+ int error;
+
+ error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+ if (error)
+ return (error);
+ error = kern_writev(td, uap->fd, auio);
+ free(auio, M_IOV);
+ return (error);
+}
+
+int
+kern_writev(struct thread *td, int fd, struct uio *auio)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
+ if (error)
+ return (error);
+ error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Gather positioned write system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pwritev_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+ off_t offset;
+};
+#endif
+int
+sys_pwritev(struct thread *td, struct pwritev_args *uap)
+{
+ struct uio *auio;
+ int error;
+
+ error = copyinuio(uap->iovp, uap->iovcnt, &auio);
+ if (error)
+ return (error);
+ error = kern_pwritev(td, uap->fd, auio, uap->offset);
+ free(auio, M_IOV);
+ return (error);
+}
+
+int
+kern_pwritev(td, fd, auio, offset)
+ struct thread *td;
+ struct uio *auio;
+ int fd;
+ off_t offset;
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp);
+ if (error)
+ return (error);
+ if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
+ error = ESPIPE;
+ else if (offset < 0 && fp->f_vnode->v_type != VCHR)
+ error = EINVAL;
+ else
+ error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common code for writev and pwritev that writes data to
+ * a file using the passed in uio, offset, and flags.
+ */
+static int
+dofilewrite(td, fd, fp, auio, offset, flags)
+ struct thread *td;
+ int fd;
+ struct file *fp;
+ struct uio *auio;
+ off_t offset;
+ int flags;
+{
+ ssize_t cnt;
+ int error;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+
+ auio->uio_rw = UIO_WRITE;
+ auio->uio_td = td;
+ auio->uio_offset = offset;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO))
+ ktruio = cloneuio(auio);
+#endif
+ cnt = auio->uio_resid;
+ if (fp->f_type == DTYPE_VNODE &&
+ (fp->f_vnread_flags & FDEVFS_VNODE) == 0)
+ bwillwrite();
+ if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
+ if (auio->uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Socket layer is responsible for issuing SIGPIPE. */
+ if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
+ PROC_LOCK(td->td_proc);
+ tdsignal(td, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ cnt -= auio->uio_resid;
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = cnt;
+ ktrgenio(fd, UIO_WRITE, ktruio, error);
+ }
+#endif
+ td->td_retval[0] = cnt;
+ return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ *
+ * Can't use fget_write() here, since must return EINVAL and not EBADF if the
+ * descriptor isn't writable.
+ */
+int
+kern_ftruncate(td, fd, length)
+ struct thread *td;
+ int fd;
+ off_t length;
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ if (length < 0)
+ return (EINVAL);
+ error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp);
+ if (error)
+ return (error);
+ AUDIT_ARG_FILE(td->td_proc, fp);
+ if (!(fp->f_flag & FWRITE)) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ error = fo_truncate(fp, length, td->td_ucred, td);
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+ int fd;
+ int pad;
+ off_t length;
+};
+#endif
+int
+sys_ftruncate(td, uap)
+ struct thread *td;
+ struct ftruncate_args *uap;
+{
+
+ return (kern_ftruncate(td, uap->fd, uap->length));
+}
+
+#if defined(COMPAT_43)
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+ int fd;
+ long length;
+};
+#endif
+int
+oftruncate(td, uap)
+ struct thread *td;
+ struct oftruncate_args *uap;
+{
+
+ return (kern_ftruncate(td, uap->fd, uap->length));
+}
+#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+ int fd;
+ u_long com;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+sys_ioctl(struct thread *td, struct ioctl_args *uap)
+{
+ u_long com;
+ int arg, error;
+ u_int size;
+ caddr_t data;
+
+ if (uap->com > 0xffffffff) {
+ printf(
+ "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
+ td->td_proc->p_pid, td->td_name, uap->com);
+ uap->com &= 0xffffffff;
+ }
+ com = uap->com;
+
+ /*
+ * Interpret high order word to find amount of data to be
+ * copied to/from the user's address space.
+ */
+ size = IOCPARM_LEN(com);
+ if ((size > IOCPARM_MAX) ||
+ ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) ||
+#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+ ((com & IOC_OUT) && size == 0) ||
+#else
+ ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
+#endif
+ ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
+ return (ENOTTY);
+
+ if (size > 0) {
+ if (com & IOC_VOID) {
+ /* Integer argument. */
+ arg = (intptr_t)uap->data;
+ data = (void *)&arg;
+ size = 0;
+ } else
+ data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+ } else
+ data = (void *)&uap->data;
+ if (com & IOC_IN) {
+ error = copyin(uap->data, data, (u_int)size);
+ if (error) {
+ if (size > 0)
+ free(data, M_IOCTLOPS);
+ return (error);
+ }
+ } else if (com & IOC_OUT) {
+ /*
+ * Zero the buffer so the user always
+ * gets back something deterministic.
+ */
+ bzero(data, size);
+ }
+
+ error = kern_ioctl(td, uap->fd, com, data);
+
+ if (error == 0 && (com & IOC_OUT))
+ error = copyout(data, uap->data, (u_int)size);
+
+ if (size > 0)
+ free(data, M_IOCTLOPS);
+ return (error);
+}
+
+int
+kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
+{
+ struct file *fp;
+ struct filedesc *fdp;
+#ifndef CAPABILITIES
+ cap_rights_t rights;
+#endif
+ int error, tmp, locked;
+
+ AUDIT_ARG_FD(fd);
+ AUDIT_ARG_CMD(com);
+
+ fdp = td->td_proc->p_fd;
+
+ switch (com) {
+ case FIONCLEX:
+ case FIOCLEX:
+ FILEDESC_XLOCK(fdp);
+ locked = LA_XLOCKED;
+ break;
+ default:
+#ifdef CAPABILITIES
+ FILEDESC_SLOCK(fdp);
+ locked = LA_SLOCKED;
+#else
+ locked = LA_UNLOCKED;
+#endif
+ break;
+ }
+
+#ifdef CAPABILITIES
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
+ error = EBADF;
+ goto out;
+ }
+ if ((error = cap_ioctl_check(fdp, fd, com)) != 0) {
+ fp = NULL; /* fhold() was not called yet */
+ goto out;
+ }
+ fhold(fp);
+ if (locked == LA_SLOCKED) {
+ FILEDESC_SUNLOCK(fdp);
+ locked = LA_UNLOCKED;
+ }
+#else
+ error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+ if (error != 0) {
+ fp = NULL;
+ goto out;
+ }
+#endif
+ if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ switch (com) {
+ case FIONCLEX:
+ fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
+ goto out;
+ case FIOCLEX:
+ fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
+ goto out;
+ case FIONBIO:
+ if ((tmp = *(int *)data))
+ atomic_set_int(&fp->f_flag, FNONBLOCK);
+ else
+ atomic_clear_int(&fp->f_flag, FNONBLOCK);
+ data = (void *)&tmp;
+ break;
+ case FIOASYNC:
+ if ((tmp = *(int *)data))
+ atomic_set_int(&fp->f_flag, FASYNC);
+ else
+ atomic_clear_int(&fp->f_flag, FASYNC);
+ data = (void *)&tmp;
+ break;
+ }
+
+ error = fo_ioctl(fp, com, data, td->td_ucred, td);
+out:
+ switch (locked) {
+ case LA_XLOCKED:
+ FILEDESC_XUNLOCK(fdp);
+ break;
+#ifdef CAPABILITIES
+ case LA_SLOCKED:
+ FILEDESC_SUNLOCK(fdp);
+ break;
+#endif
+ default:
+ FILEDESC_UNLOCK_ASSERT(fdp);
+ break;
+ }
+ if (fp != NULL)
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+poll_no_poll(int events)
+{
+ /*
+ * Return true for read/write. If the user asked for something
+ * special, return POLLNVAL, so that clients have a way of
+ * determining reliably whether or not the extended
+ * functionality is present without hard-coding knowledge
+ * of specific filesystem implementations.
+ */
+ if (events & ~POLLSTANDARD)
+ return (POLLNVAL);
+
+ return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+int
+sys_pselect(struct thread *td, struct pselect_args *uap)
+{
+ struct timespec ts;
+ struct timeval tv, *tvp;
+ sigset_t set, *uset;
+ int error;
+
+ if (uap->ts != NULL) {
+ error = copyin(uap->ts, &ts, sizeof(ts));
+ if (error != 0)
+ return (error);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts);
+ tvp = &tv;
+ } else
+ tvp = NULL;
+ if (uap->sm != NULL) {
+ error = copyin(uap->sm, &set, sizeof(set));
+ if (error != 0)
+ return (error);
+ uset = &set;
+ } else
+ uset = NULL;
+ return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
+ uset, NFDBITS));
+}
+
+int
+kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
+ struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
+{
+ int error;
+
+ if (uset != NULL) {
+ error = kern_sigprocmask(td, SIG_SETMASK, uset,
+ &td->td_oldsigmask, 0);
+ if (error != 0)
+ return (error);
+ td->td_pflags |= TDP_OLDMASK;
+ /*
+ * Make sure that ast() is called on return to
+ * usermode and TDP_OLDMASK is cleared, restoring old
+ * sigmask.
+ */
+ thread_lock(td);
+ td->td_flags |= TDF_ASTPENDING;
+ thread_unlock(td);
+ }
+ error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+ int nd;
+ fd_set *in, *ou, *ex;
+ struct timeval *tv;
+};
+#endif
+int
+sys_select(struct thread *td, struct select_args *uap)
+{
+ struct timeval tv, *tvp;
+ int error;
+
+ if (uap->tv != NULL) {
+ error = copyin(uap->tv, &tv, sizeof(tv));
+ if (error)
+ return (error);
+ tvp = &tv;
+ } else
+ tvp = NULL;
+
+ return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
+ NFDBITS));
+}
+
+/*
+ * In the unlikely case when user specified n greater then the last
+ * open file descriptor, check that no bits are set after the last
+ * valid fd. We must return EBADF if any is set.
+ *
+ * There are applications that rely on the behaviour.
+ *
+ * nd is fd_lastfile + 1.
+ */
+static int
+select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits)
+{
+ char *addr, *oaddr;
+ int b, i, res;
+ uint8_t bits;
+
+ if (nd >= ndu || fd_in == NULL)
+ return (0);
+
+ oaddr = NULL;
+ bits = 0; /* silence gcc */
+ for (i = nd; i < ndu; i++) {
+ b = i / NBBY;
+#if BYTE_ORDER == LITTLE_ENDIAN
+ addr = (char *)fd_in + b;
+#else
+ addr = (char *)fd_in;
+ if (abi_nfdbits == NFDBITS) {
+ addr += rounddown(b, sizeof(fd_mask)) +
+ sizeof(fd_mask) - 1 - b % sizeof(fd_mask);
+ } else {
+ addr += rounddown(b, sizeof(uint32_t)) +
+ sizeof(uint32_t) - 1 - b % sizeof(uint32_t);
+ }
+#endif
+ if (addr != oaddr) {
+ res = fubyte(addr);
+ if (res == -1)
+ return (EFAULT);
+ oaddr = addr;
+ bits = res;
+ }
+ if ((bits & (1 << (i % NBBY))) != 0)
+ return (EBADF);
+ }
+ return (0);
+}
+
+int
+kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
+ fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
+{
+ struct filedesc *fdp;
+ /*
+ * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+ * infds with the new FD_SETSIZE of 1024, and more than enough for
+ * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+ * of 256.
+ */
+ fd_mask s_selbits[howmany(2048, NFDBITS)];
+ fd_mask *ibits[3], *obits[3], *selbits, *sbp;
+ struct timeval rtv;
+ sbintime_t asbt, precision, rsbt;
+ u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
+ int error, lf, ndu;
+
+ if (nd < 0)
+ return (EINVAL);
+ fdp = td->td_proc->p_fd;
+ ndu = nd;
+ lf = fdp->fd_lastfile;
+ if (nd > lf + 1)
+ nd = lf + 1;
+
+ error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits);
+ if (error != 0)
+ return (error);
+ error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits);
+ if (error != 0)
+ return (error);
+ error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Allocate just enough bits for the non-null fd_sets. Use the
+ * preallocated auto buffer if possible.
+ */
+ nfdbits = roundup(nd, NFDBITS);
+ ncpbytes = nfdbits / NBBY;
+ ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
+ nbufbytes = 0;
+ if (fd_in != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (fd_ou != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (fd_ex != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (nbufbytes <= sizeof s_selbits)
+ selbits = &s_selbits[0];
+ else
+ selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
+
+ /*
+ * Assign pointers into the bit buffers and fetch the input bits.
+ * Put the output buffers together so that they can be bzeroed
+ * together.
+ */
+ sbp = selbits;
+#define getbits(name, x) \
+ do { \
+ if (name == NULL) { \
+ ibits[x] = NULL; \
+ obits[x] = NULL; \
+ } else { \
+ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
+ obits[x] = sbp; \
+ sbp += ncpbytes / sizeof *sbp; \
+ error = copyin(name, ibits[x], ncpubytes); \
+ if (error != 0) \
+ goto done; \
+ bzero((char *)ibits[x] + ncpubytes, \
+ ncpbytes - ncpubytes); \
+ } \
+ } while (0)
+ getbits(fd_in, 0);
+ getbits(fd_ou, 1);
+ getbits(fd_ex, 2);
+#undef getbits
+
+#if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
+ /*
+ * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
+ * we are running under 32-bit emulation. This should be more
+ * generic.
+ */
+#define swizzle_fdset(bits) \
+ if (abi_nfdbits != NFDBITS && bits != NULL) { \
+ int i; \
+ for (i = 0; i < ncpbytes / sizeof *sbp; i++) \
+ bits[i] = (bits[i] >> 32) | (bits[i] << 32); \
+ }
+#else
+#define swizzle_fdset(bits)
+#endif
+
+ /* Make sure the bit order makes it through an ABI transition */
+ swizzle_fdset(ibits[0]);
+ swizzle_fdset(ibits[1]);
+ swizzle_fdset(ibits[2]);
+
+ if (nbufbytes != 0)
+ bzero(selbits, nbufbytes / 2);
+
+ precision = 0;
+ if (tvp != NULL) {
+ rtv = *tvp;
+ if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
+ rtv.tv_usec >= 1000000) {
+ error = EINVAL;
+ goto done;
+ }
+ if (!timevalisset(&rtv))
+ asbt = 0;
+ else if (rtv.tv_sec <= INT32_MAX) {
+ rsbt = tvtosbt(rtv);
+ precision = rsbt;
+ precision >>= tc_precexp;
+ if (TIMESEL(&asbt, rsbt))
+ asbt += tc_tick_sbt;
+ if (asbt <= INT64_MAX - rsbt)
+ asbt += rsbt;
+ else
+ asbt = -1;
+ } else
+ asbt = -1;
+ } else
+ asbt = -1;
+ seltdinit(td);
+ /* Iterate until the timeout expires or descriptors become ready. */
+ for (;;) {
+ error = selscan(td, ibits, obits, nd);
+ if (error || td->td_retval[0] != 0)
+ break;
+ error = seltdwait(td, asbt, precision);
+ if (error)
+ break;
+ error = selrescan(td, ibits, obits);
+ if (error || td->td_retval[0] != 0)
+ break;
+ }
+ seltdclear(td);
+
+done:
+ /* select is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+
+ /* swizzle bit order back, if necessary */
+ swizzle_fdset(obits[0]);
+ swizzle_fdset(obits[1]);
+ swizzle_fdset(obits[2]);
+#undef swizzle_fdset
+
+#define putbits(name, x) \
+ if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
+ error = error2;
+ if (error == 0) {
+ int error2;
+
+ putbits(fd_in, 0);
+ putbits(fd_ou, 1);
+ putbits(fd_ex, 2);
+#undef putbits
+ }
+ if (selbits != &s_selbits[0])
+ free(selbits, M_SELECT);
+
+ return (error);
+}
+/*
+ * Convert a select bit set to poll flags.
+ *
+ * The backend always returns POLLHUP/POLLERR if appropriate and we
+ * return this as a set bit in any set.
+ */
+static int select_flags[3] = {
+ POLLRDNORM | POLLHUP | POLLERR,
+ POLLWRNORM | POLLHUP | POLLERR,
+ POLLRDBAND | POLLERR
+};
+
+/*
+ * Compute the fo_poll flags required for a fd given by the index and
+ * bit position in the fd_mask array.
+ */
+static __inline int
+selflags(fd_mask **ibits, int idx, fd_mask bit)
+{
+ int flags;
+ int msk;
+
+ flags = 0;
+ for (msk = 0; msk < 3; msk++) {
+ if (ibits[msk] == NULL)
+ continue;
+ if ((ibits[msk][idx] & bit) == 0)
+ continue;
+ flags |= select_flags[msk];
+ }
+ return (flags);
+}
+
+/*
+ * Set the appropriate output bits given a mask of fired events and the
+ * input bits originally requested.
+ */
+static __inline int
+selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
+{
+ int msk;
+ int n;
+
+ n = 0;
+ for (msk = 0; msk < 3; msk++) {
+ if ((events & select_flags[msk]) == 0)
+ continue;
+ if (ibits[msk] == NULL)
+ continue;
+ if ((ibits[msk][idx] & bit) == 0)
+ continue;
+ /*
+ * XXX Check for a duplicate set. This can occur because a
+ * socket calls selrecord() twice for each poll() call
+ * resulting in two selfds per real fd. selrescan() will
+ * call selsetbits twice as a result.
+ */
+ if ((obits[msk][idx] & bit) != 0)
+ continue;
+ obits[msk][idx] |= bit;
+ n++;
+ }
+
+ return (n);
+}
+
+static __inline int
+getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp)
+{
+ cap_rights_t rights;
+
+ return (fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_POLL_EVENT),
+ 0, fpp, NULL));
+}
+
+/*
+ * Traverse the list of fds attached to this thread's seltd and check for
+ * completion.
+ */
+static int
+selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
+{
+ struct filedesc *fdp;
+ struct selinfo *si;
+ struct seltd *stp;
+ struct selfd *sfp;
+ struct selfd *sfn;
+ struct file *fp;
+ fd_mask bit;
+ int fd, ev, n, idx;
+ int error;
+
+ fdp = td->td_proc->p_fd;
+ stp = td->td_sel;
+ n = 0;
+ STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
+ fd = (int)(uintptr_t)sfp->sf_cookie;
+ si = sfp->sf_si;
+ selfdfree(stp, sfp);
+ /* If the selinfo wasn't cleared the event didn't fire. */
+ if (si != NULL)
+ continue;
+ error = getselfd_cap(fdp, fd, &fp);
+ if (error)
+ return (error);
+ idx = fd / NFDBITS;
+ bit = (fd_mask)1 << (fd % NFDBITS);
+ ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
+ fdrop(fp, td);
+ if (ev != 0)
+ n += selsetbits(ibits, obits, idx, bit, ev);
+ }
+ stp->st_flags = 0;
+ td->td_retval[0] = n;
+ return (0);
+}
+
+/*
+ * Perform the initial filedescriptor scan and register ourselves with
+ * each selinfo.
+ */
+static int
+selscan(td, ibits, obits, nfd)
+ struct thread *td;
+ fd_mask **ibits, **obits;
+ int nfd;
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ fd_mask bit;
+ int ev, flags, end, fd;
+ int n, idx;
+ int error;
+
+ fdp = td->td_proc->p_fd;
+ n = 0;
+ for (idx = 0, fd = 0; fd < nfd; idx++) {
+ end = imin(fd + NFDBITS, nfd);
+ for (bit = 1; fd < end; bit <<= 1, fd++) {
+ /* Compute the list of events we're interested in. */
+ flags = selflags(ibits, idx, bit);
+ if (flags == 0)
+ continue;
+ error = getselfd_cap(fdp, fd, &fp);
+ if (error)
+ return (error);
+ selfdalloc(td, (void *)(uintptr_t)fd);
+ ev = fo_poll(fp, flags, td->td_ucred, td);
+ fdrop(fp, td);
+ if (ev != 0)
+ n += selsetbits(ibits, obits, idx, bit, ev);
+ }
+ }
+
+ td->td_retval[0] = n;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct poll_args {
+ struct pollfd *fds;
+ u_int nfds;
+ int timeout;
+};
+#endif
+int
+sys_poll(td, uap)
+ struct thread *td;
+ struct poll_args *uap;
+{
+ struct pollfd *bits;
+ struct pollfd smallbits[32];
+ sbintime_t asbt, precision, rsbt;
+ u_int nfds;
+ int error;
+ size_t ni;
+
+ nfds = uap->nfds;
+ if (nfds > maxfilesperproc && nfds > FD_SETSIZE)
+ return (EINVAL);
+ ni = nfds * sizeof(struct pollfd);
+ if (ni > sizeof(smallbits))
+ bits = malloc(ni, M_TEMP, M_WAITOK);
+ else
+ bits = smallbits;
+ error = copyin(uap->fds, bits, ni);
+ if (error)
+ goto done;
+ precision = 0;
+ if (uap->timeout != INFTIM) {
+ if (uap->timeout < 0) {
+ error = EINVAL;
+ goto done;
+ }
+ if (uap->timeout == 0)
+ asbt = 0;
+ else {
+ rsbt = SBT_1MS * uap->timeout;
+ precision = rsbt;
+ precision >>= tc_precexp;
+ if (TIMESEL(&asbt, rsbt))
+ asbt += tc_tick_sbt;
+ asbt += rsbt;
+ }
+ } else
+ asbt = -1;
+ seltdinit(td);
+ /* Iterate until the timeout expires or descriptors become ready. */
+ for (;;) {
+ error = pollscan(td, bits, nfds);
+ if (error || td->td_retval[0] != 0)
+ break;
+ error = seltdwait(td, asbt, precision);
+ if (error)
+ break;
+ error = pollrescan(td);
+ if (error || td->td_retval[0] != 0)
+ break;
+ }
+ seltdclear(td);
+
+done:
+ /* poll is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+ if (error == 0) {
+ error = pollout(td, bits, uap->fds, nfds);
+ if (error)
+ goto out;
+ }
+out:
+ if (ni > sizeof(smallbits))
+ free(bits, M_TEMP);
+ return (error);
+}
+
+static int
+pollrescan(struct thread *td)
+{
+ struct seltd *stp;
+ struct selfd *sfp;
+ struct selfd *sfn;
+ struct selinfo *si;
+ struct filedesc *fdp;
+ struct file *fp;
+ struct pollfd *fd;
+#ifdef CAPABILITIES
+ cap_rights_t rights;
+#endif
+ int n;
+
+ n = 0;
+ fdp = td->td_proc->p_fd;
+ stp = td->td_sel;
+ FILEDESC_SLOCK(fdp);
+ STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
+ fd = (struct pollfd *)sfp->sf_cookie;
+ si = sfp->sf_si;
+ selfdfree(stp, sfp);
+ /* If the selinfo wasn't cleared the event didn't fire. */
+ if (si != NULL)
+ continue;
+ fp = fdp->fd_ofiles[fd->fd].fde_file;
+#ifdef CAPABILITIES
+ if (fp == NULL ||
+ cap_check(cap_rights(fdp, fd->fd),
+ cap_rights_init(&rights, CAP_POLL_EVENT)) != 0)
+#else
+ if (fp == NULL)
+#endif
+ {
+ fd->revents = POLLNVAL;
+ n++;
+ continue;
+ }
+
+ /*
+ * Note: backend also returns POLLHUP and
+ * POLLERR if appropriate.
+ */
+ fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
+ if (fd->revents != 0)
+ n++;
+ }
+ FILEDESC_SUNLOCK(fdp);
+ stp->st_flags = 0;
+ td->td_retval[0] = n;
+ return (0);
+}
+
+
+static int
+pollout(td, fds, ufds, nfd)
+ struct thread *td;
+ struct pollfd *fds;
+ struct pollfd *ufds;
+ u_int nfd;
+{
+ int error = 0;
+ u_int i = 0;
+ u_int n = 0;
+
+ for (i = 0; i < nfd; i++) {
+ error = copyout(&fds->revents, &ufds->revents,
+ sizeof(ufds->revents));
+ if (error)
+ return (error);
+ if (fds->revents != 0)
+ n++;
+ fds++;
+ ufds++;
+ }
+ td->td_retval[0] = n;
+ return (0);
+}
+
+static int
+pollscan(td, fds, nfd)
+ struct thread *td;
+ struct pollfd *fds;
+ u_int nfd;
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ struct file *fp;
+#ifdef CAPABILITIES
+ cap_rights_t rights;
+#endif
+ int i, n = 0;
+
+ FILEDESC_SLOCK(fdp);
+ for (i = 0; i < nfd; i++, fds++) {
+ if (fds->fd >= fdp->fd_nfiles) {
+ fds->revents = POLLNVAL;
+ n++;
+ } else if (fds->fd < 0) {
+ fds->revents = 0;
+ } else {
+ fp = fdp->fd_ofiles[fds->fd].fde_file;
+#ifdef CAPABILITIES
+ if (fp == NULL ||
+ cap_check(cap_rights(fdp, fds->fd),
+ cap_rights_init(&rights, CAP_POLL_EVENT)) != 0)
+#else
+ if (fp == NULL)
+#endif
+ {
+ fds->revents = POLLNVAL;
+ n++;
+ } else {
+ /*
+ * Note: backend also returns POLLHUP and
+ * POLLERR if appropriate.
+ */
+ selfdalloc(td, fds);
+ fds->revents = fo_poll(fp, fds->events,
+ td->td_ucred, td);
+ /*
+ * POSIX requires POLLOUT to be never
+ * set simultaneously with POLLHUP.
+ */
+ if ((fds->revents & POLLHUP) != 0)
+ fds->revents &= ~POLLOUT;
+
+ if (fds->revents != 0)
+ n++;
+ }
+ }
+ }
+ FILEDESC_SUNLOCK(fdp);
+ td->td_retval[0] = n;
+ return (0);
+}
+
+/*
+ * OpenBSD poll system call.
+ *
+ * XXX this isn't quite a true representation.. OpenBSD uses select ops.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct openbsd_poll_args {
+ struct pollfd *fds;
+ u_int nfds;
+ int timeout;
+};
+#endif
+int
+sys_openbsd_poll(td, uap)
+ register struct thread *td;
+ register struct openbsd_poll_args *uap;
+{
+ return (sys_poll(td, (struct poll_args *)uap));
+}
+
+/*
+ * XXX This was created specifically to support netncp and netsmb. This
+ * allows the caller to specify a socket to wait for events on. It returns
+ * 0 if any events matched and an error otherwise. There is no way to
+ * determine which events fired.
+ */
+int
+selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
+{
+ struct timeval rtv;
+ sbintime_t asbt, precision, rsbt;
+ int error;
+
+ precision = 0; /* stupid gcc! */
+ if (tvp != NULL) {
+ rtv = *tvp;
+ if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
+ rtv.tv_usec >= 1000000)
+ return (EINVAL);
+ if (!timevalisset(&rtv))
+ asbt = 0;
+ else if (rtv.tv_sec <= INT32_MAX) {
+ rsbt = tvtosbt(rtv);
+ precision = rsbt;
+ precision >>= tc_precexp;
+ if (TIMESEL(&asbt, rsbt))
+ asbt += tc_tick_sbt;
+ if (asbt <= INT64_MAX - rsbt)
+ asbt += rsbt;
+ else
+ asbt = -1;
+ } else
+ asbt = -1;
+ } else
+ asbt = -1;
+ seltdinit(td);
+ /*
+ * Iterate until the timeout expires or the socket becomes ready.
+ */
+ for (;;) {
+ selfdalloc(td, NULL);
+ error = sopoll(so, events, NULL, td);
+ /* error here is actually the ready events. */
+ if (error)
+ return (0);
+ error = seltdwait(td, asbt, precision);
+ if (error)
+ break;
+ }
+ seltdclear(td);
+ /* XXX Duplicates ncp/smb behavior. */
+ if (error == ERESTART)
+ error = 0;
+ return (error);
+}
+
+/*
+ * Preallocate two selfds associated with 'cookie'. Some fo_poll routines
+ * have two select sets, one for read and another for write.
+ */
+static void
+selfdalloc(struct thread *td, void *cookie)
+{
+ struct seltd *stp;
+
+ stp = td->td_sel;
+ if (stp->st_free1 == NULL)
+ stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
+ stp->st_free1->sf_td = stp;
+ stp->st_free1->sf_cookie = cookie;
+ if (stp->st_free2 == NULL)
+ stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
+ stp->st_free2->sf_td = stp;
+ stp->st_free2->sf_cookie = cookie;
+}
+
+static void
+selfdfree(struct seltd *stp, struct selfd *sfp)
+{
+ STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
+ mtx_lock(sfp->sf_mtx);
+ if (sfp->sf_si)
+ TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
+ mtx_unlock(sfp->sf_mtx);
+ uma_zfree(selfd_zone, sfp);
+}
+
+/* Drain the waiters tied to all the selfd belonging the specified selinfo. */
+void
+seldrain(sip)
+ struct selinfo *sip;
+{
+
+ /*
+ * This feature is already provided by doselwakeup(), thus it is
+ * enough to go for it.
+ * Eventually, the context, should take care to avoid races
+ * between thread calling select()/poll() and file descriptor
+ * detaching, but, again, the races are just the same as
+ * selwakeup().
+ */
+ doselwakeup(sip, -1);
+}
+
+/*
+ * Record a select request.
+ */
+void
+selrecord(selector, sip)
+ struct thread *selector;
+ struct selinfo *sip;
+{
+ struct selfd *sfp;
+ struct seltd *stp;
+ struct mtx *mtxp;
+
+ stp = selector->td_sel;
+ /*
+ * Don't record when doing a rescan.
+ */
+ if (stp->st_flags & SELTD_RESCAN)
+ return;
+ /*
+ * Grab one of the preallocated descriptors.
+ */
+ sfp = NULL;
+ if ((sfp = stp->st_free1) != NULL)
+ stp->st_free1 = NULL;
+ else if ((sfp = stp->st_free2) != NULL)
+ stp->st_free2 = NULL;
+ else
+ panic("selrecord: No free selfd on selq");
+ mtxp = sip->si_mtx;
+ if (mtxp == NULL)
+ mtxp = mtx_pool_find(mtxpool_select, sip);
+ /*
+ * Initialize the sfp and queue it in the thread.
+ */
+ sfp->sf_si = sip;
+ sfp->sf_mtx = mtxp;
+ STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
+ /*
+ * Now that we've locked the sip, check for initialization.
+ */
+ mtx_lock(mtxp);
+ if (sip->si_mtx == NULL) {
+ sip->si_mtx = mtxp;
+ TAILQ_INIT(&sip->si_tdlist);
+ }
+ /*
+ * Add this thread to the list of selfds listening on this selinfo.
+ */
+ TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
+ mtx_unlock(sip->si_mtx);
+}
+
+/* Wake up a selecting thread. */
+void
+selwakeup(sip)
+ struct selinfo *sip;
+{
+ doselwakeup(sip, -1);
+}
+
+/* Wake up a selecting thread, and set its priority. */
+void
+selwakeuppri(sip, pri)
+ struct selinfo *sip;
+ int pri;
+{
+ doselwakeup(sip, pri);
+}
+
+/*
+ * Do a wakeup when a selectable event occurs.
+ */
+static void
+doselwakeup(sip, pri)
+ struct selinfo *sip;
+ int pri;
+{
+ struct selfd *sfp;
+ struct selfd *sfn;
+ struct seltd *stp;
+
+ /* If it's not initialized there can't be any waiters. */
+ if (sip->si_mtx == NULL)
+ return;
+ /*
+ * Locking the selinfo locks all selfds associated with it.
+ */
+ mtx_lock(sip->si_mtx);
+ TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
+ /*
+ * Once we remove this sfp from the list and clear the
+ * sf_si seltdclear will know to ignore this si.
+ */
+ TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
+ sfp->sf_si = NULL;
+ stp = sfp->sf_td;
+ mtx_lock(&stp->st_mtx);
+ stp->st_flags |= SELTD_PENDING;
+ cv_broadcastpri(&stp->st_wait, pri);
+ mtx_unlock(&stp->st_mtx);
+ }
+ mtx_unlock(sip->si_mtx);
+}
+
+static void
+seltdinit(struct thread *td)
+{
+ struct seltd *stp;
+
+ if ((stp = td->td_sel) != NULL)
+ goto out;
+ td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
+ mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
+ cv_init(&stp->st_wait, "select");
+out:
+ stp->st_flags = 0;
+ STAILQ_INIT(&stp->st_selq);
+}
+
+static int
+seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision)
+{
+ struct seltd *stp;
+ int error;
+
+ stp = td->td_sel;
+ /*
+ * An event of interest may occur while we do not hold the seltd
+ * locked so check the pending flag before we sleep.
+ */
+ mtx_lock(&stp->st_mtx);
+ /*
+ * Any further calls to selrecord will be a rescan.
+ */
+ stp->st_flags |= SELTD_RESCAN;
+ if (stp->st_flags & SELTD_PENDING) {
+ mtx_unlock(&stp->st_mtx);
+ return (0);
+ }
+ if (sbt == 0)
+ error = EWOULDBLOCK;
+ else if (sbt != -1)
+ error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx,
+ sbt, precision, C_ABSOLUTE);
+ else
+ error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
+ mtx_unlock(&stp->st_mtx);
+
+ return (error);
+}
+
+void
+seltdfini(struct thread *td)
+{
+ struct seltd *stp;
+
+ stp = td->td_sel;
+ if (stp == NULL)
+ return;
+ if (stp->st_free1)
+ uma_zfree(selfd_zone, stp->st_free1);
+ if (stp->st_free2)
+ uma_zfree(selfd_zone, stp->st_free2);
+ td->td_sel = NULL;
+ free(stp, M_SELECT);
+}
+
+/*
+ * Remove the references to the thread from all of the objects we were
+ * polling.
+ */
+static void
+seltdclear(struct thread *td)
+{
+ struct seltd *stp;
+ struct selfd *sfp;
+ struct selfd *sfn;
+
+ stp = td->td_sel;
+ STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
+ selfdfree(stp, sfp);
+ stp->st_flags = 0;
+}
+
+static void selectinit(void *);
+SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
+static void
+selectinit(void *dummy __unused)
+{
+
+ selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, 0);
+ mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
+}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
new file mode 100644
index 0000000..76c295e
--- /dev/null
+++ b/sys/kern/sys_pipe.c
@@ -0,0 +1,1834 @@
+/*-
+ * Copyright (c) 1996 John S. Dyson
+ * Copyright (c) 2012 Giovanni Trematerra
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ * are met.
+ */
+
+/*
+ * This file contains a high-performance replacement for the socket-based
+ * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
+ * all features of sockets, but does do everything that pipes normally
+ * do.
+ */
+
+/*
+ * This code has two modes of operation, a small write mode and a large
+ * write mode. The small write mode acts like conventional pipes with
+ * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
+ * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
+ * and PIPE_SIZE in size, the sending process pins the underlying pages in
+ * memory, and the receiving process copies directly from these pinned pages
+ * in the sending process.
+ *
+ * If the sending process receives a signal, it is possible that it will
+ * go away, and certainly its address space can change, because control
+ * is returned back to the user-mode side. In that case, the pipe code
+ * arranges to copy the buffer supplied by the user process, to a pageable
+ * kernel buffer, and the receiving process will grab the data from the
+ * pageable kernel buffer. Since signals don't happen all that often,
+ * the copy operation is normally eliminated.
+ *
+ * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
+ * happen for small transfers so that the system will not spend all of
+ * its time context switching.
+ *
+ * In order to limit the resource use of pipes, two sysctls exist:
+ *
+ * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
+ * address space available to us in pipe_map. This value is normally
+ * autotuned, but may also be loader tuned.
+ *
+ * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
+ * memory in use by pipes.
+ *
+ * Based on how large pipekva is relative to maxpipekva, the following
+ * will happen:
+ *
+ * 0% - 50%:
+ * New pipes are given 16K of memory backing, pipes may dynamically
+ * grow to as large as 64K where needed.
+ * 50% - 75%:
+ * New pipes are given 4K (or PAGE_SIZE) of memory backing,
+ * existing pipes may NOT grow.
+ * 75% - 100%:
+ * New pipes are given 4K (or PAGE_SIZE) of memory backing,
+ * existing pipes will be shrunk down to 4K whenever possible.
+ *
+ * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If
+ * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
+ * resize which MUST occur for reverse-direction pipes when they are
+ * first used.
+ *
+ * Additional information about the current state of pipes may be obtained
+ * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
+ * and kern.ipc.piperesizefail.
+ *
+ * Locking rules: There are two locks present here: A mutex, used via
+ * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via
+ * the flag, as mutexes can not persist over uiomove. The mutex
+ * exists only to guard access to the flag, and is not in itself a
+ * locking mechanism. Also note that there is only a single mutex for
+ * both directions of a pipe.
+ *
+ * As pipelock() may have to sleep before it can acquire the flag, it
+ * is important to reread all data after a call to pipelock(); everything
+ * in the structure may have changed.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/selinfo.h>
+#include <sys/signalvar.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/event.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+/*
+ * Use this define if you want to disable *fancy* VM things. Expect an
+ * approx 30% decrease in transfer rate. This could be useful for
+ * NetBSD or OpenBSD.
+ */
+/* #define PIPE_NODIRECT */
+
+#define PIPE_PEER(pipe) \
+ (((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer))
+
+/*
+ * interfaces to the outside world
+ */
+static fo_rdwr_t pipe_read;
+static fo_rdwr_t pipe_write;
+static fo_truncate_t pipe_truncate;
+static fo_ioctl_t pipe_ioctl;
+static fo_poll_t pipe_poll;
+static fo_kqfilter_t pipe_kqfilter;
+static fo_stat_t pipe_stat;
+static fo_close_t pipe_close;
+static fo_chmod_t pipe_chmod;
+static fo_chown_t pipe_chown;
+
+struct fileops pipeops = {
+ .fo_read = pipe_read,
+ .fo_write = pipe_write,
+ .fo_truncate = pipe_truncate,
+ .fo_ioctl = pipe_ioctl,
+ .fo_poll = pipe_poll,
+ .fo_kqfilter = pipe_kqfilter,
+ .fo_stat = pipe_stat,
+ .fo_close = pipe_close,
+ .fo_chmod = pipe_chmod,
+ .fo_chown = pipe_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_flags = DFLAG_PASSABLE
+};
+
+static void filt_pipedetach(struct knote *kn);
+static void filt_pipedetach_notsup(struct knote *kn);
+static int filt_pipenotsup(struct knote *kn, long hint);
+static int filt_piperead(struct knote *kn, long hint);
+static int filt_pipewrite(struct knote *kn, long hint);
+
+static struct filterops pipe_nfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_pipedetach_notsup,
+ .f_event = filt_pipenotsup
+};
+static struct filterops pipe_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_pipedetach,
+ .f_event = filt_piperead
+};
+static struct filterops pipe_wfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_pipedetach,
+ .f_event = filt_pipewrite
+};
+
+/*
+ * Default pipe buffer size(s), this can be kind-of large now because pipe
+ * space is pageable. The pipe code will try to maintain locality of
+ * reference for performance reasons, so small amounts of outstanding I/O
+ * will not wipe the cache.
+ */
+#define MINPIPESIZE (PIPE_SIZE/3)
+#define MAXPIPESIZE (2*PIPE_SIZE/3)
+
+static long amountpipekva;
+static int pipefragretry;
+static int pipeallocfail;
+static int piperesizefail;
+static int piperesizeallowed = 1;
+
+SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
+ &maxpipekva, 0, "Pipe KVA limit");
+SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
+ &amountpipekva, 0, "Pipe KVA usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
+ &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
+ &pipeallocfail, 0, "Pipe allocation failures");
+SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
+ &piperesizefail, 0, "Pipe resize failures");
+SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
+ &piperesizeallowed, 0, "Pipe resizing allowed");
+
+static void pipeinit(void *dummy __unused);
+static void pipeclose(struct pipe *cpipe);
+static void pipe_free_kmem(struct pipe *cpipe);
+static int pipe_create(struct pipe *pipe, int backing);
+static int pipe_paircreate(struct thread *td, struct pipepair **p_pp);
+static __inline int pipelock(struct pipe *cpipe, int catch);
+static __inline void pipeunlock(struct pipe *cpipe);
+#ifndef PIPE_NODIRECT
+static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
+static void pipe_destroy_write_buffer(struct pipe *wpipe);
+static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
+static void pipe_clone_write_buffer(struct pipe *wpipe);
+#endif
+static int pipespace(struct pipe *cpipe, int size);
+static int pipespace_new(struct pipe *cpipe, int size);
+
+static int pipe_zone_ctor(void *mem, int size, void *arg, int flags);
+static int pipe_zone_init(void *mem, int size, int flags);
+static void pipe_zone_fini(void *mem, int size);
+
+static uma_zone_t pipe_zone;
+static struct unrhdr *pipeino_unr;
+static dev_t pipedev_ino;
+
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
+
+static void
+pipeinit(void *dummy __unused)
+{
+
+ pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
+ pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
+ UMA_ALIGN_PTR, 0);
+ KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
+ pipeino_unr = new_unrhdr(1, INT32_MAX, NULL);
+ KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized"));
+ pipedev_ino = devfs_alloc_cdp_inode();
+ KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized"));
+}
+
+static int
+pipe_zone_ctor(void *mem, int size, void *arg, int flags)
+{
+ struct pipepair *pp;
+ struct pipe *rpipe, *wpipe;
+
+ KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
+
+ pp = (struct pipepair *)mem;
+
+ /*
+ * We zero both pipe endpoints to make sure all the kmem pointers
+ * are NULL, flag fields are zero'd, etc. We timestamp both
+ * endpoints with the same time.
+ */
+ rpipe = &pp->pp_rpipe;
+ bzero(rpipe, sizeof(*rpipe));
+ vfs_timestamp(&rpipe->pipe_ctime);
+ rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
+
+ wpipe = &pp->pp_wpipe;
+ bzero(wpipe, sizeof(*wpipe));
+ wpipe->pipe_ctime = rpipe->pipe_ctime;
+ wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
+
+ rpipe->pipe_peer = wpipe;
+ rpipe->pipe_pair = pp;
+ wpipe->pipe_peer = rpipe;
+ wpipe->pipe_pair = pp;
+
+ /*
+ * Mark both endpoints as present; they will later get free'd
+ * one at a time. When both are free'd, then the whole pair
+ * is released.
+ */
+ rpipe->pipe_present = PIPE_ACTIVE;
+ wpipe->pipe_present = PIPE_ACTIVE;
+
+ /*
+ * Eventually, the MAC Framework may initialize the label
+ * in ctor or init, but for now we do it elswhere to avoid
+ * blocking in ctor or init.
+ */
+ pp->pp_label = NULL;
+
+ return (0);
+}
+
+static int
+pipe_zone_init(void *mem, int size, int flags)
+{
+ struct pipepair *pp;
+
+ KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
+
+ pp = (struct pipepair *)mem;
+
+ mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
+ return (0);
+}
+
+static void
+pipe_zone_fini(void *mem, int size)
+{
+ struct pipepair *pp;
+
+ KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
+
+ pp = (struct pipepair *)mem;
+
+ mtx_destroy(&pp->pp_mtx);
+}
+
+static int
+pipe_paircreate(struct thread *td, struct pipepair **p_pp)
+{
+ struct pipepair *pp;
+ struct pipe *rpipe, *wpipe;
+ int error;
+
+ *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK);
+#ifdef MAC
+ /*
+ * The MAC label is shared between the connected endpoints. As a
+ * result mac_pipe_init() and mac_pipe_create() are called once
+ * for the pair, and not on the endpoints.
+ */
+ mac_pipe_init(pp);
+ mac_pipe_create(td->td_ucred, pp);
+#endif
+ rpipe = &pp->pp_rpipe;
+ wpipe = &pp->pp_wpipe;
+
+ knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
+ knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
+
+ /* Only the forward direction pipe is backed by default */
+ if ((error = pipe_create(rpipe, 1)) != 0 ||
+ (error = pipe_create(wpipe, 0)) != 0) {
+ pipeclose(rpipe);
+ pipeclose(wpipe);
+ return (error);
+ }
+
+ rpipe->pipe_state |= PIPE_DIRECTOK;
+ wpipe->pipe_state |= PIPE_DIRECTOK;
+ return (0);
+}
+
+int
+pipe_named_ctor(struct pipe **ppipe, struct thread *td)
+{
+ struct pipepair *pp;
+ int error;
+
+ error = pipe_paircreate(td, &pp);
+ if (error != 0)
+ return (error);
+ pp->pp_rpipe.pipe_state |= PIPE_NAMED;
+ *ppipe = &pp->pp_rpipe;
+ return (0);
+}
+
+void
+pipe_dtor(struct pipe *dpipe)
+{
+ ino_t ino;
+
+ ino = dpipe->pipe_ino;
+ funsetown(&dpipe->pipe_sigio);
+ pipeclose(dpipe);
+ if (dpipe->pipe_state & PIPE_NAMED) {
+ dpipe = dpipe->pipe_peer;
+ funsetown(&dpipe->pipe_sigio);
+ pipeclose(dpipe);
+ }
+ if (ino != 0 && ino != (ino_t)-1)
+ free_unr(pipeino_unr, ino);
+}
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let
+ * the zone pick up the pieces via pipeclose().
+ */
+int
+kern_pipe(struct thread *td, int fildes[2])
+{
+
+ return (kern_pipe2(td, fildes, 0));
+}
+
+int
+kern_pipe2(struct thread *td, int fildes[2], int flags)
+{
+ struct filedesc *fdp;
+ struct file *rf, *wf;
+ struct pipe *rpipe, *wpipe;
+ struct pipepair *pp;
+ int fd, fflags, error;
+
+ fdp = td->td_proc->p_fd;
+ error = pipe_paircreate(td, &pp);
+ if (error != 0)
+ return (error);
+ rpipe = &pp->pp_rpipe;
+ wpipe = &pp->pp_wpipe;
+ error = falloc(td, &rf, &fd, flags);
+ if (error) {
+ pipeclose(rpipe);
+ pipeclose(wpipe);
+ return (error);
+ }
+ /* An extra reference on `rf' has been held for us by falloc(). */
+ fildes[0] = fd;
+
+ fflags = FREAD | FWRITE;
+ if ((flags & O_NONBLOCK) != 0)
+ fflags |= FNONBLOCK;
+
+ /*
+ * Warning: once we've gotten past allocation of the fd for the
+ * read-side, we can only drop the read side via fdrop() in order
+ * to avoid races against processes which manage to dup() the read
+ * side while we are blocked trying to allocate the write side.
+ */
+ finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
+ error = falloc(td, &wf, &fd, flags);
+ if (error) {
+ fdclose(fdp, rf, fildes[0], td);
+ fdrop(rf, td);
+ /* rpipe has been closed by fdrop(). */
+ pipeclose(wpipe);
+ return (error);
+ }
+ /* An extra reference on `wf' has been held for us by falloc(). */
+ finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops);
+ fdrop(wf, td);
+ fildes[1] = fd;
+ fdrop(rf, td);
+
+ return (0);
+}
+
+/* ARGSUSED */
+int
+sys_pipe(struct thread *td, struct pipe_args *uap)
+{
+ int error;
+ int fildes[2];
+
+ error = kern_pipe(td, fildes);
+ if (error)
+ return (error);
+
+ td->td_retval[0] = fildes[0];
+ td->td_retval[1] = fildes[1];
+
+ return (0);
+}
+
+int
+sys_pipe2(struct thread *td, struct pipe2_args *uap)
+{
+ int error, fildes[2];
+
+ if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
+ return (EINVAL);
+ error = kern_pipe2(td, fildes, uap->flags);
+ if (error)
+ return (error);
+ error = copyout(fildes, uap->fildes, 2 * sizeof(int));
+ if (error) {
+ (void)kern_close(td, fildes[0]);
+ (void)kern_close(td, fildes[1]);
+ }
+ return (error);
+}
+
+/*
+ * Allocate kva for pipe circular buffer, the space is pageable
+ * This routine will 'realloc' the size of a pipe safely, if it fails
+ * it will retain the old buffer.
+ * If it fails it will return ENOMEM.
+ */
+static int
+pipespace_new(cpipe, size)
+ struct pipe *cpipe;
+ int size;
+{
+ caddr_t buffer;
+ int error, cnt, firstseg;
+ static int curfail = 0;
+ static struct timeval lastfail;
+
+ KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
+ KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
+ ("pipespace: resize of direct writes not allowed"));
+retry:
+ cnt = cpipe->pipe_buffer.cnt;
+ if (cnt > size)
+ size = cnt;
+
+ size = round_page(size);
+ buffer = (caddr_t) vm_map_min(pipe_map);
+
+ error = vm_map_find(pipe_map, NULL, 0,
+ (vm_offset_t *) &buffer, size, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error != KERN_SUCCESS) {
+ if ((cpipe->pipe_buffer.buffer == NULL) &&
+ (size > SMALL_PIPE_SIZE)) {
+ size = SMALL_PIPE_SIZE;
+ pipefragretry++;
+ goto retry;
+ }
+ if (cpipe->pipe_buffer.buffer == NULL) {
+ pipeallocfail++;
+ if (ppsratecheck(&lastfail, &curfail, 1))
+ printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
+ } else {
+ piperesizefail++;
+ }
+ return (ENOMEM);
+ }
+
+ /* copy data, then free old resources if we're resizing */
+ if (cnt > 0) {
+ if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
+ firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
+ bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
+ buffer, firstseg);
+ if ((cnt - firstseg) > 0)
+ bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
+ cpipe->pipe_buffer.in);
+ } else {
+ bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
+ buffer, cnt);
+ }
+ }
+ pipe_free_kmem(cpipe);
+ cpipe->pipe_buffer.buffer = buffer;
+ cpipe->pipe_buffer.size = size;
+ cpipe->pipe_buffer.in = cnt;
+ cpipe->pipe_buffer.out = 0;
+ cpipe->pipe_buffer.cnt = cnt;
+ atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
+ return (0);
+}
+
+/*
+ * Wrapper for pipespace_new() that performs locking assertions.
+ */
+static int
+pipespace(cpipe, size)
+ struct pipe *cpipe;
+ int size;
+{
+
+ KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
+ ("Unlocked pipe passed to pipespace"));
+ return (pipespace_new(cpipe, size));
+}
+
+/*
+ * lock a pipe for I/O, blocking other access
+ */
+static __inline int
+pipelock(cpipe, catch)
+ struct pipe *cpipe;
+ int catch;
+{
+ int error;
+
+ PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+ while (cpipe->pipe_state & PIPE_LOCKFL) {
+ cpipe->pipe_state |= PIPE_LWANT;
+ error = msleep(cpipe, PIPE_MTX(cpipe),
+ catch ? (PRIBIO | PCATCH) : PRIBIO,
+ "pipelk", 0);
+ if (error != 0)
+ return (error);
+ }
+ cpipe->pipe_state |= PIPE_LOCKFL;
+ return (0);
+}
+
+/*
+ * unlock a pipe I/O lock
+ */
+static __inline void
+pipeunlock(cpipe)
+ struct pipe *cpipe;
+{
+
+ PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+ KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
+ ("Unlocked pipe passed to pipeunlock"));
+ cpipe->pipe_state &= ~PIPE_LOCKFL;
+ if (cpipe->pipe_state & PIPE_LWANT) {
+ cpipe->pipe_state &= ~PIPE_LWANT;
+ wakeup(cpipe);
+ }
+}
+
+void
+pipeselwakeup(cpipe)
+ struct pipe *cpipe;
+{
+
+ PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
+ if (cpipe->pipe_state & PIPE_SEL) {
+ selwakeuppri(&cpipe->pipe_sel, PSOCK);
+ if (!SEL_WAITING(&cpipe->pipe_sel))
+ cpipe->pipe_state &= ~PIPE_SEL;
+ }
+ if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
+ pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
+ KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
+}
+
+/*
+ * Initialize and allocate VM and memory for pipe. The structure
+ * will start out zero'd from the ctor, so we just manage the kmem.
+ */
+static int
+pipe_create(pipe, backing)
+ struct pipe *pipe;
+ int backing;
+{
+ int error;
+
+ if (backing) {
+ if (amountpipekva > maxpipekva / 2)
+ error = pipespace_new(pipe, SMALL_PIPE_SIZE);
+ else
+ error = pipespace_new(pipe, PIPE_SIZE);
+ } else {
+ /* If we're not backing this pipe, no need to do anything. */
+ error = 0;
+ }
+ pipe->pipe_ino = -1;
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+pipe_read(fp, uio, active_cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *active_cred;
+ struct thread *td;
+ int flags;
+{
+ struct pipe *rpipe;
+ int error;
+ int nread = 0;
+ int size;
+
+ rpipe = fp->f_data;
+ PIPE_LOCK(rpipe);
+ ++rpipe->pipe_busy;
+ error = pipelock(rpipe, 1);
+ if (error)
+ goto unlocked_error;
+
+#ifdef MAC
+ error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
+ if (error)
+ goto locked_error;
+#endif
+ if (amountpipekva > (3 * maxpipekva) / 4) {
+ if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
+ (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
+ (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
+ (piperesizeallowed == 1)) {
+ PIPE_UNLOCK(rpipe);
+ pipespace(rpipe, SMALL_PIPE_SIZE);
+ PIPE_LOCK(rpipe);
+ }
+ }
+
+ while (uio->uio_resid) {
+ /*
+ * normal pipe buffer receive
+ */
+ if (rpipe->pipe_buffer.cnt > 0) {
+ size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
+ if (size > rpipe->pipe_buffer.cnt)
+ size = rpipe->pipe_buffer.cnt;
+ if (size > uio->uio_resid)
+ size = uio->uio_resid;
+
+ PIPE_UNLOCK(rpipe);
+ error = uiomove(
+ &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
+ size, uio);
+ PIPE_LOCK(rpipe);
+ if (error)
+ break;
+
+ rpipe->pipe_buffer.out += size;
+ if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
+ rpipe->pipe_buffer.out = 0;
+
+ rpipe->pipe_buffer.cnt -= size;
+
+ /*
+ * If there is no more to read in the pipe, reset
+ * its pointers to the beginning. This improves
+ * cache hit stats.
+ */
+ if (rpipe->pipe_buffer.cnt == 0) {
+ rpipe->pipe_buffer.in = 0;
+ rpipe->pipe_buffer.out = 0;
+ }
+ nread += size;
+#ifndef PIPE_NODIRECT
+ /*
+ * Direct copy, bypassing a kernel buffer.
+ */
+ } else if ((size = rpipe->pipe_map.cnt) &&
+ (rpipe->pipe_state & PIPE_DIRECTW)) {
+ if (size > uio->uio_resid)
+ size = (u_int) uio->uio_resid;
+
+ PIPE_UNLOCK(rpipe);
+ error = uiomove_fromphys(rpipe->pipe_map.ms,
+ rpipe->pipe_map.pos, size, uio);
+ PIPE_LOCK(rpipe);
+ if (error)
+ break;
+ nread += size;
+ rpipe->pipe_map.pos += size;
+ rpipe->pipe_map.cnt -= size;
+ if (rpipe->pipe_map.cnt == 0) {
+ rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW);
+ wakeup(rpipe);
+ }
+#endif
+ } else {
+ /*
+ * detect EOF condition
+ * read returns 0 on EOF, no need to set error
+ */
+ if (rpipe->pipe_state & PIPE_EOF)
+ break;
+
+ /*
+ * If the "write-side" has been blocked, wake it up now.
+ */
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+
+ /*
+ * Break if some data was read.
+ */
+ if (nread > 0)
+ break;
+
+ /*
+ * Unlock the pipe buffer for our remaining processing.
+ * We will either break out with an error or we will
+ * sleep and relock to loop.
+ */
+ pipeunlock(rpipe);
+
+ /*
+ * Handle non-blocking mode operation or
+ * wait for more data.
+ */
+ if (fp->f_flag & FNONBLOCK) {
+ error = EAGAIN;
+ } else {
+ rpipe->pipe_state |= PIPE_WANTR;
+ if ((error = msleep(rpipe, PIPE_MTX(rpipe),
+ PRIBIO | PCATCH,
+ "piperd", 0)) == 0)
+ error = pipelock(rpipe, 1);
+ }
+ if (error)
+ goto unlocked_error;
+ }
+ }
+#ifdef MAC
+locked_error:
+#endif
+ pipeunlock(rpipe);
+
+ /* XXX: should probably do this before getting any locks. */
+ if (error == 0)
+ vfs_timestamp(&rpipe->pipe_atime);
+unlocked_error:
+ --rpipe->pipe_busy;
+
+ /*
+ * PIPE_WANT processing only makes sense if pipe_busy is 0.
+ */
+ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
+ rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
+ wakeup(rpipe);
+ } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
+ /*
+ * Handle write blocking hysteresis.
+ */
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+ }
+
+ if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
+ pipeselwakeup(rpipe);
+
+ PIPE_UNLOCK(rpipe);
+ return (error);
+}
+
+#ifndef PIPE_NODIRECT
+/*
+ * Map the sending processes' buffer into kernel space and wire it.
+ * This is similar to a physical write operation.
+ */
+static int
+pipe_build_write_buffer(wpipe, uio)
+ struct pipe *wpipe;
+ struct uio *uio;
+{
+ u_int size;
+ int i;
+
+ PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
+ KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
+ ("Clone attempt on non-direct write pipe!"));
+
+ if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size)
+ size = wpipe->pipe_buffer.size;
+ else
+ size = uio->uio_iov->iov_len;
+
+ if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+ (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
+ wpipe->pipe_map.ms, PIPENPAGES)) < 0)
+ return (EFAULT);
+
+/*
+ * set up the control block
+ */
+ wpipe->pipe_map.npages = i;
+ wpipe->pipe_map.pos =
+ ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
+ wpipe->pipe_map.cnt = size;
+
+/*
+ * and update the uio data
+ */
+
+ uio->uio_iov->iov_len -= size;
+ uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
+ if (uio->uio_iov->iov_len == 0)
+ uio->uio_iov++;
+ uio->uio_resid -= size;
+ uio->uio_offset += size;
+ return (0);
+}
+
+/*
+ * unmap and unwire the process buffer
+ */
+static void
+pipe_destroy_write_buffer(wpipe)
+ struct pipe *wpipe;
+{
+
+ PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+ vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages);
+ wpipe->pipe_map.npages = 0;
+}
+
+/*
+ * In the case of a signal, the writing process might go away. This
+ * code copies the data into the circular buffer so that the source
+ * pages can be freed without loss of data.
+ */
+static void
+pipe_clone_write_buffer(wpipe)
+ struct pipe *wpipe;
+{
+ struct uio uio;
+ struct iovec iov;
+ int size;
+ int pos;
+
+ PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+ size = wpipe->pipe_map.cnt;
+ pos = wpipe->pipe_map.pos;
+
+ wpipe->pipe_buffer.in = size;
+ wpipe->pipe_buffer.out = 0;
+ wpipe->pipe_buffer.cnt = size;
+ wpipe->pipe_state &= ~PIPE_DIRECTW;
+
+ PIPE_UNLOCK(wpipe);
+ iov.iov_base = wpipe->pipe_buffer.buffer;
+ iov.iov_len = size;
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = size;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_rw = UIO_READ;
+ uio.uio_td = curthread;
+ uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
+ PIPE_LOCK(wpipe);
+ pipe_destroy_write_buffer(wpipe);
+}
+
+/*
+ * This implements the pipe buffer write mechanism. Note that only
+ * a direct write OR a normal pipe write can be pending at any given time.
+ * If there are any characters in the pipe buffer, the direct write will
+ * be deferred until the receiving process grabs all of the bytes from
+ * the pipe buffer. Then the direct mapping write is set-up.
+ */
+static int
+pipe_direct_write(wpipe, uio)
+ struct pipe *wpipe;
+ struct uio *uio;
+{
+ int error;
+
+retry:
+ PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
+ error = pipelock(wpipe, 1);
+ if (wpipe->pipe_state & PIPE_EOF)
+ error = EPIPE;
+ if (error) {
+ pipeunlock(wpipe);
+ goto error1;
+ }
+ while (wpipe->pipe_state & PIPE_DIRECTW) {
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ pipeselwakeup(wpipe);
+ wpipe->pipe_state |= PIPE_WANTW;
+ pipeunlock(wpipe);
+ error = msleep(wpipe, PIPE_MTX(wpipe),
+ PRIBIO | PCATCH, "pipdww", 0);
+ if (error)
+ goto error1;
+ else
+ goto retry;
+ }
+ wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
+ if (wpipe->pipe_buffer.cnt > 0) {
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ pipeselwakeup(wpipe);
+ wpipe->pipe_state |= PIPE_WANTW;
+ pipeunlock(wpipe);
+ error = msleep(wpipe, PIPE_MTX(wpipe),
+ PRIBIO | PCATCH, "pipdwc", 0);
+ if (error)
+ goto error1;
+ else
+ goto retry;
+ }
+
+ wpipe->pipe_state |= PIPE_DIRECTW;
+
+ PIPE_UNLOCK(wpipe);
+ error = pipe_build_write_buffer(wpipe, uio);
+ PIPE_LOCK(wpipe);
+ if (error) {
+ wpipe->pipe_state &= ~PIPE_DIRECTW;
+ pipeunlock(wpipe);
+ goto error1;
+ }
+
+ error = 0;
+ while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
+ if (wpipe->pipe_state & PIPE_EOF) {
+ pipe_destroy_write_buffer(wpipe);
+ pipeselwakeup(wpipe);
+ pipeunlock(wpipe);
+ error = EPIPE;
+ goto error1;
+ }
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ pipeselwakeup(wpipe);
+ wpipe->pipe_state |= PIPE_WANTW;
+ pipeunlock(wpipe);
+ error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
+ "pipdwt", 0);
+ pipelock(wpipe, 0);
+ }
+
+ if (wpipe->pipe_state & PIPE_EOF)
+ error = EPIPE;
+ if (wpipe->pipe_state & PIPE_DIRECTW) {
+ /*
+ * this bit of trickery substitutes a kernel buffer for
+ * the process that might be going away.
+ */
+ pipe_clone_write_buffer(wpipe);
+ } else {
+ pipe_destroy_write_buffer(wpipe);
+ }
+ pipeunlock(wpipe);
+ return (error);
+
+error1:
+ wakeup(wpipe);
+ return (error);
+}
+#endif
+
+static int
+pipe_write(fp, uio, active_cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *active_cred;
+ struct thread *td;
+ int flags;
+{
+ int error = 0;
+ int desiredsize;
+ ssize_t orig_resid;
+ struct pipe *wpipe, *rpipe;
+
+ rpipe = fp->f_data;
+ wpipe = PIPE_PEER(rpipe);
+ PIPE_LOCK(rpipe);
+ error = pipelock(wpipe, 1);
+ if (error) {
+ PIPE_UNLOCK(rpipe);
+ return (error);
+ }
+ /*
+ * detect loss of pipe read side, issue SIGPIPE if lost.
+ */
+ if (wpipe->pipe_present != PIPE_ACTIVE ||
+ (wpipe->pipe_state & PIPE_EOF)) {
+ pipeunlock(wpipe);
+ PIPE_UNLOCK(rpipe);
+ return (EPIPE);
+ }
+#ifdef MAC
+ error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
+ if (error) {
+ pipeunlock(wpipe);
+ PIPE_UNLOCK(rpipe);
+ return (error);
+ }
+#endif
+ ++wpipe->pipe_busy;
+
+ /* Choose a larger size if it's advantageous */
+ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
+ while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
+ if (piperesizeallowed != 1)
+ break;
+ if (amountpipekva > maxpipekva / 2)
+ break;
+ if (desiredsize == BIG_PIPE_SIZE)
+ break;
+ desiredsize = desiredsize * 2;
+ }
+
+ /* Choose a smaller size if we're in a OOM situation */
+ if ((amountpipekva > (3 * maxpipekva) / 4) &&
+ (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
+ (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
+ (piperesizeallowed == 1))
+ desiredsize = SMALL_PIPE_SIZE;
+
+ /* Resize if the above determined that a new size was necessary */
+ if ((desiredsize != wpipe->pipe_buffer.size) &&
+ ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
+ PIPE_UNLOCK(wpipe);
+ pipespace(wpipe, desiredsize);
+ PIPE_LOCK(wpipe);
+ }
+ if (wpipe->pipe_buffer.size == 0) {
+ /*
+ * This can only happen for reverse direction use of pipes
+ * in a complete OOM situation.
+ */
+ error = ENOMEM;
+ --wpipe->pipe_busy;
+ pipeunlock(wpipe);
+ PIPE_UNLOCK(wpipe);
+ return (error);
+ }
+
+ pipeunlock(wpipe);
+
+ orig_resid = uio->uio_resid;
+
+ while (uio->uio_resid) {
+ int space;
+
+ pipelock(wpipe, 0);
+ if (wpipe->pipe_state & PIPE_EOF) {
+ pipeunlock(wpipe);
+ error = EPIPE;
+ break;
+ }
+#ifndef PIPE_NODIRECT
+ /*
+ * If the transfer is large, we can gain performance if
+ * we do process-to-process copies directly.
+ * If the write is non-blocking, we don't use the
+ * direct write mechanism.
+ *
+ * The direct write mechanism will detect the reader going
+ * away on us.
+ */
+ if (uio->uio_segflg == UIO_USERSPACE &&
+ uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
+ wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
+ (fp->f_flag & FNONBLOCK) == 0) {
+ pipeunlock(wpipe);
+ error = pipe_direct_write(wpipe, uio);
+ if (error)
+ break;
+ continue;
+ }
+#endif
+
+ /*
+ * Pipe buffered writes cannot be coincidental with
+ * direct writes. We wait until the currently executing
+ * direct write is completed before we start filling the
+ * pipe buffer. We break out if a signal occurs or the
+ * reader goes away.
+ */
+ if (wpipe->pipe_state & PIPE_DIRECTW) {
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ pipeselwakeup(wpipe);
+ wpipe->pipe_state |= PIPE_WANTW;
+ pipeunlock(wpipe);
+ error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
+ "pipbww", 0);
+ if (error)
+ break;
+ else
+ continue;
+ }
+
+ space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+
+ /* Writes of size <= PIPE_BUF must be atomic. */
+ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
+ space = 0;
+
+ if (space > 0) {
+ int size; /* Transfer size */
+ int segsize; /* first segment to transfer */
+
+ /*
+ * Transfer size is minimum of uio transfer
+ * and free space in pipe buffer.
+ */
+ if (space > uio->uio_resid)
+ size = uio->uio_resid;
+ else
+ size = space;
+ /*
+ * First segment to transfer is minimum of
+ * transfer size and contiguous space in
+ * pipe buffer. If first segment to transfer
+ * is less than the transfer size, we've got
+ * a wraparound in the buffer.
+ */
+ segsize = wpipe->pipe_buffer.size -
+ wpipe->pipe_buffer.in;
+ if (segsize > size)
+ segsize = size;
+
+ /* Transfer first segment */
+
+ PIPE_UNLOCK(rpipe);
+ error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
+ segsize, uio);
+ PIPE_LOCK(rpipe);
+
+ if (error == 0 && segsize < size) {
+ KASSERT(wpipe->pipe_buffer.in + segsize ==
+ wpipe->pipe_buffer.size,
+ ("Pipe buffer wraparound disappeared"));
+ /*
+ * Transfer remaining part now, to
+ * support atomic writes. Wraparound
+ * happened.
+ */
+
+ PIPE_UNLOCK(rpipe);
+ error = uiomove(
+ &wpipe->pipe_buffer.buffer[0],
+ size - segsize, uio);
+ PIPE_LOCK(rpipe);
+ }
+ if (error == 0) {
+ wpipe->pipe_buffer.in += size;
+ if (wpipe->pipe_buffer.in >=
+ wpipe->pipe_buffer.size) {
+ KASSERT(wpipe->pipe_buffer.in ==
+ size - segsize +
+ wpipe->pipe_buffer.size,
+ ("Expected wraparound bad"));
+ wpipe->pipe_buffer.in = size - segsize;
+ }
+
+ wpipe->pipe_buffer.cnt += size;
+ KASSERT(wpipe->pipe_buffer.cnt <=
+ wpipe->pipe_buffer.size,
+ ("Pipe buffer overflow"));
+ }
+ pipeunlock(wpipe);
+ if (error != 0)
+ break;
+ } else {
+ /*
+ * If the "read-side" has been blocked, wake it up now.
+ */
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+
+ /*
+ * don't block on non-blocking I/O
+ */
+ if (fp->f_flag & FNONBLOCK) {
+ error = EAGAIN;
+ pipeunlock(wpipe);
+ break;
+ }
+
+ /*
+ * We have no more space and have something to offer,
+ * wake up select/poll.
+ */
+ pipeselwakeup(wpipe);
+
+ wpipe->pipe_state |= PIPE_WANTW;
+ pipeunlock(wpipe);
+ error = msleep(wpipe, PIPE_MTX(rpipe),
+ PRIBIO | PCATCH, "pipewr", 0);
+ if (error != 0)
+ break;
+ }
+ }
+
+ pipelock(wpipe, 0);
+ --wpipe->pipe_busy;
+
+ if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
+ wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
+ wakeup(wpipe);
+ } else if (wpipe->pipe_buffer.cnt > 0) {
+ /*
+ * If we have put any characters in the buffer, we wake up
+ * the reader.
+ */
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ }
+
+ /*
+ * Don't return EPIPE if I/O was successful
+ */
+ if ((wpipe->pipe_buffer.cnt == 0) &&
+ (uio->uio_resid == 0) &&
+ (error == EPIPE)) {
+ error = 0;
+ }
+
+ if (error == 0)
+ vfs_timestamp(&wpipe->pipe_mtime);
+
+ /*
+ * We have something to offer,
+ * wake up select/poll.
+ */
+ if (wpipe->pipe_buffer.cnt)
+ pipeselwakeup(wpipe);
+
+ pipeunlock(wpipe);
+ PIPE_UNLOCK(rpipe);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+pipe_truncate(fp, length, active_cred, td)
+ struct file *fp;
+ off_t length;
+ struct ucred *active_cred;
+ struct thread *td;
+{
+
+ /* For named pipes call the vnode operation. */
+ if (fp->f_vnode != NULL)
+ return (vnops.fo_truncate(fp, length, active_cred, td));
+ return (EINVAL);
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+static int
+pipe_ioctl(fp, cmd, data, active_cred, td)
+ struct file *fp;
+ u_long cmd;
+ void *data;
+ struct ucred *active_cred;
+ struct thread *td;
+{
+ struct pipe *mpipe = fp->f_data;
+ int error;
+
+ PIPE_LOCK(mpipe);
+
+#ifdef MAC
+ error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
+ if (error) {
+ PIPE_UNLOCK(mpipe);
+ return (error);
+ }
+#endif
+
+ error = 0;
+ switch (cmd) {
+
+ case FIONBIO:
+ break;
+
+ case FIOASYNC:
+ if (*(int *)data) {
+ mpipe->pipe_state |= PIPE_ASYNC;
+ } else {
+ mpipe->pipe_state &= ~PIPE_ASYNC;
+ }
+ break;
+
+ case FIONREAD:
+ if (!(fp->f_flag & FREAD)) {
+ *(int *)data = 0;
+ PIPE_UNLOCK(mpipe);
+ return (0);
+ }
+ if (mpipe->pipe_state & PIPE_DIRECTW)
+ *(int *)data = mpipe->pipe_map.cnt;
+ else
+ *(int *)data = mpipe->pipe_buffer.cnt;
+ break;
+
+ case FIOSETOWN:
+ PIPE_UNLOCK(mpipe);
+ error = fsetown(*(int *)data, &mpipe->pipe_sigio);
+ goto out_unlocked;
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(&mpipe->pipe_sigio);
+ break;
+
+ /* This is deprecated, FIOSETOWN should be used instead. */
+ case TIOCSPGRP:
+ PIPE_UNLOCK(mpipe);
+ error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
+ goto out_unlocked;
+
+ /* This is deprecated, FIOGETOWN should be used instead. */
+ case TIOCGPGRP:
+ *(int *)data = -fgetown(&mpipe->pipe_sigio);
+ break;
+
+ default:
+ error = ENOTTY;
+ break;
+ }
+ PIPE_UNLOCK(mpipe);
+out_unlocked:
+ return (error);
+}
+
+static int
+pipe_poll(fp, events, active_cred, td)
+ struct file *fp;
+ int events;
+ struct ucred *active_cred;
+ struct thread *td;
+{
+ struct pipe *rpipe;
+ struct pipe *wpipe;
+ int levents, revents;
+#ifdef MAC
+ int error;
+#endif
+
+ revents = 0;
+ rpipe = fp->f_data;
+ wpipe = PIPE_PEER(rpipe);
+ PIPE_LOCK(rpipe);
+#ifdef MAC
+ error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
+ if (error)
+ goto locked_error;
+#endif
+ if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM))
+ if ((rpipe->pipe_state & PIPE_DIRECTW) ||
+ (rpipe->pipe_buffer.cnt > 0))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM))
+ if (wpipe->pipe_present != PIPE_ACTIVE ||
+ (wpipe->pipe_state & PIPE_EOF) ||
+ (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
+ ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF ||
+ wpipe->pipe_buffer.size == 0)))
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ levents = events &
+ (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
+ if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents &&
+ fp->f_seqcount == rpipe->pipe_wgen)
+ events |= POLLINIGNEOF;
+
+ if ((events & POLLINIGNEOF) == 0) {
+ if (rpipe->pipe_state & PIPE_EOF) {
+ revents |= (events & (POLLIN | POLLRDNORM));
+ if (wpipe->pipe_present != PIPE_ACTIVE ||
+ (wpipe->pipe_state & PIPE_EOF))
+ revents |= POLLHUP;
+ }
+ }
+
+ if (revents == 0) {
+ if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) {
+ selrecord(td, &rpipe->pipe_sel);
+ if (SEL_WAITING(&rpipe->pipe_sel))
+ rpipe->pipe_state |= PIPE_SEL;
+ }
+
+ if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) {
+ selrecord(td, &wpipe->pipe_sel);
+ if (SEL_WAITING(&wpipe->pipe_sel))
+ wpipe->pipe_state |= PIPE_SEL;
+ }
+ }
+#ifdef MAC
+locked_error:
+#endif
+ PIPE_UNLOCK(rpipe);
+
+ return (revents);
+}
+
+/*
+ * We shouldn't need locks here as we're doing a read and this should
+ * be a natural race.
+ */
+static int
+pipe_stat(fp, ub, active_cred, td)
+ struct file *fp;
+ struct stat *ub;
+ struct ucred *active_cred;
+ struct thread *td;
+{
+ struct pipe *pipe;
+ int new_unr;
+#ifdef MAC
+ int error;
+#endif
+
+ pipe = fp->f_data;
+ PIPE_LOCK(pipe);
+#ifdef MAC
+ error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
+ if (error) {
+ PIPE_UNLOCK(pipe);
+ return (error);
+ }
+#endif
+
+ /* For named pipes ask the underlying filesystem. */
+ if (pipe->pipe_state & PIPE_NAMED) {
+ PIPE_UNLOCK(pipe);
+ return (vnops.fo_stat(fp, ub, active_cred, td));
+ }
+
+ /*
+ * Lazily allocate an inode number for the pipe. Most pipe
+ * users do not call fstat(2) on the pipe, which means that
+ * postponing the inode allocation until it is must be
+ * returned to userland is useful. If alloc_unr failed,
+ * assign st_ino zero instead of returning an error.
+ * Special pipe_ino values:
+ * -1 - not yet initialized;
+ * 0 - alloc_unr failed, return 0 as st_ino forever.
+ */
+ if (pipe->pipe_ino == (ino_t)-1) {
+ new_unr = alloc_unr(pipeino_unr);
+ if (new_unr != -1)
+ pipe->pipe_ino = new_unr;
+ else
+ pipe->pipe_ino = 0;
+ }
+ PIPE_UNLOCK(pipe);
+
+ bzero(ub, sizeof(*ub));
+ ub->st_mode = S_IFIFO;
+ ub->st_blksize = PAGE_SIZE;
+ if (pipe->pipe_state & PIPE_DIRECTW)
+ ub->st_size = pipe->pipe_map.cnt;
+ else
+ ub->st_size = pipe->pipe_buffer.cnt;
+ ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
+ ub->st_atim = pipe->pipe_atime;
+ ub->st_mtim = pipe->pipe_mtime;
+ ub->st_ctim = pipe->pipe_ctime;
+ ub->st_uid = fp->f_cred->cr_uid;
+ ub->st_gid = fp->f_cred->cr_gid;
+ ub->st_dev = pipedev_ino;
+ ub->st_ino = pipe->pipe_ino;
+ /*
+ * Left as 0: st_nlink, st_rdev, st_flags, st_gen.
+ */
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+pipe_close(fp, td)
+ struct file *fp;
+ struct thread *td;
+{
+
+ if (fp->f_vnode != NULL)
+ return vnops.fo_close(fp, td);
+ fp->f_ops = &badfileops;
+ pipe_dtor(fp->f_data);
+ fp->f_data = NULL;
+ return (0);
+}
+
+static int
+pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
+{
+ struct pipe *cpipe;
+ int error;
+
+ cpipe = fp->f_data;
+ if (cpipe->pipe_state & PIPE_NAMED)
+ error = vn_chmod(fp, mode, active_cred, td);
+ else
+ error = invfo_chmod(fp, mode, active_cred, td);
+ return (error);
+}
+
+static int
+pipe_chown(fp, uid, gid, active_cred, td)
+ struct file *fp;
+ uid_t uid;
+ gid_t gid;
+ struct ucred *active_cred;
+ struct thread *td;
+{
+ struct pipe *cpipe;
+ int error;
+
+ cpipe = fp->f_data;
+ if (cpipe->pipe_state & PIPE_NAMED)
+ error = vn_chown(fp, uid, gid, active_cred, td);
+ else
+ error = invfo_chown(fp, uid, gid, active_cred, td);
+ return (error);
+}
+
+static void
+pipe_free_kmem(cpipe)
+ struct pipe *cpipe;
+{
+
+ KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
+ ("pipe_free_kmem: pipe mutex locked"));
+
+ if (cpipe->pipe_buffer.buffer != NULL) {
+ atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
+ vm_map_remove(pipe_map,
+ (vm_offset_t)cpipe->pipe_buffer.buffer,
+ (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
+ cpipe->pipe_buffer.buffer = NULL;
+ }
+#ifndef PIPE_NODIRECT
+ {
+ cpipe->pipe_map.cnt = 0;
+ cpipe->pipe_map.pos = 0;
+ cpipe->pipe_map.npages = 0;
+ }
+#endif
+}
+
+/*
+ * shutdown the pipe
+ */
+static void
+pipeclose(cpipe)
+ struct pipe *cpipe;
+{
+ struct pipepair *pp;
+ struct pipe *ppipe;
+
+ KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
+
+ PIPE_LOCK(cpipe);
+ pipelock(cpipe, 0);
+ pp = cpipe->pipe_pair;
+
+ pipeselwakeup(cpipe);
+
+ /*
+ * If the other side is blocked, wake it up saying that
+ * we want to close it down.
+ */
+ cpipe->pipe_state |= PIPE_EOF;
+ while (cpipe->pipe_busy) {
+ wakeup(cpipe);
+ cpipe->pipe_state |= PIPE_WANT;
+ pipeunlock(cpipe);
+ msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
+ pipelock(cpipe, 0);
+ }
+
+
+ /*
+ * Disconnect from peer, if any.
+ */
+ ppipe = cpipe->pipe_peer;
+ if (ppipe->pipe_present == PIPE_ACTIVE) {
+ pipeselwakeup(ppipe);
+
+ ppipe->pipe_state |= PIPE_EOF;
+ wakeup(ppipe);
+ KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
+ }
+
+ /*
+ * Mark this endpoint as free. Release kmem resources. We
+ * don't mark this endpoint as unused until we've finished
+ * doing that, or the pipe might disappear out from under
+ * us.
+ */
+ PIPE_UNLOCK(cpipe);
+ pipe_free_kmem(cpipe);
+ PIPE_LOCK(cpipe);
+ cpipe->pipe_present = PIPE_CLOSING;
+ pipeunlock(cpipe);
+
+ /*
+ * knlist_clear() may sleep dropping the PIPE_MTX. Set the
+ * PIPE_FINALIZED, that allows other end to free the
+ * pipe_pair, only after the knotes are completely dismantled.
+ */
+ knlist_clear(&cpipe->pipe_sel.si_note, 1);
+ cpipe->pipe_present = PIPE_FINALIZED;
+ seldrain(&cpipe->pipe_sel);
+ knlist_destroy(&cpipe->pipe_sel.si_note);
+
+ /*
+ * If both endpoints are now closed, release the memory for the
+ * pipe pair. If not, unlock.
+ */
+ if (ppipe->pipe_present == PIPE_FINALIZED) {
+ PIPE_UNLOCK(cpipe);
+#ifdef MAC
+ mac_pipe_destroy(pp);
+#endif
+ uma_zfree(pipe_zone, cpipe->pipe_pair);
+ } else
+ PIPE_UNLOCK(cpipe);
+}
+
+/*ARGSUSED*/
+static int
+pipe_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct pipe *cpipe;
+
+ /*
+ * If a filter is requested that is not supported by this file
+ * descriptor, don't return an error, but also don't ever generate an
+ * event.
+ */
+ if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
+ kn->kn_fop = &pipe_nfiltops;
+ return (0);
+ }
+ if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
+ kn->kn_fop = &pipe_nfiltops;
+ return (0);
+ }
+ cpipe = fp->f_data;
+ PIPE_LOCK(cpipe);
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ kn->kn_fop = &pipe_rfiltops;
+ break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &pipe_wfiltops;
+ if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
+ /* other end of pipe has been closed */
+ PIPE_UNLOCK(cpipe);
+ return (EPIPE);
+ }
+ cpipe = PIPE_PEER(cpipe);
+ break;
+ default:
+ PIPE_UNLOCK(cpipe);
+ return (EINVAL);
+ }
+
+ kn->kn_hook = cpipe;
+ knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
+ PIPE_UNLOCK(cpipe);
+ return (0);
+}
+
+static void
+filt_pipedetach(struct knote *kn)
+{
+ struct pipe *cpipe = kn->kn_hook;
+
+ PIPE_LOCK(cpipe);
+ knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
+ PIPE_UNLOCK(cpipe);
+}
+
+/*ARGSUSED*/
+static int
+filt_piperead(struct knote *kn, long hint)
+{
+ struct pipe *rpipe = kn->kn_hook;
+ struct pipe *wpipe = rpipe->pipe_peer;
+ int ret;
+
+ PIPE_LOCK(rpipe);
+ kn->kn_data = rpipe->pipe_buffer.cnt;
+ if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
+ kn->kn_data = rpipe->pipe_map.cnt;
+
+ if ((rpipe->pipe_state & PIPE_EOF) ||
+ wpipe->pipe_present != PIPE_ACTIVE ||
+ (wpipe->pipe_state & PIPE_EOF)) {
+ kn->kn_flags |= EV_EOF;
+ PIPE_UNLOCK(rpipe);
+ return (1);
+ }
+ ret = kn->kn_data > 0;
+ PIPE_UNLOCK(rpipe);
+ return ret;
+}
+
+/*ARGSUSED*/
+static int
+filt_pipewrite(struct knote *kn, long hint)
+{
+ struct pipe *wpipe;
+
+ wpipe = kn->kn_hook;
+ PIPE_LOCK(wpipe);
+ if (wpipe->pipe_present != PIPE_ACTIVE ||
+ (wpipe->pipe_state & PIPE_EOF)) {
+ kn->kn_data = 0;
+ kn->kn_flags |= EV_EOF;
+ PIPE_UNLOCK(wpipe);
+ return (1);
+ }
+ kn->kn_data = (wpipe->pipe_buffer.size > 0) ?
+ (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF;
+ if (wpipe->pipe_state & PIPE_DIRECTW)
+ kn->kn_data = 0;
+
+ PIPE_UNLOCK(wpipe);
+ return (kn->kn_data >= PIPE_BUF);
+}
+
+static void
+filt_pipedetach_notsup(struct knote *kn)
+{
+
+}
+
+static int
+filt_pipenotsup(struct knote *kn, long hint)
+{
+
+ return (0);
+}
diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c
new file mode 100644
index 0000000..4bafeab
--- /dev/null
+++ b/sys/kern/sys_procdesc.c
@@ -0,0 +1,535 @@
+/*-
+ * Copyright (c) 2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed at the University of Cambridge Computer
+ * Laboratory with support from a grant from Google, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*-
+ * FreeBSD process descriptor facility.
+ *
+ * Some processes are represented by a file descriptor, which will be used in
+ * preference to signaling and pids for the purposes of process management,
+ * and is, in effect, a form of capability. When a process descriptor is
+ * used with a process, it ceases to be visible to certain traditional UNIX
+ * process facilities, such as waitpid(2).
+ *
+ * Some semantics:
+ *
+ * - At most one process descriptor will exist for any process, although
+ * references to that descriptor may be held from many processes (or even
+ * be in flight between processes over a local domain socket).
+ * - Last close on the process descriptor will terminate the process using
+ * SIGKILL and reparent it to init so that there's a process to reap it
+ * when it's done exiting.
+ * - If the process exits before the descriptor is closed, it will not
+ * generate SIGCHLD on termination, or be picked up by waitpid().
+ * - The pdkill(2) system call may be used to deliver a signal to the process
+ * using its process descriptor.
+ * - The pdwait4(2) system call may be used to block (or not) on a process
+ * descriptor to collect termination information.
+ *
+ * Open questions:
+ *
+ * - How to handle ptrace(2)?
+ * - Will we want to add a pidtoprocdesc(2) system call to allow process
+ * descriptors to be created for processes without pfork(2)?
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_procdesc.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/procdesc.h>
+#include <sys/resourcevar.h>
+#include <sys/stat.h>
+#include <sys/sysproto.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/ucred.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/uma.h>
+
+#ifdef PROCDESC
+
+FEATURE(process_descriptors, "Process Descriptors");
+
+static uma_zone_t procdesc_zone;
+
+static fo_rdwr_t procdesc_read;
+static fo_rdwr_t procdesc_write;
+static fo_truncate_t procdesc_truncate;
+static fo_ioctl_t procdesc_ioctl;
+static fo_poll_t procdesc_poll;
+static fo_kqfilter_t procdesc_kqfilter;
+static fo_stat_t procdesc_stat;
+static fo_close_t procdesc_close;
+static fo_chmod_t procdesc_chmod;
+static fo_chown_t procdesc_chown;
+
+static struct fileops procdesc_ops = {
+ .fo_read = procdesc_read,
+ .fo_write = procdesc_write,
+ .fo_truncate = procdesc_truncate,
+ .fo_ioctl = procdesc_ioctl,
+ .fo_poll = procdesc_poll,
+ .fo_kqfilter = procdesc_kqfilter,
+ .fo_stat = procdesc_stat,
+ .fo_close = procdesc_close,
+ .fo_chmod = procdesc_chmod,
+ .fo_chown = procdesc_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_flags = DFLAG_PASSABLE,
+};
+
+/*
+ * Initialize with VFS so that process descriptors are available along with
+ * other file descriptor types. As long as it runs before init(8) starts,
+ * there shouldn't be a problem.
+ */
+static void
+procdesc_init(void *dummy __unused)
+{
+
+ procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ if (procdesc_zone == NULL)
+ panic("procdesc_init: procdesc_zone not initialized");
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL);
+
+/*
+ * Return a locked process given a process descriptor, or ESRCH if it has
+ * died.
+ */
+int
+procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp,
+ struct proc **p)
+{
+ struct procdesc *pd;
+ struct file *fp;
+ int error;
+
+ error = fget(td, fd, rightsp, &fp);
+ if (error)
+ return (error);
+ if (fp->f_type != DTYPE_PROCDESC) {
+ error = EBADF;
+ goto out;
+ }
+ pd = fp->f_data;
+ sx_slock(&proctree_lock);
+ if (pd->pd_proc != NULL) {
+ *p = pd->pd_proc;
+ PROC_LOCK(*p);
+ } else
+ error = ESRCH;
+ sx_sunlock(&proctree_lock);
+out:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Function to be used by procstat(1) sysctls when returning procdesc
+ * information.
+ */
+pid_t
+procdesc_pid(struct file *fp_procdesc)
+{
+ struct procdesc *pd;
+
+ KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC,
+ ("procdesc_pid: !procdesc"));
+
+ pd = fp_procdesc->f_data;
+ return (pd->pd_pid);
+}
+
+/*
+ * Retrieve the PID associated with a process descriptor.
+ */
+int
+kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp)
+{
+ struct file *fp;
+ int error;
+
+ error = fget(td, fd, rightsp, &fp);
+ if (error)
+ return (error);
+ if (fp->f_type != DTYPE_PROCDESC) {
+ error = EBADF;
+ goto out;
+ }
+ *pidp = procdesc_pid(fp);
+out:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * System call to return the pid of a process given its process descriptor.
+ */
+int
+sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
+{
+ cap_rights_t rights;
+ pid_t pid;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ error = kern_pdgetpid(td, uap->fd,
+ cap_rights_init(&rights, CAP_PDGETPID), &pid);
+ if (error == 0)
+ error = copyout(&pid, uap->pidp, sizeof(pid));
+ return (error);
+}
+
+/*
+ * When a new process is forked by pdfork(), a file descriptor is allocated
+ * by the fork code first, then the process is forked, and then we get a
+ * chance to set up the process descriptor. Failure is not permitted at this
+ * point, so procdesc_new() must succeed.
+ */
+void
+procdesc_new(struct proc *p, int flags)
+{
+ struct procdesc *pd;
+
+ pd = uma_zalloc(procdesc_zone, M_WAITOK | M_ZERO);
+ pd->pd_proc = p;
+ pd->pd_pid = p->p_pid;
+ p->p_procdesc = pd;
+ pd->pd_flags = 0;
+ if (flags & PD_DAEMON)
+ pd->pd_flags |= PDF_DAEMON;
+ PROCDESC_LOCK_INIT(pd);
+
+ /*
+ * Process descriptors start out with two references: one from their
+ * struct file, and the other from their struct proc.
+ */
+ refcount_init(&pd->pd_refcount, 2);
+}
+
+/*
+ * Initialize a file with a process descriptor.
+ */
+void
+procdesc_finit(struct procdesc *pdp, struct file *fp)
+{
+
+ finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops);
+}
+
+static void
+procdesc_free(struct procdesc *pd)
+{
+
+ /*
+ * When the last reference is released, we assert that the descriptor
+ * has been closed, but not that the process has exited, as we will
+ * detach the descriptor before the process dies if the descript is
+ * closed, as we can't wait synchronously.
+ */
+ if (refcount_release(&pd->pd_refcount)) {
+ KASSERT(pd->pd_proc == NULL,
+ ("procdesc_free: pd_proc != NULL"));
+ KASSERT((pd->pd_flags & PDF_CLOSED),
+ ("procdesc_free: !PDF_CLOSED"));
+
+ PROCDESC_LOCK_DESTROY(pd);
+ uma_zfree(procdesc_zone, pd);
+ }
+}
+
+/*
+ * procdesc_exit() - notify a process descriptor that its process is exiting.
+ * We use the proctree_lock to ensure that process exit either happens
+ * strictly before or strictly after a concurrent call to procdesc_close().
+ */
+int
+procdesc_exit(struct proc *p)
+{
+ struct procdesc *pd;
+
+ sx_assert(&proctree_lock, SA_XLOCKED);
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
+
+ pd = p->p_procdesc;
+
+ PROCDESC_LOCK(pd);
+ KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc,
+ ("procdesc_exit: closed && parent not init"));
+
+ pd->pd_flags |= PDF_EXITED;
+
+ /*
+ * If the process descriptor has been closed, then we have nothing
+ * to do; return 1 so that init will get SIGCHLD and do the reaping.
+ * Clean up the procdesc now rather than letting it happen during
+ * that reap.
+ */
+ if (pd->pd_flags & PDF_CLOSED) {
+ PROCDESC_UNLOCK(pd);
+ pd->pd_proc = NULL;
+ p->p_procdesc = NULL;
+ procdesc_free(pd);
+ return (1);
+ }
+ if (pd->pd_flags & PDF_SELECTED) {
+ pd->pd_flags &= ~PDF_SELECTED;
+ selwakeup(&pd->pd_selinfo);
+ }
+ PROCDESC_UNLOCK(pd);
+ return (0);
+}
+
+/*
+ * When a process descriptor is reaped, perhaps as a result of close() or
+ * pdwait4(), release the process's reference on the process descriptor.
+ */
+void
+procdesc_reap(struct proc *p)
+{
+ struct procdesc *pd;
+
+ sx_assert(&proctree_lock, SA_XLOCKED);
+ KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
+
+ pd = p->p_procdesc;
+ pd->pd_proc = NULL;
+ p->p_procdesc = NULL;
+ procdesc_free(pd);
+}
+
+/*
+ * procdesc_close() - last close on a process descriptor. If the process is
+ * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let
+ * init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
+ */
+static int
+procdesc_close(struct file *fp, struct thread *td)
+{
+ struct procdesc *pd;
+ struct proc *p;
+
+ KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc"));
+
+ pd = fp->f_data;
+ fp->f_ops = &badfileops;
+ fp->f_data = NULL;
+
+ sx_xlock(&proctree_lock);
+ PROCDESC_LOCK(pd);
+ pd->pd_flags |= PDF_CLOSED;
+ PROCDESC_UNLOCK(pd);
+ p = pd->pd_proc;
+ if (p == NULL) {
+ /*
+ * This is the case where process' exit status was already
+ * collected and procdesc_reap() was already called.
+ */
+ sx_xunlock(&proctree_lock);
+ } else if (p->p_state == PRS_ZOMBIE) {
+ /*
+ * If the process is already dead and just awaiting reaping,
+ * do that now. This will release the process's reference to
+ * the process descriptor when it calls back into
+ * procdesc_reap().
+ */
+ PROC_LOCK(p);
+ PROC_SLOCK(p);
+ proc_reap(curthread, p, NULL, 0);
+ } else {
+ /*
+ * If the process is not yet dead, we need to kill it, but we
+ * can't wait around synchronously for it to go away, as that
+ * path leads to madness (and deadlocks). First, detach the
+ * process from its descriptor so that its exit status will
+ * be reported normally.
+ */
+ PROC_LOCK(p);
+ pd->pd_proc = NULL;
+ p->p_procdesc = NULL;
+ procdesc_free(pd);
+
+ /*
+ * Next, reparent it to init(8) so that there's someone to
+ * pick up the pieces; finally, terminate with prejudice.
+ */
+ p->p_sigparent = SIGCHLD;
+ proc_reparent(p, initproc);
+ if ((pd->pd_flags & PDF_DAEMON) == 0)
+ kern_psignal(p, SIGKILL);
+ PROC_UNLOCK(p);
+ sx_xunlock(&proctree_lock);
+ }
+
+ /*
+ * Release the file descriptor's reference on the process descriptor.
+ */
+ procdesc_free(pd);
+ return (0);
+}
+
+static int
+procdesc_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_ioctl(struct file *fp, u_long com, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct procdesc *pd;
+ int revents;
+
+ revents = 0;
+ pd = fp->f_data;
+ PROCDESC_LOCK(pd);
+ if (pd->pd_flags & PDF_EXITED)
+ revents |= POLLHUP;
+ if (revents == 0) {
+ selrecord(td, &pd->pd_selinfo);
+ pd->pd_flags |= PDF_SELECTED;
+ }
+ PROCDESC_UNLOCK(pd);
+ return (revents);
+}
+
+static int
+procdesc_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct procdesc *pd;
+ struct timeval pstart;
+
+ /*
+ * XXXRW: Perhaps we should cache some more information from the
+ * process so that we can return it reliably here even after it has
+ * died. For example, caching its credential data.
+ */
+ bzero(sb, sizeof(*sb));
+ pd = fp->f_data;
+ sx_slock(&proctree_lock);
+ if (pd->pd_proc != NULL) {
+ PROC_LOCK(pd->pd_proc);
+
+ /* Set birth and [acm] times to process start time. */
+ pstart = pd->pd_proc->p_stats->p_start;
+ timevaladd(&pstart, &boottime);
+ TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim);
+ sb->st_atim = sb->st_birthtim;
+ sb->st_ctim = sb->st_birthtim;
+ sb->st_mtim = sb->st_birthtim;
+ if (pd->pd_proc->p_state != PRS_ZOMBIE)
+ sb->st_mode = S_IFREG | S_IRWXU;
+ else
+ sb->st_mode = S_IFREG;
+ sb->st_uid = pd->pd_proc->p_ucred->cr_ruid;
+ sb->st_gid = pd->pd_proc->p_ucred->cr_rgid;
+ PROC_UNLOCK(pd->pd_proc);
+ } else
+ sb->st_mode = S_IFREG;
+ sx_sunlock(&proctree_lock);
+ return (0);
+}
+
+static int
+procdesc_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+procdesc_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+#else /* !PROCDESC */
+
+int
+sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
+{
+
+ return (ENOSYS);
+}
+
+#endif /* PROCDESC */
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
new file mode 100644
index 0000000..5508dcf
--- /dev/null
+++ b/sys/kern/sys_process.c
@@ -0,0 +1,1242 @@
+/*-
+ * Copyright (c) 1994, Sean Eric Fagan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Sean Eric Fagan.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/ptrace.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/malloc.h>
+#include <sys/signalvar.h>
+
+#include <machine/reg.h>
+
+#include <security/audit/audit.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_param.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/procfs.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+
+struct ptrace_io_desc32 {
+ int piod_op;
+ uint32_t piod_offs;
+ uint32_t piod_addr;
+ uint32_t piod_len;
+};
+
+struct ptrace_vm_entry32 {
+ int pve_entry;
+ int pve_timestamp;
+ uint32_t pve_start;
+ uint32_t pve_end;
+ uint32_t pve_offset;
+ u_int pve_prot;
+ u_int pve_pathlen;
+ int32_t pve_fileid;
+ u_int pve_fsid;
+ uint32_t pve_path;
+};
+
+struct ptrace_lwpinfo32 {
+ lwpid_t pl_lwpid; /* LWP described. */
+ int pl_event; /* Event that stopped the LWP. */
+ int pl_flags; /* LWP flags. */
+ sigset_t pl_sigmask; /* LWP signal mask */
+ sigset_t pl_siglist; /* LWP pending signal */
+ struct siginfo32 pl_siginfo; /* siginfo for signal */
+ char pl_tdname[MAXCOMLEN + 1]; /* LWP name. */
+ int pl_child_pid; /* New child pid */
+};
+
+#endif
+
+/*
+ * Functions implemented using PROC_ACTION():
+ *
+ * proc_read_regs(proc, regs)
+ * Get the current user-visible register set from the process
+ * and copy it into the regs structure (<machine/reg.h>).
+ * The process is stopped at the time read_regs is called.
+ *
+ * proc_write_regs(proc, regs)
+ * Update the current register set from the passed in regs
+ * structure. Take care to avoid clobbering special CPU
+ * registers or privileged bits in the PSL.
+ * Depending on the architecture this may have fix-up work to do,
+ * especially if the IAR or PCW are modified.
+ * The process is stopped at the time write_regs is called.
+ *
+ * proc_read_fpregs, proc_write_fpregs
+ * deal with the floating point register set, otherwise as above.
+ *
+ * proc_read_dbregs, proc_write_dbregs
+ * deal with the processor debug register set, otherwise as above.
+ *
+ * proc_sstep(proc)
+ * Arrange for the process to trap after executing a single instruction.
+ */
+
+#define PROC_ACTION(action) do { \
+ int error; \
+ \
+ PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); \
+ if ((td->td_proc->p_flag & P_INMEM) == 0) \
+ error = EIO; \
+ else \
+ error = (action); \
+ return (error); \
+} while(0)
+
+int
+proc_read_regs(struct thread *td, struct reg *regs)
+{
+
+ PROC_ACTION(fill_regs(td, regs));
+}
+
+int
+proc_write_regs(struct thread *td, struct reg *regs)
+{
+
+ PROC_ACTION(set_regs(td, regs));
+}
+
+int
+proc_read_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+
+ PROC_ACTION(fill_dbregs(td, dbregs));
+}
+
+int
+proc_write_dbregs(struct thread *td, struct dbreg *dbregs)
+{
+
+ PROC_ACTION(set_dbregs(td, dbregs));
+}
+
+/*
+ * Ptrace doesn't support fpregs at all, and there are no security holes
+ * or translations for fpregs, so we can just copy them.
+ */
+int
+proc_read_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+ PROC_ACTION(fill_fpregs(td, fpregs));
+}
+
+int
+proc_write_fpregs(struct thread *td, struct fpreg *fpregs)
+{
+
+ PROC_ACTION(set_fpregs(td, fpregs));
+}
+
+#ifdef COMPAT_FREEBSD32
+/* For 32 bit binaries, we need to expose the 32 bit regs layouts. */
+int
+proc_read_regs32(struct thread *td, struct reg32 *regs32)
+{
+
+ PROC_ACTION(fill_regs32(td, regs32));
+}
+
+int
+proc_write_regs32(struct thread *td, struct reg32 *regs32)
+{
+
+ PROC_ACTION(set_regs32(td, regs32));
+}
+
+int
+proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
+{
+
+ PROC_ACTION(fill_dbregs32(td, dbregs32));
+}
+
+int
+proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
+{
+
+ PROC_ACTION(set_dbregs32(td, dbregs32));
+}
+
+int
+proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
+{
+
+ PROC_ACTION(fill_fpregs32(td, fpregs32));
+}
+
+int
+proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
+{
+
+ PROC_ACTION(set_fpregs32(td, fpregs32));
+}
+#endif
+
+int
+proc_sstep(struct thread *td)
+{
+
+ PROC_ACTION(ptrace_single_step(td));
+}
+
+int
+proc_rwmem(struct proc *p, struct uio *uio)
+{
+ vm_map_t map;
+ vm_offset_t pageno; /* page number */
+ vm_prot_t reqprot;
+ int error, fault_flags, page_offset, writing;
+
+ /*
+ * Assert that someone has locked this vmspace. (Should be
+ * curthread but we can't assert that.) This keeps the process
+ * from exiting out from under us until this operation completes.
+ */
+ KASSERT(p->p_lock >= 1, ("%s: process %p (pid %d) not held", __func__,
+ p, p->p_pid));
+
+ /*
+ * The map we want...
+ */
+ map = &p->p_vmspace->vm_map;
+
+ /*
+ * If we are writing, then we request vm_fault() to create a private
+ * copy of each page. Since these copies will not be writeable by the
+ * process, we must explicity request that they be dirtied.
+ */
+ writing = uio->uio_rw == UIO_WRITE;
+ reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
+ fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
+
+ /*
+ * Only map in one page at a time. We don't have to, but it
+ * makes things easier. This way is trivial - right?
+ */
+ do {
+ vm_offset_t uva;
+ u_int len;
+ vm_page_t m;
+
+ uva = (vm_offset_t)uio->uio_offset;
+
+ /*
+ * Get the page number of this segment.
+ */
+ pageno = trunc_page(uva);
+ page_offset = uva - pageno;
+
+ /*
+ * How many bytes to copy
+ */
+ len = min(PAGE_SIZE - page_offset, uio->uio_resid);
+
+ /*
+ * Fault and hold the page on behalf of the process.
+ */
+ error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
+ if (error != KERN_SUCCESS) {
+ if (error == KERN_RESOURCE_SHORTAGE)
+ error = ENOMEM;
+ else
+ error = EFAULT;
+ break;
+ }
+
+ /*
+ * Now do the i/o move.
+ */
+ error = uiomove_fromphys(&m, page_offset, len, uio);
+
+ /* Make the I-cache coherent for breakpoints. */
+ if (writing && error == 0) {
+ vm_map_lock_read(map);
+ if (vm_map_check_protection(map, pageno, pageno +
+ PAGE_SIZE, VM_PROT_EXECUTE))
+ vm_sync_icache(map, uva, len);
+ vm_map_unlock_read(map);
+ }
+
+ /*
+ * Release the page.
+ */
+ vm_page_lock(m);
+ vm_page_unhold(m);
+ vm_page_unlock(m);
+
+ } while (error == 0 && uio->uio_resid > 0);
+
+ return (error);
+}
+
+static int
+ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve)
+{
+ struct vattr vattr;
+ vm_map_t map;
+ vm_map_entry_t entry;
+ vm_object_t obj, tobj, lobj;
+ struct vmspace *vm;
+ struct vnode *vp;
+ char *freepath, *fullpath;
+ u_int pathlen;
+ int error, index;
+
+ error = 0;
+ obj = NULL;
+
+ vm = vmspace_acquire_ref(p);
+ map = &vm->vm_map;
+ vm_map_lock_read(map);
+
+ do {
+ entry = map->header.next;
+ index = 0;
+ while (index < pve->pve_entry && entry != &map->header) {
+ entry = entry->next;
+ index++;
+ }
+ if (index != pve->pve_entry) {
+ error = EINVAL;
+ break;
+ }
+ while (entry != &map->header &&
+ (entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
+ entry = entry->next;
+ index++;
+ }
+ if (entry == &map->header) {
+ error = ENOENT;
+ break;
+ }
+
+ /* We got an entry. */
+ pve->pve_entry = index + 1;
+ pve->pve_timestamp = map->timestamp;
+ pve->pve_start = entry->start;
+ pve->pve_end = entry->end - 1;
+ pve->pve_offset = entry->offset;
+ pve->pve_prot = entry->protection;
+
+ /* Backing object's path needed? */
+ if (pve->pve_pathlen == 0)
+ break;
+
+ pathlen = pve->pve_pathlen;
+ pve->pve_pathlen = 0;
+
+ obj = entry->object.vm_object;
+ if (obj != NULL)
+ VM_OBJECT_RLOCK(obj);
+ } while (0);
+
+ vm_map_unlock_read(map);
+ vmspace_free(vm);
+
+ pve->pve_fsid = VNOVAL;
+ pve->pve_fileid = VNOVAL;
+
+ if (error == 0 && obj != NULL) {
+ lobj = obj;
+ for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
+ if (tobj != obj)
+ VM_OBJECT_RLOCK(tobj);
+ if (lobj != obj)
+ VM_OBJECT_RUNLOCK(lobj);
+ lobj = tobj;
+ pve->pve_offset += tobj->backing_object_offset;
+ }
+ vp = (lobj->type == OBJT_VNODE) ? lobj->handle : NULL;
+ if (vp != NULL)
+ vref(vp);
+ if (lobj != obj)
+ VM_OBJECT_RUNLOCK(lobj);
+ VM_OBJECT_RUNLOCK(obj);
+
+ if (vp != NULL) {
+ freepath = NULL;
+ fullpath = NULL;
+ vn_fullpath(td, vp, &fullpath, &freepath);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) {
+ pve->pve_fileid = vattr.va_fileid;
+ pve->pve_fsid = vattr.va_fsid;
+ }
+ vput(vp);
+
+ if (fullpath != NULL) {
+ pve->pve_pathlen = strlen(fullpath) + 1;
+ if (pve->pve_pathlen <= pathlen) {
+ error = copyout(fullpath, pve->pve_path,
+ pve->pve_pathlen);
+ } else
+ error = ENAMETOOLONG;
+ }
+ if (freepath != NULL)
+ free(freepath, M_TEMP);
+ }
+ }
+
+ return (error);
+}
+
+#ifdef COMPAT_FREEBSD32
+static int
+ptrace_vm_entry32(struct thread *td, struct proc *p,
+ struct ptrace_vm_entry32 *pve32)
+{
+ struct ptrace_vm_entry pve;
+ int error;
+
+ pve.pve_entry = pve32->pve_entry;
+ pve.pve_pathlen = pve32->pve_pathlen;
+ pve.pve_path = (void *)(uintptr_t)pve32->pve_path;
+
+ error = ptrace_vm_entry(td, p, &pve);
+ if (error == 0) {
+ pve32->pve_entry = pve.pve_entry;
+ pve32->pve_timestamp = pve.pve_timestamp;
+ pve32->pve_start = pve.pve_start;
+ pve32->pve_end = pve.pve_end;
+ pve32->pve_offset = pve.pve_offset;
+ pve32->pve_prot = pve.pve_prot;
+ pve32->pve_fileid = pve.pve_fileid;
+ pve32->pve_fsid = pve.pve_fsid;
+ }
+
+ pve32->pve_pathlen = pve.pve_pathlen;
+ return (error);
+}
+
+static void
+ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl,
+ struct ptrace_lwpinfo32 *pl32)
+{
+
+ pl32->pl_lwpid = pl->pl_lwpid;
+ pl32->pl_event = pl->pl_event;
+ pl32->pl_flags = pl->pl_flags;
+ pl32->pl_sigmask = pl->pl_sigmask;
+ pl32->pl_siglist = pl->pl_siglist;
+ siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
+ strcpy(pl32->pl_tdname, pl->pl_tdname);
+ pl32->pl_child_pid = pl->pl_child_pid;
+}
+#endif /* COMPAT_FREEBSD32 */
+
+/*
+ * Process debugging system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ptrace_args {
+ int req;
+ pid_t pid;
+ caddr_t addr;
+ int data;
+};
+#endif
+
+#ifdef COMPAT_FREEBSD32
+/*
+ * This CPP subterfuge is to try and reduce the number of ifdefs in
+ * the body of the code.
+ * COPYIN(uap->addr, &r.reg, sizeof r.reg);
+ * becomes either:
+ * copyin(uap->addr, &r.reg, sizeof r.reg);
+ * or
+ * copyin(uap->addr, &r.reg32, sizeof r.reg32);
+ * .. except this is done at runtime.
+ */
+#define COPYIN(u, k, s) wrap32 ? \
+ copyin(u, k ## 32, s ## 32) : \
+ copyin(u, k, s)
+#define COPYOUT(k, u, s) wrap32 ? \
+ copyout(k ## 32, u, s ## 32) : \
+ copyout(k, u, s)
+#else
+#define COPYIN(u, k, s) copyin(u, k, s)
+#define COPYOUT(k, u, s) copyout(k, u, s)
+#endif
+int
+sys_ptrace(struct thread *td, struct ptrace_args *uap)
+{
+ /*
+ * XXX this obfuscation is to reduce stack usage, but the register
+ * structs may be too large to put on the stack anyway.
+ */
+ union {
+ struct ptrace_io_desc piod;
+ struct ptrace_lwpinfo pl;
+ struct ptrace_vm_entry pve;
+ struct dbreg dbreg;
+ struct fpreg fpreg;
+ struct reg reg;
+#ifdef COMPAT_FREEBSD32
+ struct dbreg32 dbreg32;
+ struct fpreg32 fpreg32;
+ struct reg32 reg32;
+ struct ptrace_io_desc32 piod32;
+ struct ptrace_lwpinfo32 pl32;
+ struct ptrace_vm_entry32 pve32;
+#endif
+ } r;
+ void *addr;
+ int error = 0;
+#ifdef COMPAT_FREEBSD32
+ int wrap32 = 0;
+
+ if (SV_CURPROC_FLAG(SV_ILP32))
+ wrap32 = 1;
+#endif
+ AUDIT_ARG_PID(uap->pid);
+ AUDIT_ARG_CMD(uap->req);
+ AUDIT_ARG_VALUE(uap->data);
+ addr = &r;
+ switch (uap->req) {
+ case PT_GETREGS:
+ case PT_GETFPREGS:
+ case PT_GETDBREGS:
+ case PT_LWPINFO:
+ break;
+ case PT_SETREGS:
+ error = COPYIN(uap->addr, &r.reg, sizeof r.reg);
+ break;
+ case PT_SETFPREGS:
+ error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg);
+ break;
+ case PT_SETDBREGS:
+ error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg);
+ break;
+ case PT_IO:
+ error = COPYIN(uap->addr, &r.piod, sizeof r.piod);
+ break;
+ case PT_VM_ENTRY:
+ error = COPYIN(uap->addr, &r.pve, sizeof r.pve);
+ break;
+ default:
+ addr = uap->addr;
+ break;
+ }
+ if (error)
+ return (error);
+
+ error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data);
+ if (error)
+ return (error);
+
+ switch (uap->req) {
+ case PT_VM_ENTRY:
+ error = COPYOUT(&r.pve, uap->addr, sizeof r.pve);
+ break;
+ case PT_IO:
+ error = COPYOUT(&r.piod, uap->addr, sizeof r.piod);
+ break;
+ case PT_GETREGS:
+ error = COPYOUT(&r.reg, uap->addr, sizeof r.reg);
+ break;
+ case PT_GETFPREGS:
+ error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg);
+ break;
+ case PT_GETDBREGS:
+ error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg);
+ break;
+ case PT_LWPINFO:
+ error = copyout(&r.pl, uap->addr, uap->data);
+ break;
+ }
+
+ return (error);
+}
+#undef COPYIN
+#undef COPYOUT
+
+#ifdef COMPAT_FREEBSD32
+/*
+ * PROC_READ(regs, td2, addr);
+ * becomes either:
+ * proc_read_regs(td2, addr);
+ * or
+ * proc_read_regs32(td2, addr);
+ * .. except this is done at runtime. There is an additional
+ * complication in that PROC_WRITE disallows 32 bit consumers
+ * from writing to 64 bit address space targets.
+ */
+#define PROC_READ(w, t, a) wrap32 ? \
+ proc_read_ ## w ## 32(t, a) : \
+ proc_read_ ## w (t, a)
+#define PROC_WRITE(w, t, a) wrap32 ? \
+ (safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \
+ proc_write_ ## w (t, a)
+#else
+#define PROC_READ(w, t, a) proc_read_ ## w (t, a)
+#define PROC_WRITE(w, t, a) proc_write_ ## w (t, a)
+#endif
+
+int
+kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
+{
+ struct iovec iov;
+ struct uio uio;
+ struct proc *curp, *p, *pp;
+ struct thread *td2 = NULL, *td3;
+ struct ptrace_io_desc *piod = NULL;
+ struct ptrace_lwpinfo *pl;
+ int error, write, tmp, num;
+ int proctree_locked = 0;
+ lwpid_t tid = 0, *buf;
+#ifdef COMPAT_FREEBSD32
+ int wrap32 = 0, safe = 0;
+ struct ptrace_io_desc32 *piod32 = NULL;
+ struct ptrace_lwpinfo32 *pl32 = NULL;
+ struct ptrace_lwpinfo plr;
+#endif
+
+ curp = td->td_proc;
+
+ /* Lock proctree before locking the process. */
+ switch (req) {
+ case PT_TRACE_ME:
+ case PT_ATTACH:
+ case PT_STEP:
+ case PT_CONTINUE:
+ case PT_TO_SCE:
+ case PT_TO_SCX:
+ case PT_SYSCALL:
+ case PT_FOLLOW_FORK:
+ case PT_DETACH:
+ sx_xlock(&proctree_lock);
+ proctree_locked = 1;
+ break;
+ default:
+ break;
+ }
+
+ write = 0;
+ if (req == PT_TRACE_ME) {
+ p = td->td_proc;
+ PROC_LOCK(p);
+ } else {
+ if (pid <= PID_MAX) {
+ if ((p = pfind(pid)) == NULL) {
+ if (proctree_locked)
+ sx_xunlock(&proctree_lock);
+ return (ESRCH);
+ }
+ } else {
+ td2 = tdfind(pid, -1);
+ if (td2 == NULL) {
+ if (proctree_locked)
+ sx_xunlock(&proctree_lock);
+ return (ESRCH);
+ }
+ p = td2->td_proc;
+ tid = pid;
+ pid = p->p_pid;
+ }
+ }
+ AUDIT_ARG_PROCESS(p);
+
+ if ((p->p_flag & P_WEXIT) != 0) {
+ error = ESRCH;
+ goto fail;
+ }
+ if ((error = p_cansee(td, p)) != 0)
+ goto fail;
+
+ if ((error = p_candebug(td, p)) != 0)
+ goto fail;
+
+ /*
+ * System processes can't be debugged.
+ */
+ if ((p->p_flag & P_SYSTEM) != 0) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ if (tid == 0) {
+ if ((p->p_flag & P_STOPPED_TRACE) != 0) {
+ KASSERT(p->p_xthread != NULL, ("NULL p_xthread"));
+ td2 = p->p_xthread;
+ } else {
+ td2 = FIRST_THREAD_IN_PROC(p);
+ }
+ tid = td2->td_tid;
+ }
+
+#ifdef COMPAT_FREEBSD32
+ /*
+ * Test if we're a 32 bit client and what the target is.
+ * Set the wrap controls accordingly.
+ */
+ if (SV_CURPROC_FLAG(SV_ILP32)) {
+ if (SV_PROC_FLAG(td2->td_proc, SV_ILP32))
+ safe = 1;
+ wrap32 = 1;
+ }
+#endif
+ /*
+ * Permissions check
+ */
+ switch (req) {
+ case PT_TRACE_ME:
+ /* Always legal. */
+ break;
+
+ case PT_ATTACH:
+ /* Self */
+ if (p->p_pid == td->td_proc->p_pid) {
+ error = EINVAL;
+ goto fail;
+ }
+
+ /* Already traced */
+ if (p->p_flag & P_TRACED) {
+ error = EBUSY;
+ goto fail;
+ }
+
+ /* Can't trace an ancestor if you're being traced. */
+ if (curp->p_flag & P_TRACED) {
+ for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
+ if (pp == p) {
+ error = EINVAL;
+ goto fail;
+ }
+ }
+ }
+
+
+ /* OK */
+ break;
+
+ case PT_CLEARSTEP:
+ /* Allow thread to clear single step for itself */
+ if (td->td_tid == tid)
+ break;
+
+ /* FALLTHROUGH */
+ default:
+ /* not being traced... */
+ if ((p->p_flag & P_TRACED) == 0) {
+ error = EPERM;
+ goto fail;
+ }
+
+ /* not being traced by YOU */
+ if (p->p_pptr != td->td_proc) {
+ error = EBUSY;
+ goto fail;
+ }
+
+ /* not currently stopped */
+ if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) == 0 ||
+ p->p_suspcount != p->p_numthreads ||
+ (p->p_flag & P_WAITED) == 0) {
+ error = EBUSY;
+ goto fail;
+ }
+
+ if ((p->p_flag & P_STOPPED_TRACE) == 0) {
+ static int count = 0;
+ if (count++ == 0)
+ printf("P_STOPPED_TRACE not set.\n");
+ }
+
+ /* OK */
+ break;
+ }
+
+ /* Keep this process around until we finish this request. */
+ _PHOLD(p);
+
+#ifdef FIX_SSTEP
+ /*
+ * Single step fixup ala procfs
+ */
+ FIX_SSTEP(td2);
+#endif
+
+ /*
+ * Actually do the requests
+ */
+
+ td->td_retval[0] = 0;
+
+ switch (req) {
+ case PT_TRACE_ME:
+ /* set my trace flag and "owner" so it can read/write me */
+ p->p_flag |= P_TRACED;
+ if (p->p_flag & P_PPWAIT)
+ p->p_flag |= P_PPTRACE;
+ p->p_oppid = p->p_pptr->p_pid;
+ break;
+
+ case PT_ATTACH:
+ /* security check done above */
+ /*
+ * It would be nice if the tracing relationship was separate
+ * from the parent relationship but that would require
+ * another set of links in the proc struct or for "wait"
+ * to scan the entire proc table. To make life easier,
+ * we just re-parent the process we're trying to trace.
+ * The old parent is remembered so we can put things back
+ * on a "detach".
+ */
+ p->p_flag |= P_TRACED;
+ p->p_oppid = p->p_pptr->p_pid;
+ if (p->p_pptr != td->td_proc) {
+ proc_reparent(p, td->td_proc);
+ }
+ data = SIGSTOP;
+ goto sendsig; /* in PT_CONTINUE below */
+
+ case PT_CLEARSTEP:
+ error = ptrace_clear_single_step(td2);
+ break;
+
+ case PT_SETSTEP:
+ error = ptrace_single_step(td2);
+ break;
+
+ case PT_SUSPEND:
+ td2->td_dbgflags |= TDB_SUSPEND;
+ thread_lock(td2);
+ td2->td_flags |= TDF_NEEDSUSPCHK;
+ thread_unlock(td2);
+ break;
+
+ case PT_RESUME:
+ td2->td_dbgflags &= ~TDB_SUSPEND;
+ break;
+
+ case PT_FOLLOW_FORK:
+ if (data)
+ p->p_flag |= P_FOLLOWFORK;
+ else
+ p->p_flag &= ~P_FOLLOWFORK;
+ break;
+
+ case PT_STEP:
+ case PT_CONTINUE:
+ case PT_TO_SCE:
+ case PT_TO_SCX:
+ case PT_SYSCALL:
+ case PT_DETACH:
+ /* Zero means do not send any signal */
+ if (data < 0 || data > _SIG_MAXSIG) {
+ error = EINVAL;
+ break;
+ }
+
+ switch (req) {
+ case PT_STEP:
+ error = ptrace_single_step(td2);
+ if (error)
+ goto out;
+ break;
+ case PT_CONTINUE:
+ case PT_TO_SCE:
+ case PT_TO_SCX:
+ case PT_SYSCALL:
+ if (addr != (void *)1) {
+ error = ptrace_set_pc(td2,
+ (u_long)(uintfptr_t)addr);
+ if (error)
+ goto out;
+ }
+ switch (req) {
+ case PT_TO_SCE:
+ p->p_stops |= S_PT_SCE;
+ break;
+ case PT_TO_SCX:
+ p->p_stops |= S_PT_SCX;
+ break;
+ case PT_SYSCALL:
+ p->p_stops |= S_PT_SCE | S_PT_SCX;
+ break;
+ }
+ break;
+ case PT_DETACH:
+ /* reset process parent */
+ if (p->p_oppid != p->p_pptr->p_pid) {
+ struct proc *pp;
+
+ PROC_LOCK(p->p_pptr);
+ sigqueue_take(p->p_ksi);
+ PROC_UNLOCK(p->p_pptr);
+
+ PROC_UNLOCK(p);
+ pp = pfind(p->p_oppid);
+ if (pp == NULL)
+ pp = initproc;
+ else
+ PROC_UNLOCK(pp);
+ PROC_LOCK(p);
+ proc_reparent(p, pp);
+ if (pp == initproc)
+ p->p_sigparent = SIGCHLD;
+ }
+ p->p_oppid = 0;
+ p->p_flag &= ~(P_TRACED | P_WAITED | P_FOLLOWFORK);
+
+ /* should we send SIGCHLD? */
+ /* childproc_continued(p); */
+ break;
+ }
+
+ sendsig:
+ if (proctree_locked) {
+ sx_xunlock(&proctree_lock);
+ proctree_locked = 0;
+ }
+ p->p_xstat = data;
+ p->p_xthread = NULL;
+ if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) {
+ /* deliver or queue signal */
+ td2->td_dbgflags &= ~TDB_XSIG;
+ td2->td_xsig = data;
+
+ if (req == PT_DETACH) {
+ FOREACH_THREAD_IN_PROC(p, td3)
+ td3->td_dbgflags &= ~TDB_SUSPEND;
+ }
+ /*
+ * unsuspend all threads, to not let a thread run,
+ * you should use PT_SUSPEND to suspend it before
+ * continuing process.
+ */
+ PROC_SLOCK(p);
+ p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
+ thread_unsuspend(p);
+ PROC_SUNLOCK(p);
+ if (req == PT_ATTACH)
+ kern_psignal(p, data);
+ } else {
+ if (data)
+ kern_psignal(p, data);
+ }
+ break;
+
+ case PT_WRITE_I:
+ case PT_WRITE_D:
+ td2->td_dbgflags |= TDB_USERWR;
+ write = 1;
+ /* FALLTHROUGH */
+ case PT_READ_I:
+ case PT_READ_D:
+ PROC_UNLOCK(p);
+ tmp = 0;
+ /* write = 0 set above */
+ iov.iov_base = write ? (caddr_t)&data : (caddr_t)&tmp;
+ iov.iov_len = sizeof(int);
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = (off_t)(uintptr_t)addr;
+ uio.uio_resid = sizeof(int);
+ uio.uio_segflg = UIO_SYSSPACE; /* i.e.: the uap */
+ uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+ uio.uio_td = td;
+ error = proc_rwmem(p, &uio);
+ if (uio.uio_resid != 0) {
+ /*
+ * XXX proc_rwmem() doesn't currently return ENOSPC,
+ * so I think write() can bogusly return 0.
+ * XXX what happens for short writes? We don't want
+ * to write partial data.
+ * XXX proc_rwmem() returns EPERM for other invalid
+ * addresses. Convert this to EINVAL. Does this
+ * clobber returns of EPERM for other reasons?
+ */
+ if (error == 0 || error == ENOSPC || error == EPERM)
+ error = EINVAL; /* EOF */
+ }
+ if (!write)
+ td->td_retval[0] = tmp;
+ PROC_LOCK(p);
+ break;
+
+ case PT_IO:
+#ifdef COMPAT_FREEBSD32
+ if (wrap32) {
+ piod32 = addr;
+ iov.iov_base = (void *)(uintptr_t)piod32->piod_addr;
+ iov.iov_len = piod32->piod_len;
+ uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs;
+ uio.uio_resid = piod32->piod_len;
+ } else
+#endif
+ {
+ piod = addr;
+ iov.iov_base = piod->piod_addr;
+ iov.iov_len = piod->piod_len;
+ uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs;
+ uio.uio_resid = piod->piod_len;
+ }
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_segflg = UIO_USERSPACE;
+ uio.uio_td = td;
+#ifdef COMPAT_FREEBSD32
+ tmp = wrap32 ? piod32->piod_op : piod->piod_op;
+#else
+ tmp = piod->piod_op;
+#endif
+ switch (tmp) {
+ case PIOD_READ_D:
+ case PIOD_READ_I:
+ uio.uio_rw = UIO_READ;
+ break;
+ case PIOD_WRITE_D:
+ case PIOD_WRITE_I:
+ td2->td_dbgflags |= TDB_USERWR;
+ uio.uio_rw = UIO_WRITE;
+ break;
+ default:
+ error = EINVAL;
+ goto out;
+ }
+ PROC_UNLOCK(p);
+ error = proc_rwmem(p, &uio);
+#ifdef COMPAT_FREEBSD32
+ if (wrap32)
+ piod32->piod_len -= uio.uio_resid;
+ else
+#endif
+ piod->piod_len -= uio.uio_resid;
+ PROC_LOCK(p);
+ break;
+
+ case PT_KILL:
+ data = SIGKILL;
+ goto sendsig; /* in PT_CONTINUE above */
+
+ case PT_SETREGS:
+ td2->td_dbgflags |= TDB_USERWR;
+ error = PROC_WRITE(regs, td2, addr);
+ break;
+
+ case PT_GETREGS:
+ error = PROC_READ(regs, td2, addr);
+ break;
+
+ case PT_SETFPREGS:
+ td2->td_dbgflags |= TDB_USERWR;
+ error = PROC_WRITE(fpregs, td2, addr);
+ break;
+
+ case PT_GETFPREGS:
+ error = PROC_READ(fpregs, td2, addr);
+ break;
+
+ case PT_SETDBREGS:
+ td2->td_dbgflags |= TDB_USERWR;
+ error = PROC_WRITE(dbregs, td2, addr);
+ break;
+
+ case PT_GETDBREGS:
+ error = PROC_READ(dbregs, td2, addr);
+ break;
+
+ case PT_LWPINFO:
+ if (data <= 0 ||
+#ifdef COMPAT_FREEBSD32
+ (!wrap32 && data > sizeof(*pl)) ||
+ (wrap32 && data > sizeof(*pl32))) {
+#else
+ data > sizeof(*pl)) {
+#endif
+ error = EINVAL;
+ break;
+ }
+#ifdef COMPAT_FREEBSD32
+ if (wrap32) {
+ pl = &plr;
+ pl32 = addr;
+ } else
+#endif
+ pl = addr;
+ pl->pl_lwpid = td2->td_tid;
+ pl->pl_event = PL_EVENT_NONE;
+ pl->pl_flags = 0;
+ if (td2->td_dbgflags & TDB_XSIG) {
+ pl->pl_event = PL_EVENT_SIGNAL;
+ if (td2->td_dbgksi.ksi_signo != 0 &&
+#ifdef COMPAT_FREEBSD32
+ ((!wrap32 && data >= offsetof(struct ptrace_lwpinfo,
+ pl_siginfo) + sizeof(pl->pl_siginfo)) ||
+ (wrap32 && data >= offsetof(struct ptrace_lwpinfo32,
+ pl_siginfo) + sizeof(struct siginfo32)))
+#else
+ data >= offsetof(struct ptrace_lwpinfo, pl_siginfo)
+ + sizeof(pl->pl_siginfo)
+#endif
+ ){
+ pl->pl_flags |= PL_FLAG_SI;
+ pl->pl_siginfo = td2->td_dbgksi.ksi_info;
+ }
+ }
+ if ((pl->pl_flags & PL_FLAG_SI) == 0)
+ bzero(&pl->pl_siginfo, sizeof(pl->pl_siginfo));
+ if (td2->td_dbgflags & TDB_SCE)
+ pl->pl_flags |= PL_FLAG_SCE;
+ else if (td2->td_dbgflags & TDB_SCX)
+ pl->pl_flags |= PL_FLAG_SCX;
+ if (td2->td_dbgflags & TDB_EXEC)
+ pl->pl_flags |= PL_FLAG_EXEC;
+ if (td2->td_dbgflags & TDB_FORK) {
+ pl->pl_flags |= PL_FLAG_FORKED;
+ pl->pl_child_pid = td2->td_dbg_forked;
+ }
+ if (td2->td_dbgflags & TDB_CHILD)
+ pl->pl_flags |= PL_FLAG_CHILD;
+ pl->pl_sigmask = td2->td_sigmask;
+ pl->pl_siglist = td2->td_siglist;
+ strcpy(pl->pl_tdname, td2->td_name);
+#ifdef COMPAT_FREEBSD32
+ if (wrap32)
+ ptrace_lwpinfo_to32(pl, pl32);
+#endif
+ break;
+
+ case PT_GETNUMLWPS:
+ td->td_retval[0] = p->p_numthreads;
+ break;
+
+ case PT_GETLWPLIST:
+ if (data <= 0) {
+ error = EINVAL;
+ break;
+ }
+ num = imin(p->p_numthreads, data);
+ PROC_UNLOCK(p);
+ buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
+ tmp = 0;
+ PROC_LOCK(p);
+ FOREACH_THREAD_IN_PROC(p, td2) {
+ if (tmp >= num)
+ break;
+ buf[tmp++] = td2->td_tid;
+ }
+ PROC_UNLOCK(p);
+ error = copyout(buf, addr, tmp * sizeof(lwpid_t));
+ free(buf, M_TEMP);
+ if (!error)
+ td->td_retval[0] = tmp;
+ PROC_LOCK(p);
+ break;
+
+ case PT_VM_TIMESTAMP:
+ td->td_retval[0] = p->p_vmspace->vm_map.timestamp;
+ break;
+
+ case PT_VM_ENTRY:
+ PROC_UNLOCK(p);
+#ifdef COMPAT_FREEBSD32
+ if (wrap32)
+ error = ptrace_vm_entry32(td, p, addr);
+ else
+#endif
+ error = ptrace_vm_entry(td, p, addr);
+ PROC_LOCK(p);
+ break;
+
+ default:
+#ifdef __HAVE_PTRACE_MACHDEP
+ if (req >= PT_FIRSTMACH) {
+ PROC_UNLOCK(p);
+ error = cpu_ptrace(td2, req, addr, data);
+ PROC_LOCK(p);
+ } else
+#endif
+ /* Unknown request. */
+ error = EINVAL;
+ break;
+ }
+
+out:
+ /* Drop our hold on this process now that the request has completed. */
+ _PRELE(p);
+fail:
+ PROC_UNLOCK(p);
+ if (proctree_locked)
+ sx_xunlock(&proctree_lock);
+ return (error);
+}
+#undef PROC_READ
+#undef PROC_WRITE
+
+/*
+ * Stop a process because of a debugging event;
+ * stay stopped until p->p_step is cleared
+ * (cleared by PIOCCONT in procfs).
+ */
+void
+stopevent(struct proc *p, unsigned int event, unsigned int val)
+{
+
+ PROC_LOCK_ASSERT(p, MA_OWNED);
+ p->p_step = 1;
+ do {
+ p->p_xstat = val;
+ p->p_xthread = NULL;
+ p->p_stype = event; /* Which event caused the stop? */
+ wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */
+ msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
+ } while (p->p_step);
+}
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
new file mode 100644
index 0000000..6a766af
--- /dev/null
+++ b/sys/kern/sys_socket.c
@@ -0,0 +1,297 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/sigio.h>
+#include <sys/signal.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/filio.h> /* XXX */
+#include <sys/sockio.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ucred.h>
+
+#include <net/if.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+struct fileops socketops = {
+ .fo_read = soo_read,
+ .fo_write = soo_write,
+ .fo_truncate = soo_truncate,
+ .fo_ioctl = soo_ioctl,
+ .fo_poll = soo_poll,
+ .fo_kqfilter = soo_kqfilter,
+ .fo_stat = soo_stat,
+ .fo_close = soo_close,
+ .fo_chmod = invfo_chmod,
+ .fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_flags = DFLAG_PASSABLE
+};
+
+/* ARGSUSED */
+int
+soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct socket *so = fp->f_data;
+ int error;
+
+#ifdef MAC
+ error = mac_socket_check_receive(active_cred, so);
+ if (error)
+ return (error);
+#endif
+ error = soreceive(so, 0, uio, 0, 0, 0);
+ return (error);
+}
+
+/* ARGSUSED */
+int
+soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct socket *so = fp->f_data;
+ int error;
+
+#ifdef MAC
+ error = mac_socket_check_send(active_cred, so);
+ if (error)
+ return (error);
+#endif
+ error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
+ if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
+ PROC_LOCK(uio->uio_td->td_proc);
+ tdsignal(uio->uio_td, SIGPIPE);
+ PROC_UNLOCK(uio->uio_td->td_proc);
+ }
+ return (error);
+}
+
+int
+soo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+int
+soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct socket *so = fp->f_data;
+ int error = 0;
+
+ switch (cmd) {
+ case FIONBIO:
+ SOCK_LOCK(so);
+ if (*(int *)data)
+ so->so_state |= SS_NBIO;
+ else
+ so->so_state &= ~SS_NBIO;
+ SOCK_UNLOCK(so);
+ break;
+
+ case FIOASYNC:
+ /*
+ * XXXRW: This code separately acquires SOCK_LOCK(so) and
+ * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
+ * mutex to avoid introducing the assumption that they are
+ * the same.
+ */
+ if (*(int *)data) {
+ SOCK_LOCK(so);
+ so->so_state |= SS_ASYNC;
+ SOCK_UNLOCK(so);
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_flags |= SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_flags |= SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ } else {
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_ASYNC;
+ SOCK_UNLOCK(so);
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_flags &= ~SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_flags &= ~SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ }
+ break;
+
+ case FIONREAD:
+ /* Unlocked read. */
+ *(int *)data = so->so_rcv.sb_cc;
+ break;
+
+ case FIONWRITE:
+ /* Unlocked read. */
+ *(int *)data = so->so_snd.sb_cc;
+ break;
+
+ case FIONSPACE:
+ if ((so->so_snd.sb_hiwat < so->so_snd.sb_cc) ||
+ (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt))
+ *(int *)data = 0;
+ else
+ *(int *)data = sbspace(&so->so_snd);
+ break;
+
+ case FIOSETOWN:
+ error = fsetown(*(int *)data, &so->so_sigio);
+ break;
+
+ case FIOGETOWN:
+ *(int *)data = fgetown(&so->so_sigio);
+ break;
+
+ case SIOCSPGRP:
+ error = fsetown(-(*(int *)data), &so->so_sigio);
+ break;
+
+ case SIOCGPGRP:
+ *(int *)data = -fgetown(&so->so_sigio);
+ break;
+
+ case SIOCATMARK:
+ /* Unlocked read. */
+ *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0;
+ break;
+ default:
+ /*
+ * Interface/routing/protocol specific ioctls: interface and
+ * routing ioctls should have a different entry since a
+ * socket is unnecessary.
+ */
+ if (IOCGROUP(cmd) == 'i')
+ error = ifioctl(so, cmd, data, td);
+ else if (IOCGROUP(cmd) == 'r') {
+ CURVNET_SET(so->so_vnet);
+ error = rtioctl_fib(cmd, data, so->so_fibnum);
+ CURVNET_RESTORE();
+ } else {
+ CURVNET_SET(so->so_vnet);
+ error = ((*so->so_proto->pr_usrreqs->pru_control)
+ (so, cmd, data, 0, td));
+ CURVNET_RESTORE();
+ }
+ break;
+ }
+ return (error);
+}
+
+int
+soo_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct socket *so = fp->f_data;
+#ifdef MAC
+ int error;
+
+ error = mac_socket_check_poll(active_cred, so);
+ if (error)
+ return (error);
+#endif
+ return (sopoll(so, events, fp->f_cred, td));
+}
+
+int
+soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct socket *so = fp->f_data;
+#ifdef MAC
+ int error;
+#endif
+
+ bzero((caddr_t)ub, sizeof (*ub));
+ ub->st_mode = S_IFSOCK;
+#ifdef MAC
+ error = mac_socket_check_stat(active_cred, so);
+ if (error)
+ return (error);
+#endif
+ /*
+ * If SBS_CANTRCVMORE is set, but there's still data left in the
+ * receive buffer, the socket is still readable.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 ||
+ so->so_rcv.sb_cc != 0)
+ ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
+ ub->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /* Unlocked read. */
+ if ((so->so_snd.sb_state & SBS_CANTSENDMORE) == 0)
+ ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
+ ub->st_uid = so->so_cred->cr_uid;
+ ub->st_gid = so->so_cred->cr_gid;
+ return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
+}
+
+/*
+ * API socket close on file pointer. We call soclose() to close the socket
+ * (including initiating closing protocols). soclose() will sorele() the
+ * file reference but the actual socket will not go away until the socket's
+ * ref count hits 0.
+ */
+/* ARGSUSED */
+int
+soo_close(struct file *fp, struct thread *td)
+{
+ int error = 0;
+ struct socket *so;
+
+ so = fp->f_data;
+ fp->f_ops = &badfileops;
+ fp->f_data = NULL;
+
+ if (so)
+ error = soclose(so);
+ return (error);
+}
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
new file mode 100644
index 0000000..f330879
--- /dev/null
+++ b/sys/kern/syscalls.c
@@ -0,0 +1,554 @@
+/*
+ * System call names.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * created from FreeBSD: head/sys/kern/syscalls.master 255219 2013-09-05 00:09:56Z pjd
+ */
+
+const char *syscallnames[] = {
+ "syscall", /* 0 = syscall */
+ "exit", /* 1 = exit */
+ "fork", /* 2 = fork */
+ "read", /* 3 = read */
+ "write", /* 4 = write */
+ "open", /* 5 = open */
+ "close", /* 6 = close */
+ "wait4", /* 7 = wait4 */
+ "compat.creat", /* 8 = old creat */
+ "link", /* 9 = link */
+ "unlink", /* 10 = unlink */
+ "obs_execv", /* 11 = obsolete execv */
+ "chdir", /* 12 = chdir */
+ "fchdir", /* 13 = fchdir */
+ "mknod", /* 14 = mknod */
+ "chmod", /* 15 = chmod */
+ "chown", /* 16 = chown */
+ "break", /* 17 = break */
+ "compat4.getfsstat", /* 18 = freebsd4 getfsstat */
+ "compat.lseek", /* 19 = old lseek */
+ "getpid", /* 20 = getpid */
+ "mount", /* 21 = mount */
+ "unmount", /* 22 = unmount */
+ "setuid", /* 23 = setuid */
+ "getuid", /* 24 = getuid */
+ "geteuid", /* 25 = geteuid */
+ "ptrace", /* 26 = ptrace */
+ "recvmsg", /* 27 = recvmsg */
+ "sendmsg", /* 28 = sendmsg */
+ "recvfrom", /* 29 = recvfrom */
+ "accept", /* 30 = accept */
+ "getpeername", /* 31 = getpeername */
+ "getsockname", /* 32 = getsockname */
+ "access", /* 33 = access */
+ "chflags", /* 34 = chflags */
+ "fchflags", /* 35 = fchflags */
+ "sync", /* 36 = sync */
+ "kill", /* 37 = kill */
+ "compat.stat", /* 38 = old stat */
+ "getppid", /* 39 = getppid */
+ "compat.lstat", /* 40 = old lstat */
+ "dup", /* 41 = dup */
+ "pipe", /* 42 = pipe */
+ "getegid", /* 43 = getegid */
+ "profil", /* 44 = profil */
+ "ktrace", /* 45 = ktrace */
+ "compat.sigaction", /* 46 = old sigaction */
+ "getgid", /* 47 = getgid */
+ "compat.sigprocmask", /* 48 = old sigprocmask */
+ "getlogin", /* 49 = getlogin */
+ "setlogin", /* 50 = setlogin */
+ "acct", /* 51 = acct */
+ "compat.sigpending", /* 52 = old sigpending */
+ "sigaltstack", /* 53 = sigaltstack */
+ "ioctl", /* 54 = ioctl */
+ "reboot", /* 55 = reboot */
+ "revoke", /* 56 = revoke */
+ "symlink", /* 57 = symlink */
+ "readlink", /* 58 = readlink */
+ "execve", /* 59 = execve */
+ "umask", /* 60 = umask */
+ "chroot", /* 61 = chroot */
+ "compat.fstat", /* 62 = old fstat */
+ "compat.getkerninfo", /* 63 = old getkerninfo */
+ "compat.getpagesize", /* 64 = old getpagesize */
+ "msync", /* 65 = msync */
+ "vfork", /* 66 = vfork */
+ "obs_vread", /* 67 = obsolete vread */
+ "obs_vwrite", /* 68 = obsolete vwrite */
+ "sbrk", /* 69 = sbrk */
+ "sstk", /* 70 = sstk */
+ "compat.mmap", /* 71 = old mmap */
+ "vadvise", /* 72 = vadvise */
+ "munmap", /* 73 = munmap */
+ "mprotect", /* 74 = mprotect */
+ "madvise", /* 75 = madvise */
+ "obs_vhangup", /* 76 = obsolete vhangup */
+ "obs_vlimit", /* 77 = obsolete vlimit */
+ "mincore", /* 78 = mincore */
+ "getgroups", /* 79 = getgroups */
+ "setgroups", /* 80 = setgroups */
+ "getpgrp", /* 81 = getpgrp */
+ "setpgid", /* 82 = setpgid */
+ "setitimer", /* 83 = setitimer */
+ "compat.wait", /* 84 = old wait */
+ "swapon", /* 85 = swapon */
+ "getitimer", /* 86 = getitimer */
+ "compat.gethostname", /* 87 = old gethostname */
+ "compat.sethostname", /* 88 = old sethostname */
+ "getdtablesize", /* 89 = getdtablesize */
+ "dup2", /* 90 = dup2 */
+ "#91", /* 91 = getdopt */
+ "fcntl", /* 92 = fcntl */
+ "select", /* 93 = select */
+ "#94", /* 94 = setdopt */
+ "fsync", /* 95 = fsync */
+ "setpriority", /* 96 = setpriority */
+ "socket", /* 97 = socket */
+ "connect", /* 98 = connect */
+ "compat.accept", /* 99 = old accept */
+ "getpriority", /* 100 = getpriority */
+ "compat.send", /* 101 = old send */
+ "compat.recv", /* 102 = old recv */
+ "compat.sigreturn", /* 103 = old sigreturn */
+ "bind", /* 104 = bind */
+ "setsockopt", /* 105 = setsockopt */
+ "listen", /* 106 = listen */
+ "obs_vtimes", /* 107 = obsolete vtimes */
+ "compat.sigvec", /* 108 = old sigvec */
+ "compat.sigblock", /* 109 = old sigblock */
+ "compat.sigsetmask", /* 110 = old sigsetmask */
+ "compat.sigsuspend", /* 111 = old sigsuspend */
+ "compat.sigstack", /* 112 = old sigstack */
+ "compat.recvmsg", /* 113 = old recvmsg */
+ "compat.sendmsg", /* 114 = old sendmsg */
+ "obs_vtrace", /* 115 = obsolete vtrace */
+ "gettimeofday", /* 116 = gettimeofday */
+ "getrusage", /* 117 = getrusage */
+ "getsockopt", /* 118 = getsockopt */
+ "#119", /* 119 = resuba */
+ "readv", /* 120 = readv */
+ "writev", /* 121 = writev */
+ "settimeofday", /* 122 = settimeofday */
+ "fchown", /* 123 = fchown */
+ "fchmod", /* 124 = fchmod */
+ "compat.recvfrom", /* 125 = old recvfrom */
+ "setreuid", /* 126 = setreuid */
+ "setregid", /* 127 = setregid */
+ "rename", /* 128 = rename */
+ "compat.truncate", /* 129 = old truncate */
+ "compat.ftruncate", /* 130 = old ftruncate */
+ "flock", /* 131 = flock */
+ "mkfifo", /* 132 = mkfifo */
+ "sendto", /* 133 = sendto */
+ "shutdown", /* 134 = shutdown */
+ "socketpair", /* 135 = socketpair */
+ "mkdir", /* 136 = mkdir */
+ "rmdir", /* 137 = rmdir */
+ "utimes", /* 138 = utimes */
+ "obs_4.2", /* 139 = obsolete 4.2 sigreturn */
+ "adjtime", /* 140 = adjtime */
+ "compat.getpeername", /* 141 = old getpeername */
+ "compat.gethostid", /* 142 = old gethostid */
+ "compat.sethostid", /* 143 = old sethostid */
+ "compat.getrlimit", /* 144 = old getrlimit */
+ "compat.setrlimit", /* 145 = old setrlimit */
+ "compat.killpg", /* 146 = old killpg */
+ "setsid", /* 147 = setsid */
+ "quotactl", /* 148 = quotactl */
+ "compat.quota", /* 149 = old quota */
+ "compat.getsockname", /* 150 = old getsockname */
+ "#151", /* 151 = sem_lock */
+ "#152", /* 152 = sem_wakeup */
+ "#153", /* 153 = asyncdaemon */
+ "nlm_syscall", /* 154 = nlm_syscall */
+ "nfssvc", /* 155 = nfssvc */
+ "compat.getdirentries", /* 156 = old getdirentries */
+ "compat4.statfs", /* 157 = freebsd4 statfs */
+ "compat4.fstatfs", /* 158 = freebsd4 fstatfs */
+ "#159", /* 159 = nosys */
+ "lgetfh", /* 160 = lgetfh */
+ "getfh", /* 161 = getfh */
+ "compat4.getdomainname", /* 162 = freebsd4 getdomainname */
+ "compat4.setdomainname", /* 163 = freebsd4 setdomainname */
+ "compat4.uname", /* 164 = freebsd4 uname */
+ "sysarch", /* 165 = sysarch */
+ "rtprio", /* 166 = rtprio */
+ "#167", /* 167 = nosys */
+ "#168", /* 168 = nosys */
+ "semsys", /* 169 = semsys */
+ "msgsys", /* 170 = msgsys */
+ "shmsys", /* 171 = shmsys */
+ "#172", /* 172 = nosys */
+ "freebsd6_pread", /* 173 = freebsd6_pread */
+ "freebsd6_pwrite", /* 174 = freebsd6_pwrite */
+ "setfib", /* 175 = setfib */
+ "ntp_adjtime", /* 176 = ntp_adjtime */
+ "#177", /* 177 = sfork */
+ "#178", /* 178 = getdescriptor */
+ "#179", /* 179 = setdescriptor */
+ "#180", /* 180 = nosys */
+ "setgid", /* 181 = setgid */
+ "setegid", /* 182 = setegid */
+ "seteuid", /* 183 = seteuid */
+ "#184", /* 184 = lfs_bmapv */
+ "#185", /* 185 = lfs_markv */
+ "#186", /* 186 = lfs_segclean */
+ "#187", /* 187 = lfs_segwait */
+ "stat", /* 188 = stat */
+ "fstat", /* 189 = fstat */
+ "lstat", /* 190 = lstat */
+ "pathconf", /* 191 = pathconf */
+ "fpathconf", /* 192 = fpathconf */
+ "#193", /* 193 = nosys */
+ "getrlimit", /* 194 = getrlimit */
+ "setrlimit", /* 195 = setrlimit */
+ "getdirentries", /* 196 = getdirentries */
+ "freebsd6_mmap", /* 197 = freebsd6_mmap */
+ "__syscall", /* 198 = __syscall */
+ "freebsd6_lseek", /* 199 = freebsd6_lseek */
+ "freebsd6_truncate", /* 200 = freebsd6_truncate */
+ "freebsd6_ftruncate", /* 201 = freebsd6_ftruncate */
+ "__sysctl", /* 202 = __sysctl */
+ "mlock", /* 203 = mlock */
+ "munlock", /* 204 = munlock */
+ "undelete", /* 205 = undelete */
+ "futimes", /* 206 = futimes */
+ "getpgid", /* 207 = getpgid */
+ "#208", /* 208 = newreboot */
+ "poll", /* 209 = poll */
+ "lkmnosys", /* 210 = lkmnosys */
+ "lkmnosys", /* 211 = lkmnosys */
+ "lkmnosys", /* 212 = lkmnosys */
+ "lkmnosys", /* 213 = lkmnosys */
+ "lkmnosys", /* 214 = lkmnosys */
+ "lkmnosys", /* 215 = lkmnosys */
+ "lkmnosys", /* 216 = lkmnosys */
+ "lkmnosys", /* 217 = lkmnosys */
+ "lkmnosys", /* 218 = lkmnosys */
+ "lkmnosys", /* 219 = lkmnosys */
+ "compat7.__semctl", /* 220 = freebsd7 __semctl */
+ "semget", /* 221 = semget */
+ "semop", /* 222 = semop */
+ "#223", /* 223 = semconfig */
+ "compat7.msgctl", /* 224 = freebsd7 msgctl */
+ "msgget", /* 225 = msgget */
+ "msgsnd", /* 226 = msgsnd */
+ "msgrcv", /* 227 = msgrcv */
+ "shmat", /* 228 = shmat */
+ "compat7.shmctl", /* 229 = freebsd7 shmctl */
+ "shmdt", /* 230 = shmdt */
+ "shmget", /* 231 = shmget */
+ "clock_gettime", /* 232 = clock_gettime */
+ "clock_settime", /* 233 = clock_settime */
+ "clock_getres", /* 234 = clock_getres */
+ "ktimer_create", /* 235 = ktimer_create */
+ "ktimer_delete", /* 236 = ktimer_delete */
+ "ktimer_settime", /* 237 = ktimer_settime */
+ "ktimer_gettime", /* 238 = ktimer_gettime */
+ "ktimer_getoverrun", /* 239 = ktimer_getoverrun */
+ "nanosleep", /* 240 = nanosleep */
+ "ffclock_getcounter", /* 241 = ffclock_getcounter */
+ "ffclock_setestimate", /* 242 = ffclock_setestimate */
+ "ffclock_getestimate", /* 243 = ffclock_getestimate */
+ "#244", /* 244 = nosys */
+ "#245", /* 245 = nosys */
+ "#246", /* 246 = nosys */
+ "clock_getcpuclockid2", /* 247 = clock_getcpuclockid2 */
+ "ntp_gettime", /* 248 = ntp_gettime */
+ "#249", /* 249 = nosys */
+ "minherit", /* 250 = minherit */
+ "rfork", /* 251 = rfork */
+ "openbsd_poll", /* 252 = openbsd_poll */
+ "issetugid", /* 253 = issetugid */
+ "lchown", /* 254 = lchown */
+ "aio_read", /* 255 = aio_read */
+ "aio_write", /* 256 = aio_write */
+ "lio_listio", /* 257 = lio_listio */
+ "#258", /* 258 = nosys */
+ "#259", /* 259 = nosys */
+ "#260", /* 260 = nosys */
+ "#261", /* 261 = nosys */
+ "#262", /* 262 = nosys */
+ "#263", /* 263 = nosys */
+ "#264", /* 264 = nosys */
+ "#265", /* 265 = nosys */
+ "#266", /* 266 = nosys */
+ "#267", /* 267 = nosys */
+ "#268", /* 268 = nosys */
+ "#269", /* 269 = nosys */
+ "#270", /* 270 = nosys */
+ "#271", /* 271 = nosys */
+ "getdents", /* 272 = getdents */
+ "#273", /* 273 = nosys */
+ "lchmod", /* 274 = lchmod */
+ "netbsd_lchown", /* 275 = netbsd_lchown */
+ "lutimes", /* 276 = lutimes */
+ "netbsd_msync", /* 277 = netbsd_msync */
+ "nstat", /* 278 = nstat */
+ "nfstat", /* 279 = nfstat */
+ "nlstat", /* 280 = nlstat */
+ "#281", /* 281 = nosys */
+ "#282", /* 282 = nosys */
+ "#283", /* 283 = nosys */
+ "#284", /* 284 = nosys */
+ "#285", /* 285 = nosys */
+ "#286", /* 286 = nosys */
+ "#287", /* 287 = nosys */
+ "#288", /* 288 = nosys */
+ "preadv", /* 289 = preadv */
+ "pwritev", /* 290 = pwritev */
+ "#291", /* 291 = nosys */
+ "#292", /* 292 = nosys */
+ "#293", /* 293 = nosys */
+ "#294", /* 294 = nosys */
+ "#295", /* 295 = nosys */
+ "#296", /* 296 = nosys */
+ "compat4.fhstatfs", /* 297 = freebsd4 fhstatfs */
+ "fhopen", /* 298 = fhopen */
+ "fhstat", /* 299 = fhstat */
+ "modnext", /* 300 = modnext */
+ "modstat", /* 301 = modstat */
+ "modfnext", /* 302 = modfnext */
+ "modfind", /* 303 = modfind */
+ "kldload", /* 304 = kldload */
+ "kldunload", /* 305 = kldunload */
+ "kldfind", /* 306 = kldfind */
+ "kldnext", /* 307 = kldnext */
+ "kldstat", /* 308 = kldstat */
+ "kldfirstmod", /* 309 = kldfirstmod */
+ "getsid", /* 310 = getsid */
+ "setresuid", /* 311 = setresuid */
+ "setresgid", /* 312 = setresgid */
+ "obs_signanosleep", /* 313 = obsolete signanosleep */
+ "aio_return", /* 314 = aio_return */
+ "aio_suspend", /* 315 = aio_suspend */
+ "aio_cancel", /* 316 = aio_cancel */
+ "aio_error", /* 317 = aio_error */
+ "oaio_read", /* 318 = oaio_read */
+ "oaio_write", /* 319 = oaio_write */
+ "olio_listio", /* 320 = olio_listio */
+ "yield", /* 321 = yield */
+ "obs_thr_sleep", /* 322 = obsolete thr_sleep */
+ "obs_thr_wakeup", /* 323 = obsolete thr_wakeup */
+ "mlockall", /* 324 = mlockall */
+ "munlockall", /* 325 = munlockall */
+ "__getcwd", /* 326 = __getcwd */
+ "sched_setparam", /* 327 = sched_setparam */
+ "sched_getparam", /* 328 = sched_getparam */
+ "sched_setscheduler", /* 329 = sched_setscheduler */
+ "sched_getscheduler", /* 330 = sched_getscheduler */
+ "sched_yield", /* 331 = sched_yield */
+ "sched_get_priority_max", /* 332 = sched_get_priority_max */
+ "sched_get_priority_min", /* 333 = sched_get_priority_min */
+ "sched_rr_get_interval", /* 334 = sched_rr_get_interval */
+ "utrace", /* 335 = utrace */
+ "compat4.sendfile", /* 336 = freebsd4 sendfile */
+ "kldsym", /* 337 = kldsym */
+ "jail", /* 338 = jail */
+ "nnpfs_syscall", /* 339 = nnpfs_syscall */
+ "sigprocmask", /* 340 = sigprocmask */
+ "sigsuspend", /* 341 = sigsuspend */
+ "compat4.sigaction", /* 342 = freebsd4 sigaction */
+ "sigpending", /* 343 = sigpending */
+ "compat4.sigreturn", /* 344 = freebsd4 sigreturn */
+ "sigtimedwait", /* 345 = sigtimedwait */
+ "sigwaitinfo", /* 346 = sigwaitinfo */
+ "__acl_get_file", /* 347 = __acl_get_file */
+ "__acl_set_file", /* 348 = __acl_set_file */
+ "__acl_get_fd", /* 349 = __acl_get_fd */
+ "__acl_set_fd", /* 350 = __acl_set_fd */
+ "__acl_delete_file", /* 351 = __acl_delete_file */
+ "__acl_delete_fd", /* 352 = __acl_delete_fd */
+ "__acl_aclcheck_file", /* 353 = __acl_aclcheck_file */
+ "__acl_aclcheck_fd", /* 354 = __acl_aclcheck_fd */
+ "extattrctl", /* 355 = extattrctl */
+ "extattr_set_file", /* 356 = extattr_set_file */
+ "extattr_get_file", /* 357 = extattr_get_file */
+ "extattr_delete_file", /* 358 = extattr_delete_file */
+ "aio_waitcomplete", /* 359 = aio_waitcomplete */
+ "getresuid", /* 360 = getresuid */
+ "getresgid", /* 361 = getresgid */
+ "kqueue", /* 362 = kqueue */
+ "kevent", /* 363 = kevent */
+ "#364", /* 364 = __cap_get_proc */
+ "#365", /* 365 = __cap_set_proc */
+ "#366", /* 366 = __cap_get_fd */
+ "#367", /* 367 = __cap_get_file */
+ "#368", /* 368 = __cap_set_fd */
+ "#369", /* 369 = __cap_set_file */
+ "#370", /* 370 = nosys */
+ "extattr_set_fd", /* 371 = extattr_set_fd */
+ "extattr_get_fd", /* 372 = extattr_get_fd */
+ "extattr_delete_fd", /* 373 = extattr_delete_fd */
+ "__setugid", /* 374 = __setugid */
+ "#375", /* 375 = nfsclnt */
+ "eaccess", /* 376 = eaccess */
+ "afs3_syscall", /* 377 = afs3_syscall */
+ "nmount", /* 378 = nmount */
+ "#379", /* 379 = kse_exit */
+ "#380", /* 380 = kse_wakeup */
+ "#381", /* 381 = kse_create */
+ "#382", /* 382 = kse_thr_interrupt */
+ "#383", /* 383 = kse_release */
+ "__mac_get_proc", /* 384 = __mac_get_proc */
+ "__mac_set_proc", /* 385 = __mac_set_proc */
+ "__mac_get_fd", /* 386 = __mac_get_fd */
+ "__mac_get_file", /* 387 = __mac_get_file */
+ "__mac_set_fd", /* 388 = __mac_set_fd */
+ "__mac_set_file", /* 389 = __mac_set_file */
+ "kenv", /* 390 = kenv */
+ "lchflags", /* 391 = lchflags */
+ "uuidgen", /* 392 = uuidgen */
+ "sendfile", /* 393 = sendfile */
+ "mac_syscall", /* 394 = mac_syscall */
+ "getfsstat", /* 395 = getfsstat */
+ "statfs", /* 396 = statfs */
+ "fstatfs", /* 397 = fstatfs */
+ "fhstatfs", /* 398 = fhstatfs */
+ "#399", /* 399 = nosys */
+ "ksem_close", /* 400 = ksem_close */
+ "ksem_post", /* 401 = ksem_post */
+ "ksem_wait", /* 402 = ksem_wait */
+ "ksem_trywait", /* 403 = ksem_trywait */
+ "ksem_init", /* 404 = ksem_init */
+ "ksem_open", /* 405 = ksem_open */
+ "ksem_unlink", /* 406 = ksem_unlink */
+ "ksem_getvalue", /* 407 = ksem_getvalue */
+ "ksem_destroy", /* 408 = ksem_destroy */
+ "__mac_get_pid", /* 409 = __mac_get_pid */
+ "__mac_get_link", /* 410 = __mac_get_link */
+ "__mac_set_link", /* 411 = __mac_set_link */
+ "extattr_set_link", /* 412 = extattr_set_link */
+ "extattr_get_link", /* 413 = extattr_get_link */
+ "extattr_delete_link", /* 414 = extattr_delete_link */
+ "__mac_execve", /* 415 = __mac_execve */
+ "sigaction", /* 416 = sigaction */
+ "sigreturn", /* 417 = sigreturn */
+ "#418", /* 418 = __xstat */
+ "#419", /* 419 = __xfstat */
+ "#420", /* 420 = __xlstat */
+ "getcontext", /* 421 = getcontext */
+ "setcontext", /* 422 = setcontext */
+ "swapcontext", /* 423 = swapcontext */
+ "swapoff", /* 424 = swapoff */
+ "__acl_get_link", /* 425 = __acl_get_link */
+ "__acl_set_link", /* 426 = __acl_set_link */
+ "__acl_delete_link", /* 427 = __acl_delete_link */
+ "__acl_aclcheck_link", /* 428 = __acl_aclcheck_link */
+ "sigwait", /* 429 = sigwait */
+ "thr_create", /* 430 = thr_create */
+ "thr_exit", /* 431 = thr_exit */
+ "thr_self", /* 432 = thr_self */
+ "thr_kill", /* 433 = thr_kill */
+ "_umtx_lock", /* 434 = _umtx_lock */
+ "_umtx_unlock", /* 435 = _umtx_unlock */
+ "jail_attach", /* 436 = jail_attach */
+ "extattr_list_fd", /* 437 = extattr_list_fd */
+ "extattr_list_file", /* 438 = extattr_list_file */
+ "extattr_list_link", /* 439 = extattr_list_link */
+ "#440", /* 440 = kse_switchin */
+ "ksem_timedwait", /* 441 = ksem_timedwait */
+ "thr_suspend", /* 442 = thr_suspend */
+ "thr_wake", /* 443 = thr_wake */
+ "kldunloadf", /* 444 = kldunloadf */
+ "audit", /* 445 = audit */
+ "auditon", /* 446 = auditon */
+ "getauid", /* 447 = getauid */
+ "setauid", /* 448 = setauid */
+ "getaudit", /* 449 = getaudit */
+ "setaudit", /* 450 = setaudit */
+ "getaudit_addr", /* 451 = getaudit_addr */
+ "setaudit_addr", /* 452 = setaudit_addr */
+ "auditctl", /* 453 = auditctl */
+ "_umtx_op", /* 454 = _umtx_op */
+ "thr_new", /* 455 = thr_new */
+ "sigqueue", /* 456 = sigqueue */
+ "kmq_open", /* 457 = kmq_open */
+ "kmq_setattr", /* 458 = kmq_setattr */
+ "kmq_timedreceive", /* 459 = kmq_timedreceive */
+ "kmq_timedsend", /* 460 = kmq_timedsend */
+ "kmq_notify", /* 461 = kmq_notify */
+ "kmq_unlink", /* 462 = kmq_unlink */
+ "abort2", /* 463 = abort2 */
+ "thr_set_name", /* 464 = thr_set_name */
+ "aio_fsync", /* 465 = aio_fsync */
+ "rtprio_thread", /* 466 = rtprio_thread */
+ "#467", /* 467 = nosys */
+ "#468", /* 468 = nosys */
+ "#469", /* 469 = __getpath_fromfd */
+ "#470", /* 470 = __getpath_fromaddr */
+ "sctp_peeloff", /* 471 = sctp_peeloff */
+ "sctp_generic_sendmsg", /* 472 = sctp_generic_sendmsg */
+ "sctp_generic_sendmsg_iov", /* 473 = sctp_generic_sendmsg_iov */
+ "sctp_generic_recvmsg", /* 474 = sctp_generic_recvmsg */
+ "pread", /* 475 = pread */
+ "pwrite", /* 476 = pwrite */
+ "mmap", /* 477 = mmap */
+ "lseek", /* 478 = lseek */
+ "truncate", /* 479 = truncate */
+ "ftruncate", /* 480 = ftruncate */
+ "thr_kill2", /* 481 = thr_kill2 */
+ "shm_open", /* 482 = shm_open */
+ "shm_unlink", /* 483 = shm_unlink */
+ "cpuset", /* 484 = cpuset */
+ "cpuset_setid", /* 485 = cpuset_setid */
+ "cpuset_getid", /* 486 = cpuset_getid */
+ "cpuset_getaffinity", /* 487 = cpuset_getaffinity */
+ "cpuset_setaffinity", /* 488 = cpuset_setaffinity */
+ "faccessat", /* 489 = faccessat */
+ "fchmodat", /* 490 = fchmodat */
+ "fchownat", /* 491 = fchownat */
+ "fexecve", /* 492 = fexecve */
+ "fstatat", /* 493 = fstatat */
+ "futimesat", /* 494 = futimesat */
+ "linkat", /* 495 = linkat */
+ "mkdirat", /* 496 = mkdirat */
+ "mkfifoat", /* 497 = mkfifoat */
+ "mknodat", /* 498 = mknodat */
+ "openat", /* 499 = openat */
+ "readlinkat", /* 500 = readlinkat */
+ "renameat", /* 501 = renameat */
+ "symlinkat", /* 502 = symlinkat */
+ "unlinkat", /* 503 = unlinkat */
+ "posix_openpt", /* 504 = posix_openpt */
+ "gssd_syscall", /* 505 = gssd_syscall */
+ "jail_get", /* 506 = jail_get */
+ "jail_set", /* 507 = jail_set */
+ "jail_remove", /* 508 = jail_remove */
+ "closefrom", /* 509 = closefrom */
+ "__semctl", /* 510 = __semctl */
+ "msgctl", /* 511 = msgctl */
+ "shmctl", /* 512 = shmctl */
+ "lpathconf", /* 513 = lpathconf */
+ "obs_cap_new", /* 514 = obsolete cap_new */
+ "__cap_rights_get", /* 515 = __cap_rights_get */
+ "cap_enter", /* 516 = cap_enter */
+ "cap_getmode", /* 517 = cap_getmode */
+ "pdfork", /* 518 = pdfork */
+ "pdkill", /* 519 = pdkill */
+ "pdgetpid", /* 520 = pdgetpid */
+ "#521", /* 521 = pdwait4 */
+ "pselect", /* 522 = pselect */
+ "getloginclass", /* 523 = getloginclass */
+ "setloginclass", /* 524 = setloginclass */
+ "rctl_get_racct", /* 525 = rctl_get_racct */
+ "rctl_get_rules", /* 526 = rctl_get_rules */
+ "rctl_get_limits", /* 527 = rctl_get_limits */
+ "rctl_add_rule", /* 528 = rctl_add_rule */
+ "rctl_remove_rule", /* 529 = rctl_remove_rule */
+ "posix_fallocate", /* 530 = posix_fallocate */
+ "posix_fadvise", /* 531 = posix_fadvise */
+ "wait6", /* 532 = wait6 */
+ "cap_rights_limit", /* 533 = cap_rights_limit */
+ "cap_ioctls_limit", /* 534 = cap_ioctls_limit */
+ "cap_ioctls_get", /* 535 = cap_ioctls_get */
+ "cap_fcntls_limit", /* 536 = cap_fcntls_limit */
+ "cap_fcntls_get", /* 537 = cap_fcntls_get */
+ "bindat", /* 538 = bindat */
+ "connectat", /* 539 = connectat */
+ "chflagsat", /* 540 = chflagsat */
+ "accept4", /* 541 = accept4 */
+ "pipe2", /* 542 = pipe2 */
+ "aio_mlock", /* 543 = aio_mlock */
+};
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
new file mode 100644
index 0000000..e19e310
--- /dev/null
+++ b/sys/kern/syscalls.master
@@ -0,0 +1,982 @@
+ $FreeBSD$
+; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94
+;
+; System call name/number master file.
+; Processed to created init_sysent.c, syscalls.c and syscall.h.
+
+; Columns: number audit type name alt{name,tag,rtyp}/comments
+; number system call number, must be in order
+; audit the audit event associated with the system call
+; A value of AUE_NULL means no auditing, but it also means that
+; there is no audit event for the call at this time. For the
+; case where the event exists, but we don't want auditing, the
+; event should be #defined to AUE_NULL in audit_kevents.h.
+; type one of STD, OBSOL, UNIMPL, COMPAT, COMPAT4, COMPAT6,
+; COMPAT7, NODEF, NOARGS, NOPROTO, NOSTD
+; The COMPAT* options may be combined with one or more NO*
+; options separated by '|' with no spaces (e.g. COMPAT|NOARGS)
+; name psuedo-prototype of syscall routine
+; If one of the following alts is different, then all appear:
+; altname name of system call if different
+; alttag name of args struct tag if different from [o]`name'"_args"
+; altrtyp return type if not int (bogus - syscalls always return int)
+; for UNIMPL/OBSOL, name continues with comments
+
+; types:
+; STD always included
+; COMPAT included on COMPAT #ifdef
+; COMPAT4 included on COMPAT4 #ifdef (FreeBSD 4 compat)
+; COMPAT6 included on COMPAT6 #ifdef (FreeBSD 6 compat)
+; COMPAT7 included on COMPAT7 #ifdef (FreeBSD 7 compat)
+; OBSOL obsolete, not included in system, only specifies name
+; UNIMPL not implemented, placeholder only
+; NOSTD implemented but as a lkm that can be statically
+; compiled in; sysent entry will be filled with lkmressys
+; so the SYSCALL_MODULE macro works
+; NOARGS same as STD except do not create structure in sys/sysproto.h
+; NODEF same as STD except only have the entry in the syscall table
+; added. Meaning - do not create structure or function
+; prototype in sys/sysproto.h
+; NOPROTO same as STD except do not create structure or
+; function prototype in sys/sysproto.h. Does add a
+; definition to syscall.h besides adding a sysent.
+; NOTSTATIC syscall is loadable
+;
+; Please copy any additions and changes to the following compatability tables:
+; sys/compat/freebsd32/syscalls.master
+
+; #ifdef's, etc. may be included, and are copied to the output files.
+
+#include <sys/param.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+
+; Reserved/unimplemented system calls in the range 0-150 inclusive
+; are reserved for use in future Berkeley releases.
+; Additional system calls implemented in vendor and other
+; redistributions should be placed in the reserved range at the end
+; of the current calls.
+
+0 AUE_NULL STD { int nosys(void); } syscall nosys_args int
+1 AUE_EXIT STD { void sys_exit(int rval); } exit \
+ sys_exit_args void
+2 AUE_FORK STD { int fork(void); }
+3 AUE_NULL STD { ssize_t read(int fd, void *buf, \
+ size_t nbyte); }
+4 AUE_NULL STD { ssize_t write(int fd, const void *buf, \
+ size_t nbyte); }
+5 AUE_OPEN_RWTC STD { int open(char *path, int flags, int mode); }
+; XXX should be { int open(const char *path, int flags, ...); }
+; but we're not ready for `const' or varargs.
+; XXX man page says `mode_t mode'.
+6 AUE_CLOSE STD { int close(int fd); }
+7 AUE_WAIT4 STD { int wait4(int pid, int *status, \
+ int options, struct rusage *rusage); }
+8 AUE_CREAT COMPAT { int creat(char *path, int mode); }
+9 AUE_LINK STD { int link(char *path, char *link); }
+10 AUE_UNLINK STD { int unlink(char *path); }
+11 AUE_NULL OBSOL execv
+12 AUE_CHDIR STD { int chdir(char *path); }
+13 AUE_FCHDIR STD { int fchdir(int fd); }
+14 AUE_MKNOD STD { int mknod(char *path, int mode, int dev); }
+15 AUE_CHMOD STD { int chmod(char *path, int mode); }
+16 AUE_CHOWN STD { int chown(char *path, int uid, int gid); }
+17 AUE_NULL STD { int obreak(char *nsize); } break \
+ obreak_args int
+18 AUE_GETFSSTAT COMPAT4 { int getfsstat(struct ostatfs *buf, \
+ long bufsize, int flags); }
+19 AUE_LSEEK COMPAT { long lseek(int fd, long offset, \
+ int whence); }
+20 AUE_GETPID STD { pid_t getpid(void); }
+21 AUE_MOUNT STD { int mount(char *type, char *path, \
+ int flags, caddr_t data); }
+; XXX `path' should have type `const char *' but we're not ready for that.
+22 AUE_UMOUNT STD { int unmount(char *path, int flags); }
+23 AUE_SETUID STD { int setuid(uid_t uid); }
+24 AUE_GETUID STD { uid_t getuid(void); }
+25 AUE_GETEUID STD { uid_t geteuid(void); }
+26 AUE_PTRACE STD { int ptrace(int req, pid_t pid, \
+ caddr_t addr, int data); }
+27 AUE_RECVMSG STD { int recvmsg(int s, struct msghdr *msg, \
+ int flags); }
+28 AUE_SENDMSG STD { int sendmsg(int s, struct msghdr *msg, \
+ int flags); }
+29 AUE_RECVFROM STD { int recvfrom(int s, caddr_t buf, \
+ size_t len, int flags, \
+ struct sockaddr * __restrict from, \
+ __socklen_t * __restrict fromlenaddr); }
+30 AUE_ACCEPT STD { int accept(int s, \
+ struct sockaddr * __restrict name, \
+ __socklen_t * __restrict anamelen); }
+31 AUE_GETPEERNAME STD { int getpeername(int fdes, \
+ struct sockaddr * __restrict asa, \
+ __socklen_t * __restrict alen); }
+32 AUE_GETSOCKNAME STD { int getsockname(int fdes, \
+ struct sockaddr * __restrict asa, \
+ __socklen_t * __restrict alen); }
+33 AUE_ACCESS STD { int access(char *path, int amode); }
+34 AUE_CHFLAGS STD { int chflags(const char *path, u_long flags); }
+35 AUE_FCHFLAGS STD { int fchflags(int fd, u_long flags); }
+36 AUE_SYNC STD { int sync(void); }
+37 AUE_KILL STD { int kill(int pid, int signum); }
+38 AUE_STAT COMPAT { int stat(char *path, struct ostat *ub); }
+39 AUE_GETPPID STD { pid_t getppid(void); }
+40 AUE_LSTAT COMPAT { int lstat(char *path, struct ostat *ub); }
+41 AUE_DUP STD { int dup(u_int fd); }
+42 AUE_PIPE STD { int pipe(void); }
+43 AUE_GETEGID STD { gid_t getegid(void); }
+44 AUE_PROFILE STD { int profil(caddr_t samples, size_t size, \
+ size_t offset, u_int scale); }
+45 AUE_KTRACE STD { int ktrace(const char *fname, int ops, \
+ int facs, int pid); }
+46 AUE_SIGACTION COMPAT { int sigaction(int signum, \
+ struct osigaction *nsa, \
+ struct osigaction *osa); }
+47 AUE_GETGID STD { gid_t getgid(void); }
+48 AUE_SIGPROCMASK COMPAT { int sigprocmask(int how, osigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it, and we return the old mask as the
+; (int) return value.
+49 AUE_GETLOGIN STD { int getlogin(char *namebuf, u_int \
+ namelen); }
+50 AUE_SETLOGIN STD { int setlogin(char *namebuf); }
+51 AUE_ACCT STD { int acct(char *path); }
+52 AUE_SIGPENDING COMPAT { int sigpending(void); }
+53 AUE_SIGALTSTACK STD { int sigaltstack(stack_t *ss, \
+ stack_t *oss); }
+54 AUE_IOCTL STD { int ioctl(int fd, u_long com, \
+ caddr_t data); }
+55 AUE_REBOOT STD { int reboot(int opt); }
+56 AUE_REVOKE STD { int revoke(char *path); }
+57 AUE_SYMLINK STD { int symlink(char *path, char *link); }
+58 AUE_READLINK STD { ssize_t readlink(char *path, char *buf, \
+ size_t count); }
+59 AUE_EXECVE STD { int execve(char *fname, char **argv, \
+ char **envv); }
+60 AUE_UMASK STD { int umask(int newmask); } umask umask_args \
+ int
+61 AUE_CHROOT STD { int chroot(char *path); }
+62 AUE_FSTAT COMPAT { int fstat(int fd, struct ostat *sb); }
+63 AUE_NULL COMPAT { int getkerninfo(int op, char *where, \
+ size_t *size, int arg); } getkerninfo \
+ getkerninfo_args int
+64 AUE_NULL COMPAT { int getpagesize(void); } getpagesize \
+ getpagesize_args int
+65 AUE_MSYNC STD { int msync(void *addr, size_t len, \
+ int flags); }
+66 AUE_VFORK STD { int vfork(void); }
+67 AUE_NULL OBSOL vread
+68 AUE_NULL OBSOL vwrite
+69 AUE_SBRK STD { int sbrk(int incr); }
+70 AUE_SSTK STD { int sstk(int incr); }
+71 AUE_MMAP COMPAT { int mmap(void *addr, int len, int prot, \
+ int flags, int fd, long pos); }
+72 AUE_O_VADVISE STD { int ovadvise(int anom); } vadvise \
+ ovadvise_args int
+73 AUE_MUNMAP STD { int munmap(void *addr, size_t len); }
+74 AUE_MPROTECT STD { int mprotect(const void *addr, size_t len, \
+ int prot); }
+75 AUE_MADVISE STD { int madvise(void *addr, size_t len, \
+ int behav); }
+76 AUE_NULL OBSOL vhangup
+77 AUE_NULL OBSOL vlimit
+78 AUE_MINCORE STD { int mincore(const void *addr, size_t len, \
+ char *vec); }
+79 AUE_GETGROUPS STD { int getgroups(u_int gidsetsize, \
+ gid_t *gidset); }
+80 AUE_SETGROUPS STD { int setgroups(u_int gidsetsize, \
+ gid_t *gidset); }
+81 AUE_GETPGRP STD { int getpgrp(void); }
+82 AUE_SETPGRP STD { int setpgid(int pid, int pgid); }
+83 AUE_SETITIMER STD { int setitimer(u_int which, struct \
+ itimerval *itv, struct itimerval *oitv); }
+84 AUE_WAIT4 COMPAT { int wait(void); }
+85 AUE_SWAPON STD { int swapon(char *name); }
+86 AUE_GETITIMER STD { int getitimer(u_int which, \
+ struct itimerval *itv); }
+87 AUE_SYSCTL COMPAT { int gethostname(char *hostname, \
+ u_int len); } gethostname \
+ gethostname_args int
+88 AUE_SYSCTL COMPAT { int sethostname(char *hostname, \
+ u_int len); } sethostname \
+ sethostname_args int
+89 AUE_GETDTABLESIZE STD { int getdtablesize(void); }
+90 AUE_DUP2 STD { int dup2(u_int from, u_int to); }
+91 AUE_NULL UNIMPL getdopt
+92 AUE_FCNTL STD { int fcntl(int fd, int cmd, long arg); }
+; XXX should be { int fcntl(int fd, int cmd, ...); }
+; but we're not ready for varargs.
+93 AUE_SELECT STD { int select(int nd, fd_set *in, fd_set *ou, \
+ fd_set *ex, struct timeval *tv); }
+94 AUE_NULL UNIMPL setdopt
+95 AUE_FSYNC STD { int fsync(int fd); }
+96 AUE_SETPRIORITY STD { int setpriority(int which, int who, \
+ int prio); }
+97 AUE_SOCKET STD { int socket(int domain, int type, \
+ int protocol); }
+98 AUE_CONNECT STD { int connect(int s, caddr_t name, \
+ int namelen); }
+99 AUE_ACCEPT COMPAT|NOARGS { int accept(int s, caddr_t name, \
+ int *anamelen); } accept accept_args int
+100 AUE_GETPRIORITY STD { int getpriority(int which, int who); }
+101 AUE_SEND COMPAT { int send(int s, caddr_t buf, int len, \
+ int flags); }
+102 AUE_RECV COMPAT { int recv(int s, caddr_t buf, int len, \
+ int flags); }
+103 AUE_SIGRETURN COMPAT { int sigreturn( \
+ struct osigcontext *sigcntxp); }
+104 AUE_BIND STD { int bind(int s, caddr_t name, \
+ int namelen); }
+105 AUE_SETSOCKOPT STD { int setsockopt(int s, int level, int name, \
+ caddr_t val, int valsize); }
+106 AUE_LISTEN STD { int listen(int s, int backlog); }
+107 AUE_NULL OBSOL vtimes
+108 AUE_NULL COMPAT { int sigvec(int signum, struct sigvec *nsv, \
+ struct sigvec *osv); }
+109 AUE_NULL COMPAT { int sigblock(int mask); }
+110 AUE_NULL COMPAT { int sigsetmask(int mask); }
+111 AUE_NULL COMPAT { int sigsuspend(osigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it.
+112 AUE_NULL COMPAT { int sigstack(struct sigstack *nss, \
+ struct sigstack *oss); }
+113 AUE_RECVMSG COMPAT { int recvmsg(int s, struct omsghdr *msg, \
+ int flags); }
+114 AUE_SENDMSG COMPAT { int sendmsg(int s, caddr_t msg, \
+ int flags); }
+115 AUE_NULL OBSOL vtrace
+116 AUE_GETTIMEOFDAY STD { int gettimeofday(struct timeval *tp, \
+ struct timezone *tzp); }
+117 AUE_GETRUSAGE STD { int getrusage(int who, \
+ struct rusage *rusage); }
+118 AUE_GETSOCKOPT STD { int getsockopt(int s, int level, int name, \
+ caddr_t val, int *avalsize); }
+119 AUE_NULL UNIMPL resuba (BSD/OS 2.x)
+120 AUE_READV STD { int readv(int fd, struct iovec *iovp, \
+ u_int iovcnt); }
+121 AUE_WRITEV STD { int writev(int fd, struct iovec *iovp, \
+ u_int iovcnt); }
+122 AUE_SETTIMEOFDAY STD { int settimeofday(struct timeval *tv, \
+ struct timezone *tzp); }
+123 AUE_FCHOWN STD { int fchown(int fd, int uid, int gid); }
+124 AUE_FCHMOD STD { int fchmod(int fd, int mode); }
+125 AUE_RECVFROM COMPAT|NOARGS { int recvfrom(int s, caddr_t buf, \
+ size_t len, int flags, caddr_t from, int \
+ *fromlenaddr); } recvfrom recvfrom_args \
+ int
+126 AUE_SETREUID STD { int setreuid(int ruid, int euid); }
+127 AUE_SETREGID STD { int setregid(int rgid, int egid); }
+128 AUE_RENAME STD { int rename(char *from, char *to); }
+129 AUE_TRUNCATE COMPAT { int truncate(char *path, long length); }
+130 AUE_FTRUNCATE COMPAT { int ftruncate(int fd, long length); }
+131 AUE_FLOCK STD { int flock(int fd, int how); }
+132 AUE_MKFIFO STD { int mkfifo(char *path, int mode); }
+133 AUE_SENDTO STD { int sendto(int s, caddr_t buf, size_t len, \
+ int flags, caddr_t to, int tolen); }
+134 AUE_SHUTDOWN STD { int shutdown(int s, int how); }
+135 AUE_SOCKETPAIR STD { int socketpair(int domain, int type, \
+ int protocol, int *rsv); }
+136 AUE_MKDIR STD { int mkdir(char *path, int mode); }
+137 AUE_RMDIR STD { int rmdir(char *path); }
+138 AUE_UTIMES STD { int utimes(char *path, \
+ struct timeval *tptr); }
+139 AUE_NULL OBSOL 4.2 sigreturn
+140 AUE_ADJTIME STD { int adjtime(struct timeval *delta, \
+ struct timeval *olddelta); }
+141 AUE_GETPEERNAME COMPAT { int getpeername(int fdes, caddr_t asa, \
+ int *alen); }
+142 AUE_SYSCTL COMPAT { long gethostid(void); }
+143 AUE_SYSCTL COMPAT { int sethostid(long hostid); }
+144 AUE_GETRLIMIT COMPAT { int getrlimit(u_int which, struct \
+ orlimit *rlp); }
+145 AUE_SETRLIMIT COMPAT { int setrlimit(u_int which, \
+ struct orlimit *rlp); }
+146 AUE_KILLPG COMPAT { int killpg(int pgid, int signum); }
+147 AUE_SETSID STD { int setsid(void); }
+148 AUE_QUOTACTL STD { int quotactl(char *path, int cmd, int uid, \
+ caddr_t arg); }
+149 AUE_O_QUOTA COMPAT { int quota(void); }
+150 AUE_GETSOCKNAME COMPAT|NOARGS { int getsockname(int fdec, \
+ caddr_t asa, int *alen); } getsockname \
+ getsockname_args int
+
+; Syscalls 151-180 inclusive are reserved for vendor-specific
+; system calls. (This includes various calls added for compatibity
+; with other Unix variants.)
+; Some of these calls are now supported by BSD...
+151 AUE_NULL UNIMPL sem_lock (BSD/OS 2.x)
+152 AUE_NULL UNIMPL sem_wakeup (BSD/OS 2.x)
+153 AUE_NULL UNIMPL asyncdaemon (BSD/OS 2.x)
+; 154 is initialised by the NLM code, if present.
+154 AUE_NULL NOSTD { int nlm_syscall(int debug_level, int grace_period, int addr_count, char **addrs); }
+; 155 is initialized by the NFS code, if present.
+155 AUE_NFS_SVC NOSTD { int nfssvc(int flag, caddr_t argp); }
+156 AUE_GETDIRENTRIES COMPAT { int getdirentries(int fd, char *buf, \
+ u_int count, long *basep); }
+157 AUE_STATFS COMPAT4 { int statfs(char *path, \
+ struct ostatfs *buf); }
+158 AUE_FSTATFS COMPAT4 { int fstatfs(int fd, \
+ struct ostatfs *buf); }
+159 AUE_NULL UNIMPL nosys
+160 AUE_LGETFH STD { int lgetfh(char *fname, \
+ struct fhandle *fhp); }
+161 AUE_NFS_GETFH STD { int getfh(char *fname, \
+ struct fhandle *fhp); }
+162 AUE_SYSCTL COMPAT4 { int getdomainname(char *domainname, \
+ int len); }
+163 AUE_SYSCTL COMPAT4 { int setdomainname(char *domainname, \
+ int len); }
+164 AUE_NULL COMPAT4 { int uname(struct utsname *name); }
+165 AUE_SYSARCH STD { int sysarch(int op, char *parms); }
+166 AUE_RTPRIO STD { int rtprio(int function, pid_t pid, \
+ struct rtprio *rtp); }
+167 AUE_NULL UNIMPL nosys
+168 AUE_NULL UNIMPL nosys
+169 AUE_SEMSYS NOSTD { int semsys(int which, int a2, int a3, \
+ int a4, int a5); }
+; XXX should be { int semsys(int which, ...); }
+170 AUE_MSGSYS NOSTD { int msgsys(int which, int a2, int a3, \
+ int a4, int a5, int a6); }
+; XXX should be { int msgsys(int which, ...); }
+171 AUE_SHMSYS NOSTD { int shmsys(int which, int a2, int a3, \
+ int a4); }
+; XXX should be { int shmsys(int which, ...); }
+172 AUE_NULL UNIMPL nosys
+173 AUE_PREAD STD { ssize_t freebsd6_pread(int fd, void *buf, \
+ size_t nbyte, int pad, off_t offset); }
+174 AUE_PWRITE STD { ssize_t freebsd6_pwrite(int fd, \
+ const void *buf, \
+ size_t nbyte, int pad, off_t offset); }
+175 AUE_NULL STD { int setfib(int fibnum); }
+176 AUE_NTP_ADJTIME STD { int ntp_adjtime(struct timex *tp); }
+177 AUE_NULL UNIMPL sfork (BSD/OS 2.x)
+178 AUE_NULL UNIMPL getdescriptor (BSD/OS 2.x)
+179 AUE_NULL UNIMPL setdescriptor (BSD/OS 2.x)
+180 AUE_NULL UNIMPL nosys
+
+; Syscalls 181-199 are used by/reserved for BSD
+181 AUE_SETGID STD { int setgid(gid_t gid); }
+182 AUE_SETEGID STD { int setegid(gid_t egid); }
+183 AUE_SETEUID STD { int seteuid(uid_t euid); }
+184 AUE_NULL UNIMPL lfs_bmapv
+185 AUE_NULL UNIMPL lfs_markv
+186 AUE_NULL UNIMPL lfs_segclean
+187 AUE_NULL UNIMPL lfs_segwait
+188 AUE_STAT STD { int stat(char *path, struct stat *ub); }
+189 AUE_FSTAT STD { int fstat(int fd, struct stat *sb); }
+190 AUE_LSTAT STD { int lstat(char *path, struct stat *ub); }
+191 AUE_PATHCONF STD { int pathconf(char *path, int name); }
+192 AUE_FPATHCONF STD { int fpathconf(int fd, int name); }
+193 AUE_NULL UNIMPL nosys
+194 AUE_GETRLIMIT STD { int getrlimit(u_int which, \
+ struct rlimit *rlp); } getrlimit \
+ __getrlimit_args int
+195 AUE_SETRLIMIT STD { int setrlimit(u_int which, \
+ struct rlimit *rlp); } setrlimit \
+ __setrlimit_args int
+196 AUE_GETDIRENTRIES STD { int getdirentries(int fd, char *buf, \
+ u_int count, long *basep); }
+197 AUE_MMAP STD { caddr_t freebsd6_mmap(caddr_t addr, \
+ size_t len, int prot, int flags, int fd, \
+ int pad, off_t pos); }
+198 AUE_NULL NOPROTO { int nosys(void); } __syscall \
+ __syscall_args int
+199 AUE_LSEEK STD { off_t freebsd6_lseek(int fd, int pad, \
+ off_t offset, int whence); }
+200 AUE_TRUNCATE STD { int freebsd6_truncate(char *path, int pad, \
+ off_t length); }
+201 AUE_FTRUNCATE STD { int freebsd6_ftruncate(int fd, int pad, \
+ off_t length); }
+202 AUE_SYSCTL STD { int __sysctl(int *name, u_int namelen, \
+ void *old, size_t *oldlenp, void *new, \
+ size_t newlen); } __sysctl sysctl_args int
+203 AUE_MLOCK STD { int mlock(const void *addr, size_t len); }
+204 AUE_MUNLOCK STD { int munlock(const void *addr, size_t len); }
+205 AUE_UNDELETE STD { int undelete(char *path); }
+206 AUE_FUTIMES STD { int futimes(int fd, struct timeval *tptr); }
+207 AUE_GETPGID STD { int getpgid(pid_t pid); }
+208 AUE_NULL UNIMPL newreboot (NetBSD)
+209 AUE_POLL STD { int poll(struct pollfd *fds, u_int nfds, \
+ int timeout); }
+
+;
+; The following are reserved for loadable syscalls
+;
+210 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+211 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+212 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+213 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+214 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+215 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+216 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+217 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+218 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+219 AUE_NULL NODEF|NOTSTATIC lkmnosys lkmnosys nosys_args int
+
+;
+; The following were introduced with NetBSD/4.4Lite-2
+220 AUE_SEMCTL COMPAT7|NOSTD { int __semctl(int semid, int semnum, \
+ int cmd, union semun_old *arg); }
+221 AUE_SEMGET NOSTD { int semget(key_t key, int nsems, \
+ int semflg); }
+222 AUE_SEMOP NOSTD { int semop(int semid, struct sembuf *sops, \
+ size_t nsops); }
+223 AUE_NULL UNIMPL semconfig
+224 AUE_MSGCTL COMPAT7|NOSTD { int msgctl(int msqid, int cmd, \
+ struct msqid_ds_old *buf); }
+225 AUE_MSGGET NOSTD { int msgget(key_t key, int msgflg); }
+226 AUE_MSGSND NOSTD { int msgsnd(int msqid, const void *msgp, \
+ size_t msgsz, int msgflg); }
+227 AUE_MSGRCV NOSTD { int msgrcv(int msqid, void *msgp, \
+ size_t msgsz, long msgtyp, int msgflg); }
+228 AUE_SHMAT NOSTD { int shmat(int shmid, const void *shmaddr, \
+ int shmflg); }
+229 AUE_SHMCTL COMPAT7|NOSTD { int shmctl(int shmid, int cmd, \
+ struct shmid_ds_old *buf); }
+230 AUE_SHMDT NOSTD { int shmdt(const void *shmaddr); }
+231 AUE_SHMGET NOSTD { int shmget(key_t key, size_t size, \
+ int shmflg); }
+;
+232 AUE_NULL STD { int clock_gettime(clockid_t clock_id, \
+ struct timespec *tp); }
+233 AUE_CLOCK_SETTIME STD { int clock_settime( \
+ clockid_t clock_id, \
+ const struct timespec *tp); }
+234 AUE_NULL STD { int clock_getres(clockid_t clock_id, \
+ struct timespec *tp); }
+235 AUE_NULL STD { int ktimer_create(clockid_t clock_id, \
+ struct sigevent *evp, int *timerid); }
+236 AUE_NULL STD { int ktimer_delete(int timerid); }
+237 AUE_NULL STD { int ktimer_settime(int timerid, int flags, \
+ const struct itimerspec *value, \
+ struct itimerspec *ovalue); }
+238 AUE_NULL STD { int ktimer_gettime(int timerid, struct \
+ itimerspec *value); }
+239 AUE_NULL STD { int ktimer_getoverrun(int timerid); }
+240 AUE_NULL STD { int nanosleep(const struct timespec *rqtp, \
+ struct timespec *rmtp); }
+241 AUE_NULL STD { int ffclock_getcounter(ffcounter *ffcount); }
+242 AUE_NULL STD { int ffclock_setestimate( \
+ struct ffclock_estimate *cest); }
+243 AUE_NULL STD { int ffclock_getestimate( \
+ struct ffclock_estimate *cest); }
+244 AUE_NULL UNIMPL nosys
+245 AUE_NULL UNIMPL nosys
+246 AUE_NULL UNIMPL nosys
+247 AUE_NULL STD { int clock_getcpuclockid2(id_t id,\
+ int which, clockid_t *clock_id); }
+248 AUE_NULL STD { int ntp_gettime(struct ntptimeval *ntvp); }
+249 AUE_NULL UNIMPL nosys
+; syscall numbers initially used in OpenBSD
+250 AUE_MINHERIT STD { int minherit(void *addr, size_t len, \
+ int inherit); }
+251 AUE_RFORK STD { int rfork(int flags); }
+252 AUE_POLL STD { int openbsd_poll(struct pollfd *fds, \
+ u_int nfds, int timeout); }
+253 AUE_ISSETUGID STD { int issetugid(void); }
+254 AUE_LCHOWN STD { int lchown(char *path, int uid, int gid); }
+255 AUE_NULL NOSTD { int aio_read(struct aiocb *aiocbp); }
+256 AUE_NULL NOSTD { int aio_write(struct aiocb *aiocbp); }
+257 AUE_NULL NOSTD { int lio_listio(int mode, \
+ struct aiocb * const *acb_list, \
+ int nent, struct sigevent *sig); }
+258 AUE_NULL UNIMPL nosys
+259 AUE_NULL UNIMPL nosys
+260 AUE_NULL UNIMPL nosys
+261 AUE_NULL UNIMPL nosys
+262 AUE_NULL UNIMPL nosys
+263 AUE_NULL UNIMPL nosys
+264 AUE_NULL UNIMPL nosys
+265 AUE_NULL UNIMPL nosys
+266 AUE_NULL UNIMPL nosys
+267 AUE_NULL UNIMPL nosys
+268 AUE_NULL UNIMPL nosys
+269 AUE_NULL UNIMPL nosys
+270 AUE_NULL UNIMPL nosys
+271 AUE_NULL UNIMPL nosys
+272 AUE_O_GETDENTS STD { int getdents(int fd, char *buf, \
+ size_t count); }
+273 AUE_NULL UNIMPL nosys
+274 AUE_LCHMOD STD { int lchmod(char *path, mode_t mode); }
+275 AUE_LCHOWN NOPROTO { int lchown(char *path, uid_t uid, \
+ gid_t gid); } netbsd_lchown lchown_args \
+ int
+276 AUE_LUTIMES STD { int lutimes(char *path, \
+ struct timeval *tptr); }
+277 AUE_MSYNC NOPROTO { int msync(void *addr, size_t len, \
+ int flags); } netbsd_msync msync_args int
+278 AUE_STAT STD { int nstat(char *path, struct nstat *ub); }
+279 AUE_FSTAT STD { int nfstat(int fd, struct nstat *sb); }
+280 AUE_LSTAT STD { int nlstat(char *path, struct nstat *ub); }
+281 AUE_NULL UNIMPL nosys
+282 AUE_NULL UNIMPL nosys
+283 AUE_NULL UNIMPL nosys
+284 AUE_NULL UNIMPL nosys
+285 AUE_NULL UNIMPL nosys
+286 AUE_NULL UNIMPL nosys
+287 AUE_NULL UNIMPL nosys
+288 AUE_NULL UNIMPL nosys
+; 289 and 290 from NetBSD (OpenBSD: 267 and 268)
+289 AUE_PREADV STD { ssize_t preadv(int fd, struct iovec *iovp, \
+ u_int iovcnt, off_t offset); }
+290 AUE_PWRITEV STD { ssize_t pwritev(int fd, struct iovec *iovp, \
+ u_int iovcnt, off_t offset); }
+291 AUE_NULL UNIMPL nosys
+292 AUE_NULL UNIMPL nosys
+293 AUE_NULL UNIMPL nosys
+294 AUE_NULL UNIMPL nosys
+295 AUE_NULL UNIMPL nosys
+296 AUE_NULL UNIMPL nosys
+; XXX 297 is 300 in NetBSD
+297 AUE_FHSTATFS COMPAT4 { int fhstatfs( \
+ const struct fhandle *u_fhp, \
+ struct ostatfs *buf); }
+298 AUE_FHOPEN STD { int fhopen(const struct fhandle *u_fhp, \
+ int flags); }
+299 AUE_FHSTAT STD { int fhstat(const struct fhandle *u_fhp, \
+ struct stat *sb); }
+; syscall numbers for FreeBSD
+300 AUE_NULL STD { int modnext(int modid); }
+301 AUE_NULL STD { int modstat(int modid, \
+ struct module_stat *stat); }
+302 AUE_NULL STD { int modfnext(int modid); }
+303 AUE_NULL STD { int modfind(const char *name); }
+304 AUE_MODLOAD STD { int kldload(const char *file); }
+305 AUE_MODUNLOAD STD { int kldunload(int fileid); }
+306 AUE_NULL STD { int kldfind(const char *file); }
+307 AUE_NULL STD { int kldnext(int fileid); }
+308 AUE_NULL STD { int kldstat(int fileid, struct \
+ kld_file_stat* stat); }
+309 AUE_NULL STD { int kldfirstmod(int fileid); }
+310 AUE_GETSID STD { int getsid(pid_t pid); }
+311 AUE_SETRESUID STD { int setresuid(uid_t ruid, uid_t euid, \
+ uid_t suid); }
+312 AUE_SETRESGID STD { int setresgid(gid_t rgid, gid_t egid, \
+ gid_t sgid); }
+313 AUE_NULL OBSOL signanosleep
+314 AUE_NULL NOSTD { int aio_return(struct aiocb *aiocbp); }
+315 AUE_NULL NOSTD { int aio_suspend( \
+ struct aiocb * const * aiocbp, int nent, \
+ const struct timespec *timeout); }
+316 AUE_NULL NOSTD { int aio_cancel(int fd, \
+ struct aiocb *aiocbp); }
+317 AUE_NULL NOSTD { int aio_error(struct aiocb *aiocbp); }
+318 AUE_NULL NOSTD { int oaio_read(struct oaiocb *aiocbp); }
+319 AUE_NULL NOSTD { int oaio_write(struct oaiocb *aiocbp); }
+320 AUE_NULL NOSTD { int olio_listio(int mode, \
+ struct oaiocb * const *acb_list, \
+ int nent, struct osigevent *sig); }
+321 AUE_NULL STD { int yield(void); }
+322 AUE_NULL OBSOL thr_sleep
+323 AUE_NULL OBSOL thr_wakeup
+324 AUE_MLOCKALL STD { int mlockall(int how); }
+325 AUE_MUNLOCKALL STD { int munlockall(void); }
+326 AUE_GETCWD STD { int __getcwd(u_char *buf, u_int buflen); }
+
+327 AUE_NULL STD { int sched_setparam (pid_t pid, \
+ const struct sched_param *param); }
+328 AUE_NULL STD { int sched_getparam (pid_t pid, struct \
+ sched_param *param); }
+
+329 AUE_NULL STD { int sched_setscheduler (pid_t pid, int \
+ policy, const struct sched_param \
+ *param); }
+330 AUE_NULL STD { int sched_getscheduler (pid_t pid); }
+
+331 AUE_NULL STD { int sched_yield (void); }
+332 AUE_NULL STD { int sched_get_priority_max (int policy); }
+333 AUE_NULL STD { int sched_get_priority_min (int policy); }
+334 AUE_NULL STD { int sched_rr_get_interval (pid_t pid, \
+ struct timespec *interval); }
+335 AUE_NULL STD { int utrace(const void *addr, size_t len); }
+336 AUE_SENDFILE COMPAT4 { int sendfile(int fd, int s, \
+ off_t offset, size_t nbytes, \
+ struct sf_hdtr *hdtr, off_t *sbytes, \
+ int flags); }
+337 AUE_NULL STD { int kldsym(int fileid, int cmd, \
+ void *data); }
+338 AUE_JAIL STD { int jail(struct jail *jail); }
+339 AUE_NULL NOSTD|NOTSTATIC { int nnpfs_syscall(int operation, \
+ char *a_pathP, int a_opcode, \
+ void *a_paramsP, int a_followSymlinks); }
+340 AUE_SIGPROCMASK STD { int sigprocmask(int how, \
+ const sigset_t *set, sigset_t *oset); }
+341 AUE_SIGSUSPEND STD { int sigsuspend(const sigset_t *sigmask); }
+342 AUE_SIGACTION COMPAT4 { int sigaction(int sig, const \
+ struct sigaction *act, \
+ struct sigaction *oact); }
+343 AUE_SIGPENDING STD { int sigpending(sigset_t *set); }
+344 AUE_SIGRETURN COMPAT4 { int sigreturn( \
+ const struct ucontext4 *sigcntxp); }
+345 AUE_SIGWAIT STD { int sigtimedwait(const sigset_t *set, \
+ siginfo_t *info, \
+ const struct timespec *timeout); }
+346 AUE_NULL STD { int sigwaitinfo(const sigset_t *set, \
+ siginfo_t *info); }
+347 AUE_NULL STD { int __acl_get_file(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+348 AUE_NULL STD { int __acl_set_file(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+349 AUE_NULL STD { int __acl_get_fd(int filedes, \
+ acl_type_t type, struct acl *aclp); }
+350 AUE_NULL STD { int __acl_set_fd(int filedes, \
+ acl_type_t type, struct acl *aclp); }
+351 AUE_NULL STD { int __acl_delete_file(const char *path, \
+ acl_type_t type); }
+352 AUE_NULL STD { int __acl_delete_fd(int filedes, \
+ acl_type_t type); }
+353 AUE_NULL STD { int __acl_aclcheck_file(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+354 AUE_NULL STD { int __acl_aclcheck_fd(int filedes, \
+ acl_type_t type, struct acl *aclp); }
+355 AUE_EXTATTRCTL STD { int extattrctl(const char *path, int cmd, \
+ const char *filename, int attrnamespace, \
+ const char *attrname); }
+356 AUE_EXTATTR_SET_FILE STD { ssize_t extattr_set_file( \
+ const char *path, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+357 AUE_EXTATTR_GET_FILE STD { ssize_t extattr_get_file( \
+ const char *path, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+358 AUE_EXTATTR_DELETE_FILE STD { int extattr_delete_file(const char *path, \
+ int attrnamespace, \
+ const char *attrname); }
+359 AUE_NULL NOSTD { int aio_waitcomplete( \
+ struct aiocb **aiocbp, \
+ struct timespec *timeout); }
+360 AUE_GETRESUID STD { int getresuid(uid_t *ruid, uid_t *euid, \
+ uid_t *suid); }
+361 AUE_GETRESGID STD { int getresgid(gid_t *rgid, gid_t *egid, \
+ gid_t *sgid); }
+362 AUE_KQUEUE STD { int kqueue(void); }
+363 AUE_NULL STD { int kevent(int fd, \
+ struct kevent *changelist, int nchanges, \
+ struct kevent *eventlist, int nevents, \
+ const struct timespec *timeout); }
+364 AUE_NULL UNIMPL __cap_get_proc
+365 AUE_NULL UNIMPL __cap_set_proc
+366 AUE_NULL UNIMPL __cap_get_fd
+367 AUE_NULL UNIMPL __cap_get_file
+368 AUE_NULL UNIMPL __cap_set_fd
+369 AUE_NULL UNIMPL __cap_set_file
+370 AUE_NULL UNIMPL nosys
+371 AUE_EXTATTR_SET_FD STD { ssize_t extattr_set_fd(int fd, \
+ int attrnamespace, const char *attrname, \
+ void *data, size_t nbytes); }
+372 AUE_EXTATTR_GET_FD STD { ssize_t extattr_get_fd(int fd, \
+ int attrnamespace, const char *attrname, \
+ void *data, size_t nbytes); }
+373 AUE_EXTATTR_DELETE_FD STD { int extattr_delete_fd(int fd, \
+ int attrnamespace, \
+ const char *attrname); }
+374 AUE_NULL STD { int __setugid(int flag); }
+375 AUE_NULL UNIMPL nfsclnt
+376 AUE_EACCESS STD { int eaccess(char *path, int amode); }
+377 AUE_NULL NOSTD|NOTSTATIC { int afs3_syscall(long syscall, \
+ long parm1, long parm2, long parm3, \
+ long parm4, long parm5, long parm6); }
+378 AUE_NMOUNT STD { int nmount(struct iovec *iovp, \
+ unsigned int iovcnt, int flags); }
+379 AUE_NULL UNIMPL kse_exit
+380 AUE_NULL UNIMPL kse_wakeup
+381 AUE_NULL UNIMPL kse_create
+382 AUE_NULL UNIMPL kse_thr_interrupt
+383 AUE_NULL UNIMPL kse_release
+384 AUE_NULL STD { int __mac_get_proc(struct mac *mac_p); }
+385 AUE_NULL STD { int __mac_set_proc(struct mac *mac_p); }
+386 AUE_NULL STD { int __mac_get_fd(int fd, \
+ struct mac *mac_p); }
+387 AUE_NULL STD { int __mac_get_file(const char *path_p, \
+ struct mac *mac_p); }
+388 AUE_NULL STD { int __mac_set_fd(int fd, \
+ struct mac *mac_p); }
+389 AUE_NULL STD { int __mac_set_file(const char *path_p, \
+ struct mac *mac_p); }
+390 AUE_NULL STD { int kenv(int what, const char *name, \
+ char *value, int len); }
+391 AUE_LCHFLAGS STD { int lchflags(const char *path, \
+ u_long flags); }
+392 AUE_NULL STD { int uuidgen(struct uuid *store, \
+ int count); }
+393 AUE_SENDFILE STD { int sendfile(int fd, int s, off_t offset, \
+ size_t nbytes, struct sf_hdtr *hdtr, \
+ off_t *sbytes, int flags); }
+394 AUE_NULL STD { int mac_syscall(const char *policy, \
+ int call, void *arg); }
+395 AUE_GETFSSTAT STD { int getfsstat(struct statfs *buf, \
+ long bufsize, int flags); }
+396 AUE_STATFS STD { int statfs(char *path, \
+ struct statfs *buf); }
+397 AUE_FSTATFS STD { int fstatfs(int fd, struct statfs *buf); }
+398 AUE_FHSTATFS STD { int fhstatfs(const struct fhandle *u_fhp, \
+ struct statfs *buf); }
+399 AUE_NULL UNIMPL nosys
+400 AUE_NULL NOSTD { int ksem_close(semid_t id); }
+401 AUE_NULL NOSTD { int ksem_post(semid_t id); }
+402 AUE_NULL NOSTD { int ksem_wait(semid_t id); }
+403 AUE_NULL NOSTD { int ksem_trywait(semid_t id); }
+404 AUE_NULL NOSTD { int ksem_init(semid_t *idp, \
+ unsigned int value); }
+405 AUE_NULL NOSTD { int ksem_open(semid_t *idp, \
+ const char *name, int oflag, \
+ mode_t mode, unsigned int value); }
+406 AUE_NULL NOSTD { int ksem_unlink(const char *name); }
+407 AUE_NULL NOSTD { int ksem_getvalue(semid_t id, int *val); }
+408 AUE_NULL NOSTD { int ksem_destroy(semid_t id); }
+409 AUE_NULL STD { int __mac_get_pid(pid_t pid, \
+ struct mac *mac_p); }
+410 AUE_NULL STD { int __mac_get_link(const char *path_p, \
+ struct mac *mac_p); }
+411 AUE_NULL STD { int __mac_set_link(const char *path_p, \
+ struct mac *mac_p); }
+412 AUE_EXTATTR_SET_LINK STD { ssize_t extattr_set_link( \
+ const char *path, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+413 AUE_EXTATTR_GET_LINK STD { ssize_t extattr_get_link( \
+ const char *path, int attrnamespace, \
+ const char *attrname, void *data, \
+ size_t nbytes); }
+414 AUE_EXTATTR_DELETE_LINK STD { int extattr_delete_link( \
+ const char *path, int attrnamespace, \
+ const char *attrname); }
+415 AUE_NULL STD { int __mac_execve(char *fname, char **argv, \
+ char **envv, struct mac *mac_p); }
+416 AUE_SIGACTION STD { int sigaction(int sig, \
+ const struct sigaction *act, \
+ struct sigaction *oact); }
+417 AUE_SIGRETURN STD { int sigreturn( \
+ const struct __ucontext *sigcntxp); }
+418 AUE_NULL UNIMPL __xstat
+419 AUE_NULL UNIMPL __xfstat
+420 AUE_NULL UNIMPL __xlstat
+421 AUE_NULL STD { int getcontext(struct __ucontext *ucp); }
+422 AUE_NULL STD { int setcontext( \
+ const struct __ucontext *ucp); }
+423 AUE_NULL STD { int swapcontext(struct __ucontext *oucp, \
+ const struct __ucontext *ucp); }
+424 AUE_SWAPOFF STD { int swapoff(const char *name); }
+425 AUE_NULL STD { int __acl_get_link(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+426 AUE_NULL STD { int __acl_set_link(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+427 AUE_NULL STD { int __acl_delete_link(const char *path, \
+ acl_type_t type); }
+428 AUE_NULL STD { int __acl_aclcheck_link(const char *path, \
+ acl_type_t type, struct acl *aclp); }
+429 AUE_SIGWAIT STD { int sigwait(const sigset_t *set, \
+ int *sig); }
+430 AUE_NULL STD { int thr_create(ucontext_t *ctx, long *id, \
+ int flags); }
+431 AUE_NULL STD { void thr_exit(long *state); }
+432 AUE_NULL STD { int thr_self(long *id); }
+433 AUE_NULL STD { int thr_kill(long id, int sig); }
+434 AUE_NULL STD { int _umtx_lock(struct umtx *umtx); }
+435 AUE_NULL STD { int _umtx_unlock(struct umtx *umtx); }
+436 AUE_NULL STD { int jail_attach(int jid); }
+437 AUE_EXTATTR_LIST_FD STD { ssize_t extattr_list_fd(int fd, \
+ int attrnamespace, void *data, \
+ size_t nbytes); }
+438 AUE_EXTATTR_LIST_FILE STD { ssize_t extattr_list_file( \
+ const char *path, int attrnamespace, \
+ void *data, size_t nbytes); }
+439 AUE_EXTATTR_LIST_LINK STD { ssize_t extattr_list_link( \
+ const char *path, int attrnamespace, \
+ void *data, size_t nbytes); }
+440 AUE_NULL UNIMPL kse_switchin
+441 AUE_NULL NOSTD { int ksem_timedwait(semid_t id, \
+ const struct timespec *abstime); }
+442 AUE_NULL STD { int thr_suspend( \
+ const struct timespec *timeout); }
+443 AUE_NULL STD { int thr_wake(long id); }
+444 AUE_MODUNLOAD STD { int kldunloadf(int fileid, int flags); }
+445 AUE_AUDIT STD { int audit(const void *record, \
+ u_int length); }
+446 AUE_AUDITON STD { int auditon(int cmd, void *data, \
+ u_int length); }
+447 AUE_GETAUID STD { int getauid(uid_t *auid); }
+448 AUE_SETAUID STD { int setauid(uid_t *auid); }
+449 AUE_GETAUDIT STD { int getaudit(struct auditinfo *auditinfo); }
+450 AUE_SETAUDIT STD { int setaudit(struct auditinfo *auditinfo); }
+451 AUE_GETAUDIT_ADDR STD { int getaudit_addr( \
+ struct auditinfo_addr *auditinfo_addr, \
+ u_int length); }
+452 AUE_SETAUDIT_ADDR STD { int setaudit_addr( \
+ struct auditinfo_addr *auditinfo_addr, \
+ u_int length); }
+453 AUE_AUDITCTL STD { int auditctl(char *path); }
+454 AUE_NULL STD { int _umtx_op(void *obj, int op, \
+ u_long val, void *uaddr1, void *uaddr2); }
+455 AUE_NULL STD { int thr_new(struct thr_param *param, \
+ int param_size); }
+456 AUE_NULL STD { int sigqueue(pid_t pid, int signum, void *value); }
+457 AUE_NULL NOSTD { int kmq_open(const char *path, int flags, \
+ mode_t mode, const struct mq_attr *attr); }
+458 AUE_NULL NOSTD { int kmq_setattr(int mqd, \
+ const struct mq_attr *attr, \
+ struct mq_attr *oattr); }
+459 AUE_NULL NOSTD { int kmq_timedreceive(int mqd, \
+ char *msg_ptr, size_t msg_len, \
+ unsigned *msg_prio, \
+ const struct timespec *abs_timeout); }
+460 AUE_NULL NOSTD { int kmq_timedsend(int mqd, \
+ const char *msg_ptr, size_t msg_len,\
+ unsigned msg_prio, \
+ const struct timespec *abs_timeout);}
+461 AUE_NULL NOSTD { int kmq_notify(int mqd, \
+ const struct sigevent *sigev); }
+462 AUE_NULL NOSTD { int kmq_unlink(const char *path); }
+463 AUE_NULL STD { int abort2(const char *why, int nargs, void **args); }
+464 AUE_NULL STD { int thr_set_name(long id, const char *name); }
+465 AUE_NULL NOSTD { int aio_fsync(int op, struct aiocb *aiocbp); }
+466 AUE_RTPRIO STD { int rtprio_thread(int function, \
+ lwpid_t lwpid, struct rtprio *rtp); }
+467 AUE_NULL UNIMPL nosys
+468 AUE_NULL UNIMPL nosys
+469 AUE_NULL UNIMPL __getpath_fromfd
+470 AUE_NULL UNIMPL __getpath_fromaddr
+471 AUE_NULL STD { int sctp_peeloff(int sd, uint32_t name); }
+472 AUE_NULL STD { int sctp_generic_sendmsg(int sd, caddr_t msg, int mlen, \
+ caddr_t to, __socklen_t tolen, \
+ struct sctp_sndrcvinfo *sinfo, int flags); }
+473 AUE_NULL STD { int sctp_generic_sendmsg_iov(int sd, struct iovec *iov, int iovlen, \
+ caddr_t to, __socklen_t tolen, \
+ struct sctp_sndrcvinfo *sinfo, int flags); }
+474 AUE_NULL STD { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, \
+ struct sockaddr * from, __socklen_t *fromlenaddr, \
+ struct sctp_sndrcvinfo *sinfo, int *msg_flags); }
+475 AUE_PREAD STD { ssize_t pread(int fd, void *buf, \
+ size_t nbyte, off_t offset); }
+476 AUE_PWRITE STD { ssize_t pwrite(int fd, const void *buf, \
+ size_t nbyte, off_t offset); }
+477 AUE_MMAP STD { caddr_t mmap(caddr_t addr, size_t len, \
+ int prot, int flags, int fd, off_t pos); }
+478 AUE_LSEEK STD { off_t lseek(int fd, off_t offset, \
+ int whence); }
+479 AUE_TRUNCATE STD { int truncate(char *path, off_t length); }
+480 AUE_FTRUNCATE STD { int ftruncate(int fd, off_t length); }
+481 AUE_KILL STD { int thr_kill2(pid_t pid, long id, int sig); }
+482 AUE_SHMOPEN STD { int shm_open(const char *path, int flags, \
+ mode_t mode); }
+483 AUE_SHMUNLINK STD { int shm_unlink(const char *path); }
+484 AUE_NULL STD { int cpuset(cpusetid_t *setid); }
+485 AUE_NULL STD { int cpuset_setid(cpuwhich_t which, id_t id, \
+ cpusetid_t setid); }
+486 AUE_NULL STD { int cpuset_getid(cpulevel_t level, \
+ cpuwhich_t which, id_t id, \
+ cpusetid_t *setid); }
+487 AUE_NULL STD { int cpuset_getaffinity(cpulevel_t level, \
+ cpuwhich_t which, id_t id, size_t cpusetsize, \
+ cpuset_t *mask); }
+488 AUE_NULL STD { int cpuset_setaffinity(cpulevel_t level, \
+ cpuwhich_t which, id_t id, size_t cpusetsize, \
+ const cpuset_t *mask); }
+489 AUE_FACCESSAT STD { int faccessat(int fd, char *path, int amode, \
+ int flag); }
+490 AUE_FCHMODAT STD { int fchmodat(int fd, char *path, mode_t mode, \
+ int flag); }
+491 AUE_FCHOWNAT STD { int fchownat(int fd, char *path, uid_t uid, \
+ gid_t gid, int flag); }
+492 AUE_FEXECVE STD { int fexecve(int fd, char **argv, \
+ char **envv); }
+493 AUE_FSTATAT STD { int fstatat(int fd, char *path, \
+ struct stat *buf, int flag); }
+494 AUE_FUTIMESAT STD { int futimesat(int fd, char *path, \
+ struct timeval *times); }
+495 AUE_LINKAT STD { int linkat(int fd1, char *path1, int fd2, \
+ char *path2, int flag); }
+496 AUE_MKDIRAT STD { int mkdirat(int fd, char *path, mode_t mode); }
+497 AUE_MKFIFOAT STD { int mkfifoat(int fd, char *path, mode_t mode); }
+498 AUE_MKNODAT STD { int mknodat(int fd, char *path, mode_t mode, \
+ dev_t dev); }
+; XXX: see the comment for open
+499 AUE_OPENAT_RWTC STD { int openat(int fd, char *path, int flag, \
+ mode_t mode); }
+500 AUE_READLINKAT STD { int readlinkat(int fd, char *path, char *buf, \
+ size_t bufsize); }
+501 AUE_RENAMEAT STD { int renameat(int oldfd, char *old, int newfd, \
+ char *new); }
+502 AUE_SYMLINKAT STD { int symlinkat(char *path1, int fd, \
+ char *path2); }
+503 AUE_UNLINKAT STD { int unlinkat(int fd, char *path, int flag); }
+504 AUE_POSIX_OPENPT STD { int posix_openpt(int flags); }
+; 505 is initialised by the kgssapi code, if present.
+505 AUE_NULL NOSTD { int gssd_syscall(char *path); }
+506 AUE_NULL STD { int jail_get(struct iovec *iovp, \
+ unsigned int iovcnt, int flags); }
+507 AUE_NULL STD { int jail_set(struct iovec *iovp, \
+ unsigned int iovcnt, int flags); }
+508 AUE_NULL STD { int jail_remove(int jid); }
+509 AUE_CLOSEFROM STD { int closefrom(int lowfd); }
+510 AUE_SEMCTL NOSTD { int __semctl(int semid, int semnum, \
+ int cmd, union semun *arg); }
+511 AUE_MSGCTL NOSTD { int msgctl(int msqid, int cmd, \
+ struct msqid_ds *buf); }
+512 AUE_SHMCTL NOSTD { int shmctl(int shmid, int cmd, \
+ struct shmid_ds *buf); }
+513 AUE_LPATHCONF STD { int lpathconf(char *path, int name); }
+514 AUE_NULL OBSOL cap_new
+515 AUE_CAP_RIGHTS_GET STD { int __cap_rights_get(int version, \
+ int fd, cap_rights_t *rightsp); }
+516 AUE_CAP_ENTER STD { int cap_enter(void); }
+517 AUE_CAP_GETMODE STD { int cap_getmode(u_int *modep); }
+518 AUE_PDFORK STD { int pdfork(int *fdp, int flags); }
+519 AUE_PDKILL STD { int pdkill(int fd, int signum); }
+520 AUE_PDGETPID STD { int pdgetpid(int fd, pid_t *pidp); }
+521 AUE_PDWAIT UNIMPL pdwait4
+522 AUE_SELECT STD { int pselect(int nd, fd_set *in, \
+ fd_set *ou, fd_set *ex, \
+ const struct timespec *ts, \
+ const sigset_t *sm); }
+523 AUE_NULL STD { int getloginclass(char *namebuf, \
+ size_t namelen); }
+524 AUE_NULL STD { int setloginclass(const char *namebuf); }
+525 AUE_NULL STD { int rctl_get_racct(const void *inbufp, \
+ size_t inbuflen, void *outbufp, \
+ size_t outbuflen); }
+526 AUE_NULL STD { int rctl_get_rules(const void *inbufp, \
+ size_t inbuflen, void *outbufp, \
+ size_t outbuflen); }
+527 AUE_NULL STD { int rctl_get_limits(const void *inbufp, \
+ size_t inbuflen, void *outbufp, \
+ size_t outbuflen); }
+528 AUE_NULL STD { int rctl_add_rule(const void *inbufp, \
+ size_t inbuflen, void *outbufp, \
+ size_t outbuflen); }
+529 AUE_NULL STD { int rctl_remove_rule(const void *inbufp, \
+ size_t inbuflen, void *outbufp, \
+ size_t outbuflen); }
+530 AUE_NULL STD { int posix_fallocate(int fd, \
+ off_t offset, off_t len); }
+531 AUE_NULL STD { int posix_fadvise(int fd, off_t offset, \
+ off_t len, int advice); }
+532 AUE_WAIT6 STD { int wait6(int idtype, id_t id, \
+ int *status, int options, \
+ struct __wrusage *wrusage, \
+ siginfo_t *info); }
+533 AUE_CAP_RIGHTS_LIMIT STD { int cap_rights_limit(int fd, \
+ cap_rights_t *rightsp); }
+534 AUE_CAP_IOCTLS_LIMIT STD { int cap_ioctls_limit(int fd, \
+ const u_long *cmds, size_t ncmds); }
+535 AUE_CAP_IOCTLS_GET STD { ssize_t cap_ioctls_get(int fd, \
+ u_long *cmds, size_t maxcmds); }
+536 AUE_CAP_FCNTLS_LIMIT STD { int cap_fcntls_limit(int fd, \
+ uint32_t fcntlrights); }
+537 AUE_CAP_FCNTLS_GET STD { int cap_fcntls_get(int fd, \
+ uint32_t *fcntlrightsp); }
+538 AUE_BINDAT STD { int bindat(int fd, int s, caddr_t name, \
+ int namelen); }
+539 AUE_CONNECTAT STD { int connectat(int fd, int s, caddr_t name, \
+ int namelen); }
+540 AUE_CHFLAGSAT STD { int chflagsat(int fd, const char *path, \
+ u_long flags, int atflag); }
+541 AUE_ACCEPT STD { int accept4(int s, \
+ struct sockaddr * __restrict name, \
+ __socklen_t * __restrict anamelen, \
+ int flags); }
+542 AUE_PIPE STD { int pipe2(int *fildes, int flags); }
+543 AUE_NULL NOSTD { int aio_mlock(struct aiocb *aiocbp); }
+; Please copy any additions and changes to the following compatability tables:
+; sys/compat/freebsd32/syscalls.master
diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c
new file mode 100644
index 0000000..0a6bae4
--- /dev/null
+++ b/sys/kern/systrace_args.c
@@ -0,0 +1,10946 @@
+/*
+ * System call argument to DTrace register array converstion.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD$
+ * This file is part of the DTrace syscall provider.
+ */
+
+static void
+systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args)
+{
+ int64_t *iarg = (int64_t *) uarg;
+ switch (sysnum) {
+ /* nosys */
+ case 0: {
+ *n_args = 0;
+ break;
+ }
+ /* sys_exit */
+ case 1: {
+ struct sys_exit_args *p = params;
+ iarg[0] = p->rval; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* fork */
+ case 2: {
+ *n_args = 0;
+ break;
+ }
+ /* read */
+ case 3: {
+ struct read_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* void * */
+ uarg[2] = p->nbyte; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* write */
+ case 4: {
+ struct write_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* const void * */
+ uarg[2] = p->nbyte; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* open */
+ case 5: {
+ struct open_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* int */
+ iarg[2] = p->mode; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* close */
+ case 6: {
+ struct close_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* wait4 */
+ case 7: {
+ struct wait4_args *p = params;
+ iarg[0] = p->pid; /* int */
+ uarg[1] = (intptr_t) p->status; /* int * */
+ iarg[2] = p->options; /* int */
+ uarg[3] = (intptr_t) p->rusage; /* struct rusage * */
+ *n_args = 4;
+ break;
+ }
+ /* link */
+ case 9: {
+ struct link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->link; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* unlink */
+ case 10: {
+ struct unlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* chdir */
+ case 12: {
+ struct chdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* fchdir */
+ case 13: {
+ struct fchdir_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* mknod */
+ case 14: {
+ struct mknod_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* int */
+ iarg[2] = p->dev; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* chmod */
+ case 15: {
+ struct chmod_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* chown */
+ case 16: {
+ struct chown_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->uid; /* int */
+ iarg[2] = p->gid; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* obreak */
+ case 17: {
+ struct obreak_args *p = params;
+ uarg[0] = (intptr_t) p->nsize; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* getpid */
+ case 20: {
+ *n_args = 0;
+ break;
+ }
+ /* mount */
+ case 21: {
+ struct mount_args *p = params;
+ uarg[0] = (intptr_t) p->type; /* char * */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->flags; /* int */
+ uarg[3] = (intptr_t) p->data; /* caddr_t */
+ *n_args = 4;
+ break;
+ }
+ /* unmount */
+ case 22: {
+ struct unmount_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setuid */
+ case 23: {
+ struct setuid_args *p = params;
+ uarg[0] = p->uid; /* uid_t */
+ *n_args = 1;
+ break;
+ }
+ /* getuid */
+ case 24: {
+ *n_args = 0;
+ break;
+ }
+ /* geteuid */
+ case 25: {
+ *n_args = 0;
+ break;
+ }
+ /* ptrace */
+ case 26: {
+ struct ptrace_args *p = params;
+ iarg[0] = p->req; /* int */
+ iarg[1] = p->pid; /* pid_t */
+ uarg[2] = (intptr_t) p->addr; /* caddr_t */
+ iarg[3] = p->data; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* recvmsg */
+ case 27: {
+ struct recvmsg_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->msg; /* struct msghdr * */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* sendmsg */
+ case 28: {
+ struct sendmsg_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->msg; /* struct msghdr * */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* recvfrom */
+ case 29: {
+ struct recvfrom_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->buf; /* caddr_t */
+ uarg[2] = p->len; /* size_t */
+ iarg[3] = p->flags; /* int */
+ uarg[4] = (intptr_t) p->from; /* struct sockaddr *__restrict */
+ uarg[5] = (intptr_t) p->fromlenaddr; /* __socklen_t *__restrict */
+ *n_args = 6;
+ break;
+ }
+ /* accept */
+ case 30: {
+ struct accept_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->name; /* struct sockaddr *__restrict */
+ uarg[2] = (intptr_t) p->anamelen; /* __socklen_t *__restrict */
+ *n_args = 3;
+ break;
+ }
+ /* getpeername */
+ case 31: {
+ struct getpeername_args *p = params;
+ iarg[0] = p->fdes; /* int */
+ uarg[1] = (intptr_t) p->asa; /* struct sockaddr *__restrict */
+ uarg[2] = (intptr_t) p->alen; /* __socklen_t *__restrict */
+ *n_args = 3;
+ break;
+ }
+ /* getsockname */
+ case 32: {
+ struct getsockname_args *p = params;
+ iarg[0] = p->fdes; /* int */
+ uarg[1] = (intptr_t) p->asa; /* struct sockaddr *__restrict */
+ uarg[2] = (intptr_t) p->alen; /* __socklen_t *__restrict */
+ *n_args = 3;
+ break;
+ }
+ /* access */
+ case 33: {
+ struct access_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->amode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* chflags */
+ case 34: {
+ struct chflags_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ uarg[1] = p->flags; /* u_long */
+ *n_args = 2;
+ break;
+ }
+ /* fchflags */
+ case 35: {
+ struct fchflags_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = p->flags; /* u_long */
+ *n_args = 2;
+ break;
+ }
+ /* sync */
+ case 36: {
+ *n_args = 0;
+ break;
+ }
+ /* kill */
+ case 37: {
+ struct kill_args *p = params;
+ iarg[0] = p->pid; /* int */
+ iarg[1] = p->signum; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* getppid */
+ case 39: {
+ *n_args = 0;
+ break;
+ }
+ /* dup */
+ case 41: {
+ struct dup_args *p = params;
+ uarg[0] = p->fd; /* u_int */
+ *n_args = 1;
+ break;
+ }
+ /* pipe */
+ case 42: {
+ *n_args = 0;
+ break;
+ }
+ /* getegid */
+ case 43: {
+ *n_args = 0;
+ break;
+ }
+ /* profil */
+ case 44: {
+ struct profil_args *p = params;
+ uarg[0] = (intptr_t) p->samples; /* caddr_t */
+ uarg[1] = p->size; /* size_t */
+ uarg[2] = p->offset; /* size_t */
+ uarg[3] = p->scale; /* u_int */
+ *n_args = 4;
+ break;
+ }
+ /* ktrace */
+ case 45: {
+ struct ktrace_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* const char * */
+ iarg[1] = p->ops; /* int */
+ iarg[2] = p->facs; /* int */
+ iarg[3] = p->pid; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* getgid */
+ case 47: {
+ *n_args = 0;
+ break;
+ }
+ /* getlogin */
+ case 49: {
+ struct getlogin_args *p = params;
+ uarg[0] = (intptr_t) p->namebuf; /* char * */
+ uarg[1] = p->namelen; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* setlogin */
+ case 50: {
+ struct setlogin_args *p = params;
+ uarg[0] = (intptr_t) p->namebuf; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* acct */
+ case 51: {
+ struct acct_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* sigaltstack */
+ case 53: {
+ struct sigaltstack_args *p = params;
+ uarg[0] = (intptr_t) p->ss; /* stack_t * */
+ uarg[1] = (intptr_t) p->oss; /* stack_t * */
+ *n_args = 2;
+ break;
+ }
+ /* ioctl */
+ case 54: {
+ struct ioctl_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = p->com; /* u_long */
+ uarg[2] = (intptr_t) p->data; /* caddr_t */
+ *n_args = 3;
+ break;
+ }
+ /* reboot */
+ case 55: {
+ struct reboot_args *p = params;
+ iarg[0] = p->opt; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* revoke */
+ case 56: {
+ struct revoke_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* symlink */
+ case 57: {
+ struct symlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->link; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* readlink */
+ case 58: {
+ struct readlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ uarg[2] = p->count; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* execve */
+ case 59: {
+ struct execve_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->argv; /* char ** */
+ uarg[2] = (intptr_t) p->envv; /* char ** */
+ *n_args = 3;
+ break;
+ }
+ /* umask */
+ case 60: {
+ struct umask_args *p = params;
+ iarg[0] = p->newmask; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* chroot */
+ case 61: {
+ struct chroot_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* msync */
+ case 65: {
+ struct msync_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* vfork */
+ case 66: {
+ *n_args = 0;
+ break;
+ }
+ /* sbrk */
+ case 69: {
+ struct sbrk_args *p = params;
+ iarg[0] = p->incr; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* sstk */
+ case 70: {
+ struct sstk_args *p = params;
+ iarg[0] = p->incr; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* ovadvise */
+ case 72: {
+ struct ovadvise_args *p = params;
+ iarg[0] = p->anom; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* munmap */
+ case 73: {
+ struct munmap_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* mprotect */
+ case 74: {
+ struct mprotect_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->prot; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* madvise */
+ case 75: {
+ struct madvise_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->behav; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* mincore */
+ case 78: {
+ struct mincore_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ uarg[2] = (intptr_t) p->vec; /* char * */
+ *n_args = 3;
+ break;
+ }
+ /* getgroups */
+ case 79: {
+ struct getgroups_args *p = params;
+ uarg[0] = p->gidsetsize; /* u_int */
+ uarg[1] = (intptr_t) p->gidset; /* gid_t * */
+ *n_args = 2;
+ break;
+ }
+ /* setgroups */
+ case 80: {
+ struct setgroups_args *p = params;
+ uarg[0] = p->gidsetsize; /* u_int */
+ uarg[1] = (intptr_t) p->gidset; /* gid_t * */
+ *n_args = 2;
+ break;
+ }
+ /* getpgrp */
+ case 81: {
+ *n_args = 0;
+ break;
+ }
+ /* setpgid */
+ case 82: {
+ struct setpgid_args *p = params;
+ iarg[0] = p->pid; /* int */
+ iarg[1] = p->pgid; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setitimer */
+ case 83: {
+ struct setitimer_args *p = params;
+ uarg[0] = p->which; /* u_int */
+ uarg[1] = (intptr_t) p->itv; /* struct itimerval * */
+ uarg[2] = (intptr_t) p->oitv; /* struct itimerval * */
+ *n_args = 3;
+ break;
+ }
+ /* swapon */
+ case 85: {
+ struct swapon_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* getitimer */
+ case 86: {
+ struct getitimer_args *p = params;
+ uarg[0] = p->which; /* u_int */
+ uarg[1] = (intptr_t) p->itv; /* struct itimerval * */
+ *n_args = 2;
+ break;
+ }
+ /* getdtablesize */
+ case 89: {
+ *n_args = 0;
+ break;
+ }
+ /* dup2 */
+ case 90: {
+ struct dup2_args *p = params;
+ uarg[0] = p->from; /* u_int */
+ uarg[1] = p->to; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* fcntl */
+ case 92: {
+ struct fcntl_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->cmd; /* int */
+ iarg[2] = p->arg; /* long */
+ *n_args = 3;
+ break;
+ }
+ /* select */
+ case 93: {
+ struct select_args *p = params;
+ iarg[0] = p->nd; /* int */
+ uarg[1] = (intptr_t) p->in; /* fd_set * */
+ uarg[2] = (intptr_t) p->ou; /* fd_set * */
+ uarg[3] = (intptr_t) p->ex; /* fd_set * */
+ uarg[4] = (intptr_t) p->tv; /* struct timeval * */
+ *n_args = 5;
+ break;
+ }
+ /* fsync */
+ case 95: {
+ struct fsync_args *p = params;
+ iarg[0] = p->fd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* setpriority */
+ case 96: {
+ struct setpriority_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->who; /* int */
+ iarg[2] = p->prio; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* socket */
+ case 97: {
+ struct socket_args *p = params;
+ iarg[0] = p->domain; /* int */
+ iarg[1] = p->type; /* int */
+ iarg[2] = p->protocol; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* connect */
+ case 98: {
+ struct connect_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->name; /* caddr_t */
+ iarg[2] = p->namelen; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* getpriority */
+ case 100: {
+ struct getpriority_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->who; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* bind */
+ case 104: {
+ struct bind_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->name; /* caddr_t */
+ iarg[2] = p->namelen; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* setsockopt */
+ case 105: {
+ struct setsockopt_args *p = params;
+ iarg[0] = p->s; /* int */
+ iarg[1] = p->level; /* int */
+ iarg[2] = p->name; /* int */
+ uarg[3] = (intptr_t) p->val; /* caddr_t */
+ iarg[4] = p->valsize; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* listen */
+ case 106: {
+ struct listen_args *p = params;
+ iarg[0] = p->s; /* int */
+ iarg[1] = p->backlog; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* gettimeofday */
+ case 116: {
+ struct gettimeofday_args *p = params;
+ uarg[0] = (intptr_t) p->tp; /* struct timeval * */
+ uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+ *n_args = 2;
+ break;
+ }
+ /* getrusage */
+ case 117: {
+ struct getrusage_args *p = params;
+ iarg[0] = p->who; /* int */
+ uarg[1] = (intptr_t) p->rusage; /* struct rusage * */
+ *n_args = 2;
+ break;
+ }
+ /* getsockopt */
+ case 118: {
+ struct getsockopt_args *p = params;
+ iarg[0] = p->s; /* int */
+ iarg[1] = p->level; /* int */
+ iarg[2] = p->name; /* int */
+ uarg[3] = (intptr_t) p->val; /* caddr_t */
+ uarg[4] = (intptr_t) p->avalsize; /* int * */
+ *n_args = 5;
+ break;
+ }
+ /* readv */
+ case 120: {
+ struct readv_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[2] = p->iovcnt; /* u_int */
+ *n_args = 3;
+ break;
+ }
+ /* writev */
+ case 121: {
+ struct writev_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[2] = p->iovcnt; /* u_int */
+ *n_args = 3;
+ break;
+ }
+ /* settimeofday */
+ case 122: {
+ struct settimeofday_args *p = params;
+ uarg[0] = (intptr_t) p->tv; /* struct timeval * */
+ uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+ *n_args = 2;
+ break;
+ }
+ /* fchown */
+ case 123: {
+ struct fchown_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->uid; /* int */
+ iarg[2] = p->gid; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* fchmod */
+ case 124: {
+ struct fchmod_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setreuid */
+ case 126: {
+ struct setreuid_args *p = params;
+ iarg[0] = p->ruid; /* int */
+ iarg[1] = p->euid; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* setregid */
+ case 127: {
+ struct setregid_args *p = params;
+ iarg[0] = p->rgid; /* int */
+ iarg[1] = p->egid; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* rename */
+ case 128: {
+ struct rename_args *p = params;
+ uarg[0] = (intptr_t) p->from; /* char * */
+ uarg[1] = (intptr_t) p->to; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* flock */
+ case 131: {
+ struct flock_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->how; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* mkfifo */
+ case 132: {
+ struct mkfifo_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* sendto */
+ case 133: {
+ struct sendto_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->buf; /* caddr_t */
+ uarg[2] = p->len; /* size_t */
+ iarg[3] = p->flags; /* int */
+ uarg[4] = (intptr_t) p->to; /* caddr_t */
+ iarg[5] = p->tolen; /* int */
+ *n_args = 6;
+ break;
+ }
+ /* shutdown */
+ case 134: {
+ struct shutdown_args *p = params;
+ iarg[0] = p->s; /* int */
+ iarg[1] = p->how; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* socketpair */
+ case 135: {
+ struct socketpair_args *p = params;
+ iarg[0] = p->domain; /* int */
+ iarg[1] = p->type; /* int */
+ iarg[2] = p->protocol; /* int */
+ uarg[3] = (intptr_t) p->rsv; /* int * */
+ *n_args = 4;
+ break;
+ }
+ /* mkdir */
+ case 136: {
+ struct mkdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* rmdir */
+ case 137: {
+ struct rmdir_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* utimes */
+ case 138: {
+ struct utimes_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* adjtime */
+ case 140: {
+ struct adjtime_args *p = params;
+ uarg[0] = (intptr_t) p->delta; /* struct timeval * */
+ uarg[1] = (intptr_t) p->olddelta; /* struct timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* setsid */
+ case 147: {
+ *n_args = 0;
+ break;
+ }
+ /* quotactl */
+ case 148: {
+ struct quotactl_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->cmd; /* int */
+ iarg[2] = p->uid; /* int */
+ uarg[3] = (intptr_t) p->arg; /* caddr_t */
+ *n_args = 4;
+ break;
+ }
+ /* nlm_syscall */
+ case 154: {
+ struct nlm_syscall_args *p = params;
+ iarg[0] = p->debug_level; /* int */
+ iarg[1] = p->grace_period; /* int */
+ iarg[2] = p->addr_count; /* int */
+ uarg[3] = (intptr_t) p->addrs; /* char ** */
+ *n_args = 4;
+ break;
+ }
+ /* nfssvc */
+ case 155: {
+ struct nfssvc_args *p = params;
+ iarg[0] = p->flag; /* int */
+ uarg[1] = (intptr_t) p->argp; /* caddr_t */
+ *n_args = 2;
+ break;
+ }
+ /* lgetfh */
+ case 160: {
+ struct lgetfh_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->fhp; /* struct fhandle * */
+ *n_args = 2;
+ break;
+ }
+ /* getfh */
+ case 161: {
+ struct getfh_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->fhp; /* struct fhandle * */
+ *n_args = 2;
+ break;
+ }
+ /* sysarch */
+ case 165: {
+ struct sysarch_args *p = params;
+ iarg[0] = p->op; /* int */
+ uarg[1] = (intptr_t) p->parms; /* char * */
+ *n_args = 2;
+ break;
+ }
+ /* rtprio */
+ case 166: {
+ struct rtprio_args *p = params;
+ iarg[0] = p->function; /* int */
+ iarg[1] = p->pid; /* pid_t */
+ uarg[2] = (intptr_t) p->rtp; /* struct rtprio * */
+ *n_args = 3;
+ break;
+ }
+ /* semsys */
+ case 169: {
+ struct semsys_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->a2; /* int */
+ iarg[2] = p->a3; /* int */
+ iarg[3] = p->a4; /* int */
+ iarg[4] = p->a5; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* msgsys */
+ case 170: {
+ struct msgsys_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->a2; /* int */
+ iarg[2] = p->a3; /* int */
+ iarg[3] = p->a4; /* int */
+ iarg[4] = p->a5; /* int */
+ iarg[5] = p->a6; /* int */
+ *n_args = 6;
+ break;
+ }
+ /* shmsys */
+ case 171: {
+ struct shmsys_args *p = params;
+ iarg[0] = p->which; /* int */
+ iarg[1] = p->a2; /* int */
+ iarg[2] = p->a3; /* int */
+ iarg[3] = p->a4; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* freebsd6_pread */
+ case 173: {
+ struct freebsd6_pread_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* void * */
+ uarg[2] = p->nbyte; /* size_t */
+ iarg[3] = p->pad; /* int */
+ iarg[4] = p->offset; /* off_t */
+ *n_args = 5;
+ break;
+ }
+ /* freebsd6_pwrite */
+ case 174: {
+ struct freebsd6_pwrite_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* const void * */
+ uarg[2] = p->nbyte; /* size_t */
+ iarg[3] = p->pad; /* int */
+ iarg[4] = p->offset; /* off_t */
+ *n_args = 5;
+ break;
+ }
+ /* setfib */
+ case 175: {
+ struct setfib_args *p = params;
+ iarg[0] = p->fibnum; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* ntp_adjtime */
+ case 176: {
+ struct ntp_adjtime_args *p = params;
+ uarg[0] = (intptr_t) p->tp; /* struct timex * */
+ *n_args = 1;
+ break;
+ }
+ /* setgid */
+ case 181: {
+ struct setgid_args *p = params;
+ iarg[0] = p->gid; /* gid_t */
+ *n_args = 1;
+ break;
+ }
+ /* setegid */
+ case 182: {
+ struct setegid_args *p = params;
+ iarg[0] = p->egid; /* gid_t */
+ *n_args = 1;
+ break;
+ }
+ /* seteuid */
+ case 183: {
+ struct seteuid_args *p = params;
+ uarg[0] = p->euid; /* uid_t */
+ *n_args = 1;
+ break;
+ }
+ /* stat */
+ case 188: {
+ struct stat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->ub; /* struct stat * */
+ *n_args = 2;
+ break;
+ }
+ /* fstat */
+ case 189: {
+ struct fstat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->sb; /* struct stat * */
+ *n_args = 2;
+ break;
+ }
+ /* lstat */
+ case 190: {
+ struct lstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->ub; /* struct stat * */
+ *n_args = 2;
+ break;
+ }
+ /* pathconf */
+ case 191: {
+ struct pathconf_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->name; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* fpathconf */
+ case 192: {
+ struct fpathconf_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->name; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* getrlimit */
+ case 194: {
+ struct __getrlimit_args *p = params;
+ uarg[0] = p->which; /* u_int */
+ uarg[1] = (intptr_t) p->rlp; /* struct rlimit * */
+ *n_args = 2;
+ break;
+ }
+ /* setrlimit */
+ case 195: {
+ struct __setrlimit_args *p = params;
+ uarg[0] = p->which; /* u_int */
+ uarg[1] = (intptr_t) p->rlp; /* struct rlimit * */
+ *n_args = 2;
+ break;
+ }
+ /* getdirentries */
+ case 196: {
+ struct getdirentries_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ uarg[2] = p->count; /* u_int */
+ uarg[3] = (intptr_t) p->basep; /* long * */
+ *n_args = 4;
+ break;
+ }
+ /* freebsd6_mmap */
+ case 197: {
+ struct freebsd6_mmap_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* caddr_t */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->prot; /* int */
+ iarg[3] = p->flags; /* int */
+ iarg[4] = p->fd; /* int */
+ iarg[5] = p->pad; /* int */
+ iarg[6] = p->pos; /* off_t */
+ *n_args = 7;
+ break;
+ }
+ /* nosys */
+ case 198: {
+ *n_args = 0;
+ break;
+ }
+ /* freebsd6_lseek */
+ case 199: {
+ struct freebsd6_lseek_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->pad; /* int */
+ iarg[2] = p->offset; /* off_t */
+ iarg[3] = p->whence; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* freebsd6_truncate */
+ case 200: {
+ struct freebsd6_truncate_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->pad; /* int */
+ iarg[2] = p->length; /* off_t */
+ *n_args = 3;
+ break;
+ }
+ /* freebsd6_ftruncate */
+ case 201: {
+ struct freebsd6_ftruncate_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->pad; /* int */
+ iarg[2] = p->length; /* off_t */
+ *n_args = 3;
+ break;
+ }
+ /* __sysctl */
+ case 202: {
+ struct sysctl_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* int * */
+ uarg[1] = p->namelen; /* u_int */
+ uarg[2] = (intptr_t) p->old; /* void * */
+ uarg[3] = (intptr_t) p->oldlenp; /* size_t * */
+ uarg[4] = (intptr_t) p->new; /* void * */
+ uarg[5] = p->newlen; /* size_t */
+ *n_args = 6;
+ break;
+ }
+ /* mlock */
+ case 203: {
+ struct mlock_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* munlock */
+ case 204: {
+ struct munlock_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* undelete */
+ case 205: {
+ struct undelete_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* futimes */
+ case 206: {
+ struct futimes_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* getpgid */
+ case 207: {
+ struct getpgid_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ *n_args = 1;
+ break;
+ }
+ /* poll */
+ case 209: {
+ struct poll_args *p = params;
+ uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+ uarg[1] = p->nfds; /* u_int */
+ iarg[2] = p->timeout; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* lkmnosys */
+ case 210: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 211: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 212: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 213: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 214: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 215: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 216: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 217: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 218: {
+ *n_args = 0;
+ break;
+ }
+ /* lkmnosys */
+ case 219: {
+ *n_args = 0;
+ break;
+ }
+ /* semget */
+ case 221: {
+ struct semget_args *p = params;
+ iarg[0] = p->key; /* key_t */
+ iarg[1] = p->nsems; /* int */
+ iarg[2] = p->semflg; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* semop */
+ case 222: {
+ struct semop_args *p = params;
+ iarg[0] = p->semid; /* int */
+ uarg[1] = (intptr_t) p->sops; /* struct sembuf * */
+ uarg[2] = p->nsops; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* msgget */
+ case 225: {
+ struct msgget_args *p = params;
+ iarg[0] = p->key; /* key_t */
+ iarg[1] = p->msgflg; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* msgsnd */
+ case 226: {
+ struct msgsnd_args *p = params;
+ iarg[0] = p->msqid; /* int */
+ uarg[1] = (intptr_t) p->msgp; /* const void * */
+ uarg[2] = p->msgsz; /* size_t */
+ iarg[3] = p->msgflg; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* msgrcv */
+ case 227: {
+ struct msgrcv_args *p = params;
+ iarg[0] = p->msqid; /* int */
+ uarg[1] = (intptr_t) p->msgp; /* void * */
+ uarg[2] = p->msgsz; /* size_t */
+ iarg[3] = p->msgtyp; /* long */
+ iarg[4] = p->msgflg; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* shmat */
+ case 228: {
+ struct shmat_args *p = params;
+ iarg[0] = p->shmid; /* int */
+ uarg[1] = (intptr_t) p->shmaddr; /* const void * */
+ iarg[2] = p->shmflg; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* shmdt */
+ case 230: {
+ struct shmdt_args *p = params;
+ uarg[0] = (intptr_t) p->shmaddr; /* const void * */
+ *n_args = 1;
+ break;
+ }
+ /* shmget */
+ case 231: {
+ struct shmget_args *p = params;
+ iarg[0] = p->key; /* key_t */
+ uarg[1] = p->size; /* size_t */
+ iarg[2] = p->shmflg; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* clock_gettime */
+ case 232: {
+ struct clock_gettime_args *p = params;
+ iarg[0] = p->clock_id; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* clock_settime */
+ case 233: {
+ struct clock_settime_args *p = params;
+ iarg[0] = p->clock_id; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* const struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* clock_getres */
+ case 234: {
+ struct clock_getres_args *p = params;
+ iarg[0] = p->clock_id; /* clockid_t */
+ uarg[1] = (intptr_t) p->tp; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* ktimer_create */
+ case 235: {
+ struct ktimer_create_args *p = params;
+ iarg[0] = p->clock_id; /* clockid_t */
+ uarg[1] = (intptr_t) p->evp; /* struct sigevent * */
+ uarg[2] = (intptr_t) p->timerid; /* int * */
+ *n_args = 3;
+ break;
+ }
+ /* ktimer_delete */
+ case 236: {
+ struct ktimer_delete_args *p = params;
+ iarg[0] = p->timerid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* ktimer_settime */
+ case 237: {
+ struct ktimer_settime_args *p = params;
+ iarg[0] = p->timerid; /* int */
+ iarg[1] = p->flags; /* int */
+ uarg[2] = (intptr_t) p->value; /* const struct itimerspec * */
+ uarg[3] = (intptr_t) p->ovalue; /* struct itimerspec * */
+ *n_args = 4;
+ break;
+ }
+ /* ktimer_gettime */
+ case 238: {
+ struct ktimer_gettime_args *p = params;
+ iarg[0] = p->timerid; /* int */
+ uarg[1] = (intptr_t) p->value; /* struct itimerspec * */
+ *n_args = 2;
+ break;
+ }
+ /* ktimer_getoverrun */
+ case 239: {
+ struct ktimer_getoverrun_args *p = params;
+ iarg[0] = p->timerid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* nanosleep */
+ case 240: {
+ struct nanosleep_args *p = params;
+ uarg[0] = (intptr_t) p->rqtp; /* const struct timespec * */
+ uarg[1] = (intptr_t) p->rmtp; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* ffclock_getcounter */
+ case 241: {
+ struct ffclock_getcounter_args *p = params;
+ uarg[0] = (intptr_t) p->ffcount; /* ffcounter * */
+ *n_args = 1;
+ break;
+ }
+ /* ffclock_setestimate */
+ case 242: {
+ struct ffclock_setestimate_args *p = params;
+ uarg[0] = (intptr_t) p->cest; /* struct ffclock_estimate * */
+ *n_args = 1;
+ break;
+ }
+ /* ffclock_getestimate */
+ case 243: {
+ struct ffclock_getestimate_args *p = params;
+ uarg[0] = (intptr_t) p->cest; /* struct ffclock_estimate * */
+ *n_args = 1;
+ break;
+ }
+ /* clock_getcpuclockid2 */
+ case 247: {
+ struct clock_getcpuclockid2_args *p = params;
+ iarg[0] = p->id; /* id_t */
+ iarg[1] = p->which; /* int */
+ uarg[2] = (intptr_t) p->clock_id; /* clockid_t * */
+ *n_args = 3;
+ break;
+ }
+ /* ntp_gettime */
+ case 248: {
+ struct ntp_gettime_args *p = params;
+ uarg[0] = (intptr_t) p->ntvp; /* struct ntptimeval * */
+ *n_args = 1;
+ break;
+ }
+ /* minherit */
+ case 250: {
+ struct minherit_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->inherit; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* rfork */
+ case 251: {
+ struct rfork_args *p = params;
+ iarg[0] = p->flags; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* openbsd_poll */
+ case 252: {
+ struct openbsd_poll_args *p = params;
+ uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+ uarg[1] = p->nfds; /* u_int */
+ iarg[2] = p->timeout; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* issetugid */
+ case 253: {
+ *n_args = 0;
+ break;
+ }
+ /* lchown */
+ case 254: {
+ struct lchown_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->uid; /* int */
+ iarg[2] = p->gid; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* aio_read */
+ case 255: {
+ struct aio_read_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* aio_write */
+ case 256: {
+ struct aio_write_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* lio_listio */
+ case 257: {
+ struct lio_listio_args *p = params;
+ iarg[0] = p->mode; /* int */
+ uarg[1] = (intptr_t) p->acb_list; /* struct aiocb *const * */
+ iarg[2] = p->nent; /* int */
+ uarg[3] = (intptr_t) p->sig; /* struct sigevent * */
+ *n_args = 4;
+ break;
+ }
+ /* getdents */
+ case 272: {
+ struct getdents_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* char * */
+ uarg[2] = p->count; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* lchmod */
+ case 274: {
+ struct lchmod_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->mode; /* mode_t */
+ *n_args = 2;
+ break;
+ }
+ /* lchown */
+ case 275: {
+ struct lchown_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = p->uid; /* uid_t */
+ iarg[2] = p->gid; /* gid_t */
+ *n_args = 3;
+ break;
+ }
+ /* lutimes */
+ case 276: {
+ struct lutimes_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+ *n_args = 2;
+ break;
+ }
+ /* msync */
+ case 277: {
+ struct msync_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* void * */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* nstat */
+ case 278: {
+ struct nstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->ub; /* struct nstat * */
+ *n_args = 2;
+ break;
+ }
+ /* nfstat */
+ case 279: {
+ struct nfstat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->sb; /* struct nstat * */
+ *n_args = 2;
+ break;
+ }
+ /* nlstat */
+ case 280: {
+ struct nlstat_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->ub; /* struct nstat * */
+ *n_args = 2;
+ break;
+ }
+ /* preadv */
+ case 289: {
+ struct preadv_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[2] = p->iovcnt; /* u_int */
+ iarg[3] = p->offset; /* off_t */
+ *n_args = 4;
+ break;
+ }
+ /* pwritev */
+ case 290: {
+ struct pwritev_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[2] = p->iovcnt; /* u_int */
+ iarg[3] = p->offset; /* off_t */
+ *n_args = 4;
+ break;
+ }
+ /* fhopen */
+ case 298: {
+ struct fhopen_args *p = params;
+ uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* fhstat */
+ case 299: {
+ struct fhstat_args *p = params;
+ uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+ uarg[1] = (intptr_t) p->sb; /* struct stat * */
+ *n_args = 2;
+ break;
+ }
+ /* modnext */
+ case 300: {
+ struct modnext_args *p = params;
+ iarg[0] = p->modid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* modstat */
+ case 301: {
+ struct modstat_args *p = params;
+ iarg[0] = p->modid; /* int */
+ uarg[1] = (intptr_t) p->stat; /* struct module_stat * */
+ *n_args = 2;
+ break;
+ }
+ /* modfnext */
+ case 302: {
+ struct modfnext_args *p = params;
+ iarg[0] = p->modid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* modfind */
+ case 303: {
+ struct modfind_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* kldload */
+ case 304: {
+ struct kldload_args *p = params;
+ uarg[0] = (intptr_t) p->file; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* kldunload */
+ case 305: {
+ struct kldunload_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* kldfind */
+ case 306: {
+ struct kldfind_args *p = params;
+ uarg[0] = (intptr_t) p->file; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* kldnext */
+ case 307: {
+ struct kldnext_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* kldstat */
+ case 308: {
+ struct kldstat_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ uarg[1] = (intptr_t) p->stat; /* struct kld_file_stat * */
+ *n_args = 2;
+ break;
+ }
+ /* kldfirstmod */
+ case 309: {
+ struct kldfirstmod_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* getsid */
+ case 310: {
+ struct getsid_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ *n_args = 1;
+ break;
+ }
+ /* setresuid */
+ case 311: {
+ struct setresuid_args *p = params;
+ uarg[0] = p->ruid; /* uid_t */
+ uarg[1] = p->euid; /* uid_t */
+ uarg[2] = p->suid; /* uid_t */
+ *n_args = 3;
+ break;
+ }
+ /* setresgid */
+ case 312: {
+ struct setresgid_args *p = params;
+ iarg[0] = p->rgid; /* gid_t */
+ iarg[1] = p->egid; /* gid_t */
+ iarg[2] = p->sgid; /* gid_t */
+ *n_args = 3;
+ break;
+ }
+ /* aio_return */
+ case 314: {
+ struct aio_return_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* aio_suspend */
+ case 315: {
+ struct aio_suspend_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb *const * */
+ iarg[1] = p->nent; /* int */
+ uarg[2] = (intptr_t) p->timeout; /* const struct timespec * */
+ *n_args = 3;
+ break;
+ }
+ /* aio_cancel */
+ case 316: {
+ struct aio_cancel_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 2;
+ break;
+ }
+ /* aio_error */
+ case 317: {
+ struct aio_error_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* oaio_read */
+ case 318: {
+ struct oaio_read_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct oaiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* oaio_write */
+ case 319: {
+ struct oaio_write_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct oaiocb * */
+ *n_args = 1;
+ break;
+ }
+ /* olio_listio */
+ case 320: {
+ struct olio_listio_args *p = params;
+ iarg[0] = p->mode; /* int */
+ uarg[1] = (intptr_t) p->acb_list; /* struct oaiocb *const * */
+ iarg[2] = p->nent; /* int */
+ uarg[3] = (intptr_t) p->sig; /* struct osigevent * */
+ *n_args = 4;
+ break;
+ }
+ /* yield */
+ case 321: {
+ *n_args = 0;
+ break;
+ }
+ /* mlockall */
+ case 324: {
+ struct mlockall_args *p = params;
+ iarg[0] = p->how; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* munlockall */
+ case 325: {
+ *n_args = 0;
+ break;
+ }
+ /* __getcwd */
+ case 326: {
+ struct __getcwd_args *p = params;
+ uarg[0] = (intptr_t) p->buf; /* u_char * */
+ uarg[1] = p->buflen; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* sched_setparam */
+ case 327: {
+ struct sched_setparam_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->param; /* const struct sched_param * */
+ *n_args = 2;
+ break;
+ }
+ /* sched_getparam */
+ case 328: {
+ struct sched_getparam_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->param; /* struct sched_param * */
+ *n_args = 2;
+ break;
+ }
+ /* sched_setscheduler */
+ case 329: {
+ struct sched_setscheduler_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ iarg[1] = p->policy; /* int */
+ uarg[2] = (intptr_t) p->param; /* const struct sched_param * */
+ *n_args = 3;
+ break;
+ }
+ /* sched_getscheduler */
+ case 330: {
+ struct sched_getscheduler_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ *n_args = 1;
+ break;
+ }
+ /* sched_yield */
+ case 331: {
+ *n_args = 0;
+ break;
+ }
+ /* sched_get_priority_max */
+ case 332: {
+ struct sched_get_priority_max_args *p = params;
+ iarg[0] = p->policy; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* sched_get_priority_min */
+ case 333: {
+ struct sched_get_priority_min_args *p = params;
+ iarg[0] = p->policy; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* sched_rr_get_interval */
+ case 334: {
+ struct sched_rr_get_interval_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->interval; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* utrace */
+ case 335: {
+ struct utrace_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* const void * */
+ uarg[1] = p->len; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* kldsym */
+ case 337: {
+ struct kldsym_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ iarg[1] = p->cmd; /* int */
+ uarg[2] = (intptr_t) p->data; /* void * */
+ *n_args = 3;
+ break;
+ }
+ /* jail */
+ case 338: {
+ struct jail_args *p = params;
+ uarg[0] = (intptr_t) p->jail; /* struct jail * */
+ *n_args = 1;
+ break;
+ }
+ /* nnpfs_syscall */
+ case 339: {
+ struct nnpfs_syscall_args *p = params;
+ iarg[0] = p->operation; /* int */
+ uarg[1] = (intptr_t) p->a_pathP; /* char * */
+ iarg[2] = p->a_opcode; /* int */
+ uarg[3] = (intptr_t) p->a_paramsP; /* void * */
+ iarg[4] = p->a_followSymlinks; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* sigprocmask */
+ case 340: {
+ struct sigprocmask_args *p = params;
+ iarg[0] = p->how; /* int */
+ uarg[1] = (intptr_t) p->set; /* const sigset_t * */
+ uarg[2] = (intptr_t) p->oset; /* sigset_t * */
+ *n_args = 3;
+ break;
+ }
+ /* sigsuspend */
+ case 341: {
+ struct sigsuspend_args *p = params;
+ uarg[0] = (intptr_t) p->sigmask; /* const sigset_t * */
+ *n_args = 1;
+ break;
+ }
+ /* sigpending */
+ case 343: {
+ struct sigpending_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* sigset_t * */
+ *n_args = 1;
+ break;
+ }
+ /* sigtimedwait */
+ case 345: {
+ struct sigtimedwait_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+ uarg[1] = (intptr_t) p->info; /* siginfo_t * */
+ uarg[2] = (intptr_t) p->timeout; /* const struct timespec * */
+ *n_args = 3;
+ break;
+ }
+ /* sigwaitinfo */
+ case 346: {
+ struct sigwaitinfo_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+ uarg[1] = (intptr_t) p->info; /* siginfo_t * */
+ *n_args = 2;
+ break;
+ }
+ /* __acl_get_file */
+ case 347: {
+ struct __acl_get_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_set_file */
+ case 348: {
+ struct __acl_set_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_get_fd */
+ case 349: {
+ struct __acl_get_fd_args *p = params;
+ iarg[0] = p->filedes; /* int */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_set_fd */
+ case 350: {
+ struct __acl_set_fd_args *p = params;
+ iarg[0] = p->filedes; /* int */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_delete_file */
+ case 351: {
+ struct __acl_delete_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ *n_args = 2;
+ break;
+ }
+ /* __acl_delete_fd */
+ case 352: {
+ struct __acl_delete_fd_args *p = params;
+ iarg[0] = p->filedes; /* int */
+ iarg[1] = p->type; /* acl_type_t */
+ *n_args = 2;
+ break;
+ }
+ /* __acl_aclcheck_file */
+ case 353: {
+ struct __acl_aclcheck_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_aclcheck_fd */
+ case 354: {
+ struct __acl_aclcheck_fd_args *p = params;
+ iarg[0] = p->filedes; /* int */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* extattrctl */
+ case 355: {
+ struct extattrctl_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->cmd; /* int */
+ uarg[2] = (intptr_t) p->filename; /* const char * */
+ iarg[3] = p->attrnamespace; /* int */
+ uarg[4] = (intptr_t) p->attrname; /* const char * */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_set_file */
+ case 356: {
+ struct extattr_set_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_get_file */
+ case 357: {
+ struct extattr_get_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_delete_file */
+ case 358: {
+ struct extattr_delete_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ *n_args = 3;
+ break;
+ }
+ /* aio_waitcomplete */
+ case 359: {
+ struct aio_waitcomplete_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb ** */
+ uarg[1] = (intptr_t) p->timeout; /* struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* getresuid */
+ case 360: {
+ struct getresuid_args *p = params;
+ uarg[0] = (intptr_t) p->ruid; /* uid_t * */
+ uarg[1] = (intptr_t) p->euid; /* uid_t * */
+ uarg[2] = (intptr_t) p->suid; /* uid_t * */
+ *n_args = 3;
+ break;
+ }
+ /* getresgid */
+ case 361: {
+ struct getresgid_args *p = params;
+ uarg[0] = (intptr_t) p->rgid; /* gid_t * */
+ uarg[1] = (intptr_t) p->egid; /* gid_t * */
+ uarg[2] = (intptr_t) p->sgid; /* gid_t * */
+ *n_args = 3;
+ break;
+ }
+ /* kqueue */
+ case 362: {
+ *n_args = 0;
+ break;
+ }
+ /* kevent */
+ case 363: {
+ struct kevent_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->changelist; /* struct kevent * */
+ iarg[2] = p->nchanges; /* int */
+ uarg[3] = (intptr_t) p->eventlist; /* struct kevent * */
+ iarg[4] = p->nevents; /* int */
+ uarg[5] = (intptr_t) p->timeout; /* const struct timespec * */
+ *n_args = 6;
+ break;
+ }
+ /* extattr_set_fd */
+ case 371: {
+ struct extattr_set_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_get_fd */
+ case 372: {
+ struct extattr_get_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_delete_fd */
+ case 373: {
+ struct extattr_delete_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ *n_args = 3;
+ break;
+ }
+ /* __setugid */
+ case 374: {
+ struct __setugid_args *p = params;
+ iarg[0] = p->flag; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* eaccess */
+ case 376: {
+ struct eaccess_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->amode; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* afs3_syscall */
+ case 377: {
+ struct afs3_syscall_args *p = params;
+ iarg[0] = p->syscall; /* long */
+ iarg[1] = p->parm1; /* long */
+ iarg[2] = p->parm2; /* long */
+ iarg[3] = p->parm3; /* long */
+ iarg[4] = p->parm4; /* long */
+ iarg[5] = p->parm5; /* long */
+ iarg[6] = p->parm6; /* long */
+ *n_args = 7;
+ break;
+ }
+ /* nmount */
+ case 378: {
+ struct nmount_args *p = params;
+ uarg[0] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[1] = p->iovcnt; /* unsigned int */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* __mac_get_proc */
+ case 384: {
+ struct __mac_get_proc_args *p = params;
+ uarg[0] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 1;
+ break;
+ }
+ /* __mac_set_proc */
+ case 385: {
+ struct __mac_set_proc_args *p = params;
+ uarg[0] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 1;
+ break;
+ }
+ /* __mac_get_fd */
+ case 386: {
+ struct __mac_get_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_get_file */
+ case 387: {
+ struct __mac_get_file_args *p = params;
+ uarg[0] = (intptr_t) p->path_p; /* const char * */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_set_fd */
+ case 388: {
+ struct __mac_set_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_set_file */
+ case 389: {
+ struct __mac_set_file_args *p = params;
+ uarg[0] = (intptr_t) p->path_p; /* const char * */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* kenv */
+ case 390: {
+ struct kenv_args *p = params;
+ iarg[0] = p->what; /* int */
+ uarg[1] = (intptr_t) p->name; /* const char * */
+ uarg[2] = (intptr_t) p->value; /* char * */
+ iarg[3] = p->len; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* lchflags */
+ case 391: {
+ struct lchflags_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ uarg[1] = p->flags; /* u_long */
+ *n_args = 2;
+ break;
+ }
+ /* uuidgen */
+ case 392: {
+ struct uuidgen_args *p = params;
+ uarg[0] = (intptr_t) p->store; /* struct uuid * */
+ iarg[1] = p->count; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* sendfile */
+ case 393: {
+ struct sendfile_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->s; /* int */
+ iarg[2] = p->offset; /* off_t */
+ uarg[3] = p->nbytes; /* size_t */
+ uarg[4] = (intptr_t) p->hdtr; /* struct sf_hdtr * */
+ uarg[5] = (intptr_t) p->sbytes; /* off_t * */
+ iarg[6] = p->flags; /* int */
+ *n_args = 7;
+ break;
+ }
+ /* mac_syscall */
+ case 394: {
+ struct mac_syscall_args *p = params;
+ uarg[0] = (intptr_t) p->policy; /* const char * */
+ iarg[1] = p->call; /* int */
+ uarg[2] = (intptr_t) p->arg; /* void * */
+ *n_args = 3;
+ break;
+ }
+ /* getfsstat */
+ case 395: {
+ struct getfsstat_args *p = params;
+ uarg[0] = (intptr_t) p->buf; /* struct statfs * */
+ iarg[1] = p->bufsize; /* long */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* statfs */
+ case 396: {
+ struct statfs_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+ *n_args = 2;
+ break;
+ }
+ /* fstatfs */
+ case 397: {
+ struct fstatfs_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+ *n_args = 2;
+ break;
+ }
+ /* fhstatfs */
+ case 398: {
+ struct fhstatfs_args *p = params;
+ uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+ uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+ *n_args = 2;
+ break;
+ }
+ /* ksem_close */
+ case 400: {
+ struct ksem_close_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_post */
+ case 401: {
+ struct ksem_post_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_wait */
+ case 402: {
+ struct ksem_wait_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_trywait */
+ case 403: {
+ struct ksem_trywait_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_init */
+ case 404: {
+ struct ksem_init_args *p = params;
+ uarg[0] = (intptr_t) p->idp; /* semid_t * */
+ uarg[1] = p->value; /* unsigned int */
+ *n_args = 2;
+ break;
+ }
+ /* ksem_open */
+ case 405: {
+ struct ksem_open_args *p = params;
+ uarg[0] = (intptr_t) p->idp; /* semid_t * */
+ uarg[1] = (intptr_t) p->name; /* const char * */
+ iarg[2] = p->oflag; /* int */
+ iarg[3] = p->mode; /* mode_t */
+ uarg[4] = p->value; /* unsigned int */
+ *n_args = 5;
+ break;
+ }
+ /* ksem_unlink */
+ case 406: {
+ struct ksem_unlink_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* ksem_getvalue */
+ case 407: {
+ struct ksem_getvalue_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ uarg[1] = (intptr_t) p->val; /* int * */
+ *n_args = 2;
+ break;
+ }
+ /* ksem_destroy */
+ case 408: {
+ struct ksem_destroy_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ *n_args = 1;
+ break;
+ }
+ /* __mac_get_pid */
+ case 409: {
+ struct __mac_get_pid_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_get_link */
+ case 410: {
+ struct __mac_get_link_args *p = params;
+ uarg[0] = (intptr_t) p->path_p; /* const char * */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* __mac_set_link */
+ case 411: {
+ struct __mac_set_link_args *p = params;
+ uarg[0] = (intptr_t) p->path_p; /* const char * */
+ uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 2;
+ break;
+ }
+ /* extattr_set_link */
+ case 412: {
+ struct extattr_set_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_get_link */
+ case 413: {
+ struct extattr_get_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ uarg[3] = (intptr_t) p->data; /* void * */
+ uarg[4] = p->nbytes; /* size_t */
+ *n_args = 5;
+ break;
+ }
+ /* extattr_delete_link */
+ case 414: {
+ struct extattr_delete_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->attrname; /* const char * */
+ *n_args = 3;
+ break;
+ }
+ /* __mac_execve */
+ case 415: {
+ struct __mac_execve_args *p = params;
+ uarg[0] = (intptr_t) p->fname; /* char * */
+ uarg[1] = (intptr_t) p->argv; /* char ** */
+ uarg[2] = (intptr_t) p->envv; /* char ** */
+ uarg[3] = (intptr_t) p->mac_p; /* struct mac * */
+ *n_args = 4;
+ break;
+ }
+ /* sigaction */
+ case 416: {
+ struct sigaction_args *p = params;
+ iarg[0] = p->sig; /* int */
+ uarg[1] = (intptr_t) p->act; /* const struct sigaction * */
+ uarg[2] = (intptr_t) p->oact; /* struct sigaction * */
+ *n_args = 3;
+ break;
+ }
+ /* sigreturn */
+ case 417: {
+ struct sigreturn_args *p = params;
+ uarg[0] = (intptr_t) p->sigcntxp; /* const struct __ucontext * */
+ *n_args = 1;
+ break;
+ }
+ /* getcontext */
+ case 421: {
+ struct getcontext_args *p = params;
+ uarg[0] = (intptr_t) p->ucp; /* struct __ucontext * */
+ *n_args = 1;
+ break;
+ }
+ /* setcontext */
+ case 422: {
+ struct setcontext_args *p = params;
+ uarg[0] = (intptr_t) p->ucp; /* const struct __ucontext * */
+ *n_args = 1;
+ break;
+ }
+ /* swapcontext */
+ case 423: {
+ struct swapcontext_args *p = params;
+ uarg[0] = (intptr_t) p->oucp; /* struct __ucontext * */
+ uarg[1] = (intptr_t) p->ucp; /* const struct __ucontext * */
+ *n_args = 2;
+ break;
+ }
+ /* swapoff */
+ case 424: {
+ struct swapoff_args *p = params;
+ uarg[0] = (intptr_t) p->name; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* __acl_get_link */
+ case 425: {
+ struct __acl_get_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_set_link */
+ case 426: {
+ struct __acl_set_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* __acl_delete_link */
+ case 427: {
+ struct __acl_delete_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ *n_args = 2;
+ break;
+ }
+ /* __acl_aclcheck_link */
+ case 428: {
+ struct __acl_aclcheck_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->type; /* acl_type_t */
+ uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+ *n_args = 3;
+ break;
+ }
+ /* sigwait */
+ case 429: {
+ struct sigwait_args *p = params;
+ uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+ uarg[1] = (intptr_t) p->sig; /* int * */
+ *n_args = 2;
+ break;
+ }
+ /* thr_create */
+ case 430: {
+ struct thr_create_args *p = params;
+ uarg[0] = (intptr_t) p->ctx; /* ucontext_t * */
+ uarg[1] = (intptr_t) p->id; /* long * */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* thr_exit */
+ case 431: {
+ struct thr_exit_args *p = params;
+ uarg[0] = (intptr_t) p->state; /* long * */
+ *n_args = 1;
+ break;
+ }
+ /* thr_self */
+ case 432: {
+ struct thr_self_args *p = params;
+ uarg[0] = (intptr_t) p->id; /* long * */
+ *n_args = 1;
+ break;
+ }
+ /* thr_kill */
+ case 433: {
+ struct thr_kill_args *p = params;
+ iarg[0] = p->id; /* long */
+ iarg[1] = p->sig; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* _umtx_lock */
+ case 434: {
+ struct _umtx_lock_args *p = params;
+ uarg[0] = (intptr_t) p->umtx; /* struct umtx * */
+ *n_args = 1;
+ break;
+ }
+ /* _umtx_unlock */
+ case 435: {
+ struct _umtx_unlock_args *p = params;
+ uarg[0] = (intptr_t) p->umtx; /* struct umtx * */
+ *n_args = 1;
+ break;
+ }
+ /* jail_attach */
+ case 436: {
+ struct jail_attach_args *p = params;
+ iarg[0] = p->jid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* extattr_list_fd */
+ case 437: {
+ struct extattr_list_fd_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->data; /* void * */
+ uarg[3] = p->nbytes; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* extattr_list_file */
+ case 438: {
+ struct extattr_list_file_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->data; /* void * */
+ uarg[3] = p->nbytes; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* extattr_list_link */
+ case 439: {
+ struct extattr_list_link_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->attrnamespace; /* int */
+ uarg[2] = (intptr_t) p->data; /* void * */
+ uarg[3] = p->nbytes; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* ksem_timedwait */
+ case 441: {
+ struct ksem_timedwait_args *p = params;
+ iarg[0] = p->id; /* semid_t */
+ uarg[1] = (intptr_t) p->abstime; /* const struct timespec * */
+ *n_args = 2;
+ break;
+ }
+ /* thr_suspend */
+ case 442: {
+ struct thr_suspend_args *p = params;
+ uarg[0] = (intptr_t) p->timeout; /* const struct timespec * */
+ *n_args = 1;
+ break;
+ }
+ /* thr_wake */
+ case 443: {
+ struct thr_wake_args *p = params;
+ iarg[0] = p->id; /* long */
+ *n_args = 1;
+ break;
+ }
+ /* kldunloadf */
+ case 444: {
+ struct kldunloadf_args *p = params;
+ iarg[0] = p->fileid; /* int */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* audit */
+ case 445: {
+ struct audit_args *p = params;
+ uarg[0] = (intptr_t) p->record; /* const void * */
+ uarg[1] = p->length; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* auditon */
+ case 446: {
+ struct auditon_args *p = params;
+ iarg[0] = p->cmd; /* int */
+ uarg[1] = (intptr_t) p->data; /* void * */
+ uarg[2] = p->length; /* u_int */
+ *n_args = 3;
+ break;
+ }
+ /* getauid */
+ case 447: {
+ struct getauid_args *p = params;
+ uarg[0] = (intptr_t) p->auid; /* uid_t * */
+ *n_args = 1;
+ break;
+ }
+ /* setauid */
+ case 448: {
+ struct setauid_args *p = params;
+ uarg[0] = (intptr_t) p->auid; /* uid_t * */
+ *n_args = 1;
+ break;
+ }
+ /* getaudit */
+ case 449: {
+ struct getaudit_args *p = params;
+ uarg[0] = (intptr_t) p->auditinfo; /* struct auditinfo * */
+ *n_args = 1;
+ break;
+ }
+ /* setaudit */
+ case 450: {
+ struct setaudit_args *p = params;
+ uarg[0] = (intptr_t) p->auditinfo; /* struct auditinfo * */
+ *n_args = 1;
+ break;
+ }
+ /* getaudit_addr */
+ case 451: {
+ struct getaudit_addr_args *p = params;
+ uarg[0] = (intptr_t) p->auditinfo_addr; /* struct auditinfo_addr * */
+ uarg[1] = p->length; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* setaudit_addr */
+ case 452: {
+ struct setaudit_addr_args *p = params;
+ uarg[0] = (intptr_t) p->auditinfo_addr; /* struct auditinfo_addr * */
+ uarg[1] = p->length; /* u_int */
+ *n_args = 2;
+ break;
+ }
+ /* auditctl */
+ case 453: {
+ struct auditctl_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* _umtx_op */
+ case 454: {
+ struct _umtx_op_args *p = params;
+ uarg[0] = (intptr_t) p->obj; /* void * */
+ iarg[1] = p->op; /* int */
+ uarg[2] = p->val; /* u_long */
+ uarg[3] = (intptr_t) p->uaddr1; /* void * */
+ uarg[4] = (intptr_t) p->uaddr2; /* void * */
+ *n_args = 5;
+ break;
+ }
+ /* thr_new */
+ case 455: {
+ struct thr_new_args *p = params;
+ uarg[0] = (intptr_t) p->param; /* struct thr_param * */
+ iarg[1] = p->param_size; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* sigqueue */
+ case 456: {
+ struct sigqueue_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ iarg[1] = p->signum; /* int */
+ uarg[2] = (intptr_t) p->value; /* void * */
+ *n_args = 3;
+ break;
+ }
+ /* kmq_open */
+ case 457: {
+ struct kmq_open_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->flags; /* int */
+ iarg[2] = p->mode; /* mode_t */
+ uarg[3] = (intptr_t) p->attr; /* const struct mq_attr * */
+ *n_args = 4;
+ break;
+ }
+ /* kmq_setattr */
+ case 458: {
+ struct kmq_setattr_args *p = params;
+ iarg[0] = p->mqd; /* int */
+ uarg[1] = (intptr_t) p->attr; /* const struct mq_attr * */
+ uarg[2] = (intptr_t) p->oattr; /* struct mq_attr * */
+ *n_args = 3;
+ break;
+ }
+ /* kmq_timedreceive */
+ case 459: {
+ struct kmq_timedreceive_args *p = params;
+ iarg[0] = p->mqd; /* int */
+ uarg[1] = (intptr_t) p->msg_ptr; /* char * */
+ uarg[2] = p->msg_len; /* size_t */
+ uarg[3] = (intptr_t) p->msg_prio; /* unsigned * */
+ uarg[4] = (intptr_t) p->abs_timeout; /* const struct timespec * */
+ *n_args = 5;
+ break;
+ }
+ /* kmq_timedsend */
+ case 460: {
+ struct kmq_timedsend_args *p = params;
+ iarg[0] = p->mqd; /* int */
+ uarg[1] = (intptr_t) p->msg_ptr; /* const char * */
+ uarg[2] = p->msg_len; /* size_t */
+ uarg[3] = p->msg_prio; /* unsigned */
+ uarg[4] = (intptr_t) p->abs_timeout; /* const struct timespec * */
+ *n_args = 5;
+ break;
+ }
+ /* kmq_notify */
+ case 461: {
+ struct kmq_notify_args *p = params;
+ iarg[0] = p->mqd; /* int */
+ uarg[1] = (intptr_t) p->sigev; /* const struct sigevent * */
+ *n_args = 2;
+ break;
+ }
+ /* kmq_unlink */
+ case 462: {
+ struct kmq_unlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* abort2 */
+ case 463: {
+ struct abort2_args *p = params;
+ uarg[0] = (intptr_t) p->why; /* const char * */
+ iarg[1] = p->nargs; /* int */
+ uarg[2] = (intptr_t) p->args; /* void ** */
+ *n_args = 3;
+ break;
+ }
+ /* thr_set_name */
+ case 464: {
+ struct thr_set_name_args *p = params;
+ iarg[0] = p->id; /* long */
+ uarg[1] = (intptr_t) p->name; /* const char * */
+ *n_args = 2;
+ break;
+ }
+ /* aio_fsync */
+ case 465: {
+ struct aio_fsync_args *p = params;
+ iarg[0] = p->op; /* int */
+ uarg[1] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 2;
+ break;
+ }
+ /* rtprio_thread */
+ case 466: {
+ struct rtprio_thread_args *p = params;
+ iarg[0] = p->function; /* int */
+ iarg[1] = p->lwpid; /* lwpid_t */
+ uarg[2] = (intptr_t) p->rtp; /* struct rtprio * */
+ *n_args = 3;
+ break;
+ }
+ /* sctp_peeloff */
+ case 471: {
+ struct sctp_peeloff_args *p = params;
+ iarg[0] = p->sd; /* int */
+ uarg[1] = p->name; /* uint32_t */
+ *n_args = 2;
+ break;
+ }
+ /* sctp_generic_sendmsg */
+ case 472: {
+ struct sctp_generic_sendmsg_args *p = params;
+ iarg[0] = p->sd; /* int */
+ uarg[1] = (intptr_t) p->msg; /* caddr_t */
+ iarg[2] = p->mlen; /* int */
+ uarg[3] = (intptr_t) p->to; /* caddr_t */
+ iarg[4] = p->tolen; /* __socklen_t */
+ uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+ iarg[6] = p->flags; /* int */
+ *n_args = 7;
+ break;
+ }
+ /* sctp_generic_sendmsg_iov */
+ case 473: {
+ struct sctp_generic_sendmsg_iov_args *p = params;
+ iarg[0] = p->sd; /* int */
+ uarg[1] = (intptr_t) p->iov; /* struct iovec * */
+ iarg[2] = p->iovlen; /* int */
+ uarg[3] = (intptr_t) p->to; /* caddr_t */
+ iarg[4] = p->tolen; /* __socklen_t */
+ uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+ iarg[6] = p->flags; /* int */
+ *n_args = 7;
+ break;
+ }
+ /* sctp_generic_recvmsg */
+ case 474: {
+ struct sctp_generic_recvmsg_args *p = params;
+ iarg[0] = p->sd; /* int */
+ uarg[1] = (intptr_t) p->iov; /* struct iovec * */
+ iarg[2] = p->iovlen; /* int */
+ uarg[3] = (intptr_t) p->from; /* struct sockaddr * */
+ uarg[4] = (intptr_t) p->fromlenaddr; /* __socklen_t * */
+ uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+ uarg[6] = (intptr_t) p->msg_flags; /* int * */
+ *n_args = 7;
+ break;
+ }
+ /* pread */
+ case 475: {
+ struct pread_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* void * */
+ uarg[2] = p->nbyte; /* size_t */
+ iarg[3] = p->offset; /* off_t */
+ *n_args = 4;
+ break;
+ }
+ /* pwrite */
+ case 476: {
+ struct pwrite_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->buf; /* const void * */
+ uarg[2] = p->nbyte; /* size_t */
+ iarg[3] = p->offset; /* off_t */
+ *n_args = 4;
+ break;
+ }
+ /* mmap */
+ case 477: {
+ struct mmap_args *p = params;
+ uarg[0] = (intptr_t) p->addr; /* caddr_t */
+ uarg[1] = p->len; /* size_t */
+ iarg[2] = p->prot; /* int */
+ iarg[3] = p->flags; /* int */
+ iarg[4] = p->fd; /* int */
+ iarg[5] = p->pos; /* off_t */
+ *n_args = 6;
+ break;
+ }
+ /* lseek */
+ case 478: {
+ struct lseek_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->offset; /* off_t */
+ iarg[2] = p->whence; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* truncate */
+ case 479: {
+ struct truncate_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->length; /* off_t */
+ *n_args = 2;
+ break;
+ }
+ /* ftruncate */
+ case 480: {
+ struct ftruncate_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->length; /* off_t */
+ *n_args = 2;
+ break;
+ }
+ /* thr_kill2 */
+ case 481: {
+ struct thr_kill2_args *p = params;
+ iarg[0] = p->pid; /* pid_t */
+ iarg[1] = p->id; /* long */
+ iarg[2] = p->sig; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* shm_open */
+ case 482: {
+ struct shm_open_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ iarg[1] = p->flags; /* int */
+ iarg[2] = p->mode; /* mode_t */
+ *n_args = 3;
+ break;
+ }
+ /* shm_unlink */
+ case 483: {
+ struct shm_unlink_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* cpuset */
+ case 484: {
+ struct cpuset_args *p = params;
+ uarg[0] = (intptr_t) p->setid; /* cpusetid_t * */
+ *n_args = 1;
+ break;
+ }
+ /* cpuset_setid */
+ case 485: {
+ struct cpuset_setid_args *p = params;
+ iarg[0] = p->which; /* cpuwhich_t */
+ iarg[1] = p->id; /* id_t */
+ iarg[2] = p->setid; /* cpusetid_t */
+ *n_args = 3;
+ break;
+ }
+ /* cpuset_getid */
+ case 486: {
+ struct cpuset_getid_args *p = params;
+ iarg[0] = p->level; /* cpulevel_t */
+ iarg[1] = p->which; /* cpuwhich_t */
+ iarg[2] = p->id; /* id_t */
+ uarg[3] = (intptr_t) p->setid; /* cpusetid_t * */
+ *n_args = 4;
+ break;
+ }
+ /* cpuset_getaffinity */
+ case 487: {
+ struct cpuset_getaffinity_args *p = params;
+ iarg[0] = p->level; /* cpulevel_t */
+ iarg[1] = p->which; /* cpuwhich_t */
+ iarg[2] = p->id; /* id_t */
+ uarg[3] = p->cpusetsize; /* size_t */
+ uarg[4] = (intptr_t) p->mask; /* cpuset_t * */
+ *n_args = 5;
+ break;
+ }
+ /* cpuset_setaffinity */
+ case 488: {
+ struct cpuset_setaffinity_args *p = params;
+ iarg[0] = p->level; /* cpulevel_t */
+ iarg[1] = p->which; /* cpuwhich_t */
+ iarg[2] = p->id; /* id_t */
+ uarg[3] = p->cpusetsize; /* size_t */
+ uarg[4] = (intptr_t) p->mask; /* const cpuset_t * */
+ *n_args = 5;
+ break;
+ }
+ /* faccessat */
+ case 489: {
+ struct faccessat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->amode; /* int */
+ iarg[3] = p->flag; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* fchmodat */
+ case 490: {
+ struct fchmodat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->mode; /* mode_t */
+ iarg[3] = p->flag; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* fchownat */
+ case 491: {
+ struct fchownat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ uarg[2] = p->uid; /* uid_t */
+ iarg[3] = p->gid; /* gid_t */
+ iarg[4] = p->flag; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* fexecve */
+ case 492: {
+ struct fexecve_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->argv; /* char ** */
+ uarg[2] = (intptr_t) p->envv; /* char ** */
+ *n_args = 3;
+ break;
+ }
+ /* fstatat */
+ case 493: {
+ struct fstatat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ uarg[2] = (intptr_t) p->buf; /* struct stat * */
+ iarg[3] = p->flag; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* futimesat */
+ case 494: {
+ struct futimesat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ uarg[2] = (intptr_t) p->times; /* struct timeval * */
+ *n_args = 3;
+ break;
+ }
+ /* linkat */
+ case 495: {
+ struct linkat_args *p = params;
+ iarg[0] = p->fd1; /* int */
+ uarg[1] = (intptr_t) p->path1; /* char * */
+ iarg[2] = p->fd2; /* int */
+ uarg[3] = (intptr_t) p->path2; /* char * */
+ iarg[4] = p->flag; /* int */
+ *n_args = 5;
+ break;
+ }
+ /* mkdirat */
+ case 496: {
+ struct mkdirat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->mode; /* mode_t */
+ *n_args = 3;
+ break;
+ }
+ /* mkfifoat */
+ case 497: {
+ struct mkfifoat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->mode; /* mode_t */
+ *n_args = 3;
+ break;
+ }
+ /* mknodat */
+ case 498: {
+ struct mknodat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->mode; /* mode_t */
+ iarg[3] = p->dev; /* dev_t */
+ *n_args = 4;
+ break;
+ }
+ /* openat */
+ case 499: {
+ struct openat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->flag; /* int */
+ iarg[3] = p->mode; /* mode_t */
+ *n_args = 4;
+ break;
+ }
+ /* readlinkat */
+ case 500: {
+ struct readlinkat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ uarg[2] = (intptr_t) p->buf; /* char * */
+ uarg[3] = p->bufsize; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* renameat */
+ case 501: {
+ struct renameat_args *p = params;
+ iarg[0] = p->oldfd; /* int */
+ uarg[1] = (intptr_t) p->old; /* char * */
+ iarg[2] = p->newfd; /* int */
+ uarg[3] = (intptr_t) p->new; /* char * */
+ *n_args = 4;
+ break;
+ }
+ /* symlinkat */
+ case 502: {
+ struct symlinkat_args *p = params;
+ uarg[0] = (intptr_t) p->path1; /* char * */
+ iarg[1] = p->fd; /* int */
+ uarg[2] = (intptr_t) p->path2; /* char * */
+ *n_args = 3;
+ break;
+ }
+ /* unlinkat */
+ case 503: {
+ struct unlinkat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* char * */
+ iarg[2] = p->flag; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* posix_openpt */
+ case 504: {
+ struct posix_openpt_args *p = params;
+ iarg[0] = p->flags; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* gssd_syscall */
+ case 505: {
+ struct gssd_syscall_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ *n_args = 1;
+ break;
+ }
+ /* jail_get */
+ case 506: {
+ struct jail_get_args *p = params;
+ uarg[0] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[1] = p->iovcnt; /* unsigned int */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* jail_set */
+ case 507: {
+ struct jail_set_args *p = params;
+ uarg[0] = (intptr_t) p->iovp; /* struct iovec * */
+ uarg[1] = p->iovcnt; /* unsigned int */
+ iarg[2] = p->flags; /* int */
+ *n_args = 3;
+ break;
+ }
+ /* jail_remove */
+ case 508: {
+ struct jail_remove_args *p = params;
+ iarg[0] = p->jid; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* closefrom */
+ case 509: {
+ struct closefrom_args *p = params;
+ iarg[0] = p->lowfd; /* int */
+ *n_args = 1;
+ break;
+ }
+ /* __semctl */
+ case 510: {
+ struct __semctl_args *p = params;
+ iarg[0] = p->semid; /* int */
+ iarg[1] = p->semnum; /* int */
+ iarg[2] = p->cmd; /* int */
+ uarg[3] = (intptr_t) p->arg; /* union semun * */
+ *n_args = 4;
+ break;
+ }
+ /* msgctl */
+ case 511: {
+ struct msgctl_args *p = params;
+ iarg[0] = p->msqid; /* int */
+ iarg[1] = p->cmd; /* int */
+ uarg[2] = (intptr_t) p->buf; /* struct msqid_ds * */
+ *n_args = 3;
+ break;
+ }
+ /* shmctl */
+ case 512: {
+ struct shmctl_args *p = params;
+ iarg[0] = p->shmid; /* int */
+ iarg[1] = p->cmd; /* int */
+ uarg[2] = (intptr_t) p->buf; /* struct shmid_ds * */
+ *n_args = 3;
+ break;
+ }
+ /* lpathconf */
+ case 513: {
+ struct lpathconf_args *p = params;
+ uarg[0] = (intptr_t) p->path; /* char * */
+ iarg[1] = p->name; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* __cap_rights_get */
+ case 515: {
+ struct __cap_rights_get_args *p = params;
+ iarg[0] = p->version; /* int */
+ iarg[1] = p->fd; /* int */
+ uarg[2] = (intptr_t) p->rightsp; /* cap_rights_t * */
+ *n_args = 3;
+ break;
+ }
+ /* cap_enter */
+ case 516: {
+ *n_args = 0;
+ break;
+ }
+ /* cap_getmode */
+ case 517: {
+ struct cap_getmode_args *p = params;
+ uarg[0] = (intptr_t) p->modep; /* u_int * */
+ *n_args = 1;
+ break;
+ }
+ /* pdfork */
+ case 518: {
+ struct pdfork_args *p = params;
+ uarg[0] = (intptr_t) p->fdp; /* int * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* pdkill */
+ case 519: {
+ struct pdkill_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->signum; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* pdgetpid */
+ case 520: {
+ struct pdgetpid_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->pidp; /* pid_t * */
+ *n_args = 2;
+ break;
+ }
+ /* pselect */
+ case 522: {
+ struct pselect_args *p = params;
+ iarg[0] = p->nd; /* int */
+ uarg[1] = (intptr_t) p->in; /* fd_set * */
+ uarg[2] = (intptr_t) p->ou; /* fd_set * */
+ uarg[3] = (intptr_t) p->ex; /* fd_set * */
+ uarg[4] = (intptr_t) p->ts; /* const struct timespec * */
+ uarg[5] = (intptr_t) p->sm; /* const sigset_t * */
+ *n_args = 6;
+ break;
+ }
+ /* getloginclass */
+ case 523: {
+ struct getloginclass_args *p = params;
+ uarg[0] = (intptr_t) p->namebuf; /* char * */
+ uarg[1] = p->namelen; /* size_t */
+ *n_args = 2;
+ break;
+ }
+ /* setloginclass */
+ case 524: {
+ struct setloginclass_args *p = params;
+ uarg[0] = (intptr_t) p->namebuf; /* const char * */
+ *n_args = 1;
+ break;
+ }
+ /* rctl_get_racct */
+ case 525: {
+ struct rctl_get_racct_args *p = params;
+ uarg[0] = (intptr_t) p->inbufp; /* const void * */
+ uarg[1] = p->inbuflen; /* size_t */
+ uarg[2] = (intptr_t) p->outbufp; /* void * */
+ uarg[3] = p->outbuflen; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* rctl_get_rules */
+ case 526: {
+ struct rctl_get_rules_args *p = params;
+ uarg[0] = (intptr_t) p->inbufp; /* const void * */
+ uarg[1] = p->inbuflen; /* size_t */
+ uarg[2] = (intptr_t) p->outbufp; /* void * */
+ uarg[3] = p->outbuflen; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* rctl_get_limits */
+ case 527: {
+ struct rctl_get_limits_args *p = params;
+ uarg[0] = (intptr_t) p->inbufp; /* const void * */
+ uarg[1] = p->inbuflen; /* size_t */
+ uarg[2] = (intptr_t) p->outbufp; /* void * */
+ uarg[3] = p->outbuflen; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* rctl_add_rule */
+ case 528: {
+ struct rctl_add_rule_args *p = params;
+ uarg[0] = (intptr_t) p->inbufp; /* const void * */
+ uarg[1] = p->inbuflen; /* size_t */
+ uarg[2] = (intptr_t) p->outbufp; /* void * */
+ uarg[3] = p->outbuflen; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* rctl_remove_rule */
+ case 529: {
+ struct rctl_remove_rule_args *p = params;
+ uarg[0] = (intptr_t) p->inbufp; /* const void * */
+ uarg[1] = p->inbuflen; /* size_t */
+ uarg[2] = (intptr_t) p->outbufp; /* void * */
+ uarg[3] = p->outbuflen; /* size_t */
+ *n_args = 4;
+ break;
+ }
+ /* posix_fallocate */
+ case 530: {
+ struct posix_fallocate_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->offset; /* off_t */
+ iarg[2] = p->len; /* off_t */
+ *n_args = 3;
+ break;
+ }
+ /* posix_fadvise */
+ case 531: {
+ struct posix_fadvise_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->offset; /* off_t */
+ iarg[2] = p->len; /* off_t */
+ iarg[3] = p->advice; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* wait6 */
+ case 532: {
+ struct wait6_args *p = params;
+ iarg[0] = p->idtype; /* int */
+ iarg[1] = p->id; /* id_t */
+ uarg[2] = (intptr_t) p->status; /* int * */
+ iarg[3] = p->options; /* int */
+ uarg[4] = (intptr_t) p->wrusage; /* struct __wrusage * */
+ uarg[5] = (intptr_t) p->info; /* siginfo_t * */
+ *n_args = 6;
+ break;
+ }
+ /* cap_rights_limit */
+ case 533: {
+ struct cap_rights_limit_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->rightsp; /* cap_rights_t * */
+ *n_args = 2;
+ break;
+ }
+ /* cap_ioctls_limit */
+ case 534: {
+ struct cap_ioctls_limit_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->cmds; /* const u_long * */
+ uarg[2] = p->ncmds; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* cap_ioctls_get */
+ case 535: {
+ struct cap_ioctls_get_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->cmds; /* u_long * */
+ uarg[2] = p->maxcmds; /* size_t */
+ *n_args = 3;
+ break;
+ }
+ /* cap_fcntls_limit */
+ case 536: {
+ struct cap_fcntls_limit_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = p->fcntlrights; /* uint32_t */
+ *n_args = 2;
+ break;
+ }
+ /* cap_fcntls_get */
+ case 537: {
+ struct cap_fcntls_get_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->fcntlrightsp; /* uint32_t * */
+ *n_args = 2;
+ break;
+ }
+ /* bindat */
+ case 538: {
+ struct bindat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->s; /* int */
+ uarg[2] = (intptr_t) p->name; /* caddr_t */
+ iarg[3] = p->namelen; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* connectat */
+ case 539: {
+ struct connectat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ iarg[1] = p->s; /* int */
+ uarg[2] = (intptr_t) p->name; /* caddr_t */
+ iarg[3] = p->namelen; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* chflagsat */
+ case 540: {
+ struct chflagsat_args *p = params;
+ iarg[0] = p->fd; /* int */
+ uarg[1] = (intptr_t) p->path; /* const char * */
+ uarg[2] = p->flags; /* u_long */
+ iarg[3] = p->atflag; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* accept4 */
+ case 541: {
+ struct accept4_args *p = params;
+ iarg[0] = p->s; /* int */
+ uarg[1] = (intptr_t) p->name; /* struct sockaddr *__restrict */
+ uarg[2] = (intptr_t) p->anamelen; /* __socklen_t *__restrict */
+ iarg[3] = p->flags; /* int */
+ *n_args = 4;
+ break;
+ }
+ /* pipe2 */
+ case 542: {
+ struct pipe2_args *p = params;
+ uarg[0] = (intptr_t) p->fildes; /* int * */
+ iarg[1] = p->flags; /* int */
+ *n_args = 2;
+ break;
+ }
+ /* aio_mlock */
+ case 543: {
+ struct aio_mlock_args *p = params;
+ uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+ *n_args = 1;
+ break;
+ }
+ default:
+ *n_args = 0;
+ break;
+ };
+}
+static void
+systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
+{
+ const char *p = NULL;
+ switch (sysnum) {
+ /* nosys */
+ case 0:
+ break;
+ /* sys_exit */
+ case 1:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fork */
+ case 2:
+ break;
+ /* read */
+ case 3:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "void *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* write */
+ case 4:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const void *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* open */
+ case 5:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* close */
+ case 6:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* wait4 */
+ case 7:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "struct rusage *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* link */
+ case 9:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* unlink */
+ case 10:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* chdir */
+ case 12:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchdir */
+ case 13:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mknod */
+ case 14:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* chmod */
+ case 15:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* chown */
+ case 16:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* obreak */
+ case 17:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getpid */
+ case 20:
+ break;
+ /* mount */
+ case 21:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "caddr_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* unmount */
+ case 22:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setuid */
+ case 23:
+ switch(ndx) {
+ case 0:
+ p = "uid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getuid */
+ case 24:
+ break;
+ /* geteuid */
+ case 25:
+ break;
+ /* ptrace */
+ case 26:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "pid_t";
+ break;
+ case 2:
+ p = "caddr_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* recvmsg */
+ case 27:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct msghdr *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sendmsg */
+ case 28:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct msghdr *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* recvfrom */
+ case 29:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "caddr_t";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "struct sockaddr *__restrict";
+ break;
+ case 5:
+ p = "__socklen_t *__restrict";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* accept */
+ case 30:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct sockaddr *__restrict";
+ break;
+ case 2:
+ p = "__socklen_t *__restrict";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getpeername */
+ case 31:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct sockaddr *__restrict";
+ break;
+ case 2:
+ p = "__socklen_t *__restrict";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getsockname */
+ case 32:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct sockaddr *__restrict";
+ break;
+ case 2:
+ p = "__socklen_t *__restrict";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* access */
+ case 33:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* chflags */
+ case 34:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "u_long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchflags */
+ case 35:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "u_long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sync */
+ case 36:
+ break;
+ /* kill */
+ case 37:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getppid */
+ case 39:
+ break;
+ /* dup */
+ case 41:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pipe */
+ case 42:
+ break;
+ /* getegid */
+ case 43:
+ break;
+ /* profil */
+ case 44:
+ switch(ndx) {
+ case 0:
+ p = "caddr_t";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ktrace */
+ case 45:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getgid */
+ case 47:
+ break;
+ /* getlogin */
+ case 49:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setlogin */
+ case 50:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* acct */
+ case 51:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigaltstack */
+ case 53:
+ switch(ndx) {
+ case 0:
+ p = "stack_t *";
+ break;
+ case 1:
+ p = "stack_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ioctl */
+ case 54:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "u_long";
+ break;
+ case 2:
+ p = "caddr_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* reboot */
+ case 55:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* revoke */
+ case 56:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* symlink */
+ case 57:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* readlink */
+ case 58:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* execve */
+ case 59:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char **";
+ break;
+ case 2:
+ p = "char **";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* umask */
+ case 60:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* chroot */
+ case 61:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* msync */
+ case 65:
+ switch(ndx) {
+ case 0:
+ p = "void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* vfork */
+ case 66:
+ break;
+ /* sbrk */
+ case 69:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sstk */
+ case 70:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ovadvise */
+ case 72:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* munmap */
+ case 73:
+ switch(ndx) {
+ case 0:
+ p = "void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mprotect */
+ case 74:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* madvise */
+ case 75:
+ switch(ndx) {
+ case 0:
+ p = "void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mincore */
+ case 78:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getgroups */
+ case 79:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ case 1:
+ p = "gid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setgroups */
+ case 80:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ case 1:
+ p = "gid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getpgrp */
+ case 81:
+ break;
+ /* setpgid */
+ case 82:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setitimer */
+ case 83:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ case 1:
+ p = "struct itimerval *";
+ break;
+ case 2:
+ p = "struct itimerval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* swapon */
+ case 85:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getitimer */
+ case 86:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ case 1:
+ p = "struct itimerval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getdtablesize */
+ case 89:
+ break;
+ /* dup2 */
+ case 90:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fcntl */
+ case 92:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* select */
+ case 93:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "fd_set *";
+ break;
+ case 2:
+ p = "fd_set *";
+ break;
+ case 3:
+ p = "fd_set *";
+ break;
+ case 4:
+ p = "struct timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fsync */
+ case 95:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setpriority */
+ case 96:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* socket */
+ case 97:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* connect */
+ case 98:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "caddr_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getpriority */
+ case 100:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* bind */
+ case 104:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "caddr_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setsockopt */
+ case 105:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "caddr_t";
+ break;
+ case 4:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* listen */
+ case 106:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* gettimeofday */
+ case 116:
+ switch(ndx) {
+ case 0:
+ p = "struct timeval *";
+ break;
+ case 1:
+ p = "struct timezone *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getrusage */
+ case 117:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct rusage *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getsockopt */
+ case 118:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "caddr_t";
+ break;
+ case 4:
+ p = "int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* readv */
+ case 120:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct iovec *";
+ break;
+ case 2:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* writev */
+ case 121:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct iovec *";
+ break;
+ case 2:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* settimeofday */
+ case 122:
+ switch(ndx) {
+ case 0:
+ p = "struct timeval *";
+ break;
+ case 1:
+ p = "struct timezone *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchown */
+ case 123:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchmod */
+ case 124:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setreuid */
+ case 126:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setregid */
+ case 127:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rename */
+ case 128:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* flock */
+ case 131:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mkfifo */
+ case 132:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sendto */
+ case 133:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "caddr_t";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "caddr_t";
+ break;
+ case 5:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* shutdown */
+ case 134:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* socketpair */
+ case 135:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mkdir */
+ case 136:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rmdir */
+ case 137:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* utimes */
+ case 138:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* adjtime */
+ case 140:
+ switch(ndx) {
+ case 0:
+ p = "struct timeval *";
+ break;
+ case 1:
+ p = "struct timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setsid */
+ case 147:
+ break;
+ /* quotactl */
+ case 148:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "caddr_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nlm_syscall */
+ case 154:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "char **";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nfssvc */
+ case 155:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "caddr_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lgetfh */
+ case 160:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct fhandle *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getfh */
+ case 161:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct fhandle *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sysarch */
+ case 165:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rtprio */
+ case 166:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "pid_t";
+ break;
+ case 2:
+ p = "struct rtprio *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* semsys */
+ case 169:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* msgsys */
+ case 170:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "int";
+ break;
+ case 5:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* shmsys */
+ case 171:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* freebsd6_pread */
+ case 173:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "void *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* freebsd6_pwrite */
+ case 174:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const void *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setfib */
+ case 175:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ntp_adjtime */
+ case 176:
+ switch(ndx) {
+ case 0:
+ p = "struct timex *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setgid */
+ case 181:
+ switch(ndx) {
+ case 0:
+ p = "gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setegid */
+ case 182:
+ switch(ndx) {
+ case 0:
+ p = "gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* seteuid */
+ case 183:
+ switch(ndx) {
+ case 0:
+ p = "uid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* stat */
+ case 188:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct stat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fstat */
+ case 189:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct stat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lstat */
+ case 190:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct stat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pathconf */
+ case 191:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fpathconf */
+ case 192:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getrlimit */
+ case 194:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ case 1:
+ p = "struct rlimit *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setrlimit */
+ case 195:
+ switch(ndx) {
+ case 0:
+ p = "u_int";
+ break;
+ case 1:
+ p = "struct rlimit *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getdirentries */
+ case 196:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "u_int";
+ break;
+ case 3:
+ p = "long *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* freebsd6_mmap */
+ case 197:
+ switch(ndx) {
+ case 0:
+ p = "caddr_t";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "int";
+ break;
+ case 5:
+ p = "int";
+ break;
+ case 6:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nosys */
+ case 198:
+ break;
+ /* freebsd6_lseek */
+ case 199:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "off_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* freebsd6_truncate */
+ case 200:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* freebsd6_ftruncate */
+ case 201:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __sysctl */
+ case 202:
+ switch(ndx) {
+ case 0:
+ p = "int *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t *";
+ break;
+ case 4:
+ p = "void *";
+ break;
+ case 5:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mlock */
+ case 203:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* munlock */
+ case 204:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* undelete */
+ case 205:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* futimes */
+ case 206:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getpgid */
+ case 207:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* poll */
+ case 209:
+ switch(ndx) {
+ case 0:
+ p = "struct pollfd *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lkmnosys */
+ case 210:
+ break;
+ /* lkmnosys */
+ case 211:
+ break;
+ /* lkmnosys */
+ case 212:
+ break;
+ /* lkmnosys */
+ case 213:
+ break;
+ /* lkmnosys */
+ case 214:
+ break;
+ /* lkmnosys */
+ case 215:
+ break;
+ /* lkmnosys */
+ case 216:
+ break;
+ /* lkmnosys */
+ case 217:
+ break;
+ /* lkmnosys */
+ case 218:
+ break;
+ /* lkmnosys */
+ case 219:
+ break;
+ /* semget */
+ case 221:
+ switch(ndx) {
+ case 0:
+ p = "key_t";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* semop */
+ case 222:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct sembuf *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* msgget */
+ case 225:
+ switch(ndx) {
+ case 0:
+ p = "key_t";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* msgsnd */
+ case 226:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const void *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* msgrcv */
+ case 227:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "void *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "long";
+ break;
+ case 4:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* shmat */
+ case 228:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const void *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* shmdt */
+ case 230:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* shmget */
+ case 231:
+ switch(ndx) {
+ case 0:
+ p = "key_t";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* clock_gettime */
+ case 232:
+ switch(ndx) {
+ case 0:
+ p = "clockid_t";
+ break;
+ case 1:
+ p = "struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* clock_settime */
+ case 233:
+ switch(ndx) {
+ case 0:
+ p = "clockid_t";
+ break;
+ case 1:
+ p = "const struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* clock_getres */
+ case 234:
+ switch(ndx) {
+ case 0:
+ p = "clockid_t";
+ break;
+ case 1:
+ p = "struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ktimer_create */
+ case 235:
+ switch(ndx) {
+ case 0:
+ p = "clockid_t";
+ break;
+ case 1:
+ p = "struct sigevent *";
+ break;
+ case 2:
+ p = "int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ktimer_delete */
+ case 236:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ktimer_settime */
+ case 237:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const struct itimerspec *";
+ break;
+ case 3:
+ p = "struct itimerspec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ktimer_gettime */
+ case 238:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct itimerspec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ktimer_getoverrun */
+ case 239:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nanosleep */
+ case 240:
+ switch(ndx) {
+ case 0:
+ p = "const struct timespec *";
+ break;
+ case 1:
+ p = "struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ffclock_getcounter */
+ case 241:
+ switch(ndx) {
+ case 0:
+ p = "ffcounter *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ffclock_setestimate */
+ case 242:
+ switch(ndx) {
+ case 0:
+ p = "struct ffclock_estimate *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ffclock_getestimate */
+ case 243:
+ switch(ndx) {
+ case 0:
+ p = "struct ffclock_estimate *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* clock_getcpuclockid2 */
+ case 247:
+ switch(ndx) {
+ case 0:
+ p = "id_t";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "clockid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ntp_gettime */
+ case 248:
+ switch(ndx) {
+ case 0:
+ p = "struct ntptimeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* minherit */
+ case 250:
+ switch(ndx) {
+ case 0:
+ p = "void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rfork */
+ case 251:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* openbsd_poll */
+ case 252:
+ switch(ndx) {
+ case 0:
+ p = "struct pollfd *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* issetugid */
+ case 253:
+ break;
+ /* lchown */
+ case 254:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_read */
+ case 255:
+ switch(ndx) {
+ case 0:
+ p = "struct aiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_write */
+ case 256:
+ switch(ndx) {
+ case 0:
+ p = "struct aiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lio_listio */
+ case 257:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct aiocb *const *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "struct sigevent *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getdents */
+ case 272:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lchmod */
+ case 274:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "mode_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lchown */
+ case 275:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "uid_t";
+ break;
+ case 2:
+ p = "gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lutimes */
+ case 276:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* msync */
+ case 277:
+ switch(ndx) {
+ case 0:
+ p = "void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nstat */
+ case 278:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct nstat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nfstat */
+ case 279:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct nstat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nlstat */
+ case 280:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct nstat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* preadv */
+ case 289:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct iovec *";
+ break;
+ case 2:
+ p = "u_int";
+ break;
+ case 3:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pwritev */
+ case 290:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct iovec *";
+ break;
+ case 2:
+ p = "u_int";
+ break;
+ case 3:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fhopen */
+ case 298:
+ switch(ndx) {
+ case 0:
+ p = "const struct fhandle *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fhstat */
+ case 299:
+ switch(ndx) {
+ case 0:
+ p = "const struct fhandle *";
+ break;
+ case 1:
+ p = "struct stat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* modnext */
+ case 300:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* modstat */
+ case 301:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct module_stat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* modfnext */
+ case 302:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* modfind */
+ case 303:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kldload */
+ case 304:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kldunload */
+ case 305:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kldfind */
+ case 306:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kldnext */
+ case 307:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kldstat */
+ case 308:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct kld_file_stat *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kldfirstmod */
+ case 309:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getsid */
+ case 310:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setresuid */
+ case 311:
+ switch(ndx) {
+ case 0:
+ p = "uid_t";
+ break;
+ case 1:
+ p = "uid_t";
+ break;
+ case 2:
+ p = "uid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setresgid */
+ case 312:
+ switch(ndx) {
+ case 0:
+ p = "gid_t";
+ break;
+ case 1:
+ p = "gid_t";
+ break;
+ case 2:
+ p = "gid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_return */
+ case 314:
+ switch(ndx) {
+ case 0:
+ p = "struct aiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_suspend */
+ case 315:
+ switch(ndx) {
+ case 0:
+ p = "struct aiocb *const *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_cancel */
+ case 316:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct aiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_error */
+ case 317:
+ switch(ndx) {
+ case 0:
+ p = "struct aiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* oaio_read */
+ case 318:
+ switch(ndx) {
+ case 0:
+ p = "struct oaiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* oaio_write */
+ case 319:
+ switch(ndx) {
+ case 0:
+ p = "struct oaiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* olio_listio */
+ case 320:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct oaiocb *const *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "struct osigevent *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* yield */
+ case 321:
+ break;
+ /* mlockall */
+ case 324:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* munlockall */
+ case 325:
+ break;
+ /* __getcwd */
+ case 326:
+ switch(ndx) {
+ case 0:
+ p = "u_char *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_setparam */
+ case 327:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "const struct sched_param *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_getparam */
+ case 328:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "struct sched_param *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_setscheduler */
+ case 329:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const struct sched_param *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_getscheduler */
+ case 330:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_yield */
+ case 331:
+ break;
+ /* sched_get_priority_max */
+ case 332:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_get_priority_min */
+ case 333:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sched_rr_get_interval */
+ case 334:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* utrace */
+ case 335:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kldsym */
+ case 337:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* jail */
+ case 338:
+ switch(ndx) {
+ case 0:
+ p = "struct jail *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nnpfs_syscall */
+ case 339:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigprocmask */
+ case 340:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const sigset_t *";
+ break;
+ case 2:
+ p = "sigset_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigsuspend */
+ case 341:
+ switch(ndx) {
+ case 0:
+ p = "const sigset_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigpending */
+ case 343:
+ switch(ndx) {
+ case 0:
+ p = "sigset_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigtimedwait */
+ case 345:
+ switch(ndx) {
+ case 0:
+ p = "const sigset_t *";
+ break;
+ case 1:
+ p = "siginfo_t *";
+ break;
+ case 2:
+ p = "const struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigwaitinfo */
+ case 346:
+ switch(ndx) {
+ case 0:
+ p = "const sigset_t *";
+ break;
+ case 1:
+ p = "siginfo_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_get_file */
+ case 347:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_set_file */
+ case 348:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_get_fd */
+ case 349:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_set_fd */
+ case 350:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_delete_file */
+ case 351:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_delete_fd */
+ case 352:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_aclcheck_file */
+ case 353:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_aclcheck_fd */
+ case 354:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattrctl */
+ case 355:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_set_file */
+ case 356:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_get_file */
+ case 357:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_delete_file */
+ case 358:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_waitcomplete */
+ case 359:
+ switch(ndx) {
+ case 0:
+ p = "struct aiocb **";
+ break;
+ case 1:
+ p = "struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getresuid */
+ case 360:
+ switch(ndx) {
+ case 0:
+ p = "uid_t *";
+ break;
+ case 1:
+ p = "uid_t *";
+ break;
+ case 2:
+ p = "uid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getresgid */
+ case 361:
+ switch(ndx) {
+ case 0:
+ p = "gid_t *";
+ break;
+ case 1:
+ p = "gid_t *";
+ break;
+ case 2:
+ p = "gid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kqueue */
+ case 362:
+ break;
+ /* kevent */
+ case 363:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct kevent *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "struct kevent *";
+ break;
+ case 4:
+ p = "int";
+ break;
+ case 5:
+ p = "const struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_set_fd */
+ case 371:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_get_fd */
+ case 372:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_delete_fd */
+ case 373:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __setugid */
+ case 374:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* eaccess */
+ case 376:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* afs3_syscall */
+ case 377:
+ switch(ndx) {
+ case 0:
+ p = "long";
+ break;
+ case 1:
+ p = "long";
+ break;
+ case 2:
+ p = "long";
+ break;
+ case 3:
+ p = "long";
+ break;
+ case 4:
+ p = "long";
+ break;
+ case 5:
+ p = "long";
+ break;
+ case 6:
+ p = "long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* nmount */
+ case 378:
+ switch(ndx) {
+ case 0:
+ p = "struct iovec *";
+ break;
+ case 1:
+ p = "unsigned int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_get_proc */
+ case 384:
+ switch(ndx) {
+ case 0:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_set_proc */
+ case 385:
+ switch(ndx) {
+ case 0:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_get_fd */
+ case 386:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_get_file */
+ case 387:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_set_fd */
+ case 388:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_set_file */
+ case 389:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kenv */
+ case 390:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "char *";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lchflags */
+ case 391:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "u_long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* uuidgen */
+ case 392:
+ switch(ndx) {
+ case 0:
+ p = "struct uuid *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sendfile */
+ case 393:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "off_t";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ case 4:
+ p = "struct sf_hdtr *";
+ break;
+ case 5:
+ p = "off_t *";
+ break;
+ case 6:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mac_syscall */
+ case 394:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getfsstat */
+ case 395:
+ switch(ndx) {
+ case 0:
+ p = "struct statfs *";
+ break;
+ case 1:
+ p = "long";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* statfs */
+ case 396:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "struct statfs *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fstatfs */
+ case 397:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct statfs *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fhstatfs */
+ case 398:
+ switch(ndx) {
+ case 0:
+ p = "const struct fhandle *";
+ break;
+ case 1:
+ p = "struct statfs *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_close */
+ case 400:
+ switch(ndx) {
+ case 0:
+ p = "semid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_post */
+ case 401:
+ switch(ndx) {
+ case 0:
+ p = "semid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_wait */
+ case 402:
+ switch(ndx) {
+ case 0:
+ p = "semid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_trywait */
+ case 403:
+ switch(ndx) {
+ case 0:
+ p = "semid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_init */
+ case 404:
+ switch(ndx) {
+ case 0:
+ p = "semid_t *";
+ break;
+ case 1:
+ p = "unsigned int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_open */
+ case 405:
+ switch(ndx) {
+ case 0:
+ p = "semid_t *";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "mode_t";
+ break;
+ case 4:
+ p = "unsigned int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_unlink */
+ case 406:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_getvalue */
+ case 407:
+ switch(ndx) {
+ case 0:
+ p = "semid_t";
+ break;
+ case 1:
+ p = "int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_destroy */
+ case 408:
+ switch(ndx) {
+ case 0:
+ p = "semid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_get_pid */
+ case 409:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_get_link */
+ case 410:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_set_link */
+ case 411:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_set_link */
+ case 412:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_get_link */
+ case 413:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_delete_link */
+ case 414:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __mac_execve */
+ case 415:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "char **";
+ break;
+ case 2:
+ p = "char **";
+ break;
+ case 3:
+ p = "struct mac *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigaction */
+ case 416:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const struct sigaction *";
+ break;
+ case 2:
+ p = "struct sigaction *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigreturn */
+ case 417:
+ switch(ndx) {
+ case 0:
+ p = "const struct __ucontext *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getcontext */
+ case 421:
+ switch(ndx) {
+ case 0:
+ p = "struct __ucontext *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setcontext */
+ case 422:
+ switch(ndx) {
+ case 0:
+ p = "const struct __ucontext *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* swapcontext */
+ case 423:
+ switch(ndx) {
+ case 0:
+ p = "struct __ucontext *";
+ break;
+ case 1:
+ p = "const struct __ucontext *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* swapoff */
+ case 424:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_get_link */
+ case 425:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_set_link */
+ case 426:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_delete_link */
+ case 427:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __acl_aclcheck_link */
+ case 428:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "acl_type_t";
+ break;
+ case 2:
+ p = "struct acl *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigwait */
+ case 429:
+ switch(ndx) {
+ case 0:
+ p = "const sigset_t *";
+ break;
+ case 1:
+ p = "int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_create */
+ case 430:
+ switch(ndx) {
+ case 0:
+ p = "ucontext_t *";
+ break;
+ case 1:
+ p = "long *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_exit */
+ case 431:
+ switch(ndx) {
+ case 0:
+ p = "long *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_self */
+ case 432:
+ switch(ndx) {
+ case 0:
+ p = "long *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_kill */
+ case 433:
+ switch(ndx) {
+ case 0:
+ p = "long";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* _umtx_lock */
+ case 434:
+ switch(ndx) {
+ case 0:
+ p = "struct umtx *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* _umtx_unlock */
+ case 435:
+ switch(ndx) {
+ case 0:
+ p = "struct umtx *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* jail_attach */
+ case 436:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_list_fd */
+ case 437:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_list_file */
+ case 438:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* extattr_list_link */
+ case 439:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ksem_timedwait */
+ case 441:
+ switch(ndx) {
+ case 0:
+ p = "semid_t";
+ break;
+ case 1:
+ p = "const struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_suspend */
+ case 442:
+ switch(ndx) {
+ case 0:
+ p = "const struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_wake */
+ case 443:
+ switch(ndx) {
+ case 0:
+ p = "long";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kldunloadf */
+ case 444:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* audit */
+ case 445:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* auditon */
+ case 446:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "void *";
+ break;
+ case 2:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getauid */
+ case 447:
+ switch(ndx) {
+ case 0:
+ p = "uid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setauid */
+ case 448:
+ switch(ndx) {
+ case 0:
+ p = "uid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getaudit */
+ case 449:
+ switch(ndx) {
+ case 0:
+ p = "struct auditinfo *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setaudit */
+ case 450:
+ switch(ndx) {
+ case 0:
+ p = "struct auditinfo *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getaudit_addr */
+ case 451:
+ switch(ndx) {
+ case 0:
+ p = "struct auditinfo_addr *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setaudit_addr */
+ case 452:
+ switch(ndx) {
+ case 0:
+ p = "struct auditinfo_addr *";
+ break;
+ case 1:
+ p = "u_int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* auditctl */
+ case 453:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* _umtx_op */
+ case 454:
+ switch(ndx) {
+ case 0:
+ p = "void *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "u_long";
+ break;
+ case 3:
+ p = "void *";
+ break;
+ case 4:
+ p = "void *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_new */
+ case 455:
+ switch(ndx) {
+ case 0:
+ p = "struct thr_param *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sigqueue */
+ case 456:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kmq_open */
+ case 457:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "mode_t";
+ break;
+ case 3:
+ p = "const struct mq_attr *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kmq_setattr */
+ case 458:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const struct mq_attr *";
+ break;
+ case 2:
+ p = "struct mq_attr *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kmq_timedreceive */
+ case 459:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "unsigned *";
+ break;
+ case 4:
+ p = "const struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kmq_timedsend */
+ case 460:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "unsigned";
+ break;
+ case 4:
+ p = "const struct timespec *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kmq_notify */
+ case 461:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const struct sigevent *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* kmq_unlink */
+ case 462:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* abort2 */
+ case 463:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "void **";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_set_name */
+ case 464:
+ switch(ndx) {
+ case 0:
+ p = "long";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_fsync */
+ case 465:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct aiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rtprio_thread */
+ case 466:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "lwpid_t";
+ break;
+ case 2:
+ p = "struct rtprio *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sctp_peeloff */
+ case 471:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sctp_generic_sendmsg */
+ case 472:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "caddr_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "caddr_t";
+ break;
+ case 4:
+ p = "__socklen_t";
+ break;
+ case 5:
+ p = "struct sctp_sndrcvinfo *";
+ break;
+ case 6:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sctp_generic_sendmsg_iov */
+ case 473:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct iovec *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "caddr_t";
+ break;
+ case 4:
+ p = "__socklen_t";
+ break;
+ case 5:
+ p = "struct sctp_sndrcvinfo *";
+ break;
+ case 6:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* sctp_generic_recvmsg */
+ case 474:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct iovec *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "struct sockaddr *";
+ break;
+ case 4:
+ p = "__socklen_t *";
+ break;
+ case 5:
+ p = "struct sctp_sndrcvinfo *";
+ break;
+ case 6:
+ p = "int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pread */
+ case 475:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "void *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pwrite */
+ case 476:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const void *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ case 3:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mmap */
+ case 477:
+ switch(ndx) {
+ case 0:
+ p = "caddr_t";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "int";
+ break;
+ case 5:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lseek */
+ case 478:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "off_t";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* truncate */
+ case 479:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* ftruncate */
+ case 480:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* thr_kill2 */
+ case 481:
+ switch(ndx) {
+ case 0:
+ p = "pid_t";
+ break;
+ case 1:
+ p = "long";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* shm_open */
+ case 482:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "mode_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* shm_unlink */
+ case 483:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cpuset */
+ case 484:
+ switch(ndx) {
+ case 0:
+ p = "cpusetid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cpuset_setid */
+ case 485:
+ switch(ndx) {
+ case 0:
+ p = "cpuwhich_t";
+ break;
+ case 1:
+ p = "id_t";
+ break;
+ case 2:
+ p = "cpusetid_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cpuset_getid */
+ case 486:
+ switch(ndx) {
+ case 0:
+ p = "cpulevel_t";
+ break;
+ case 1:
+ p = "cpuwhich_t";
+ break;
+ case 2:
+ p = "id_t";
+ break;
+ case 3:
+ p = "cpusetid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cpuset_getaffinity */
+ case 487:
+ switch(ndx) {
+ case 0:
+ p = "cpulevel_t";
+ break;
+ case 1:
+ p = "cpuwhich_t";
+ break;
+ case 2:
+ p = "id_t";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ case 4:
+ p = "cpuset_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cpuset_setaffinity */
+ case 488:
+ switch(ndx) {
+ case 0:
+ p = "cpulevel_t";
+ break;
+ case 1:
+ p = "cpuwhich_t";
+ break;
+ case 2:
+ p = "id_t";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ case 4:
+ p = "const cpuset_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* faccessat */
+ case 489:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchmodat */
+ case 490:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "mode_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fchownat */
+ case 491:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "uid_t";
+ break;
+ case 3:
+ p = "gid_t";
+ break;
+ case 4:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fexecve */
+ case 492:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char **";
+ break;
+ case 2:
+ p = "char **";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* fstatat */
+ case 493:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "struct stat *";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* futimesat */
+ case 494:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "struct timeval *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* linkat */
+ case 495:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "char *";
+ break;
+ case 4:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mkdirat */
+ case 496:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "mode_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mkfifoat */
+ case 497:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "mode_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* mknodat */
+ case 498:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "mode_t";
+ break;
+ case 3:
+ p = "dev_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* openat */
+ case 499:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "mode_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* readlinkat */
+ case 500:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "char *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* renameat */
+ case 501:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* symlinkat */
+ case 502:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* unlinkat */
+ case 503:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "char *";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* posix_openpt */
+ case 504:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* gssd_syscall */
+ case 505:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* jail_get */
+ case 506:
+ switch(ndx) {
+ case 0:
+ p = "struct iovec *";
+ break;
+ case 1:
+ p = "unsigned int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* jail_set */
+ case 507:
+ switch(ndx) {
+ case 0:
+ p = "struct iovec *";
+ break;
+ case 1:
+ p = "unsigned int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* jail_remove */
+ case 508:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* closefrom */
+ case 509:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __semctl */
+ case 510:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "int";
+ break;
+ case 3:
+ p = "union semun *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* msgctl */
+ case 511:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "struct msqid_ds *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* shmctl */
+ case 512:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "struct shmid_ds *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* lpathconf */
+ case 513:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* __cap_rights_get */
+ case 515:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "cap_rights_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cap_enter */
+ case 516:
+ break;
+ /* cap_getmode */
+ case 517:
+ switch(ndx) {
+ case 0:
+ p = "u_int *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pdfork */
+ case 518:
+ switch(ndx) {
+ case 0:
+ p = "int *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pdkill */
+ case 519:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pdgetpid */
+ case 520:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "pid_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pselect */
+ case 522:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "fd_set *";
+ break;
+ case 2:
+ p = "fd_set *";
+ break;
+ case 3:
+ p = "fd_set *";
+ break;
+ case 4:
+ p = "const struct timespec *";
+ break;
+ case 5:
+ p = "const sigset_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* getloginclass */
+ case 523:
+ switch(ndx) {
+ case 0:
+ p = "char *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* setloginclass */
+ case 524:
+ switch(ndx) {
+ case 0:
+ p = "const char *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rctl_get_racct */
+ case 525:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rctl_get_rules */
+ case 526:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rctl_get_limits */
+ case 527:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rctl_add_rule */
+ case 528:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* rctl_remove_rule */
+ case 529:
+ switch(ndx) {
+ case 0:
+ p = "const void *";
+ break;
+ case 1:
+ p = "size_t";
+ break;
+ case 2:
+ p = "void *";
+ break;
+ case 3:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* posix_fallocate */
+ case 530:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "off_t";
+ break;
+ case 2:
+ p = "off_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* posix_fadvise */
+ case 531:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "off_t";
+ break;
+ case 2:
+ p = "off_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* wait6 */
+ case 532:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "id_t";
+ break;
+ case 2:
+ p = "int *";
+ break;
+ case 3:
+ p = "int";
+ break;
+ case 4:
+ p = "struct __wrusage *";
+ break;
+ case 5:
+ p = "siginfo_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cap_rights_limit */
+ case 533:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "cap_rights_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cap_ioctls_limit */
+ case 534:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const u_long *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cap_ioctls_get */
+ case 535:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "u_long *";
+ break;
+ case 2:
+ p = "size_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cap_fcntls_limit */
+ case 536:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "uint32_t";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* cap_fcntls_get */
+ case 537:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "uint32_t *";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* bindat */
+ case 538:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "caddr_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* connectat */
+ case 539:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "int";
+ break;
+ case 2:
+ p = "caddr_t";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* chflagsat */
+ case 540:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "const char *";
+ break;
+ case 2:
+ p = "u_long";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* accept4 */
+ case 541:
+ switch(ndx) {
+ case 0:
+ p = "int";
+ break;
+ case 1:
+ p = "struct sockaddr *__restrict";
+ break;
+ case 2:
+ p = "__socklen_t *__restrict";
+ break;
+ case 3:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* pipe2 */
+ case 542:
+ switch(ndx) {
+ case 0:
+ p = "int *";
+ break;
+ case 1:
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ break;
+ /* aio_mlock */
+ case 543:
+ switch(ndx) {
+ case 0:
+ p = "struct aiocb *";
+ break;
+ default:
+ break;
+ };
+ break;
+ default:
+ break;
+ };
+ if (p != NULL)
+ strlcpy(desc, p, descsz);
+}
+static void
+systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz)
+{
+ const char *p = NULL;
+ switch (sysnum) {
+ /* nosys */
+ case 0:
+ /* sys_exit */
+ case 1:
+ if (ndx == 0 || ndx == 1)
+ p = "void";
+ break;
+ /* fork */
+ case 2:
+ /* read */
+ case 3:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* write */
+ case 4:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* open */
+ case 5:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* close */
+ case 6:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* wait4 */
+ case 7:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* link */
+ case 9:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* unlink */
+ case 10:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* chdir */
+ case 12:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchdir */
+ case 13:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mknod */
+ case 14:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* chmod */
+ case 15:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* chown */
+ case 16:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* obreak */
+ case 17:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getpid */
+ case 20:
+ /* mount */
+ case 21:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* unmount */
+ case 22:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setuid */
+ case 23:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getuid */
+ case 24:
+ /* geteuid */
+ case 25:
+ /* ptrace */
+ case 26:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* recvmsg */
+ case 27:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sendmsg */
+ case 28:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* recvfrom */
+ case 29:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* accept */
+ case 30:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getpeername */
+ case 31:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getsockname */
+ case 32:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* access */
+ case 33:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* chflags */
+ case 34:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchflags */
+ case 35:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sync */
+ case 36:
+ /* kill */
+ case 37:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getppid */
+ case 39:
+ /* dup */
+ case 41:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* pipe */
+ case 42:
+ /* getegid */
+ case 43:
+ /* profil */
+ case 44:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ktrace */
+ case 45:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getgid */
+ case 47:
+ /* getlogin */
+ case 49:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setlogin */
+ case 50:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* acct */
+ case 51:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigaltstack */
+ case 53:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ioctl */
+ case 54:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* reboot */
+ case 55:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* revoke */
+ case 56:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* symlink */
+ case 57:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* readlink */
+ case 58:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* execve */
+ case 59:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* umask */
+ case 60:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* chroot */
+ case 61:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* msync */
+ case 65:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* vfork */
+ case 66:
+ /* sbrk */
+ case 69:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sstk */
+ case 70:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ovadvise */
+ case 72:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* munmap */
+ case 73:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mprotect */
+ case 74:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* madvise */
+ case 75:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mincore */
+ case 78:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getgroups */
+ case 79:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setgroups */
+ case 80:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getpgrp */
+ case 81:
+ /* setpgid */
+ case 82:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setitimer */
+ case 83:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* swapon */
+ case 85:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getitimer */
+ case 86:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getdtablesize */
+ case 89:
+ /* dup2 */
+ case 90:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fcntl */
+ case 92:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* select */
+ case 93:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fsync */
+ case 95:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setpriority */
+ case 96:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* socket */
+ case 97:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* connect */
+ case 98:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getpriority */
+ case 100:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* bind */
+ case 104:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setsockopt */
+ case 105:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* listen */
+ case 106:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* gettimeofday */
+ case 116:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getrusage */
+ case 117:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getsockopt */
+ case 118:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* readv */
+ case 120:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* writev */
+ case 121:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* settimeofday */
+ case 122:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchown */
+ case 123:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchmod */
+ case 124:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setreuid */
+ case 126:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setregid */
+ case 127:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rename */
+ case 128:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* flock */
+ case 131:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mkfifo */
+ case 132:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sendto */
+ case 133:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* shutdown */
+ case 134:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* socketpair */
+ case 135:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mkdir */
+ case 136:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rmdir */
+ case 137:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* utimes */
+ case 138:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* adjtime */
+ case 140:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setsid */
+ case 147:
+ /* quotactl */
+ case 148:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* nlm_syscall */
+ case 154:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* nfssvc */
+ case 155:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lgetfh */
+ case 160:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getfh */
+ case 161:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sysarch */
+ case 165:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rtprio */
+ case 166:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* semsys */
+ case 169:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* msgsys */
+ case 170:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* shmsys */
+ case 171:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* freebsd6_pread */
+ case 173:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* freebsd6_pwrite */
+ case 174:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* setfib */
+ case 175:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ntp_adjtime */
+ case 176:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setgid */
+ case 181:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setegid */
+ case 182:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* seteuid */
+ case 183:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* stat */
+ case 188:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fstat */
+ case 189:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lstat */
+ case 190:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* pathconf */
+ case 191:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fpathconf */
+ case 192:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getrlimit */
+ case 194:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setrlimit */
+ case 195:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getdirentries */
+ case 196:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* freebsd6_mmap */
+ case 197:
+ if (ndx == 0 || ndx == 1)
+ p = "caddr_t";
+ break;
+ /* nosys */
+ case 198:
+ /* freebsd6_lseek */
+ case 199:
+ if (ndx == 0 || ndx == 1)
+ p = "off_t";
+ break;
+ /* freebsd6_truncate */
+ case 200:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* freebsd6_ftruncate */
+ case 201:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __sysctl */
+ case 202:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mlock */
+ case 203:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* munlock */
+ case 204:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* undelete */
+ case 205:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* futimes */
+ case 206:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getpgid */
+ case 207:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* poll */
+ case 209:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lkmnosys */
+ case 210:
+ /* lkmnosys */
+ case 211:
+ /* lkmnosys */
+ case 212:
+ /* lkmnosys */
+ case 213:
+ /* lkmnosys */
+ case 214:
+ /* lkmnosys */
+ case 215:
+ /* lkmnosys */
+ case 216:
+ /* lkmnosys */
+ case 217:
+ /* lkmnosys */
+ case 218:
+ /* lkmnosys */
+ case 219:
+ /* semget */
+ case 221:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* semop */
+ case 222:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* msgget */
+ case 225:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* msgsnd */
+ case 226:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* msgrcv */
+ case 227:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* shmat */
+ case 228:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* shmdt */
+ case 230:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* shmget */
+ case 231:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* clock_gettime */
+ case 232:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* clock_settime */
+ case 233:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* clock_getres */
+ case 234:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ktimer_create */
+ case 235:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ktimer_delete */
+ case 236:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ktimer_settime */
+ case 237:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ktimer_gettime */
+ case 238:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ktimer_getoverrun */
+ case 239:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* nanosleep */
+ case 240:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ffclock_getcounter */
+ case 241:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ffclock_setestimate */
+ case 242:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ffclock_getestimate */
+ case 243:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* clock_getcpuclockid2 */
+ case 247:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ntp_gettime */
+ case 248:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* minherit */
+ case 250:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rfork */
+ case 251:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* openbsd_poll */
+ case 252:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* issetugid */
+ case 253:
+ /* lchown */
+ case 254:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_read */
+ case 255:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_write */
+ case 256:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lio_listio */
+ case 257:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getdents */
+ case 272:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lchmod */
+ case 274:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lchown */
+ case 275:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lutimes */
+ case 276:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* msync */
+ case 277:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* nstat */
+ case 278:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* nfstat */
+ case 279:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* nlstat */
+ case 280:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* preadv */
+ case 289:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* pwritev */
+ case 290:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* fhopen */
+ case 298:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fhstat */
+ case 299:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* modnext */
+ case 300:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* modstat */
+ case 301:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* modfnext */
+ case 302:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* modfind */
+ case 303:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kldload */
+ case 304:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kldunload */
+ case 305:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kldfind */
+ case 306:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kldnext */
+ case 307:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kldstat */
+ case 308:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kldfirstmod */
+ case 309:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getsid */
+ case 310:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setresuid */
+ case 311:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setresgid */
+ case 312:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_return */
+ case 314:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_suspend */
+ case 315:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_cancel */
+ case 316:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_error */
+ case 317:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* oaio_read */
+ case 318:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* oaio_write */
+ case 319:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* olio_listio */
+ case 320:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* yield */
+ case 321:
+ /* mlockall */
+ case 324:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* munlockall */
+ case 325:
+ /* __getcwd */
+ case 326:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_setparam */
+ case 327:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_getparam */
+ case 328:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_setscheduler */
+ case 329:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_getscheduler */
+ case 330:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_yield */
+ case 331:
+ /* sched_get_priority_max */
+ case 332:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_get_priority_min */
+ case 333:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sched_rr_get_interval */
+ case 334:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* utrace */
+ case 335:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kldsym */
+ case 337:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* jail */
+ case 338:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* nnpfs_syscall */
+ case 339:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigprocmask */
+ case 340:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigsuspend */
+ case 341:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigpending */
+ case 343:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigtimedwait */
+ case 345:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigwaitinfo */
+ case 346:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_get_file */
+ case 347:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_set_file */
+ case 348:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_get_fd */
+ case 349:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_set_fd */
+ case 350:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_delete_file */
+ case 351:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_delete_fd */
+ case 352:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_aclcheck_file */
+ case 353:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_aclcheck_fd */
+ case 354:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* extattrctl */
+ case 355:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* extattr_set_file */
+ case 356:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* extattr_get_file */
+ case 357:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* extattr_delete_file */
+ case 358:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_waitcomplete */
+ case 359:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getresuid */
+ case 360:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getresgid */
+ case 361:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kqueue */
+ case 362:
+ /* kevent */
+ case 363:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* extattr_set_fd */
+ case 371:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* extattr_get_fd */
+ case 372:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* extattr_delete_fd */
+ case 373:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __setugid */
+ case 374:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* eaccess */
+ case 376:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* afs3_syscall */
+ case 377:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* nmount */
+ case 378:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_get_proc */
+ case 384:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_set_proc */
+ case 385:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_get_fd */
+ case 386:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_get_file */
+ case 387:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_set_fd */
+ case 388:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_set_file */
+ case 389:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kenv */
+ case 390:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lchflags */
+ case 391:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* uuidgen */
+ case 392:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sendfile */
+ case 393:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mac_syscall */
+ case 394:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getfsstat */
+ case 395:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* statfs */
+ case 396:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fstatfs */
+ case 397:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fhstatfs */
+ case 398:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_close */
+ case 400:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_post */
+ case 401:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_wait */
+ case 402:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_trywait */
+ case 403:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_init */
+ case 404:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_open */
+ case 405:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_unlink */
+ case 406:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_getvalue */
+ case 407:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ksem_destroy */
+ case 408:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_get_pid */
+ case 409:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_get_link */
+ case 410:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_set_link */
+ case 411:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* extattr_set_link */
+ case 412:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* extattr_get_link */
+ case 413:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* extattr_delete_link */
+ case 414:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __mac_execve */
+ case 415:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigaction */
+ case 416:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigreturn */
+ case 417:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getcontext */
+ case 421:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setcontext */
+ case 422:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* swapcontext */
+ case 423:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* swapoff */
+ case 424:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_get_link */
+ case 425:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_set_link */
+ case 426:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_delete_link */
+ case 427:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __acl_aclcheck_link */
+ case 428:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigwait */
+ case 429:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* thr_create */
+ case 430:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* thr_exit */
+ case 431:
+ if (ndx == 0 || ndx == 1)
+ p = "void";
+ break;
+ /* thr_self */
+ case 432:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* thr_kill */
+ case 433:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* _umtx_lock */
+ case 434:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* _umtx_unlock */
+ case 435:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* jail_attach */
+ case 436:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* extattr_list_fd */
+ case 437:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* extattr_list_file */
+ case 438:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* extattr_list_link */
+ case 439:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* ksem_timedwait */
+ case 441:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* thr_suspend */
+ case 442:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* thr_wake */
+ case 443:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kldunloadf */
+ case 444:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* audit */
+ case 445:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* auditon */
+ case 446:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getauid */
+ case 447:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setauid */
+ case 448:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getaudit */
+ case 449:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setaudit */
+ case 450:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getaudit_addr */
+ case 451:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setaudit_addr */
+ case 452:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* auditctl */
+ case 453:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* _umtx_op */
+ case 454:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* thr_new */
+ case 455:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sigqueue */
+ case 456:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kmq_open */
+ case 457:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kmq_setattr */
+ case 458:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kmq_timedreceive */
+ case 459:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kmq_timedsend */
+ case 460:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kmq_notify */
+ case 461:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* kmq_unlink */
+ case 462:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* abort2 */
+ case 463:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* thr_set_name */
+ case 464:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_fsync */
+ case 465:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rtprio_thread */
+ case 466:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sctp_peeloff */
+ case 471:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sctp_generic_sendmsg */
+ case 472:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sctp_generic_sendmsg_iov */
+ case 473:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* sctp_generic_recvmsg */
+ case 474:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* pread */
+ case 475:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* pwrite */
+ case 476:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* mmap */
+ case 477:
+ if (ndx == 0 || ndx == 1)
+ p = "caddr_t";
+ break;
+ /* lseek */
+ case 478:
+ if (ndx == 0 || ndx == 1)
+ p = "off_t";
+ break;
+ /* truncate */
+ case 479:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* ftruncate */
+ case 480:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* thr_kill2 */
+ case 481:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* shm_open */
+ case 482:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* shm_unlink */
+ case 483:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cpuset */
+ case 484:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cpuset_setid */
+ case 485:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cpuset_getid */
+ case 486:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cpuset_getaffinity */
+ case 487:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cpuset_setaffinity */
+ case 488:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* faccessat */
+ case 489:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchmodat */
+ case 490:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fchownat */
+ case 491:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fexecve */
+ case 492:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* fstatat */
+ case 493:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* futimesat */
+ case 494:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* linkat */
+ case 495:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mkdirat */
+ case 496:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mkfifoat */
+ case 497:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* mknodat */
+ case 498:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* openat */
+ case 499:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* readlinkat */
+ case 500:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* renameat */
+ case 501:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* symlinkat */
+ case 502:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* unlinkat */
+ case 503:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* posix_openpt */
+ case 504:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* gssd_syscall */
+ case 505:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* jail_get */
+ case 506:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* jail_set */
+ case 507:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* jail_remove */
+ case 508:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* closefrom */
+ case 509:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __semctl */
+ case 510:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* msgctl */
+ case 511:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* shmctl */
+ case 512:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* lpathconf */
+ case 513:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* __cap_rights_get */
+ case 515:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cap_enter */
+ case 516:
+ /* cap_getmode */
+ case 517:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* pdfork */
+ case 518:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* pdkill */
+ case 519:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* pdgetpid */
+ case 520:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* pselect */
+ case 522:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* getloginclass */
+ case 523:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* setloginclass */
+ case 524:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rctl_get_racct */
+ case 525:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rctl_get_rules */
+ case 526:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rctl_get_limits */
+ case 527:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rctl_add_rule */
+ case 528:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* rctl_remove_rule */
+ case 529:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* posix_fallocate */
+ case 530:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* posix_fadvise */
+ case 531:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* wait6 */
+ case 532:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cap_rights_limit */
+ case 533:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cap_ioctls_limit */
+ case 534:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cap_ioctls_get */
+ case 535:
+ if (ndx == 0 || ndx == 1)
+ p = "ssize_t";
+ break;
+ /* cap_fcntls_limit */
+ case 536:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* cap_fcntls_get */
+ case 537:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* bindat */
+ case 538:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* connectat */
+ case 539:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* chflagsat */
+ case 540:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* accept4 */
+ case 541:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* pipe2 */
+ case 542:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ /* aio_mlock */
+ case 543:
+ if (ndx == 0 || ndx == 1)
+ p = "int";
+ break;
+ default:
+ break;
+ };
+ if (p != NULL)
+ strlcpy(desc, p, descsz);
+}
diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c
new file mode 100644
index 0000000..e402cb5
--- /dev/null
+++ b/sys/kern/sysv_ipc.c
@@ -0,0 +1,246 @@
+/* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */
+/*-
+ * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
+ * Copyright (c) 2006 nCircle Network Security, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert N. M. Watson for the TrustedBSD
+ * Project under contract to nCircle Network Security, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Herb Peyerl.
+ * 4. The name of Herb Peyerl may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#include <sys/ipc.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/ucred.h>
+
+void (*shmfork_hook)(struct proc *, struct proc *) = NULL;
+void (*shmexit_hook)(struct vmspace *) = NULL;
+
+/* called from kern_fork.c */
+void
+shmfork(p1, p2)
+ struct proc *p1, *p2;
+{
+
+ if (shmfork_hook != NULL)
+ shmfork_hook(p1, p2);
+ return;
+}
+
+/* called from kern_exit.c */
+void
+shmexit(struct vmspace *vm)
+{
+
+ if (shmexit_hook != NULL)
+ shmexit_hook(vm);
+ return;
+}
+
+/*
+ * Check for IPC permission.
+ *
+ * Note: The MAC Framework does not require any modifications to the
+ * ipcperm() function, as access control checks are performed throughout the
+ * implementation of each primitive. Those entry point calls complement the
+ * ipcperm() discertionary checks. Unlike file system discretionary access
+ * control, the original create of an object is given the same rights as the
+ * current owner.
+ */
+int
+ipcperm(struct thread *td, struct ipc_perm *perm, int acc_mode)
+{
+ struct ucred *cred = td->td_ucred;
+ int error, obj_mode, dac_granted, priv_granted;
+
+ dac_granted = 0;
+ if (cred->cr_uid == perm->cuid || cred->cr_uid == perm->uid) {
+ obj_mode = perm->mode;
+ dac_granted |= IPC_M;
+ } else if (groupmember(perm->gid, cred) ||
+ groupmember(perm->cgid, cred)) {
+ obj_mode = perm->mode;
+ obj_mode <<= 3;
+ } else {
+ obj_mode = perm->mode;
+ obj_mode <<= 6;
+ }
+
+ /*
+ * While the System V IPC permission model allows IPC_M to be
+ * granted, as part of the mode, our implementation requires
+ * privilege to adminster the object if not the owner or creator.
+ */
+#if 0
+ if (obj_mode & IPC_M)
+ dac_granted |= IPC_M;
+#endif
+ if (obj_mode & IPC_R)
+ dac_granted |= IPC_R;
+ if (obj_mode & IPC_W)
+ dac_granted |= IPC_W;
+
+ /*
+ * Simple case: all required rights are granted by DAC.
+ */
+ if ((dac_granted & acc_mode) == acc_mode)
+ return (0);
+
+ /*
+ * Privilege is required to satisfy the request.
+ */
+ priv_granted = 0;
+ if ((acc_mode & IPC_M) && !(dac_granted & IPC_M)) {
+ error = priv_check(td, PRIV_IPC_ADMIN);
+ if (error == 0)
+ priv_granted |= IPC_M;
+ }
+
+ if ((acc_mode & IPC_R) && !(dac_granted & IPC_R)) {
+ error = priv_check(td, PRIV_IPC_READ);
+ if (error == 0)
+ priv_granted |= IPC_R;
+ }
+
+ if ((acc_mode & IPC_W) && !(dac_granted & IPC_W)) {
+ error = priv_check(td, PRIV_IPC_WRITE);
+ if (error == 0)
+ priv_granted |= IPC_W;
+ }
+
+ if (((dac_granted | priv_granted) & acc_mode) == acc_mode)
+ return (0);
+ else
+ return (EACCES);
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+void
+ipcperm_old2new(struct ipc_perm_old *old, struct ipc_perm *new)
+{
+
+ new->cuid = old->cuid;
+ new->cgid = old->cgid;
+ new->uid = old->uid;
+ new->gid = old->gid;
+ new->mode = old->mode;
+ new->seq = old->seq;
+ new->key = old->key;
+}
+
+void
+ipcperm_new2old(struct ipc_perm *new, struct ipc_perm_old *old)
+{
+
+ /* XXX: How to handle ID's > USHORT_MAX? */
+ old->cuid = new->cuid;
+ old->cgid = new->cgid;
+ old->uid = new->uid;
+ old->gid = new->gid;
+ old->mode = new->mode;
+ old->seq = new->seq;
+ old->key = new->key;
+}
+#endif
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_ipc.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+void
+freebsd32_ipcperm_old_in(struct ipc_perm32_old *ip32, struct ipc_perm *ip)
+{
+
+ CP(*ip32, *ip, cuid);
+ CP(*ip32, *ip, cgid);
+ CP(*ip32, *ip, uid);
+ CP(*ip32, *ip, gid);
+ CP(*ip32, *ip, mode);
+ CP(*ip32, *ip, seq);
+ CP(*ip32, *ip, key);
+}
+
+void
+freebsd32_ipcperm_old_out(struct ipc_perm *ip, struct ipc_perm32_old *ip32)
+{
+
+ CP(*ip, *ip32, cuid);
+ CP(*ip, *ip32, cgid);
+ CP(*ip, *ip32, uid);
+ CP(*ip, *ip32, gid);
+ CP(*ip, *ip32, mode);
+ CP(*ip, *ip32, seq);
+ CP(*ip, *ip32, key);
+}
+#endif
+
+void
+freebsd32_ipcperm_in(struct ipc_perm32 *ip32, struct ipc_perm *ip)
+{
+
+ CP(*ip32, *ip, cuid);
+ CP(*ip32, *ip, cgid);
+ CP(*ip32, *ip, uid);
+ CP(*ip32, *ip, gid);
+ CP(*ip32, *ip, mode);
+ CP(*ip32, *ip, seq);
+ CP(*ip32, *ip, key);
+}
+
+void
+freebsd32_ipcperm_out(struct ipc_perm *ip, struct ipc_perm32 *ip32)
+{
+
+ CP(*ip, *ip32, cuid);
+ CP(*ip, *ip32, cgid);
+ CP(*ip, *ip32, uid);
+ CP(*ip, *ip32, gid);
+ CP(*ip, *ip32, mode);
+ CP(*ip, *ip32, seq);
+ CP(*ip, *ip32, key);
+}
+#endif
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
new file mode 100644
index 0000000..d58cb7e
--- /dev/null
+++ b/sys/kern/sysv_msg.c
@@ -0,0 +1,1592 @@
+/*-
+ * Implementation of SVID messages
+ *
+ * Author: Daniel Boulet
+ *
+ * Copyright 1993 Daniel Boulet and RTMX Inc.
+ *
+ * This system call was implemented by Daniel Boulet under contract from RTMX.
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+/*-
+ * Copyright (c) 2003-2005 McAfee, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by McAfee
+ * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
+ * program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/msg.h>
+#include <sys/racct.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/jail.h>
+
+#include <security/mac/mac_framework.h>
+
+FEATURE(sysv_msg, "System V message queues support");
+
+static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
+
+static int msginit(void);
+static int msgunload(void);
+static int sysvmsg_modload(struct module *, int, void *);
+
+
+#ifdef MSG_DEBUG
+#define DPRINTF(a) printf a
+#else
+#define DPRINTF(a) (void)0
+#endif
+
+static void msg_freehdr(struct msg *msghdr);
+
+#ifndef MSGSSZ
+#define MSGSSZ 8 /* Each segment must be 2^N long */
+#endif
+#ifndef MSGSEG
+#define MSGSEG 2048 /* must be less than 32767 */
+#endif
+#define MSGMAX (MSGSSZ*MSGSEG)
+#ifndef MSGMNB
+#define MSGMNB 2048 /* max # of bytes in a queue */
+#endif
+#ifndef MSGMNI
+#define MSGMNI 40
+#endif
+#ifndef MSGTQL
+#define MSGTQL 40
+#endif
+
+/*
+ * Based on the configuration parameters described in an SVR2 (yes, two)
+ * config(1m) man page.
+ *
+ * Each message is broken up and stored in segments that are msgssz bytes
+ * long. For efficiency reasons, this should be a power of two. Also,
+ * it doesn't make sense if it is less than 8 or greater than about 256.
+ * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
+ * two between 8 and 1024 inclusive (and panic's if it isn't).
+ */
+struct msginfo msginfo = {
+ MSGMAX, /* max chars in a message */
+ MSGMNI, /* # of message queue identifiers */
+ MSGMNB, /* max chars in a queue */
+ MSGTQL, /* max messages in system */
+ MSGSSZ, /* size of a message segment */
+ /* (must be small power of 2 greater than 4) */
+ MSGSEG /* number of message segments */
+};
+
+/*
+ * macros to convert between msqid_ds's and msqid's.
+ * (specific to this implementation)
+ */
+#define MSQID(ix,ds) ((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
+#define MSQID_IX(id) ((id) & 0xffff)
+#define MSQID_SEQ(id) (((id) >> 16) & 0xffff)
+
+/*
+ * The rest of this file is specific to this particular implementation.
+ */
+
+struct msgmap {
+ short next; /* next segment in buffer */
+ /* -1 -> available */
+ /* 0..(MSGSEG-1) -> index of next segment */
+};
+
+#define MSG_LOCKED 01000 /* Is this msqid_ds locked? */
+
+static int nfree_msgmaps; /* # of free map entries */
+static short free_msgmaps; /* head of linked list of free map entries */
+static struct msg *free_msghdrs;/* list of free msg headers */
+static char *msgpool; /* MSGMAX byte long msg buffer pool */
+static struct msgmap *msgmaps; /* MSGSEG msgmap structures */
+static struct msg *msghdrs; /* MSGTQL msg headers */
+static struct msqid_kernel *msqids; /* MSGMNI msqid_kernel struct's */
+static struct mtx msq_mtx; /* global mutex for message queues. */
+
+static struct syscall_helper_data msg_syscalls[] = {
+ SYSCALL_INIT_HELPER(msgctl),
+ SYSCALL_INIT_HELPER(msgget),
+ SYSCALL_INIT_HELPER(msgsnd),
+ SYSCALL_INIT_HELPER(msgrcv),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ SYSCALL_INIT_HELPER(msgsys),
+ SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl),
+#endif
+ SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_ipc.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static struct syscall_helper_data msg32_syscalls[] = {
+ SYSCALL32_INIT_HELPER(freebsd32_msgctl),
+ SYSCALL32_INIT_HELPER(freebsd32_msgsnd),
+ SYSCALL32_INIT_HELPER(freebsd32_msgrcv),
+ SYSCALL32_INIT_HELPER_COMPAT(msgget),
+ SYSCALL32_INIT_HELPER(freebsd32_msgsys),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl),
+#endif
+ SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+msginit()
+{
+ int i, error;
+
+ TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
+ TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
+ msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
+ TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
+ TUNABLE_INT_FETCH("kern.ipc.msgmnb", &msginfo.msgmnb);
+ TUNABLE_INT_FETCH("kern.ipc.msgtql", &msginfo.msgtql);
+
+ msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
+ msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
+ msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
+ msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
+ M_WAITOK);
+
+ /*
+ * msginfo.msgssz should be a power of two for efficiency reasons.
+ * It is also pretty silly if msginfo.msgssz is less than 8
+ * or greater than about 256 so ...
+ */
+
+ i = 8;
+ while (i < 1024 && i != msginfo.msgssz)
+ i <<= 1;
+ if (i != msginfo.msgssz) {
+ DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
+ msginfo.msgssz));
+ panic("msginfo.msgssz not a small power of 2");
+ }
+
+ if (msginfo.msgseg > 32767) {
+ DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
+ panic("msginfo.msgseg > 32767");
+ }
+
+ for (i = 0; i < msginfo.msgseg; i++) {
+ if (i > 0)
+ msgmaps[i-1].next = i;
+ msgmaps[i].next = -1; /* implies entry is available */
+ }
+ free_msgmaps = 0;
+ nfree_msgmaps = msginfo.msgseg;
+
+ for (i = 0; i < msginfo.msgtql; i++) {
+ msghdrs[i].msg_type = 0;
+ if (i > 0)
+ msghdrs[i-1].msg_next = &msghdrs[i];
+ msghdrs[i].msg_next = NULL;
+#ifdef MAC
+ mac_sysvmsg_init(&msghdrs[i]);
+#endif
+ }
+ free_msghdrs = &msghdrs[0];
+
+ for (i = 0; i < msginfo.msgmni; i++) {
+ msqids[i].u.msg_qbytes = 0; /* implies entry is available */
+ msqids[i].u.msg_perm.seq = 0; /* reset to a known value */
+ msqids[i].u.msg_perm.mode = 0;
+#ifdef MAC
+ mac_sysvmsq_init(&msqids[i]);
+#endif
+ }
+ mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
+
+ error = syscall_helper_register(msg_syscalls);
+ if (error != 0)
+ return (error);
+#ifdef COMPAT_FREEBSD32
+ error = syscall32_helper_register(msg32_syscalls);
+ if (error != 0)
+ return (error);
+#endif
+ return (0);
+}
+
+static int
+msgunload()
+{
+ struct msqid_kernel *msqkptr;
+ int msqid;
+#ifdef MAC
+ int i;
+#endif
+
+ syscall_helper_unregister(msg_syscalls);
+#ifdef COMPAT_FREEBSD32
+ syscall32_helper_unregister(msg32_syscalls);
+#endif
+
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ /*
+ * Look for an unallocated and unlocked msqid_ds.
+ * msqid_ds's can be locked by msgsnd or msgrcv while
+ * they are copying the message in/out. We can't
+ * re-use the entry until they release it.
+ */
+ msqkptr = &msqids[msqid];
+ if (msqkptr->u.msg_qbytes != 0 ||
+ (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
+ break;
+ }
+ if (msqid != msginfo.msgmni)
+ return (EBUSY);
+
+#ifdef MAC
+ for (i = 0; i < msginfo.msgtql; i++)
+ mac_sysvmsg_destroy(&msghdrs[i]);
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++)
+ mac_sysvmsq_destroy(&msqids[msqid]);
+#endif
+ free(msgpool, M_MSG);
+ free(msgmaps, M_MSG);
+ free(msghdrs, M_MSG);
+ free(msqids, M_MSG);
+ mtx_destroy(&msq_mtx);
+ return (0);
+}
+
+
+static int
+sysvmsg_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ error = msginit();
+ if (error != 0)
+ msgunload();
+ break;
+ case MOD_UNLOAD:
+ error = msgunload();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t sysvmsg_mod = {
+ "sysvmsg",
+ &sysvmsg_modload,
+ NULL
+};
+
+DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
+MODULE_VERSION(sysvmsg, 1);
+
+static void
+msg_freehdr(msghdr)
+ struct msg *msghdr;
+{
+ while (msghdr->msg_ts > 0) {
+ short next;
+ if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
+ panic("msghdr->msg_spot out of range");
+ next = msgmaps[msghdr->msg_spot].next;
+ msgmaps[msghdr->msg_spot].next = free_msgmaps;
+ free_msgmaps = msghdr->msg_spot;
+ nfree_msgmaps++;
+ msghdr->msg_spot = next;
+ if (msghdr->msg_ts >= msginfo.msgssz)
+ msghdr->msg_ts -= msginfo.msgssz;
+ else
+ msghdr->msg_ts = 0;
+ }
+ if (msghdr->msg_spot != -1)
+ panic("msghdr->msg_spot != -1");
+ msghdr->msg_next = free_msghdrs;
+ free_msghdrs = msghdr;
+#ifdef MAC
+ mac_sysvmsg_cleanup(msghdr);
+#endif
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args {
+ int msqid;
+ int cmd;
+ struct msqid_ds *buf;
+};
+#endif
+int
+sys_msgctl(td, uap)
+ struct thread *td;
+ register struct msgctl_args *uap;
+{
+ int msqid = uap->msqid;
+ int cmd = uap->cmd;
+ struct msqid_ds msqbuf;
+ int error;
+
+ DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
+ if (cmd == IPC_SET &&
+ (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
+ return (error);
+ error = kern_msgctl(td, msqid, cmd, &msqbuf);
+ if (cmd == IPC_STAT && error == 0)
+ error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
+ return (error);
+}
+
+int
+kern_msgctl(td, msqid, cmd, msqbuf)
+ struct thread *td;
+ int msqid;
+ int cmd;
+ struct msqid_ds *msqbuf;
+{
+ int rval, error, msqix;
+ register struct msqid_kernel *msqkptr;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+
+ msqix = IPCID_TO_IX(msqid);
+
+ if (msqix < 0 || msqix >= msginfo.msgmni) {
+ DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
+ msginfo.msgmni));
+ return (EINVAL);
+ }
+
+ msqkptr = &msqids[msqix];
+
+ mtx_lock(&msq_mtx);
+ if (msqkptr->u.msg_qbytes == 0) {
+ DPRINTF(("no such msqid\n"));
+ error = EINVAL;
+ goto done2;
+ }
+ if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
+ DPRINTF(("wrong sequence number\n"));
+ error = EINVAL;
+ goto done2;
+ }
+#ifdef MAC
+ error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd);
+ if (error != 0)
+ goto done2;
+#endif
+
+ error = 0;
+ rval = 0;
+
+ switch (cmd) {
+
+ case IPC_RMID:
+ {
+ struct msg *msghdr;
+ if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
+ goto done2;
+
+#ifdef MAC
+ /*
+ * Check that the thread has MAC access permissions to
+ * individual msghdrs. Note: We need to do this in a
+ * separate loop because the actual loop alters the
+ * msq/msghdr info as it progresses, and there is no going
+ * back if half the way through we discover that the
+ * thread cannot free a certain msghdr. The msq will get
+ * into an inconsistent state.
+ */
+ for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
+ msghdr = msghdr->msg_next) {
+ error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr);
+ if (error != 0)
+ goto done2;
+ }
+#endif
+
+ racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
+ racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
+ racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
+ crfree(msqkptr->cred);
+ msqkptr->cred = NULL;
+
+ /* Free the message headers */
+ msghdr = msqkptr->u.msg_first;
+ while (msghdr != NULL) {
+ struct msg *msghdr_tmp;
+
+ /* Free the segments of each message */
+ msqkptr->u.msg_cbytes -= msghdr->msg_ts;
+ msqkptr->u.msg_qnum--;
+ msghdr_tmp = msghdr;
+ msghdr = msghdr->msg_next;
+ msg_freehdr(msghdr_tmp);
+ }
+
+ if (msqkptr->u.msg_cbytes != 0)
+ panic("msg_cbytes is screwed up");
+ if (msqkptr->u.msg_qnum != 0)
+ panic("msg_qnum is screwed up");
+
+ msqkptr->u.msg_qbytes = 0; /* Mark it as free */
+
+#ifdef MAC
+ mac_sysvmsq_cleanup(msqkptr);
+#endif
+
+ wakeup(msqkptr);
+ }
+
+ break;
+
+ case IPC_SET:
+ if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
+ goto done2;
+ if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
+ error = priv_check(td, PRIV_IPC_MSGSIZE);
+ if (error)
+ goto done2;
+ }
+ if (msqbuf->msg_qbytes > msginfo.msgmnb) {
+ DPRINTF(("can't increase msg_qbytes beyond %d"
+ "(truncating)\n", msginfo.msgmnb));
+ msqbuf->msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */
+ }
+ if (msqbuf->msg_qbytes == 0) {
+ DPRINTF(("can't reduce msg_qbytes to 0\n"));
+ error = EINVAL; /* non-standard errno! */
+ goto done2;
+ }
+ msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid; /* change the owner */
+ msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid; /* change the owner */
+ msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
+ (msqbuf->msg_perm.mode & 0777);
+ msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
+ msqkptr->u.msg_ctime = time_second;
+ break;
+
+ case IPC_STAT:
+ if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
+ DPRINTF(("requester doesn't have read access\n"));
+ goto done2;
+ }
+ *msqbuf = msqkptr->u;
+ break;
+
+ default:
+ DPRINTF(("invalid command %d\n", cmd));
+ error = EINVAL;
+ goto done2;
+ }
+
+ if (error == 0)
+ td->td_retval[0] = rval;
+done2:
+ mtx_unlock(&msq_mtx);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgget_args {
+ key_t key;
+ int msgflg;
+};
+#endif
+
+int
+sys_msgget(td, uap)
+ struct thread *td;
+ register struct msgget_args *uap;
+{
+ int msqid, error = 0;
+ int key = uap->key;
+ int msgflg = uap->msgflg;
+ struct ucred *cred = td->td_ucred;
+ register struct msqid_kernel *msqkptr = NULL;
+
+ DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+
+ mtx_lock(&msq_mtx);
+ if (key != IPC_PRIVATE) {
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ msqkptr = &msqids[msqid];
+ if (msqkptr->u.msg_qbytes != 0 &&
+ msqkptr->u.msg_perm.key == key)
+ break;
+ }
+ if (msqid < msginfo.msgmni) {
+ DPRINTF(("found public key\n"));
+ if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
+ DPRINTF(("not exclusive\n"));
+ error = EEXIST;
+ goto done2;
+ }
+ if ((error = ipcperm(td, &msqkptr->u.msg_perm,
+ msgflg & 0700))) {
+ DPRINTF(("requester doesn't have 0%o access\n",
+ msgflg & 0700));
+ goto done2;
+ }
+#ifdef MAC
+ error = mac_sysvmsq_check_msqget(cred, msqkptr);
+ if (error != 0)
+ goto done2;
+#endif
+ goto found;
+ }
+ }
+
+ DPRINTF(("need to allocate the msqid_ds\n"));
+ if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ /*
+ * Look for an unallocated and unlocked msqid_ds.
+ * msqid_ds's can be locked by msgsnd or msgrcv while
+ * they are copying the message in/out. We can't
+ * re-use the entry until they release it.
+ */
+ msqkptr = &msqids[msqid];
+ if (msqkptr->u.msg_qbytes == 0 &&
+ (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
+ break;
+ }
+ if (msqid == msginfo.msgmni) {
+ DPRINTF(("no more msqid_ds's available\n"));
+ error = ENOSPC;
+ goto done2;
+ }
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
+ PROC_UNLOCK(td->td_proc);
+ if (error != 0) {
+ error = ENOSPC;
+ goto done2;
+ }
+#endif
+ DPRINTF(("msqid %d is available\n", msqid));
+ msqkptr->u.msg_perm.key = key;
+ msqkptr->u.msg_perm.cuid = cred->cr_uid;
+ msqkptr->u.msg_perm.uid = cred->cr_uid;
+ msqkptr->u.msg_perm.cgid = cred->cr_gid;
+ msqkptr->u.msg_perm.gid = cred->cr_gid;
+ msqkptr->u.msg_perm.mode = (msgflg & 0777);
+ msqkptr->cred = crhold(cred);
+ /* Make sure that the returned msqid is unique */
+ msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
+ msqkptr->u.msg_first = NULL;
+ msqkptr->u.msg_last = NULL;
+ msqkptr->u.msg_cbytes = 0;
+ msqkptr->u.msg_qnum = 0;
+ msqkptr->u.msg_qbytes = msginfo.msgmnb;
+ msqkptr->u.msg_lspid = 0;
+ msqkptr->u.msg_lrpid = 0;
+ msqkptr->u.msg_stime = 0;
+ msqkptr->u.msg_rtime = 0;
+ msqkptr->u.msg_ctime = time_second;
+#ifdef MAC
+ mac_sysvmsq_create(cred, msqkptr);
+#endif
+ } else {
+ DPRINTF(("didn't find it and wasn't asked to create it\n"));
+ error = ENOENT;
+ goto done2;
+ }
+
+found:
+ /* Construct the unique msqid */
+ td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
+done2:
+ mtx_unlock(&msq_mtx);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgsnd_args {
+ int msqid;
+ const void *msgp;
+ size_t msgsz;
+ int msgflg;
+};
+#endif
+int
+kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
+ struct thread *td;
+ int msqid;
+ const void *msgp; /* XXX msgp is actually mtext. */
+ size_t msgsz;
+ int msgflg;
+ long mtype;
+{
+ int msqix, segs_needed, error = 0;
+ register struct msqid_kernel *msqkptr;
+ register struct msg *msghdr;
+ short next;
+#ifdef RACCT
+ size_t saved_msgsz;
+#endif
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+
+ mtx_lock(&msq_mtx);
+ msqix = IPCID_TO_IX(msqid);
+
+ if (msqix < 0 || msqix >= msginfo.msgmni) {
+ DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
+ msginfo.msgmni));
+ error = EINVAL;
+ goto done2;
+ }
+
+ msqkptr = &msqids[msqix];
+ if (msqkptr->u.msg_qbytes == 0) {
+ DPRINTF(("no such message queue id\n"));
+ error = EINVAL;
+ goto done2;
+ }
+ if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
+ DPRINTF(("wrong sequence number\n"));
+ error = EINVAL;
+ goto done2;
+ }
+
+ if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
+ DPRINTF(("requester doesn't have write access\n"));
+ goto done2;
+ }
+
+#ifdef MAC
+ error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr);
+ if (error != 0)
+ goto done2;
+#endif
+
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
+ PROC_UNLOCK(td->td_proc);
+ error = EAGAIN;
+ goto done2;
+ }
+ saved_msgsz = msgsz;
+ if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
+ racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
+ PROC_UNLOCK(td->td_proc);
+ error = EAGAIN;
+ goto done2;
+ }
+ PROC_UNLOCK(td->td_proc);
+#endif
+
+ segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
+ DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
+ msginfo.msgssz, segs_needed));
+ for (;;) {
+ int need_more_resources = 0;
+
+ /*
+ * check msgsz
+ * (inside this loop in case msg_qbytes changes while we sleep)
+ */
+
+ if (msgsz > msqkptr->u.msg_qbytes) {
+ DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
+ error = EINVAL;
+ goto done3;
+ }
+
+ if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
+ DPRINTF(("msqid is locked\n"));
+ need_more_resources = 1;
+ }
+ if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
+ DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
+ need_more_resources = 1;
+ }
+ if (segs_needed > nfree_msgmaps) {
+ DPRINTF(("segs_needed > nfree_msgmaps\n"));
+ need_more_resources = 1;
+ }
+ if (free_msghdrs == NULL) {
+ DPRINTF(("no more msghdrs\n"));
+ need_more_resources = 1;
+ }
+
+ if (need_more_resources) {
+ int we_own_it;
+
+ if ((msgflg & IPC_NOWAIT) != 0) {
+ DPRINTF(("need more resources but caller "
+ "doesn't want to wait\n"));
+ error = EAGAIN;
+ goto done3;
+ }
+
+ if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
+ DPRINTF(("we don't own the msqid_ds\n"));
+ we_own_it = 0;
+ } else {
+ /* Force later arrivals to wait for our
+ request */
+ DPRINTF(("we own the msqid_ds\n"));
+ msqkptr->u.msg_perm.mode |= MSG_LOCKED;
+ we_own_it = 1;
+ }
+ DPRINTF(("msgsnd: goodnight\n"));
+ error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
+ "msgsnd", hz);
+ DPRINTF(("msgsnd: good morning, error=%d\n", error));
+ if (we_own_it)
+ msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+ if (error == EWOULDBLOCK) {
+ DPRINTF(("msgsnd: timed out\n"));
+ continue;
+ }
+ if (error != 0) {
+ DPRINTF(("msgsnd: interrupted system call\n"));
+ error = EINTR;
+ goto done3;
+ }
+
+ /*
+ * Make sure that the msq queue still exists
+ */
+
+ if (msqkptr->u.msg_qbytes == 0) {
+ DPRINTF(("msqid deleted\n"));
+ error = EIDRM;
+ goto done3;
+ }
+
+ } else {
+ DPRINTF(("got all the resources that we need\n"));
+ break;
+ }
+ }
+
+ /*
+ * We have the resources that we need.
+ * Make sure!
+ */
+
+ if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
+ panic("msg_perm.mode & MSG_LOCKED");
+ if (segs_needed > nfree_msgmaps)
+ panic("segs_needed > nfree_msgmaps");
+ if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
+ panic("msgsz + msg_cbytes > msg_qbytes");
+ if (free_msghdrs == NULL)
+ panic("no more msghdrs");
+
+ /*
+ * Re-lock the msqid_ds in case we page-fault when copying in the
+ * message
+ */
+
+ if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
+ panic("msqid_ds is already locked");
+ msqkptr->u.msg_perm.mode |= MSG_LOCKED;
+
+ /*
+ * Allocate a message header
+ */
+
+ msghdr = free_msghdrs;
+ free_msghdrs = msghdr->msg_next;
+ msghdr->msg_spot = -1;
+ msghdr->msg_ts = msgsz;
+ msghdr->msg_type = mtype;
+#ifdef MAC
+ /*
+ * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here
+ * immediately? Or, should it be checked just before the msg is
+ * enqueued in the msgq (as it is done now)?
+ */
+ mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr);
+#endif
+
+ /*
+ * Allocate space for the message
+ */
+
+ while (segs_needed > 0) {
+ if (nfree_msgmaps <= 0)
+ panic("not enough msgmaps");
+ if (free_msgmaps == -1)
+ panic("nil free_msgmaps");
+ next = free_msgmaps;
+ if (next <= -1)
+ panic("next too low #1");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #1");
+ DPRINTF(("allocating segment %d to message\n", next));
+ free_msgmaps = msgmaps[next].next;
+ nfree_msgmaps--;
+ msgmaps[next].next = msghdr->msg_spot;
+ msghdr->msg_spot = next;
+ segs_needed--;
+ }
+
+ /*
+ * Validate the message type
+ */
+
+ if (msghdr->msg_type < 1) {
+ msg_freehdr(msghdr);
+ msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+ wakeup(msqkptr);
+ DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
+ error = EINVAL;
+ goto done3;
+ }
+
+ /*
+ * Copy in the message body
+ */
+
+ next = msghdr->msg_spot;
+ while (msgsz > 0) {
+ size_t tlen;
+ if (msgsz > msginfo.msgssz)
+ tlen = msginfo.msgssz;
+ else
+ tlen = msgsz;
+ if (next <= -1)
+ panic("next too low #2");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #2");
+ mtx_unlock(&msq_mtx);
+ if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
+ tlen)) != 0) {
+ mtx_lock(&msq_mtx);
+ DPRINTF(("error %d copying in message segment\n",
+ error));
+ msg_freehdr(msghdr);
+ msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+ wakeup(msqkptr);
+ goto done3;
+ }
+ mtx_lock(&msq_mtx);
+ msgsz -= tlen;
+ msgp = (const char *)msgp + tlen;
+ next = msgmaps[next].next;
+ }
+ if (next != -1)
+ panic("didn't use all the msg segments");
+
+ /*
+ * We've got the message. Unlock the msqid_ds.
+ */
+
+ msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+
+ /*
+ * Make sure that the msqid_ds is still allocated.
+ */
+
+ if (msqkptr->u.msg_qbytes == 0) {
+ msg_freehdr(msghdr);
+ wakeup(msqkptr);
+ error = EIDRM;
+ goto done3;
+ }
+
+#ifdef MAC
+ /*
+ * Note: Since the task/thread allocates the msghdr and usually
+ * primes it with its own MAC label, for a majority of policies, it
+ * won't be necessary to check whether the msghdr has access
+ * permissions to the msgq. The mac_sysvmsq_check_msqsnd check would
+ * suffice in that case. However, this hook may be required where
+ * individual policies derive a non-identical label for the msghdr
+ * from the current thread label and may want to check the msghdr
+ * enqueue permissions, along with read/write permissions to the
+ * msgq.
+ */
+ error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr);
+ if (error != 0) {
+ msg_freehdr(msghdr);
+ wakeup(msqkptr);
+ goto done3;
+ }
+#endif
+
+ /*
+ * Put the message into the queue
+ */
+ if (msqkptr->u.msg_first == NULL) {
+ msqkptr->u.msg_first = msghdr;
+ msqkptr->u.msg_last = msghdr;
+ } else {
+ msqkptr->u.msg_last->msg_next = msghdr;
+ msqkptr->u.msg_last = msghdr;
+ }
+ msqkptr->u.msg_last->msg_next = NULL;
+
+ msqkptr->u.msg_cbytes += msghdr->msg_ts;
+ msqkptr->u.msg_qnum++;
+ msqkptr->u.msg_lspid = td->td_proc->p_pid;
+ msqkptr->u.msg_stime = time_second;
+
+ wakeup(msqkptr);
+ td->td_retval[0] = 0;
+done3:
+#ifdef RACCT
+ if (error != 0) {
+ PROC_LOCK(td->td_proc);
+ racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
+ racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
+ PROC_UNLOCK(td->td_proc);
+ }
+#endif
+done2:
+ mtx_unlock(&msq_mtx);
+ return (error);
+}
+
+int
+sys_msgsnd(td, uap)
+ struct thread *td;
+ register struct msgsnd_args *uap;
+{
+ int error;
+ long mtype;
+
+ DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
+ uap->msgsz, uap->msgflg));
+
+ if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
+ DPRINTF(("error %d copying the message type\n", error));
+ return (error);
+ }
+ return (kern_msgsnd(td, uap->msqid,
+ (const char *)uap->msgp + sizeof(mtype),
+ uap->msgsz, uap->msgflg, mtype));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgrcv_args {
+ int msqid;
+ void *msgp;
+ size_t msgsz;
+ long msgtyp;
+ int msgflg;
+};
+#endif
+int
+kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
+ struct thread *td;
+ int msqid;
+ void *msgp; /* XXX msgp is actually mtext. */
+ size_t msgsz;
+ long msgtyp;
+ int msgflg;
+ long *mtype;
+{
+ size_t len;
+ register struct msqid_kernel *msqkptr;
+ register struct msg *msghdr;
+ int msqix, error = 0;
+ short next;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+
+ msqix = IPCID_TO_IX(msqid);
+
+ if (msqix < 0 || msqix >= msginfo.msgmni) {
+ DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
+ msginfo.msgmni));
+ return (EINVAL);
+ }
+
+ msqkptr = &msqids[msqix];
+ mtx_lock(&msq_mtx);
+ if (msqkptr->u.msg_qbytes == 0) {
+ DPRINTF(("no such message queue id\n"));
+ error = EINVAL;
+ goto done2;
+ }
+ if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
+ DPRINTF(("wrong sequence number\n"));
+ error = EINVAL;
+ goto done2;
+ }
+
+ if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
+ DPRINTF(("requester doesn't have read access\n"));
+ goto done2;
+ }
+
+#ifdef MAC
+ error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr);
+ if (error != 0)
+ goto done2;
+#endif
+
+ msghdr = NULL;
+ while (msghdr == NULL) {
+ if (msgtyp == 0) {
+ msghdr = msqkptr->u.msg_first;
+ if (msghdr != NULL) {
+ if (msgsz < msghdr->msg_ts &&
+ (msgflg & MSG_NOERROR) == 0) {
+ DPRINTF(("first message on the queue "
+ "is too big (want %zu, got %d)\n",
+ msgsz, msghdr->msg_ts));
+ error = E2BIG;
+ goto done2;
+ }
+#ifdef MAC
+ error = mac_sysvmsq_check_msgrcv(td->td_ucred,
+ msghdr);
+ if (error != 0)
+ goto done2;
+#endif
+ if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
+ msqkptr->u.msg_first = NULL;
+ msqkptr->u.msg_last = NULL;
+ } else {
+ msqkptr->u.msg_first = msghdr->msg_next;
+ if (msqkptr->u.msg_first == NULL)
+ panic("msg_first/last screwed up #1");
+ }
+ }
+ } else {
+ struct msg *previous;
+ struct msg **prev;
+
+ previous = NULL;
+ prev = &(msqkptr->u.msg_first);
+ while ((msghdr = *prev) != NULL) {
+ /*
+ * Is this message's type an exact match or is
+ * this message's type less than or equal to
+ * the absolute value of a negative msgtyp?
+ * Note that the second half of this test can
+ * NEVER be true if msgtyp is positive since
+ * msg_type is always positive!
+ */
+
+ if (msgtyp == msghdr->msg_type ||
+ msghdr->msg_type <= -msgtyp) {
+ DPRINTF(("found message type %ld, "
+ "requested %ld\n",
+ msghdr->msg_type, msgtyp));
+ if (msgsz < msghdr->msg_ts &&
+ (msgflg & MSG_NOERROR) == 0) {
+ DPRINTF(("requested message "
+ "on the queue is too big "
+ "(want %zu, got %hu)\n",
+ msgsz, msghdr->msg_ts));
+ error = E2BIG;
+ goto done2;
+ }
+#ifdef MAC
+ error = mac_sysvmsq_check_msgrcv(
+ td->td_ucred, msghdr);
+ if (error != 0)
+ goto done2;
+#endif
+ *prev = msghdr->msg_next;
+ if (msghdr == msqkptr->u.msg_last) {
+ if (previous == NULL) {
+ if (prev !=
+ &msqkptr->u.msg_first)
+ panic("msg_first/last screwed up #2");
+ msqkptr->u.msg_first =
+ NULL;
+ msqkptr->u.msg_last =
+ NULL;
+ } else {
+ if (prev ==
+ &msqkptr->u.msg_first)
+ panic("msg_first/last screwed up #3");
+ msqkptr->u.msg_last =
+ previous;
+ }
+ }
+ break;
+ }
+ previous = msghdr;
+ prev = &(msghdr->msg_next);
+ }
+ }
+
+ /*
+ * We've either extracted the msghdr for the appropriate
+ * message or there isn't one.
+ * If there is one then bail out of this loop.
+ */
+
+ if (msghdr != NULL)
+ break;
+
+ /*
+ * Hmph! No message found. Does the user want to wait?
+ */
+
+ if ((msgflg & IPC_NOWAIT) != 0) {
+ DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
+ msgtyp));
+ /* The SVID says to return ENOMSG. */
+ error = ENOMSG;
+ goto done2;
+ }
+
+ /*
+ * Wait for something to happen
+ */
+
+ DPRINTF(("msgrcv: goodnight\n"));
+ error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
+ "msgrcv", 0);
+ DPRINTF(("msgrcv: good morning (error=%d)\n", error));
+
+ if (error != 0) {
+ DPRINTF(("msgrcv: interrupted system call\n"));
+ error = EINTR;
+ goto done2;
+ }
+
+ /*
+ * Make sure that the msq queue still exists
+ */
+
+ if (msqkptr->u.msg_qbytes == 0 ||
+ msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
+ DPRINTF(("msqid deleted\n"));
+ error = EIDRM;
+ goto done2;
+ }
+ }
+
+ /*
+ * Return the message to the user.
+ *
+ * First, do the bookkeeping (before we risk being interrupted).
+ */
+
+ msqkptr->u.msg_cbytes -= msghdr->msg_ts;
+ msqkptr->u.msg_qnum--;
+ msqkptr->u.msg_lrpid = td->td_proc->p_pid;
+ msqkptr->u.msg_rtime = time_second;
+
+ racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1);
+ racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts);
+
+ /*
+ * Make msgsz the actual amount that we'll be returning.
+ * Note that this effectively truncates the message if it is too long
+ * (since msgsz is never increased).
+ */
+
+ DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
+ msghdr->msg_ts));
+ if (msgsz > msghdr->msg_ts)
+ msgsz = msghdr->msg_ts;
+ *mtype = msghdr->msg_type;
+
+ /*
+ * Return the segments to the user
+ */
+
+ next = msghdr->msg_spot;
+ for (len = 0; len < msgsz; len += msginfo.msgssz) {
+ size_t tlen;
+
+ if (msgsz - len > msginfo.msgssz)
+ tlen = msginfo.msgssz;
+ else
+ tlen = msgsz - len;
+ if (next <= -1)
+ panic("next too low #3");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #3");
+ mtx_unlock(&msq_mtx);
+ error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
+ mtx_lock(&msq_mtx);
+ if (error != 0) {
+ DPRINTF(("error (%d) copying out message segment\n",
+ error));
+ msg_freehdr(msghdr);
+ wakeup(msqkptr);
+ goto done2;
+ }
+ msgp = (char *)msgp + tlen;
+ next = msgmaps[next].next;
+ }
+
+ /*
+ * Done, return the actual number of bytes copied out.
+ */
+
+ msg_freehdr(msghdr);
+ wakeup(msqkptr);
+ td->td_retval[0] = msgsz;
+done2:
+ mtx_unlock(&msq_mtx);
+ return (error);
+}
+
+int
+sys_msgrcv(td, uap)
+ struct thread *td;
+ register struct msgrcv_args *uap;
+{
+ int error;
+ long mtype;
+
+ DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
+ uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
+
+ if ((error = kern_msgrcv(td, uap->msqid,
+ (char *)uap->msgp + sizeof(mtype), uap->msgsz,
+ uap->msgtyp, uap->msgflg, &mtype)) != 0)
+ return (error);
+ if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
+ DPRINTF(("error %d copying the message type\n", error));
+ return (error);
+}
+
+static int
+sysctl_msqids(SYSCTL_HANDLER_ARGS)
+{
+
+ return (SYSCTL_OUT(req, msqids,
+ sizeof(struct msqid_kernel) * msginfo.msgmni));
+}
+
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
+ "Maximum message size");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
+ "Number of message queue identifiers");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
+ "Maximum number of bytes in a queue");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
+ "Maximum number of messages in the system");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
+ "Size of a message segment");
+SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
+ "Number of message segments");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_msqids, "", "Message queue IDs");
+
+#ifdef COMPAT_FREEBSD32
+int
+freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
+{
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ switch (uap->which) {
+ case 0:
+ return (freebsd7_freebsd32_msgctl(td,
+ (struct freebsd7_freebsd32_msgctl_args *)&uap->a2));
+ case 2:
+ return (freebsd32_msgsnd(td,
+ (struct freebsd32_msgsnd_args *)&uap->a2));
+ case 3:
+ return (freebsd32_msgrcv(td,
+ (struct freebsd32_msgrcv_args *)&uap->a2));
+ default:
+ return (sys_msgsys(td, (struct msgsys_args *)uap));
+ }
+#else
+ return (nosys(td, NULL));
+#endif
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+int
+freebsd7_freebsd32_msgctl(struct thread *td,
+ struct freebsd7_freebsd32_msgctl_args *uap)
+{
+ struct msqid_ds msqbuf;
+ struct msqid_ds32_old msqbuf32;
+ int error;
+
+ if (uap->cmd == IPC_SET) {
+ error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
+ if (error)
+ return (error);
+ freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
+ PTRIN_CP(msqbuf32, msqbuf, msg_first);
+ PTRIN_CP(msqbuf32, msqbuf, msg_last);
+ CP(msqbuf32, msqbuf, msg_cbytes);
+ CP(msqbuf32, msqbuf, msg_qnum);
+ CP(msqbuf32, msqbuf, msg_qbytes);
+ CP(msqbuf32, msqbuf, msg_lspid);
+ CP(msqbuf32, msqbuf, msg_lrpid);
+ CP(msqbuf32, msqbuf, msg_stime);
+ CP(msqbuf32, msqbuf, msg_rtime);
+ CP(msqbuf32, msqbuf, msg_ctime);
+ }
+ error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
+ if (error)
+ return (error);
+ if (uap->cmd == IPC_STAT) {
+ bzero(&msqbuf32, sizeof(msqbuf32));
+ freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
+ PTROUT_CP(msqbuf, msqbuf32, msg_first);
+ PTROUT_CP(msqbuf, msqbuf32, msg_last);
+ CP(msqbuf, msqbuf32, msg_cbytes);
+ CP(msqbuf, msqbuf32, msg_qnum);
+ CP(msqbuf, msqbuf32, msg_qbytes);
+ CP(msqbuf, msqbuf32, msg_lspid);
+ CP(msqbuf, msqbuf32, msg_lrpid);
+ CP(msqbuf, msqbuf32, msg_stime);
+ CP(msqbuf, msqbuf32, msg_rtime);
+ CP(msqbuf, msqbuf32, msg_ctime);
+ error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
+ }
+ return (error);
+}
+#endif
+
+int
+freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap)
+{
+ struct msqid_ds msqbuf;
+ struct msqid_ds32 msqbuf32;
+ int error;
+
+ if (uap->cmd == IPC_SET) {
+ error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
+ if (error)
+ return (error);
+ freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
+ PTRIN_CP(msqbuf32, msqbuf, msg_first);
+ PTRIN_CP(msqbuf32, msqbuf, msg_last);
+ CP(msqbuf32, msqbuf, msg_cbytes);
+ CP(msqbuf32, msqbuf, msg_qnum);
+ CP(msqbuf32, msqbuf, msg_qbytes);
+ CP(msqbuf32, msqbuf, msg_lspid);
+ CP(msqbuf32, msqbuf, msg_lrpid);
+ CP(msqbuf32, msqbuf, msg_stime);
+ CP(msqbuf32, msqbuf, msg_rtime);
+ CP(msqbuf32, msqbuf, msg_ctime);
+ }
+ error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
+ if (error)
+ return (error);
+ if (uap->cmd == IPC_STAT) {
+ freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
+ PTROUT_CP(msqbuf, msqbuf32, msg_first);
+ PTROUT_CP(msqbuf, msqbuf32, msg_last);
+ CP(msqbuf, msqbuf32, msg_cbytes);
+ CP(msqbuf, msqbuf32, msg_qnum);
+ CP(msqbuf, msqbuf32, msg_qbytes);
+ CP(msqbuf, msqbuf32, msg_lspid);
+ CP(msqbuf, msqbuf32, msg_lrpid);
+ CP(msqbuf, msqbuf32, msg_stime);
+ CP(msqbuf, msqbuf32, msg_rtime);
+ CP(msqbuf, msqbuf32, msg_ctime);
+ error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
+ }
+ return (error);
+}
+
+int
+freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap)
+{
+ const void *msgp;
+ long mtype;
+ int32_t mtype32;
+ int error;
+
+ msgp = PTRIN(uap->msgp);
+ if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0)
+ return (error);
+ mtype = mtype32;
+ return (kern_msgsnd(td, uap->msqid,
+ (const char *)msgp + sizeof(mtype32),
+ uap->msgsz, uap->msgflg, mtype));
+}
+
+int
+freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap)
+{
+ void *msgp;
+ long mtype;
+ int32_t mtype32;
+ int error;
+
+ msgp = PTRIN(uap->msgp);
+ if ((error = kern_msgrcv(td, uap->msqid,
+ (char *)msgp + sizeof(mtype32), uap->msgsz,
+ uap->msgtyp, uap->msgflg, &mtype)) != 0)
+ return (error);
+ mtype32 = (int32_t)mtype;
+ return (copyout(&mtype32, msgp, sizeof(mtype32)));
+}
+#endif
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *msgcalls[] = {
+ (sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget,
+ (sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv
+};
+
+/*
+ * Entry point for all MSG calls.
+ */
+int
+sys_msgsys(td, uap)
+ struct thread *td;
+ /* XXX actually varargs. */
+ struct msgsys_args /* {
+ int which;
+ int a2;
+ int a3;
+ int a4;
+ int a5;
+ int a6;
+ } */ *uap;
+{
+ int error;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+ if (uap->which < 0 ||
+ uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
+ return (EINVAL);
+ error = (*msgcalls[uap->which])(td, &uap->a2);
+ return (error);
+}
+
+#ifndef CP
+#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd7_msgctl_args {
+ int msqid;
+ int cmd;
+ struct msqid_ds_old *buf;
+};
+#endif
+int
+freebsd7_msgctl(td, uap)
+ struct thread *td;
+ struct freebsd7_msgctl_args *uap;
+{
+ struct msqid_ds_old msqold;
+ struct msqid_ds msqbuf;
+ int error;
+
+ DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd,
+ uap->buf));
+ if (uap->cmd == IPC_SET) {
+ error = copyin(uap->buf, &msqold, sizeof(msqold));
+ if (error)
+ return (error);
+ ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm);
+ CP(msqold, msqbuf, msg_first);
+ CP(msqold, msqbuf, msg_last);
+ CP(msqold, msqbuf, msg_cbytes);
+ CP(msqold, msqbuf, msg_qnum);
+ CP(msqold, msqbuf, msg_qbytes);
+ CP(msqold, msqbuf, msg_lspid);
+ CP(msqold, msqbuf, msg_lrpid);
+ CP(msqold, msqbuf, msg_stime);
+ CP(msqold, msqbuf, msg_rtime);
+ CP(msqold, msqbuf, msg_ctime);
+ }
+ error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
+ if (error)
+ return (error);
+ if (uap->cmd == IPC_STAT) {
+ bzero(&msqold, sizeof(msqold));
+ ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm);
+ CP(msqbuf, msqold, msg_first);
+ CP(msqbuf, msqold, msg_last);
+ CP(msqbuf, msqold, msg_cbytes);
+ CP(msqbuf, msqold, msg_qnum);
+ CP(msqbuf, msqold, msg_qbytes);
+ CP(msqbuf, msqold, msg_lspid);
+ CP(msqbuf, msqold, msg_lrpid);
+ CP(msqbuf, msqold, msg_stime);
+ CP(msqbuf, msqold, msg_rtime);
+ CP(msqbuf, msqold, msg_ctime);
+ error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old));
+ }
+ return (error);
+}
+
+#undef CP
+
+#endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
+ COMPAT_FREEBSD7 */
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
new file mode 100644
index 0000000..f9ff217
--- /dev/null
+++ b/sys/kern/sysv_sem.c
@@ -0,0 +1,1666 @@
+/*-
+ * Implementation of SVID semaphores
+ *
+ * Author: Daniel Boulet
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+/*-
+ * Copyright (c) 2003-2005 McAfee, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by McAfee
+ * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
+ * program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/racct.h>
+#include <sys/sem.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/malloc.h>
+#include <sys/jail.h>
+
+#include <security/mac/mac_framework.h>
+
+FEATURE(sysv_sem, "System V semaphores support");
+
+static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
+
+#ifdef SEM_DEBUG
+#define DPRINTF(a) printf a
+#else
+#define DPRINTF(a)
+#endif
+
+static int seminit(void);
+static int sysvsem_modload(struct module *, int, void *);
+static int semunload(void);
+static void semexit_myhook(void *arg, struct proc *p);
+static int sysctl_sema(SYSCTL_HANDLER_ARGS);
+static int semvalid(int semid, struct semid_kernel *semakptr);
+
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args;
+int __semctl(struct thread *td, struct __semctl_args *uap);
+struct semget_args;
+int semget(struct thread *td, struct semget_args *uap);
+struct semop_args;
+int semop(struct thread *td, struct semop_args *uap);
+#endif
+
+static struct sem_undo *semu_alloc(struct thread *td);
+static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
+ int semid, int semseq, int semnum, int adjval);
+static void semundo_clear(int semid, int semnum);
+
+static struct mtx sem_mtx; /* semaphore global lock */
+static struct mtx sem_undo_mtx;
+static int semtot = 0;
+static struct semid_kernel *sema; /* semaphore id pool */
+static struct mtx *sema_mtx; /* semaphore id pool mutexes*/
+static struct sem *sem; /* semaphore pool */
+LIST_HEAD(, sem_undo) semu_list; /* list of active undo structures */
+LIST_HEAD(, sem_undo) semu_free_list; /* list of free undo structures */
+static int *semu; /* undo structure pool */
+static eventhandler_tag semexit_tag;
+
+#define SEMUNDO_MTX sem_undo_mtx
+#define SEMUNDO_LOCK() mtx_lock(&SEMUNDO_MTX);
+#define SEMUNDO_UNLOCK() mtx_unlock(&SEMUNDO_MTX);
+#define SEMUNDO_LOCKASSERT(how) mtx_assert(&SEMUNDO_MTX, (how));
+
+struct sem {
+ u_short semval; /* semaphore value */
+ pid_t sempid; /* pid of last operation */
+ u_short semncnt; /* # awaiting semval > cval */
+ u_short semzcnt; /* # awaiting semval = 0 */
+};
+
+/*
+ * Undo structure (one per process)
+ */
+struct sem_undo {
+ LIST_ENTRY(sem_undo) un_next; /* ptr to next active undo structure */
+ struct proc *un_proc; /* owner of this structure */
+ short un_cnt; /* # of active entries */
+ struct undo {
+ short un_adjval; /* adjust on exit values */
+ short un_num; /* semaphore # */
+ int un_id; /* semid */
+ unsigned short un_seq;
+ } un_ent[1]; /* undo entries */
+};
+
+/*
+ * Configuration parameters
+ */
+#ifndef SEMMNI
+#define SEMMNI 50 /* # of semaphore identifiers */
+#endif
+#ifndef SEMMNS
+#define SEMMNS 340 /* # of semaphores in system */
+#endif
+#ifndef SEMUME
+#define SEMUME 50 /* max # of undo entries per process */
+#endif
+#ifndef SEMMNU
+#define SEMMNU 150 /* # of undo structures in system */
+#endif
+
+/* shouldn't need tuning */
+#ifndef SEMMSL
+#define SEMMSL SEMMNS /* max # of semaphores per id */
+#endif
+#ifndef SEMOPM
+#define SEMOPM 100 /* max # of operations per semop call */
+#endif
+
+#define SEMVMX 32767 /* semaphore maximum value */
+#define SEMAEM 16384 /* adjust on exit max value */
+
+/*
+ * Due to the way semaphore memory is allocated, we have to ensure that
+ * SEMUSZ is properly aligned.
+ */
+
+#define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
+
+/* actual size of an undo structure */
+#define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
+
+/*
+ * Macro to find a particular sem_undo vector
+ */
+#define SEMU(ix) \
+ ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
+
+/*
+ * semaphore info struct
+ */
+struct seminfo seminfo = {
+ SEMMNI, /* # of semaphore identifiers */
+ SEMMNS, /* # of semaphores in system */
+ SEMMNU, /* # of undo structures in system */
+ SEMMSL, /* max # of semaphores per id */
+ SEMOPM, /* max # of operations per semop call */
+ SEMUME, /* max # of undo entries per process */
+ SEMUSZ, /* size in bytes of undo structure */
+ SEMVMX, /* semaphore maximum value */
+ SEMAEM /* adjust on exit max value */
+};
+
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
+ "Number of semaphore identifiers");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
+ "Maximum number of semaphores in the system");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
+ "Maximum number of undo structures in the system");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0,
+ "Max semaphores per id");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
+ "Max operations per semop call");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
+ "Max undo entries per process");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
+ "Size in bytes of undo structure");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0,
+ "Semaphore maximum value");
+SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0,
+ "Adjust on exit max value");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_sema, "", "Semaphore id pool");
+
+static struct syscall_helper_data sem_syscalls[] = {
+ SYSCALL_INIT_HELPER(__semctl),
+ SYSCALL_INIT_HELPER(semget),
+ SYSCALL_INIT_HELPER(semop),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ SYSCALL_INIT_HELPER(semsys),
+ SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl),
+#endif
+ SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_ipc.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static struct syscall_helper_data sem32_syscalls[] = {
+ SYSCALL32_INIT_HELPER(freebsd32_semctl),
+ SYSCALL32_INIT_HELPER_COMPAT(semget),
+ SYSCALL32_INIT_HELPER_COMPAT(semop),
+ SYSCALL32_INIT_HELPER(freebsd32_semsys),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl),
+#endif
+ SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+seminit(void)
+{
+ int i, error;
+
+ TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
+ TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
+ TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
+ TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
+ TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
+ TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
+ TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
+ TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
+ TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);
+
+ sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
+ sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
+ M_WAITOK);
+ sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
+ M_WAITOK | M_ZERO);
+ semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
+
+ for (i = 0; i < seminfo.semmni; i++) {
+ sema[i].u.sem_base = 0;
+ sema[i].u.sem_perm.mode = 0;
+ sema[i].u.sem_perm.seq = 0;
+#ifdef MAC
+ mac_sysvsem_init(&sema[i]);
+#endif
+ }
+ for (i = 0; i < seminfo.semmni; i++)
+ mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
+ LIST_INIT(&semu_free_list);
+ for (i = 0; i < seminfo.semmnu; i++) {
+ struct sem_undo *suptr = SEMU(i);
+ suptr->un_proc = NULL;
+ LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
+ }
+ LIST_INIT(&semu_list);
+ mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
+ mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF);
+ semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
+ EVENTHANDLER_PRI_ANY);
+
+ error = syscall_helper_register(sem_syscalls);
+ if (error != 0)
+ return (error);
+#ifdef COMPAT_FREEBSD32
+ error = syscall32_helper_register(sem32_syscalls);
+ if (error != 0)
+ return (error);
+#endif
+ return (0);
+}
+
+static int
+semunload(void)
+{
+ int i;
+
+ /* XXXKIB */
+ if (semtot != 0)
+ return (EBUSY);
+
+#ifdef COMPAT_FREEBSD32
+ syscall32_helper_unregister(sem32_syscalls);
+#endif
+ syscall_helper_unregister(sem_syscalls);
+ EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
+#ifdef MAC
+ for (i = 0; i < seminfo.semmni; i++)
+ mac_sysvsem_destroy(&sema[i]);
+#endif
+ free(sem, M_SEM);
+ free(sema, M_SEM);
+ free(semu, M_SEM);
+ for (i = 0; i < seminfo.semmni; i++)
+ mtx_destroy(&sema_mtx[i]);
+ free(sema_mtx, M_SEM);
+ mtx_destroy(&sem_mtx);
+ mtx_destroy(&sem_undo_mtx);
+ return (0);
+}
+
+static int
+sysvsem_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ error = seminit();
+ if (error != 0)
+ semunload();
+ break;
+ case MOD_UNLOAD:
+ error = semunload();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t sysvsem_mod = {
+ "sysvsem",
+ &sysvsem_modload,
+ NULL
+};
+
+DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
+MODULE_VERSION(sysvsem, 1);
+
+/*
+ * Allocate a new sem_undo structure for a process
+ * (returns ptr to structure or NULL if no more room)
+ */
+
+static struct sem_undo *
+semu_alloc(struct thread *td)
+{
+ struct sem_undo *suptr;
+
+ SEMUNDO_LOCKASSERT(MA_OWNED);
+ if ((suptr = LIST_FIRST(&semu_free_list)) == NULL)
+ return (NULL);
+ LIST_REMOVE(suptr, un_next);
+ LIST_INSERT_HEAD(&semu_list, suptr, un_next);
+ suptr->un_cnt = 0;
+ suptr->un_proc = td->td_proc;
+ return (suptr);
+}
+
+static int
+semu_try_free(struct sem_undo *suptr)
+{
+
+ SEMUNDO_LOCKASSERT(MA_OWNED);
+
+ if (suptr->un_cnt != 0)
+ return (0);
+ LIST_REMOVE(suptr, un_next);
+ LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
+ return (1);
+}
+
+/*
+ * Adjust a particular entry for a particular proc
+ */
+
+static int
+semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid,
+ int semseq, int semnum, int adjval)
+{
+ struct proc *p = td->td_proc;
+ struct sem_undo *suptr;
+ struct undo *sunptr;
+ int i;
+
+ SEMUNDO_LOCKASSERT(MA_OWNED);
+ /* Look for and remember the sem_undo if the caller doesn't provide
+ it */
+
+ suptr = *supptr;
+ if (suptr == NULL) {
+ LIST_FOREACH(suptr, &semu_list, un_next) {
+ if (suptr->un_proc == p) {
+ *supptr = suptr;
+ break;
+ }
+ }
+ if (suptr == NULL) {
+ if (adjval == 0)
+ return(0);
+ suptr = semu_alloc(td);
+ if (suptr == NULL)
+ return (ENOSPC);
+ *supptr = suptr;
+ }
+ }
+
+ /*
+ * Look for the requested entry and adjust it (delete if adjval becomes
+ * 0).
+ */
+ sunptr = &suptr->un_ent[0];
+ for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+ if (sunptr->un_id != semid || sunptr->un_num != semnum)
+ continue;
+ if (adjval != 0) {
+ adjval += sunptr->un_adjval;
+ if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
+ return (ERANGE);
+ }
+ sunptr->un_adjval = adjval;
+ if (sunptr->un_adjval == 0) {
+ suptr->un_cnt--;
+ if (i < suptr->un_cnt)
+ suptr->un_ent[i] =
+ suptr->un_ent[suptr->un_cnt];
+ if (suptr->un_cnt == 0)
+ semu_try_free(suptr);
+ }
+ return (0);
+ }
+
+ /* Didn't find the right entry - create it */
+ if (adjval == 0)
+ return (0);
+ if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
+ return (ERANGE);
+ if (suptr->un_cnt != seminfo.semume) {
+ sunptr = &suptr->un_ent[suptr->un_cnt];
+ suptr->un_cnt++;
+ sunptr->un_adjval = adjval;
+ sunptr->un_id = semid;
+ sunptr->un_num = semnum;
+ sunptr->un_seq = semseq;
+ } else
+ return (EINVAL);
+ return (0);
+}
+
+static void
+semundo_clear(int semid, int semnum)
+{
+ struct sem_undo *suptr, *suptr1;
+ struct undo *sunptr;
+ int i;
+
+ SEMUNDO_LOCKASSERT(MA_OWNED);
+ LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) {
+ sunptr = &suptr->un_ent[0];
+ for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+ if (sunptr->un_id != semid)
+ continue;
+ if (semnum == -1 || sunptr->un_num == semnum) {
+ suptr->un_cnt--;
+ if (i < suptr->un_cnt) {
+ suptr->un_ent[i] =
+ suptr->un_ent[suptr->un_cnt];
+ continue;
+ }
+ semu_try_free(suptr);
+ }
+ if (semnum != -1)
+ break;
+ }
+ }
+}
+
+static int
+semvalid(int semid, struct semid_kernel *semakptr)
+{
+
+ return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
+ semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0);
+}
+
+/*
+ * Note that the user-mode half of this passes a union, not a pointer.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args {
+ int semid;
+ int semnum;
+ int cmd;
+ union semun *arg;
+};
+#endif
+int
+sys___semctl(struct thread *td, struct __semctl_args *uap)
+{
+ struct semid_ds dsbuf;
+ union semun arg, semun;
+ register_t rval;
+ int error;
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_SET:
+ case IPC_STAT:
+ case GETALL:
+ case SETVAL:
+ case SETALL:
+ error = copyin(uap->arg, &arg, sizeof(arg));
+ if (error)
+ return (error);
+ break;
+ }
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ semun.buf = &dsbuf;
+ break;
+ case IPC_SET:
+ error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
+ if (error)
+ return (error);
+ semun.buf = &dsbuf;
+ break;
+ case GETALL:
+ case SETALL:
+ semun.array = arg.array;
+ break;
+ case SETVAL:
+ semun.val = arg.val;
+ break;
+ }
+
+ error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+ &rval);
+ if (error)
+ return (error);
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
+ break;
+ }
+
+ if (error == 0)
+ td->td_retval[0] = rval;
+ return (error);
+}
+
+int
+kern_semctl(struct thread *td, int semid, int semnum, int cmd,
+ union semun *arg, register_t *rval)
+{
+ u_short *array;
+ struct ucred *cred = td->td_ucred;
+ int i, error;
+ struct semid_ds *sbuf;
+ struct semid_kernel *semakptr;
+ struct mtx *sema_mtxp;
+ u_short usval, count;
+ int semidx;
+
+ DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
+ semid, semnum, cmd, arg));
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+
+ array = NULL;
+
+ switch(cmd) {
+ case SEM_STAT:
+ /*
+ * For this command we assume semid is an array index
+ * rather than an IPC id.
+ */
+ if (semid < 0 || semid >= seminfo.semmni)
+ return (EINVAL);
+ semakptr = &sema[semid];
+ sema_mtxp = &sema_mtx[semid];
+ mtx_lock(sema_mtxp);
+ if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
+ error = EINVAL;
+ goto done2;
+ }
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+ goto done2;
+#ifdef MAC
+ error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
+ if (error != 0)
+ goto done2;
+#endif
+ bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
+ *rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
+ mtx_unlock(sema_mtxp);
+ return (0);
+ }
+
+ semidx = IPCID_TO_IX(semid);
+ if (semidx < 0 || semidx >= seminfo.semmni)
+ return (EINVAL);
+
+ semakptr = &sema[semidx];
+ sema_mtxp = &sema_mtx[semidx];
+ if (cmd == IPC_RMID)
+ mtx_lock(&sem_mtx);
+ mtx_lock(sema_mtxp);
+#ifdef MAC
+ error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
+ if (error != 0)
+ goto done2;
+#endif
+
+ error = 0;
+ *rval = 0;
+
+ switch (cmd) {
+ case IPC_RMID:
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
+ goto done2;
+ semakptr->u.sem_perm.cuid = cred->cr_uid;
+ semakptr->u.sem_perm.uid = cred->cr_uid;
+ semakptr->u.sem_perm.mode = 0;
+ racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
+ crfree(semakptr->cred);
+ semakptr->cred = NULL;
+ SEMUNDO_LOCK();
+ semundo_clear(semidx, -1);
+ SEMUNDO_UNLOCK();
+#ifdef MAC
+ mac_sysvsem_cleanup(semakptr);
+#endif
+ wakeup(semakptr);
+ for (i = 0; i < seminfo.semmni; i++) {
+ if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
+ sema[i].u.sem_base > semakptr->u.sem_base)
+ mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
+ }
+ for (i = semakptr->u.sem_base - sem; i < semtot; i++)
+ sem[i] = sem[i + semakptr->u.sem_nsems];
+ for (i = 0; i < seminfo.semmni; i++) {
+ if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
+ sema[i].u.sem_base > semakptr->u.sem_base) {
+ sema[i].u.sem_base -= semakptr->u.sem_nsems;
+ mtx_unlock(&sema_mtx[i]);
+ }
+ }
+ semtot -= semakptr->u.sem_nsems;
+ break;
+
+ case IPC_SET:
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
+ goto done2;
+ sbuf = arg->buf;
+ semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
+ semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
+ semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
+ ~0777) | (sbuf->sem_perm.mode & 0777);
+ semakptr->u.sem_ctime = time_second;
+ break;
+
+ case IPC_STAT:
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+ goto done2;
+ bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
+ break;
+
+ case GETNCNT:
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+ goto done2;
+ if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ *rval = semakptr->u.sem_base[semnum].semncnt;
+ break;
+
+ case GETPID:
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+ goto done2;
+ if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ *rval = semakptr->u.sem_base[semnum].sempid;
+ break;
+
+ case GETVAL:
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+ goto done2;
+ if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ *rval = semakptr->u.sem_base[semnum].semval;
+ break;
+
+ case GETALL:
+ /*
+ * Unfortunately, callers of this function don't know
+ * in advance how many semaphores are in this set.
+ * While we could just allocate the maximum size array
+ * and pass the actual size back to the caller, that
+ * won't work for SETALL since we can't copyin() more
+ * data than the user specified as we may return a
+ * spurious EFAULT.
+ *
+ * Note that the number of semaphores in a set is
+ * fixed for the life of that set. The only way that
+ * the 'count' could change while are blocked in
+ * malloc() is if this semaphore set were destroyed
+ * and a new one created with the same index.
+ * However, semvalid() will catch that due to the
+ * sequence number unless exactly 0x8000 (or a
+ * multiple thereof) semaphore sets for the same index
+ * are created and destroyed while we are in malloc!
+ *
+ */
+ count = semakptr->u.sem_nsems;
+ mtx_unlock(sema_mtxp);
+ array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
+ mtx_lock(sema_mtxp);
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+ goto done2;
+ for (i = 0; i < semakptr->u.sem_nsems; i++)
+ array[i] = semakptr->u.sem_base[i].semval;
+ mtx_unlock(sema_mtxp);
+ error = copyout(array, arg->array, count * sizeof(*array));
+ mtx_lock(sema_mtxp);
+ break;
+
+ case GETZCNT:
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
+ goto done2;
+ if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ *rval = semakptr->u.sem_base[semnum].semzcnt;
+ break;
+
+ case SETVAL:
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
+ goto done2;
+ if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
+ error = EINVAL;
+ goto done2;
+ }
+ if (arg->val < 0 || arg->val > seminfo.semvmx) {
+ error = ERANGE;
+ goto done2;
+ }
+ semakptr->u.sem_base[semnum].semval = arg->val;
+ SEMUNDO_LOCK();
+ semundo_clear(semidx, semnum);
+ SEMUNDO_UNLOCK();
+ wakeup(semakptr);
+ break;
+
+ case SETALL:
+ /*
+ * See comment on GETALL for why 'count' shouldn't change
+ * and why we require a userland buffer.
+ */
+ count = semakptr->u.sem_nsems;
+ mtx_unlock(sema_mtxp);
+ array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
+ error = copyin(arg->array, array, count * sizeof(*array));
+ mtx_lock(sema_mtxp);
+ if (error)
+ break;
+ if ((error = semvalid(semid, semakptr)) != 0)
+ goto done2;
+ KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
+ goto done2;
+ for (i = 0; i < semakptr->u.sem_nsems; i++) {
+ usval = array[i];
+ if (usval > seminfo.semvmx) {
+ error = ERANGE;
+ break;
+ }
+ semakptr->u.sem_base[i].semval = usval;
+ }
+ SEMUNDO_LOCK();
+ semundo_clear(semidx, -1);
+ SEMUNDO_UNLOCK();
+ wakeup(semakptr);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+done2:
+ mtx_unlock(sema_mtxp);
+ if (cmd == IPC_RMID)
+ mtx_unlock(&sem_mtx);
+ if (array != NULL)
+ free(array, M_TEMP);
+ return(error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semget_args {
+ key_t key;
+ int nsems;
+ int semflg;
+};
+#endif
+int
+sys_semget(struct thread *td, struct semget_args *uap)
+{
+ int semid, error = 0;
+ int key = uap->key;
+ int nsems = uap->nsems;
+ int semflg = uap->semflg;
+ struct ucred *cred = td->td_ucred;
+
+ DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+
+ mtx_lock(&sem_mtx);
+ if (key != IPC_PRIVATE) {
+ for (semid = 0; semid < seminfo.semmni; semid++) {
+ if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
+ sema[semid].u.sem_perm.key == key)
+ break;
+ }
+ if (semid < seminfo.semmni) {
+ DPRINTF(("found public key\n"));
+ if ((error = ipcperm(td, &sema[semid].u.sem_perm,
+ semflg & 0700))) {
+ goto done2;
+ }
+ if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
+ DPRINTF(("too small\n"));
+ error = EINVAL;
+ goto done2;
+ }
+ if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
+ DPRINTF(("not exclusive\n"));
+ error = EEXIST;
+ goto done2;
+ }
+#ifdef MAC
+ error = mac_sysvsem_check_semget(cred, &sema[semid]);
+ if (error != 0)
+ goto done2;
+#endif
+ goto found;
+ }
+ }
+
+ DPRINTF(("need to allocate the semid_kernel\n"));
+ if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
+ if (nsems <= 0 || nsems > seminfo.semmsl) {
+ DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
+ seminfo.semmsl));
+ error = EINVAL;
+ goto done2;
+ }
+ if (nsems > seminfo.semmns - semtot) {
+ DPRINTF((
+ "not enough semaphores left (need %d, got %d)\n",
+ nsems, seminfo.semmns - semtot));
+ error = ENOSPC;
+ goto done2;
+ }
+ for (semid = 0; semid < seminfo.semmni; semid++) {
+ if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
+ break;
+ }
+ if (semid == seminfo.semmni) {
+ DPRINTF(("no more semid_kernel's available\n"));
+ error = ENOSPC;
+ goto done2;
+ }
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ error = racct_add(td->td_proc, RACCT_NSEM, nsems);
+ PROC_UNLOCK(td->td_proc);
+ if (error != 0) {
+ error = ENOSPC;
+ goto done2;
+ }
+#endif
+ DPRINTF(("semid %d is available\n", semid));
+ mtx_lock(&sema_mtx[semid]);
+ KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
+ ("Lost semaphore %d", semid));
+ sema[semid].u.sem_perm.key = key;
+ sema[semid].u.sem_perm.cuid = cred->cr_uid;
+ sema[semid].u.sem_perm.uid = cred->cr_uid;
+ sema[semid].u.sem_perm.cgid = cred->cr_gid;
+ sema[semid].u.sem_perm.gid = cred->cr_gid;
+ sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
+ sema[semid].cred = crhold(cred);
+ sema[semid].u.sem_perm.seq =
+ (sema[semid].u.sem_perm.seq + 1) & 0x7fff;
+ sema[semid].u.sem_nsems = nsems;
+ sema[semid].u.sem_otime = 0;
+ sema[semid].u.sem_ctime = time_second;
+ sema[semid].u.sem_base = &sem[semtot];
+ semtot += nsems;
+ bzero(sema[semid].u.sem_base,
+ sizeof(sema[semid].u.sem_base[0])*nsems);
+#ifdef MAC
+ mac_sysvsem_create(cred, &sema[semid]);
+#endif
+ mtx_unlock(&sema_mtx[semid]);
+ DPRINTF(("sembase = %p, next = %p\n",
+ sema[semid].u.sem_base, &sem[semtot]));
+ } else {
+ DPRINTF(("didn't find it and wasn't asked to create it\n"));
+ error = ENOENT;
+ goto done2;
+ }
+
+found:
+ td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
+done2:
+ mtx_unlock(&sem_mtx);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semop_args {
+ int semid;
+ struct sembuf *sops;
+ size_t nsops;
+};
+#endif
+int
+sys_semop(struct thread *td, struct semop_args *uap)
+{
+#define SMALL_SOPS 8
+ struct sembuf small_sops[SMALL_SOPS];
+ int semid = uap->semid;
+ size_t nsops = uap->nsops;
+ struct sembuf *sops;
+ struct semid_kernel *semakptr;
+ struct sembuf *sopptr = 0;
+ struct sem *semptr = 0;
+ struct sem_undo *suptr;
+ struct mtx *sema_mtxp;
+ size_t i, j, k;
+ int error;
+ int do_wakeup, do_undos;
+ unsigned short seq;
+
+#ifdef SEM_DEBUG
+ sops = NULL;
+#endif
+ DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+
+ semid = IPCID_TO_IX(semid); /* Convert back to zero origin */
+
+ if (semid < 0 || semid >= seminfo.semmni)
+ return (EINVAL);
+
+ /* Allocate memory for sem_ops */
+ if (nsops <= SMALL_SOPS)
+ sops = small_sops;
+ else if (nsops > seminfo.semopm) {
+ DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
+ nsops));
+ return (E2BIG);
+ } else {
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) {
+ PROC_UNLOCK(td->td_proc);
+ return (E2BIG);
+ }
+ PROC_UNLOCK(td->td_proc);
+#endif
+
+ sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
+ }
+ if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
+ DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
+ uap->sops, sops, nsops * sizeof(sops[0])));
+ if (sops != small_sops)
+ free(sops, M_SEM);
+ return (error);
+ }
+
+ semakptr = &sema[semid];
+ sema_mtxp = &sema_mtx[semid];
+ mtx_lock(sema_mtxp);
+ if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
+ error = EINVAL;
+ goto done2;
+ }
+ seq = semakptr->u.sem_perm.seq;
+ if (seq != IPCID_TO_SEQ(uap->semid)) {
+ error = EINVAL;
+ goto done2;
+ }
+ /*
+ * Initial pass thru sops to see what permissions are needed.
+ * Also perform any checks that don't need repeating on each
+ * attempt to satisfy the request vector.
+ */
+ j = 0; /* permission needed */
+ do_undos = 0;
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+ if (sopptr->sem_num >= semakptr->u.sem_nsems) {
+ error = EFBIG;
+ goto done2;
+ }
+ if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
+ do_undos = 1;
+ j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
+ }
+
+ if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
+ DPRINTF(("error = %d from ipaccess\n", error));
+ goto done2;
+ }
+#ifdef MAC
+ error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j);
+ if (error != 0)
+ goto done2;
+#endif
+
+ /*
+ * Loop trying to satisfy the vector of requests.
+ * If we reach a point where we must wait, any requests already
+ * performed are rolled back and we go to sleep until some other
+ * process wakes us up. At this point, we start all over again.
+ *
+ * This ensures that from the perspective of other tasks, a set
+ * of requests is atomic (never partially satisfied).
+ */
+ for (;;) {
+ do_wakeup = 0;
+ error = 0; /* error return if necessary */
+
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+ semptr = &semakptr->u.sem_base[sopptr->sem_num];
+
+ DPRINTF((
+ "semop: semakptr=%p, sem_base=%p, "
+ "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
+ semakptr, semakptr->u.sem_base, semptr,
+ sopptr->sem_num, semptr->semval, sopptr->sem_op,
+ (sopptr->sem_flg & IPC_NOWAIT) ?
+ "nowait" : "wait"));
+
+ if (sopptr->sem_op < 0) {
+ if (semptr->semval + sopptr->sem_op < 0) {
+ DPRINTF(("semop: can't do it now\n"));
+ break;
+ } else {
+ semptr->semval += sopptr->sem_op;
+ if (semptr->semval == 0 &&
+ semptr->semzcnt > 0)
+ do_wakeup = 1;
+ }
+ } else if (sopptr->sem_op == 0) {
+ if (semptr->semval != 0) {
+ DPRINTF(("semop: not zero now\n"));
+ break;
+ }
+ } else if (semptr->semval + sopptr->sem_op >
+ seminfo.semvmx) {
+ error = ERANGE;
+ break;
+ } else {
+ if (semptr->semncnt > 0)
+ do_wakeup = 1;
+ semptr->semval += sopptr->sem_op;
+ }
+ }
+
+ /*
+ * Did we get through the entire vector?
+ */
+ if (i >= nsops)
+ goto done;
+
+ /*
+ * No ... rollback anything that we've already done
+ */
+ DPRINTF(("semop: rollback 0 through %d\n", i-1));
+ for (j = 0; j < i; j++)
+ semakptr->u.sem_base[sops[j].sem_num].semval -=
+ sops[j].sem_op;
+
+ /* If we detected an error, return it */
+ if (error != 0)
+ goto done2;
+
+ /*
+ * If the request that we couldn't satisfy has the
+ * NOWAIT flag set then return with EAGAIN.
+ */
+ if (sopptr->sem_flg & IPC_NOWAIT) {
+ error = EAGAIN;
+ goto done2;
+ }
+
+ if (sopptr->sem_op == 0)
+ semptr->semzcnt++;
+ else
+ semptr->semncnt++;
+
+ DPRINTF(("semop: good night!\n"));
+ error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
+ "semwait", 0);
+ DPRINTF(("semop: good morning (error=%d)!\n", error));
+ /* return code is checked below, after sem[nz]cnt-- */
+
+ /*
+ * Make sure that the semaphore still exists
+ */
+ seq = semakptr->u.sem_perm.seq;
+ if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
+ seq != IPCID_TO_SEQ(uap->semid)) {
+ error = EIDRM;
+ goto done2;
+ }
+
+ /*
+ * Renew the semaphore's pointer after wakeup since
+ * during msleep sem_base may have been modified and semptr
+ * is not valid any more
+ */
+ semptr = &semakptr->u.sem_base[sopptr->sem_num];
+
+ /*
+ * The semaphore is still alive. Readjust the count of
+ * waiting processes.
+ */
+ if (sopptr->sem_op == 0)
+ semptr->semzcnt--;
+ else
+ semptr->semncnt--;
+
+ /*
+ * Is it really morning, or was our sleep interrupted?
+ * (Delayed check of msleep() return code because we
+ * need to decrement sem[nz]cnt either way.)
+ */
+ if (error != 0) {
+ error = EINTR;
+ goto done2;
+ }
+ DPRINTF(("semop: good morning!\n"));
+ }
+
+done:
+ /*
+ * Process any SEM_UNDO requests.
+ */
+ if (do_undos) {
+ SEMUNDO_LOCK();
+ suptr = NULL;
+ for (i = 0; i < nsops; i++) {
+ /*
+ * We only need to deal with SEM_UNDO's for non-zero
+ * op's.
+ */
+ int adjval;
+
+ if ((sops[i].sem_flg & SEM_UNDO) == 0)
+ continue;
+ adjval = sops[i].sem_op;
+ if (adjval == 0)
+ continue;
+ error = semundo_adjust(td, &suptr, semid, seq,
+ sops[i].sem_num, -adjval);
+ if (error == 0)
+ continue;
+
+ /*
+ * Oh-Oh! We ran out of either sem_undo's or undo's.
+ * Rollback the adjustments to this point and then
+ * rollback the semaphore ups and down so we can return
+ * with an error with all structures restored. We
+ * rollback the undo's in the exact reverse order that
+ * we applied them. This guarantees that we won't run
+ * out of space as we roll things back out.
+ */
+ for (j = 0; j < i; j++) {
+ k = i - j - 1;
+ if ((sops[k].sem_flg & SEM_UNDO) == 0)
+ continue;
+ adjval = sops[k].sem_op;
+ if (adjval == 0)
+ continue;
+ if (semundo_adjust(td, &suptr, semid, seq,
+ sops[k].sem_num, adjval) != 0)
+ panic("semop - can't undo undos");
+ }
+
+ for (j = 0; j < nsops; j++)
+ semakptr->u.sem_base[sops[j].sem_num].semval -=
+ sops[j].sem_op;
+
+ DPRINTF(("error = %d from semundo_adjust\n", error));
+ SEMUNDO_UNLOCK();
+ goto done2;
+ } /* loop through the sops */
+ SEMUNDO_UNLOCK();
+ } /* if (do_undos) */
+
+ /* We're definitely done - set the sempid's and time */
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+ semptr = &semakptr->u.sem_base[sopptr->sem_num];
+ semptr->sempid = td->td_proc->p_pid;
+ }
+ semakptr->u.sem_otime = time_second;
+
+ /*
+ * Do a wakeup if any semaphore was up'd whilst something was
+ * sleeping on it.
+ */
+ if (do_wakeup) {
+ DPRINTF(("semop: doing wakeup\n"));
+ wakeup(semakptr);
+ DPRINTF(("semop: back from wakeup\n"));
+ }
+ DPRINTF(("semop: done\n"));
+ td->td_retval[0] = 0;
+done2:
+ mtx_unlock(sema_mtxp);
+ if (sops != small_sops)
+ free(sops, M_SEM);
+ return (error);
+}
+
+/*
+ * Go through the undo structures for this process and apply the adjustments to
+ * semaphores.
+ */
+static void
+semexit_myhook(void *arg, struct proc *p)
+{
+ struct sem_undo *suptr;
+ struct semid_kernel *semakptr;
+ struct mtx *sema_mtxp;
+ int semid, semnum, adjval, ix;
+ unsigned short seq;
+
+ /*
+ * Go through the chain of undo vectors looking for one
+ * associated with this process.
+ */
+ SEMUNDO_LOCK();
+ LIST_FOREACH(suptr, &semu_list, un_next) {
+ if (suptr->un_proc == p)
+ break;
+ }
+ if (suptr == NULL) {
+ SEMUNDO_UNLOCK();
+ return;
+ }
+ LIST_REMOVE(suptr, un_next);
+
+ DPRINTF(("proc @%p has undo structure with %d entries\n", p,
+ suptr->un_cnt));
+
+ /*
+ * If there are any active undo elements then process them.
+ */
+ if (suptr->un_cnt > 0) {
+ SEMUNDO_UNLOCK();
+ for (ix = 0; ix < suptr->un_cnt; ix++) {
+ semid = suptr->un_ent[ix].un_id;
+ semnum = suptr->un_ent[ix].un_num;
+ adjval = suptr->un_ent[ix].un_adjval;
+ seq = suptr->un_ent[ix].un_seq;
+ semakptr = &sema[semid];
+ sema_mtxp = &sema_mtx[semid];
+
+ mtx_lock(sema_mtxp);
+ if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
+ (semakptr->u.sem_perm.seq != seq)) {
+ mtx_unlock(sema_mtxp);
+ continue;
+ }
+ if (semnum >= semakptr->u.sem_nsems)
+ panic("semexit - semnum out of range");
+
+ DPRINTF((
+ "semexit: %p id=%d num=%d(adj=%d) ; sem=%d\n",
+ suptr->un_proc, suptr->un_ent[ix].un_id,
+ suptr->un_ent[ix].un_num,
+ suptr->un_ent[ix].un_adjval,
+ semakptr->u.sem_base[semnum].semval));
+
+ if (adjval < 0 && semakptr->u.sem_base[semnum].semval <
+ -adjval)
+ semakptr->u.sem_base[semnum].semval = 0;
+ else
+ semakptr->u.sem_base[semnum].semval += adjval;
+
+ wakeup(semakptr);
+ DPRINTF(("semexit: back from wakeup\n"));
+ mtx_unlock(sema_mtxp);
+ }
+ SEMUNDO_LOCK();
+ }
+
+ /*
+ * Deallocate the undo vector.
+ */
+ DPRINTF(("removing vector\n"));
+ suptr->un_proc = NULL;
+ suptr->un_cnt = 0;
+ LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
+ SEMUNDO_UNLOCK();
+}
+
+static int
+sysctl_sema(SYSCTL_HANDLER_ARGS)
+{
+
+ return (SYSCTL_OUT(req, sema,
+ sizeof(struct semid_kernel) * seminfo.semmni));
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *semcalls[] = {
+ (sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget,
+ (sy_call_t *)sys_semop
+};
+
+/*
+ * Entry point for all SEM calls.
+ */
+int
+sys_semsys(td, uap)
+ struct thread *td;
+ /* XXX actually varargs. */
+ struct semsys_args /* {
+ int which;
+ int a2;
+ int a3;
+ int a4;
+ int a5;
+ } */ *uap;
+{
+ int error;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+ if (uap->which < 0 ||
+ uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
+ return (EINVAL);
+ error = (*semcalls[uap->which])(td, &uap->a2);
+ return (error);
+}
+
+#ifndef CP
+#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd7___semctl_args {
+ int semid;
+ int semnum;
+ int cmd;
+ union semun_old *arg;
+};
+#endif
+int
+freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap)
+{
+ struct semid_ds_old dsold;
+ struct semid_ds dsbuf;
+ union semun_old arg;
+ union semun semun;
+ register_t rval;
+ int error;
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_SET:
+ case IPC_STAT:
+ case GETALL:
+ case SETVAL:
+ case SETALL:
+ error = copyin(uap->arg, &arg, sizeof(arg));
+ if (error)
+ return (error);
+ break;
+ }
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ semun.buf = &dsbuf;
+ break;
+ case IPC_SET:
+ error = copyin(arg.buf, &dsold, sizeof(dsold));
+ if (error)
+ return (error);
+ ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm);
+ CP(dsold, dsbuf, sem_base);
+ CP(dsold, dsbuf, sem_nsems);
+ CP(dsold, dsbuf, sem_otime);
+ CP(dsold, dsbuf, sem_ctime);
+ semun.buf = &dsbuf;
+ break;
+ case GETALL:
+ case SETALL:
+ semun.array = arg.array;
+ break;
+ case SETVAL:
+ semun.val = arg.val;
+ break;
+ }
+
+ error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+ &rval);
+ if (error)
+ return (error);
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ bzero(&dsold, sizeof(dsold));
+ ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm);
+ CP(dsbuf, dsold, sem_base);
+ CP(dsbuf, dsold, sem_nsems);
+ CP(dsbuf, dsold, sem_otime);
+ CP(dsbuf, dsold, sem_ctime);
+ error = copyout(&dsold, arg.buf, sizeof(dsold));
+ break;
+ }
+
+ if (error == 0)
+ td->td_retval[0] = rval;
+ return (error);
+}
+
+#endif /* COMPAT_FREEBSD{4,5,6,7} */
+
+#ifdef COMPAT_FREEBSD32
+
+int
+freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
+{
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ switch (uap->which) {
+ case 0:
+ return (freebsd7_freebsd32_semctl(td,
+ (struct freebsd7_freebsd32_semctl_args *)&uap->a2));
+ default:
+ return (sys_semsys(td, (struct semsys_args *)uap));
+ }
+#else
+ return (nosys(td, NULL));
+#endif
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+int
+freebsd7_freebsd32_semctl(struct thread *td,
+ struct freebsd7_freebsd32_semctl_args *uap)
+{
+ struct semid_ds32_old dsbuf32;
+ struct semid_ds dsbuf;
+ union semun semun;
+ union semun32 arg;
+ register_t rval;
+ int error;
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_SET:
+ case IPC_STAT:
+ case GETALL:
+ case SETVAL:
+ case SETALL:
+ error = copyin(uap->arg, &arg, sizeof(arg));
+ if (error)
+ return (error);
+ break;
+ }
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ semun.buf = &dsbuf;
+ break;
+ case IPC_SET:
+ error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
+ if (error)
+ return (error);
+ freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
+ PTRIN_CP(dsbuf32, dsbuf, sem_base);
+ CP(dsbuf32, dsbuf, sem_nsems);
+ CP(dsbuf32, dsbuf, sem_otime);
+ CP(dsbuf32, dsbuf, sem_ctime);
+ semun.buf = &dsbuf;
+ break;
+ case GETALL:
+ case SETALL:
+ semun.array = PTRIN(arg.array);
+ break;
+ case SETVAL:
+ semun.val = arg.val;
+ break;
+ }
+
+ error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+ &rval);
+ if (error)
+ return (error);
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ bzero(&dsbuf32, sizeof(dsbuf32));
+ freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
+ PTROUT_CP(dsbuf, dsbuf32, sem_base);
+ CP(dsbuf, dsbuf32, sem_nsems);
+ CP(dsbuf, dsbuf32, sem_otime);
+ CP(dsbuf, dsbuf32, sem_ctime);
+ error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
+ break;
+ }
+
+ if (error == 0)
+ td->td_retval[0] = rval;
+ return (error);
+}
+#endif
+
+int
+freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap)
+{
+ struct semid_ds32 dsbuf32;
+ struct semid_ds dsbuf;
+ union semun semun;
+ union semun32 arg;
+ register_t rval;
+ int error;
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_SET:
+ case IPC_STAT:
+ case GETALL:
+ case SETVAL:
+ case SETALL:
+ error = copyin(uap->arg, &arg, sizeof(arg));
+ if (error)
+ return (error);
+ break;
+ }
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ semun.buf = &dsbuf;
+ break;
+ case IPC_SET:
+ error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
+ if (error)
+ return (error);
+ freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
+ PTRIN_CP(dsbuf32, dsbuf, sem_base);
+ CP(dsbuf32, dsbuf, sem_nsems);
+ CP(dsbuf32, dsbuf, sem_otime);
+ CP(dsbuf32, dsbuf, sem_ctime);
+ semun.buf = &dsbuf;
+ break;
+ case GETALL:
+ case SETALL:
+ semun.array = PTRIN(arg.array);
+ break;
+ case SETVAL:
+ semun.val = arg.val;
+ break;
+ }
+
+ error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+ &rval);
+ if (error)
+ return (error);
+
+ switch (uap->cmd) {
+ case SEM_STAT:
+ case IPC_STAT:
+ bzero(&dsbuf32, sizeof(dsbuf32));
+ freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
+ PTROUT_CP(dsbuf, dsbuf32, sem_base);
+ CP(dsbuf, dsbuf32, sem_nsems);
+ CP(dsbuf, dsbuf32, sem_otime);
+ CP(dsbuf, dsbuf32, sem_ctime);
+ error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
+ break;
+ }
+
+ if (error == 0)
+ td->td_retval[0] = rval;
+ return (error);
+}
+
+#endif /* COMPAT_FREEBSD32 */
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
new file mode 100644
index 0000000..90f5d77
--- /dev/null
+++ b/sys/kern/sysv_shm.c
@@ -0,0 +1,1407 @@
+/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */
+/*-
+ * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Adam Glass and Charles
+ * Hannum.
+ * 4. The names of the authors may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 2003-2005 McAfee, Inc.
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by McAfee
+ * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
+ * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
+ * program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/sysctl.h>
+#include <sys/shm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/jail.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+FEATURE(sysv_shm, "System V shared memory segments support");
+
+static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
+
+static int shmget_allocate_segment(struct thread *td,
+ struct shmget_args *uap, int mode);
+static int shmget_existing(struct thread *td, struct shmget_args *uap,
+ int mode, int segnum);
+
+#define SHMSEG_FREE 0x0200
+#define SHMSEG_REMOVED 0x0400
+#define SHMSEG_ALLOCATED 0x0800
+#define SHMSEG_WANTED 0x1000
+
+static int shm_last_free, shm_nused, shmalloced;
+vm_size_t shm_committed;
+static struct shmid_kernel *shmsegs;
+
+struct shmmap_state {
+ vm_offset_t va;
+ int shmid;
+};
+
+static void shm_deallocate_segment(struct shmid_kernel *);
+static int shm_find_segment_by_key(key_t);
+static struct shmid_kernel *shm_find_segment_by_shmid(int);
+static struct shmid_kernel *shm_find_segment_by_shmidx(int);
+static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
+static void shmrealloc(void);
+static int shminit(void);
+static int sysvshm_modload(struct module *, int, void *);
+static int shmunload(void);
+static void shmexit_myhook(struct vmspace *vm);
+static void shmfork_myhook(struct proc *p1, struct proc *p2);
+static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
+
+/*
+ * Tuneable values.
+ */
+#ifndef SHMMAXPGS
+#define SHMMAXPGS 131072 /* Note: sysv shared memory is swap backed. */
+#endif
+#ifndef SHMMAX
+#define SHMMAX (SHMMAXPGS*PAGE_SIZE)
+#endif
+#ifndef SHMMIN
+#define SHMMIN 1
+#endif
+#ifndef SHMMNI
+#define SHMMNI 192
+#endif
+#ifndef SHMSEG
+#define SHMSEG 128
+#endif
+#ifndef SHMALL
+#define SHMALL (SHMMAXPGS)
+#endif
+
+struct shminfo shminfo = {
+ SHMMAX,
+ SHMMIN,
+ SHMMNI,
+ SHMSEG,
+ SHMALL
+};
+
+static int shm_use_phys;
+static int shm_allow_removed;
+
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
+ "Maximum shared memory segment size");
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
+ "Minimum shared memory segment size");
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
+ "Number of shared memory identifiers");
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
+ "Number of segments per process");
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
+ "Maximum number of pages available for shared memory");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
+ &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
+SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RW,
+ &shm_allow_removed, 0,
+ "Enable/Disable attachment to attached segments marked for removal");
+SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_shmsegs, "",
+ "Current number of shared memory segments allocated");
+
+static int
+shm_find_segment_by_key(key)
+ key_t key;
+{
+ int i;
+
+ for (i = 0; i < shmalloced; i++)
+ if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
+ shmsegs[i].u.shm_perm.key == key)
+ return (i);
+ return (-1);
+}
+
+static struct shmid_kernel *
+shm_find_segment_by_shmid(int shmid)
+{
+ int segnum;
+ struct shmid_kernel *shmseg;
+
+ segnum = IPCID_TO_IX(shmid);
+ if (segnum < 0 || segnum >= shmalloced)
+ return (NULL);
+ shmseg = &shmsegs[segnum];
+ if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
+ (!shm_allow_removed &&
+ (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
+ shmseg->u.shm_perm.seq != IPCID_TO_SEQ(shmid))
+ return (NULL);
+ return (shmseg);
+}
+
+static struct shmid_kernel *
+shm_find_segment_by_shmidx(int segnum)
+{
+ struct shmid_kernel *shmseg;
+
+ if (segnum < 0 || segnum >= shmalloced)
+ return (NULL);
+ shmseg = &shmsegs[segnum];
+ if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
+ (!shm_allow_removed &&
+ (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0))
+ return (NULL);
+ return (shmseg);
+}
+
+static void
+shm_deallocate_segment(shmseg)
+ struct shmid_kernel *shmseg;
+{
+ vm_size_t size;
+
+ GIANT_REQUIRED;
+
+ vm_object_deallocate(shmseg->object);
+ shmseg->object = NULL;
+ size = round_page(shmseg->u.shm_segsz);
+ shm_committed -= btoc(size);
+ shm_nused--;
+ shmseg->u.shm_perm.mode = SHMSEG_FREE;
+#ifdef MAC
+ mac_sysvshm_cleanup(shmseg);
+#endif
+ racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
+ racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
+ crfree(shmseg->cred);
+ shmseg->cred = NULL;
+}
+
+static int
+shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
+{
+ struct shmid_kernel *shmseg;
+ int segnum, result;
+ vm_size_t size;
+
+ GIANT_REQUIRED;
+
+ segnum = IPCID_TO_IX(shmmap_s->shmid);
+ shmseg = &shmsegs[segnum];
+ size = round_page(shmseg->u.shm_segsz);
+ result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
+ if (result != KERN_SUCCESS)
+ return (EINVAL);
+ shmmap_s->shmid = -1;
+ shmseg->u.shm_dtime = time_second;
+ if ((--shmseg->u.shm_nattch <= 0) &&
+ (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
+ shm_deallocate_segment(shmseg);
+ shm_last_free = segnum;
+ }
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmdt_args {
+ const void *shmaddr;
+};
+#endif
+int
+sys_shmdt(td, uap)
+ struct thread *td;
+ struct shmdt_args *uap;
+{
+ struct proc *p = td->td_proc;
+ struct shmmap_state *shmmap_s;
+#ifdef MAC
+ struct shmid_kernel *shmsegptr;
+#endif
+ int i;
+ int error = 0;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ shmmap_s = p->p_vmspace->vm_shm;
+ if (shmmap_s == NULL) {
+ error = EINVAL;
+ goto done2;
+ }
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
+ if (shmmap_s->shmid != -1 &&
+ shmmap_s->va == (vm_offset_t)uap->shmaddr) {
+ break;
+ }
+ }
+ if (i == shminfo.shmseg) {
+ error = EINVAL;
+ goto done2;
+ }
+#ifdef MAC
+ shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
+ error = mac_sysvshm_check_shmdt(td->td_ucred, shmsegptr);
+ if (error != 0)
+ goto done2;
+#endif
+ error = shm_delete_mapping(p->p_vmspace, shmmap_s);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args {
+ int shmid;
+ const void *shmaddr;
+ int shmflg;
+};
+#endif
+int
+kern_shmat(td, shmid, shmaddr, shmflg)
+ struct thread *td;
+ int shmid;
+ const void *shmaddr;
+ int shmflg;
+{
+ struct proc *p = td->td_proc;
+ int i, flags;
+ struct shmid_kernel *shmseg;
+ struct shmmap_state *shmmap_s = NULL;
+ vm_offset_t attach_va;
+ vm_prot_t prot;
+ vm_size_t size;
+ int rv;
+ int error = 0;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ shmmap_s = p->p_vmspace->vm_shm;
+ if (shmmap_s == NULL) {
+ shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
+ M_SHM, M_WAITOK);
+ for (i = 0; i < shminfo.shmseg; i++)
+ shmmap_s[i].shmid = -1;
+ p->p_vmspace->vm_shm = shmmap_s;
+ }
+ shmseg = shm_find_segment_by_shmid(shmid);
+ if (shmseg == NULL) {
+ error = EINVAL;
+ goto done2;
+ }
+ error = ipcperm(td, &shmseg->u.shm_perm,
+ (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
+ if (error)
+ goto done2;
+#ifdef MAC
+ error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
+ if (error != 0)
+ goto done2;
+#endif
+ for (i = 0; i < shminfo.shmseg; i++) {
+ if (shmmap_s->shmid == -1)
+ break;
+ shmmap_s++;
+ }
+ if (i >= shminfo.shmseg) {
+ error = EMFILE;
+ goto done2;
+ }
+ size = round_page(shmseg->u.shm_segsz);
+ prot = VM_PROT_READ;
+ if ((shmflg & SHM_RDONLY) == 0)
+ prot |= VM_PROT_WRITE;
+ flags = MAP_ANON | MAP_SHARED;
+ if (shmaddr) {
+ flags |= MAP_FIXED;
+ if (shmflg & SHM_RND) {
+ attach_va = (vm_offset_t)shmaddr & ~(SHMLBA-1);
+ } else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) {
+ attach_va = (vm_offset_t)shmaddr;
+ } else {
+ error = EINVAL;
+ goto done2;
+ }
+ } else {
+ /*
+ * This is just a hint to vm_map_find() about where to
+ * put it.
+ */
+ PROC_LOCK(p);
+ attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
+ lim_max(p, RLIMIT_DATA));
+ PROC_UNLOCK(p);
+ }
+
+ vm_object_reference(shmseg->object);
+ rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object,
+ 0, &attach_va, size, (flags & MAP_FIXED) ? VMFS_NO_SPACE :
+ VMFS_OPTIMAL_SPACE, prot, prot, MAP_INHERIT_SHARE);
+ if (rv != KERN_SUCCESS) {
+ vm_object_deallocate(shmseg->object);
+ error = ENOMEM;
+ goto done2;
+ }
+
+ shmmap_s->va = attach_va;
+ shmmap_s->shmid = shmid;
+ shmseg->u.shm_lpid = p->p_pid;
+ shmseg->u.shm_atime = time_second;
+ shmseg->u.shm_nattch++;
+ td->td_retval[0] = attach_va;
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+int
+sys_shmat(td, uap)
+ struct thread *td;
+ struct shmat_args *uap;
+{
+ return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg);
+}
+
+int
+kern_shmctl(td, shmid, cmd, buf, bufsz)
+ struct thread *td;
+ int shmid;
+ int cmd;
+ void *buf;
+ size_t *bufsz;
+{
+ int error = 0;
+ struct shmid_kernel *shmseg;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+
+ mtx_lock(&Giant);
+ switch (cmd) {
+ /*
+ * It is possible that kern_shmctl is being called from the Linux ABI
+ * layer, in which case, we will need to implement IPC_INFO. It should
+ * be noted that other shmctl calls will be funneled through here for
+ * Linix binaries as well.
+ *
+ * NB: The Linux ABI layer will convert this data to structure(s) more
+ * consistent with the Linux ABI.
+ */
+ case IPC_INFO:
+ memcpy(buf, &shminfo, sizeof(shminfo));
+ if (bufsz)
+ *bufsz = sizeof(shminfo);
+ td->td_retval[0] = shmalloced;
+ goto done2;
+ case SHM_INFO: {
+ struct shm_info shm_info;
+ shm_info.used_ids = shm_nused;
+ shm_info.shm_rss = 0; /*XXX where to get from ? */
+ shm_info.shm_tot = 0; /*XXX where to get from ? */
+ shm_info.shm_swp = 0; /*XXX where to get from ? */
+ shm_info.swap_attempts = 0; /*XXX where to get from ? */
+ shm_info.swap_successes = 0; /*XXX where to get from ? */
+ memcpy(buf, &shm_info, sizeof(shm_info));
+ if (bufsz)
+ *bufsz = sizeof(shm_info);
+ td->td_retval[0] = shmalloced;
+ goto done2;
+ }
+ }
+ if (cmd == SHM_STAT)
+ shmseg = shm_find_segment_by_shmidx(shmid);
+ else
+ shmseg = shm_find_segment_by_shmid(shmid);
+ if (shmseg == NULL) {
+ error = EINVAL;
+ goto done2;
+ }
+#ifdef MAC
+ error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
+ if (error != 0)
+ goto done2;
+#endif
+ switch (cmd) {
+ case SHM_STAT:
+ case IPC_STAT:
+ error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
+ if (error)
+ goto done2;
+ memcpy(buf, &shmseg->u, sizeof(struct shmid_ds));
+ if (bufsz)
+ *bufsz = sizeof(struct shmid_ds);
+ if (cmd == SHM_STAT)
+ td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm);
+ break;
+ case IPC_SET: {
+ struct shmid_ds *shmid;
+
+ shmid = (struct shmid_ds *)buf;
+ error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
+ if (error)
+ goto done2;
+ shmseg->u.shm_perm.uid = shmid->shm_perm.uid;
+ shmseg->u.shm_perm.gid = shmid->shm_perm.gid;
+ shmseg->u.shm_perm.mode =
+ (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
+ (shmid->shm_perm.mode & ACCESSPERMS);
+ shmseg->u.shm_ctime = time_second;
+ break;
+ }
+ case IPC_RMID:
+ error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
+ if (error)
+ goto done2;
+ shmseg->u.shm_perm.key = IPC_PRIVATE;
+ shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
+ if (shmseg->u.shm_nattch <= 0) {
+ shm_deallocate_segment(shmseg);
+ shm_last_free = IPCID_TO_IX(shmid);
+ }
+ break;
+#if 0
+ case SHM_LOCK:
+ case SHM_UNLOCK:
+#endif
+ default:
+ error = EINVAL;
+ break;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmctl_args {
+ int shmid;
+ int cmd;
+ struct shmid_ds *buf;
+};
+#endif
+int
+sys_shmctl(td, uap)
+ struct thread *td;
+ struct shmctl_args *uap;
+{
+ int error = 0;
+ struct shmid_ds buf;
+ size_t bufsz;
+
+ /*
+ * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
+ * Linux binaries. If we see the call come through the FreeBSD ABI,
+ * return an error back to the user since we do not to support this.
+ */
+ if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
+ uap->cmd == SHM_STAT)
+ return (EINVAL);
+
+ /* IPC_SET needs to copyin the buffer before calling kern_shmctl */
+ if (uap->cmd == IPC_SET) {
+ if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
+ goto done;
+ }
+
+ error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
+ if (error)
+ goto done;
+
+ /* Cases in which we need to copyout */
+ switch (uap->cmd) {
+ case IPC_STAT:
+ error = copyout(&buf, uap->buf, bufsz);
+ break;
+ }
+
+done:
+ if (error) {
+ /* Invalidate the return value */
+ td->td_retval[0] = -1;
+ }
+ return (error);
+}
+
+
+static int
+shmget_existing(td, uap, mode, segnum)
+ struct thread *td;
+ struct shmget_args *uap;
+ int mode;
+ int segnum;
+{
+ struct shmid_kernel *shmseg;
+ int error;
+
+ shmseg = &shmsegs[segnum];
+ if (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) {
+ /*
+ * This segment is in the process of being allocated. Wait
+ * until it's done, and look the key up again (in case the
+ * allocation failed or it was freed).
+ */
+ shmseg->u.shm_perm.mode |= SHMSEG_WANTED;
+ error = tsleep(shmseg, PLOCK | PCATCH, "shmget", 0);
+ if (error)
+ return (error);
+ return (EAGAIN);
+ }
+ if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
+ return (EEXIST);
+#ifdef MAC
+ error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg);
+ if (error != 0)
+ return (error);
+#endif
+ if (uap->size != 0 && uap->size > shmseg->u.shm_segsz)
+ return (EINVAL);
+ td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
+ return (0);
+}
+
+static int
+shmget_allocate_segment(td, uap, mode)
+ struct thread *td;
+ struct shmget_args *uap;
+ int mode;
+{
+ int i, segnum, shmid;
+ size_t size;
+ struct ucred *cred = td->td_ucred;
+ struct shmid_kernel *shmseg;
+ vm_object_t shm_object;
+
+ GIANT_REQUIRED;
+
+ if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
+ return (EINVAL);
+ if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
+ return (ENOSPC);
+ size = round_page(uap->size);
+ if (shm_committed + btoc(size) > shminfo.shmall)
+ return (ENOMEM);
+ if (shm_last_free < 0) {
+ shmrealloc(); /* Maybe expand the shmsegs[] array. */
+ for (i = 0; i < shmalloced; i++)
+ if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
+ break;
+ if (i == shmalloced)
+ return (ENOSPC);
+ segnum = i;
+ } else {
+ segnum = shm_last_free;
+ shm_last_free = -1;
+ }
+ shmseg = &shmsegs[segnum];
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
+ PROC_UNLOCK(td->td_proc);
+ return (ENOSPC);
+ }
+ if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
+ racct_sub(td->td_proc, RACCT_NSHM, 1);
+ PROC_UNLOCK(td->td_proc);
+ return (ENOMEM);
+ }
+ PROC_UNLOCK(td->td_proc);
+#endif
+ /*
+ * In case we sleep in malloc(), mark the segment present but deleted
+ * so that noone else tries to create the same key.
+ */
+ shmseg->u.shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
+ shmseg->u.shm_perm.key = uap->key;
+ shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
+ shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
+
+ /*
+ * We make sure that we have allocated a pager before we need
+ * to.
+ */
+ shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
+ 0, size, VM_PROT_DEFAULT, 0, cred);
+ if (shm_object == NULL) {
+#ifdef RACCT
+ PROC_LOCK(td->td_proc);
+ racct_sub(td->td_proc, RACCT_NSHM, 1);
+ racct_sub(td->td_proc, RACCT_SHMSIZE, size);
+ PROC_UNLOCK(td->td_proc);
+#endif
+ return (ENOMEM);
+ }
+ VM_OBJECT_WLOCK(shm_object);
+ vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
+ vm_object_set_flag(shm_object, OBJ_NOSPLIT);
+ VM_OBJECT_WUNLOCK(shm_object);
+
+ shmseg->object = shm_object;
+ shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
+ shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
+ shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) |
+ (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
+ shmseg->cred = crhold(cred);
+ shmseg->u.shm_segsz = uap->size;
+ shmseg->u.shm_cpid = td->td_proc->p_pid;
+ shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
+ shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
+#ifdef MAC
+ mac_sysvshm_create(cred, shmseg);
+#endif
+ shmseg->u.shm_ctime = time_second;
+ shm_committed += btoc(size);
+ shm_nused++;
+ if (shmseg->u.shm_perm.mode & SHMSEG_WANTED) {
+ /*
+ * Somebody else wanted this key while we were asleep. Wake
+ * them up now.
+ */
+ shmseg->u.shm_perm.mode &= ~SHMSEG_WANTED;
+ wakeup(shmseg);
+ }
+ td->td_retval[0] = shmid;
+ return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmget_args {
+ key_t key;
+ size_t size;
+ int shmflg;
+};
+#endif
+int
+sys_shmget(td, uap)
+ struct thread *td;
+ struct shmget_args *uap;
+{
+ int segnum, mode;
+ int error;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ mode = uap->shmflg & ACCESSPERMS;
+ if (uap->key != IPC_PRIVATE) {
+ again:
+ segnum = shm_find_segment_by_key(uap->key);
+ if (segnum >= 0) {
+ error = shmget_existing(td, uap, mode, segnum);
+ if (error == EAGAIN)
+ goto again;
+ goto done2;
+ }
+ if ((uap->shmflg & IPC_CREAT) == 0) {
+ error = ENOENT;
+ goto done2;
+ }
+ }
+ error = shmget_allocate_segment(td, uap, mode);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static void
+shmfork_myhook(p1, p2)
+ struct proc *p1, *p2;
+{
+ struct shmmap_state *shmmap_s;
+ size_t size;
+ int i;
+
+ mtx_lock(&Giant);
+ size = shminfo.shmseg * sizeof(struct shmmap_state);
+ shmmap_s = malloc(size, M_SHM, M_WAITOK);
+ bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
+ p2->p_vmspace->vm_shm = shmmap_s;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1)
+ shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
+ mtx_unlock(&Giant);
+}
+
+static void
+shmexit_myhook(struct vmspace *vm)
+{
+ struct shmmap_state *base, *shm;
+ int i;
+
+ if ((base = vm->vm_shm) != NULL) {
+ vm->vm_shm = NULL;
+ mtx_lock(&Giant);
+ for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
+ if (shm->shmid != -1)
+ shm_delete_mapping(vm, shm);
+ }
+ mtx_unlock(&Giant);
+ free(base, M_SHM);
+ }
+}
+
+static void
+shmrealloc(void)
+{
+ int i;
+ struct shmid_kernel *newsegs;
+
+ if (shmalloced >= shminfo.shmmni)
+ return;
+
+ newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
+ if (newsegs == NULL)
+ return;
+ for (i = 0; i < shmalloced; i++)
+ bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
+ for (; i < shminfo.shmmni; i++) {
+ shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
+ shmsegs[i].u.shm_perm.seq = 0;
+#ifdef MAC
+ mac_sysvshm_init(&shmsegs[i]);
+#endif
+ }
+ free(shmsegs, M_SHM);
+ shmsegs = newsegs;
+ shmalloced = shminfo.shmmni;
+}
+
+static struct syscall_helper_data shm_syscalls[] = {
+ SYSCALL_INIT_HELPER(shmat),
+ SYSCALL_INIT_HELPER(shmctl),
+ SYSCALL_INIT_HELPER(shmdt),
+ SYSCALL_INIT_HELPER(shmget),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
+#endif
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
+ SYSCALL_INIT_HELPER(shmsys),
+#endif
+ SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_ipc.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static struct syscall_helper_data shm32_syscalls[] = {
+ SYSCALL32_INIT_HELPER_COMPAT(shmat),
+ SYSCALL32_INIT_HELPER_COMPAT(shmdt),
+ SYSCALL32_INIT_HELPER_COMPAT(shmget),
+ SYSCALL32_INIT_HELPER(freebsd32_shmsys),
+ SYSCALL32_INIT_HELPER(freebsd32_shmctl),
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
+#endif
+ SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+shminit()
+{
+ int i, error;
+
+#ifndef BURN_BRIDGES
+ if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
+ printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
+#endif
+ TUNABLE_ULONG_FETCH("kern.ipc.shmall", &shminfo.shmall);
+ if (!TUNABLE_ULONG_FETCH("kern.ipc.shmmax", &shminfo.shmmax)) {
+ /* Initialize shmmax dealing with possible overflow. */
+ for (i = PAGE_SIZE; i > 0; i--) {
+ shminfo.shmmax = shminfo.shmall * i;
+ if (shminfo.shmmax >= shminfo.shmall)
+ break;
+ }
+ }
+ TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
+ TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
+ TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
+ TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
+
+ shmalloced = shminfo.shmmni;
+ shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
+ for (i = 0; i < shmalloced; i++) {
+ shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
+ shmsegs[i].u.shm_perm.seq = 0;
+#ifdef MAC
+ mac_sysvshm_init(&shmsegs[i]);
+#endif
+ }
+ shm_last_free = 0;
+ shm_nused = 0;
+ shm_committed = 0;
+ shmexit_hook = &shmexit_myhook;
+ shmfork_hook = &shmfork_myhook;
+
+ error = syscall_helper_register(shm_syscalls);
+ if (error != 0)
+ return (error);
+#ifdef COMPAT_FREEBSD32
+ error = syscall32_helper_register(shm32_syscalls);
+ if (error != 0)
+ return (error);
+#endif
+ return (0);
+}
+
+static int
+shmunload()
+{
+ int i;
+
+ if (shm_nused > 0)
+ return (EBUSY);
+
+#ifdef COMPAT_FREEBSD32
+ syscall32_helper_unregister(shm32_syscalls);
+#endif
+ syscall_helper_unregister(shm_syscalls);
+
+ for (i = 0; i < shmalloced; i++) {
+#ifdef MAC
+ mac_sysvshm_destroy(&shmsegs[i]);
+#endif
+ /*
+ * Objects might be still mapped into the processes
+ * address spaces. Actual free would happen on the
+ * last mapping destruction.
+ */
+ if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
+ vm_object_deallocate(shmsegs[i].object);
+ }
+ free(shmsegs, M_SHM);
+ shmexit_hook = NULL;
+ shmfork_hook = NULL;
+ return (0);
+}
+
+static int
+sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
+{
+
+ return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])));
+}
+
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
+struct oshmid_ds {
+ struct ipc_perm_old shm_perm; /* operation perms */
+ int shm_segsz; /* size of segment (bytes) */
+ u_short shm_cpid; /* pid, creator */
+ u_short shm_lpid; /* pid, last operation */
+ short shm_nattch; /* no. of current attaches */
+ time_t shm_atime; /* last attach time */
+ time_t shm_dtime; /* last detach time */
+ time_t shm_ctime; /* last change time */
+ void *shm_handle; /* internal handle for shm segment */
+};
+
+struct oshmctl_args {
+ int shmid;
+ int cmd;
+ struct oshmid_ds *ubuf;
+};
+
+static int
+oshmctl(struct thread *td, struct oshmctl_args *uap)
+{
+#ifdef COMPAT_43
+ int error = 0;
+ struct shmid_kernel *shmseg;
+ struct oshmid_ds outbuf;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+ mtx_lock(&Giant);
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL) {
+ error = EINVAL;
+ goto done2;
+ }
+ switch (uap->cmd) {
+ case IPC_STAT:
+ error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
+ if (error)
+ goto done2;
+#ifdef MAC
+ error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
+ if (error != 0)
+ goto done2;
+#endif
+ ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
+ outbuf.shm_segsz = shmseg->u.shm_segsz;
+ outbuf.shm_cpid = shmseg->u.shm_cpid;
+ outbuf.shm_lpid = shmseg->u.shm_lpid;
+ outbuf.shm_nattch = shmseg->u.shm_nattch;
+ outbuf.shm_atime = shmseg->u.shm_atime;
+ outbuf.shm_dtime = shmseg->u.shm_dtime;
+ outbuf.shm_ctime = shmseg->u.shm_ctime;
+ outbuf.shm_handle = shmseg->object;
+ error = copyout(&outbuf, uap->ubuf, sizeof(outbuf));
+ if (error)
+ goto done2;
+ break;
+ default:
+ error = freebsd7_shmctl(td, (struct freebsd7_shmctl_args *)uap);
+ break;
+ }
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+#else
+ return (EINVAL);
+#endif
+}
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *shmcalls[] = {
+ (sy_call_t *)sys_shmat, (sy_call_t *)oshmctl,
+ (sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget,
+ (sy_call_t *)freebsd7_shmctl
+};
+
+int
+sys_shmsys(td, uap)
+ struct thread *td;
+ /* XXX actually varargs. */
+ struct shmsys_args /* {
+ int which;
+ int a2;
+ int a3;
+ int a4;
+ } */ *uap;
+{
+ int error;
+
+ if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+ return (ENOSYS);
+ if (uap->which < 0 ||
+ uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
+ return (EINVAL);
+ mtx_lock(&Giant);
+ error = (*shmcalls[uap->which])(td, &uap->a2);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+#endif /* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */
+
+#ifdef COMPAT_FREEBSD32
+
+int
+freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
+{
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+ switch (uap->which) {
+ case 0: { /* shmat */
+ struct shmat_args ap;
+
+ ap.shmid = uap->a2;
+ ap.shmaddr = PTRIN(uap->a3);
+ ap.shmflg = uap->a4;
+ return (sysent[SYS_shmat].sy_call(td, &ap));
+ }
+ case 2: { /* shmdt */
+ struct shmdt_args ap;
+
+ ap.shmaddr = PTRIN(uap->a2);
+ return (sysent[SYS_shmdt].sy_call(td, &ap));
+ }
+ case 3: { /* shmget */
+ struct shmget_args ap;
+
+ ap.key = uap->a2;
+ ap.size = uap->a3;
+ ap.shmflg = uap->a4;
+ return (sysent[SYS_shmget].sy_call(td, &ap));
+ }
+ case 4: { /* shmctl */
+ struct freebsd7_freebsd32_shmctl_args ap;
+
+ ap.shmid = uap->a2;
+ ap.cmd = uap->a3;
+ ap.buf = PTRIN(uap->a4);
+ return (freebsd7_freebsd32_shmctl(td, &ap));
+ }
+ case 1: /* oshmctl */
+ default:
+ return (EINVAL);
+ }
+#else
+ return (nosys(td, NULL));
+#endif
+}
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+int
+freebsd7_freebsd32_shmctl(struct thread *td,
+ struct freebsd7_freebsd32_shmctl_args *uap)
+{
+ int error = 0;
+ union {
+ struct shmid_ds shmid_ds;
+ struct shm_info shm_info;
+ struct shminfo shminfo;
+ } u;
+ union {
+ struct shmid_ds32_old shmid_ds32;
+ struct shm_info32 shm_info32;
+ struct shminfo32 shminfo32;
+ } u32;
+ size_t sz;
+
+ if (uap->cmd == IPC_SET) {
+ if ((error = copyin(uap->buf, &u32.shmid_ds32,
+ sizeof(u32.shmid_ds32))))
+ goto done;
+ freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
+ &u.shmid_ds.shm_perm);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
+ }
+
+ error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
+ if (error)
+ goto done;
+
+ /* Cases in which we need to copyout */
+ switch (uap->cmd) {
+ case IPC_INFO:
+ CP(u.shminfo, u32.shminfo32, shmmax);
+ CP(u.shminfo, u32.shminfo32, shmmin);
+ CP(u.shminfo, u32.shminfo32, shmmni);
+ CP(u.shminfo, u32.shminfo32, shmseg);
+ CP(u.shminfo, u32.shminfo32, shmall);
+ error = copyout(&u32.shminfo32, uap->buf,
+ sizeof(u32.shminfo32));
+ break;
+ case SHM_INFO:
+ CP(u.shm_info, u32.shm_info32, used_ids);
+ CP(u.shm_info, u32.shm_info32, shm_rss);
+ CP(u.shm_info, u32.shm_info32, shm_tot);
+ CP(u.shm_info, u32.shm_info32, shm_swp);
+ CP(u.shm_info, u32.shm_info32, swap_attempts);
+ CP(u.shm_info, u32.shm_info32, swap_successes);
+ error = copyout(&u32.shm_info32, uap->buf,
+ sizeof(u32.shm_info32));
+ break;
+ case SHM_STAT:
+ case IPC_STAT:
+ freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
+ &u32.shmid_ds32.shm_perm);
+ if (u.shmid_ds.shm_segsz > INT32_MAX)
+ u32.shmid_ds32.shm_segsz = INT32_MAX;
+ else
+ CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
+ u32.shmid_ds32.shm_internal = 0;
+ error = copyout(&u32.shmid_ds32, uap->buf,
+ sizeof(u32.shmid_ds32));
+ break;
+ }
+
+done:
+ if (error) {
+ /* Invalidate the return value */
+ td->td_retval[0] = -1;
+ }
+ return (error);
+}
+#endif
+
+int
+freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap)
+{
+ int error = 0;
+ union {
+ struct shmid_ds shmid_ds;
+ struct shm_info shm_info;
+ struct shminfo shminfo;
+ } u;
+ union {
+ struct shmid_ds32 shmid_ds32;
+ struct shm_info32 shm_info32;
+ struct shminfo32 shminfo32;
+ } u32;
+ size_t sz;
+
+ if (uap->cmd == IPC_SET) {
+ if ((error = copyin(uap->buf, &u32.shmid_ds32,
+ sizeof(u32.shmid_ds32))))
+ goto done;
+ freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
+ &u.shmid_ds.shm_perm);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
+ CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
+ }
+
+ error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
+ if (error)
+ goto done;
+
+ /* Cases in which we need to copyout */
+ switch (uap->cmd) {
+ case IPC_INFO:
+ CP(u.shminfo, u32.shminfo32, shmmax);
+ CP(u.shminfo, u32.shminfo32, shmmin);
+ CP(u.shminfo, u32.shminfo32, shmmni);
+ CP(u.shminfo, u32.shminfo32, shmseg);
+ CP(u.shminfo, u32.shminfo32, shmall);
+ error = copyout(&u32.shminfo32, uap->buf,
+ sizeof(u32.shminfo32));
+ break;
+ case SHM_INFO:
+ CP(u.shm_info, u32.shm_info32, used_ids);
+ CP(u.shm_info, u32.shm_info32, shm_rss);
+ CP(u.shm_info, u32.shm_info32, shm_tot);
+ CP(u.shm_info, u32.shm_info32, shm_swp);
+ CP(u.shm_info, u32.shm_info32, swap_attempts);
+ CP(u.shm_info, u32.shm_info32, swap_successes);
+ error = copyout(&u32.shm_info32, uap->buf,
+ sizeof(u32.shm_info32));
+ break;
+ case SHM_STAT:
+ case IPC_STAT:
+ freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
+ &u32.shmid_ds32.shm_perm);
+ if (u.shmid_ds.shm_segsz > INT32_MAX)
+ u32.shmid_ds32.shm_segsz = INT32_MAX;
+ else
+ CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
+ CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
+ error = copyout(&u32.shmid_ds32, uap->buf,
+ sizeof(u32.shmid_ds32));
+ break;
+ }
+
+done:
+ if (error) {
+ /* Invalidate the return value */
+ td->td_retval[0] = -1;
+ }
+ return (error);
+}
+#endif
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+
+#ifndef CP
+#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd7_shmctl_args {
+ int shmid;
+ int cmd;
+ struct shmid_ds_old *buf;
+};
+#endif
+int
+freebsd7_shmctl(td, uap)
+ struct thread *td;
+ struct freebsd7_shmctl_args *uap;
+{
+ int error = 0;
+ struct shmid_ds_old old;
+ struct shmid_ds buf;
+ size_t bufsz;
+
+ /*
+ * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
+ * Linux binaries. If we see the call come through the FreeBSD ABI,
+ * return an error back to the user since we do not to support this.
+ */
+ if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
+ uap->cmd == SHM_STAT)
+ return (EINVAL);
+
+ /* IPC_SET needs to copyin the buffer before calling kern_shmctl */
+ if (uap->cmd == IPC_SET) {
+ if ((error = copyin(uap->buf, &old, sizeof(old))))
+ goto done;
+ ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
+ CP(old, buf, shm_segsz);
+ CP(old, buf, shm_lpid);
+ CP(old, buf, shm_cpid);
+ CP(old, buf, shm_nattch);
+ CP(old, buf, shm_atime);
+ CP(old, buf, shm_dtime);
+ CP(old, buf, shm_ctime);
+ }
+
+ error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
+ if (error)
+ goto done;
+
+ /* Cases in which we need to copyout */
+ switch (uap->cmd) {
+ case IPC_STAT:
+ ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
+ if (buf.shm_segsz > INT_MAX)
+ old.shm_segsz = INT_MAX;
+ else
+ CP(buf, old, shm_segsz);
+ CP(buf, old, shm_lpid);
+ CP(buf, old, shm_cpid);
+ if (buf.shm_nattch > SHRT_MAX)
+ old.shm_nattch = SHRT_MAX;
+ else
+ CP(buf, old, shm_nattch);
+ CP(buf, old, shm_atime);
+ CP(buf, old, shm_dtime);
+ CP(buf, old, shm_ctime);
+ old.shm_internal = NULL;
+ error = copyout(&old, uap->buf, sizeof(old));
+ break;
+ }
+
+done:
+ if (error) {
+ /* Invalidate the return value */
+ td->td_retval[0] = -1;
+ }
+ return (error);
+}
+
+#endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
+ COMPAT_FREEBSD7 */
+
+static int
+sysvshm_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ error = shminit();
+ if (error != 0)
+ shmunload();
+ break;
+ case MOD_UNLOAD:
+ error = shmunload();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t sysvshm_mod = {
+ "sysvshm",
+ &sysvshm_modload,
+ NULL
+};
+
+DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
+MODULE_VERSION(sysvshm, 1);
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
new file mode 100644
index 0000000..4fce607
--- /dev/null
+++ b/sys/kern/tty.c
@@ -0,0 +1,2209 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#ifdef COMPAT_43TTY
+#include <sys/ioctl_compat.h>
+#endif /* COMPAT_43TTY */
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/poll.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/serial.h>
+#include <sys/signal.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/ttycom.h>
+#define TTYDEFCHARS
+#include <sys/ttydefaults.h>
+#undef TTYDEFCHARS
+#include <sys/ucred.h>
+#include <sys/vnode.h>
+
+#include <machine/stdarg.h>
+
+static MALLOC_DEFINE(M_TTY, "tty", "tty device");
+
+static void tty_rel_free(struct tty *tp);
+
+static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
+static struct sx tty_list_sx;
+SX_SYSINIT(tty_list, &tty_list_sx, "tty list");
+static unsigned int tty_list_count = 0;
+
+/* Character device of /dev/console. */
+static struct cdev *dev_console;
+static const char *dev_console_filename;
+
+/*
+ * Flags that are supported and stored by this implementation.
+ */
+#define TTYSUP_IFLAG (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|\
+ INLCR|IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL)
+#define TTYSUP_OFLAG (OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET)
+#define TTYSUP_LFLAG (ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\
+ ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\
+ FLUSHO|NOKERNINFO|NOFLSH)
+#define TTYSUP_CFLAG (CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\
+ HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\
+ CDSR_OFLOW|CCAR_OFLOW)
+
+#define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)
+
+/*
+ * Set TTY buffer sizes.
+ */
+
+#define TTYBUF_MAX 65536
+
+static void
+tty_watermarks(struct tty *tp)
+{
+ size_t bs = 0;
+
+ /* Provide an input buffer for 0.2 seconds of data. */
+ if (tp->t_termios.c_cflag & CREAD)
+ bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX);
+ ttyinq_setsize(&tp->t_inq, tp, bs);
+
+ /* Set low watermark at 10% (when 90% is available). */
+ tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10;
+
+ /* Provide an output buffer for 0.2 seconds of data. */
+ bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX);
+ ttyoutq_setsize(&tp->t_outq, tp, bs);
+
+ /* Set low watermark at 10% (when 90% is available). */
+ tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10;
+}
+
+static int
+tty_drain(struct tty *tp)
+{
+ int error;
+
+ if (ttyhook_hashook(tp, getc_inject))
+ /* buffer is inaccessible */
+ return (0);
+
+ while (ttyoutq_bytesused(&tp->t_outq) > 0) {
+ ttydevsw_outwakeup(tp);
+ /* Could be handled synchronously. */
+ if (ttyoutq_bytesused(&tp->t_outq) == 0)
+ return (0);
+
+ /* Wait for data to be drained. */
+ error = tty_wait(tp, &tp->t_outwait);
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+
+/*
+ * Though ttydev_enter() and ttydev_leave() seem to be related, they
+ * don't have to be used together. ttydev_enter() is used by the cdev
+ * operations to prevent an actual operation from being processed when
+ * the TTY has been abandoned. ttydev_leave() is used by ttydev_open()
+ * and ttydev_close() to determine whether per-TTY data should be
+ * deallocated.
+ */
+
+static __inline int
+ttydev_enter(struct tty *tp)
+{
+ tty_lock(tp);
+
+ if (tty_gone(tp) || !tty_opened(tp)) {
+ /* Device is already gone. */
+ tty_unlock(tp);
+ return (ENXIO);
+ }
+
+ return (0);
+}
+
+static void
+ttydev_leave(struct tty *tp)
+{
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) {
+ /* Device is still opened somewhere. */
+ tty_unlock(tp);
+ return;
+ }
+
+ tp->t_flags |= TF_OPENCLOSE;
+
+ /* Stop asynchronous I/O. */
+ funsetown(&tp->t_sigio);
+
+ /* Remove console TTY. */
+ if (constty == tp)
+ constty_clear();
+
+ /* Drain any output. */
+ MPASS((tp->t_flags & TF_STOPPED) == 0);
+ if (!tty_gone(tp))
+ tty_drain(tp);
+
+ ttydisc_close(tp);
+
+ /* Destroy associated buffers already. */
+ ttyinq_free(&tp->t_inq);
+ tp->t_inlow = 0;
+ ttyoutq_free(&tp->t_outq);
+ tp->t_outlow = 0;
+
+ knlist_clear(&tp->t_inpoll.si_note, 1);
+ knlist_clear(&tp->t_outpoll.si_note, 1);
+
+ if (!tty_gone(tp))
+ ttydevsw_close(tp);
+
+ tp->t_flags &= ~TF_OPENCLOSE;
+ cv_broadcast(&tp->t_dcdwait);
+ tty_rel_free(tp);
+}
+
+/*
+ * Operations that are exposed through the character device in /dev.
+ */
+static int
+ttydev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ struct tty *tp;
+ int error = 0;
+
+ while ((tp = dev->si_drv1) == NULL) {
+ error = tsleep(&dev->si_drv1, PCATCH, "ttdrv1", 1);
+ if (error != EWOULDBLOCK)
+ return (error);
+ }
+
+ tty_lock(tp);
+ if (tty_gone(tp)) {
+ /* Device is already gone. */
+ tty_unlock(tp);
+ return (ENXIO);
+ }
+
+ /*
+ * Block when other processes are currently opening or closing
+ * the TTY.
+ */
+ while (tp->t_flags & TF_OPENCLOSE) {
+ error = tty_wait(tp, &tp->t_dcdwait);
+ if (error != 0) {
+ tty_unlock(tp);
+ return (error);
+ }
+ }
+ tp->t_flags |= TF_OPENCLOSE;
+
+ /*
+ * Make sure the "tty" and "cua" device cannot be opened at the
+ * same time.
+ */
+ if (TTY_CALLOUT(tp, dev)) {
+ if (tp->t_flags & TF_OPENED_IN) {
+ error = EBUSY;
+ goto done;
+ }
+ } else {
+ if (tp->t_flags & TF_OPENED_OUT) {
+ error = EBUSY;
+ goto done;
+ }
+ }
+
+ if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) {
+ error = EBUSY;
+ goto done;
+ }
+
+ if (!tty_opened(tp)) {
+ /* Set proper termios flags. */
+ if (TTY_CALLOUT(tp, dev))
+ tp->t_termios = tp->t_termios_init_out;
+ else
+ tp->t_termios = tp->t_termios_init_in;
+ ttydevsw_param(tp, &tp->t_termios);
+ /* Prevent modem control on callout devices and /dev/console. */
+ if (TTY_CALLOUT(tp, dev) || dev == dev_console)
+ tp->t_termios.c_cflag |= CLOCAL;
+
+ ttydevsw_modem(tp, SER_DTR|SER_RTS, 0);
+
+ error = ttydevsw_open(tp);
+ if (error != 0)
+ goto done;
+
+ ttydisc_open(tp);
+ tty_watermarks(tp);
+ }
+
+ /* Wait for Carrier Detect. */
+ if ((oflags & O_NONBLOCK) == 0 &&
+ (tp->t_termios.c_cflag & CLOCAL) == 0) {
+ while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) {
+ error = tty_wait(tp, &tp->t_dcdwait);
+ if (error != 0)
+ goto done;
+ }
+ }
+
+ if (dev == dev_console)
+ tp->t_flags |= TF_OPENED_CONS;
+ else if (TTY_CALLOUT(tp, dev))
+ tp->t_flags |= TF_OPENED_OUT;
+ else
+ tp->t_flags |= TF_OPENED_IN;
+
+done: tp->t_flags &= ~TF_OPENCLOSE;
+ cv_broadcast(&tp->t_dcdwait);
+ ttydev_leave(tp);
+
+ return (error);
+}
+
+static int
+ttydev_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+ struct tty *tp = dev->si_drv1;
+
+ tty_lock(tp);
+
+ /*
+ * Don't actually close the device if it is being used as the
+ * console.
+ */
+ MPASS((tp->t_flags & TF_OPENED) != TF_OPENED);
+ if (dev == dev_console)
+ tp->t_flags &= ~TF_OPENED_CONS;
+ else
+ tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT);
+
+ if (tp->t_flags & TF_OPENED) {
+ tty_unlock(tp);
+ return (0);
+ }
+
+ /*
+ * This can only be called once. The callin and the callout
+ * devices cannot be opened at the same time.
+ */
+ tp->t_flags &= ~(TF_EXCLUDE|TF_STOPPED);
+
+ /* Properly wake up threads that are stuck - revoke(). */
+ tp->t_revokecnt++;
+ tty_wakeup(tp, FREAD|FWRITE);
+ cv_broadcast(&tp->t_bgwait);
+ cv_broadcast(&tp->t_dcdwait);
+
+ ttydev_leave(tp);
+
+ return (0);
+}
+
+static __inline int
+tty_is_ctty(struct tty *tp, struct proc *p)
+{
+ tty_lock_assert(tp, MA_OWNED);
+
+ return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT);
+}
+
+int
+tty_wait_background(struct tty *tp, struct thread *td, int sig)
+{
+ struct proc *p = td->td_proc;
+ struct pgrp *pg;
+ ksiginfo_t ksi;
+ int error;
+
+ MPASS(sig == SIGTTIN || sig == SIGTTOU);
+ tty_lock_assert(tp, MA_OWNED);
+
+ for (;;) {
+ PROC_LOCK(p);
+ /*
+ * The process should only sleep, when:
+ * - This terminal is the controling terminal
+ * - Its process group is not the foreground process
+ * group
+ * - The parent process isn't waiting for the child to
+ * exit
+ * - the signal to send to the process isn't masked
+ */
+ if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) {
+ /* Allow the action to happen. */
+ PROC_UNLOCK(p);
+ return (0);
+ }
+
+ if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) ||
+ SIGISMEMBER(td->td_sigmask, sig)) {
+ /* Only allow them in write()/ioctl(). */
+ PROC_UNLOCK(p);
+ return (sig == SIGTTOU ? 0 : EIO);
+ }
+
+ pg = p->p_pgrp;
+ if (p->p_flag & P_PPWAIT || pg->pg_jobc == 0) {
+ /* Don't allow the action to happen. */
+ PROC_UNLOCK(p);
+ return (EIO);
+ }
+ PROC_UNLOCK(p);
+
+ /*
+ * Send the signal and sleep until we're the new
+ * foreground process group.
+ */
+ if (sig != 0) {
+ ksiginfo_init(&ksi);
+ ksi.ksi_code = SI_KERNEL;
+ ksi.ksi_signo = sig;
+ sig = 0;
+ }
+ PGRP_LOCK(pg);
+ pgsignal(pg, ksi.ksi_signo, 1, &ksi);
+ PGRP_UNLOCK(pg);
+
+ error = tty_wait(tp, &tp->t_bgwait);
+ if (error)
+ return (error);
+ }
+}
+
+static int
+ttydev_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct tty *tp = dev->si_drv1;
+ int error;
+
+ error = ttydev_enter(tp);
+ if (error)
+ goto done;
+ error = ttydisc_read(tp, uio, ioflag);
+ tty_unlock(tp);
+
+ /*
+ * The read() call should not throw an error when the device is
+ * being destroyed. Silently convert it to an EOF.
+ */
+done: if (error == ENXIO)
+ error = 0;
+ return (error);
+}
+
+static int
+ttydev_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ struct tty *tp = dev->si_drv1;
+ int error;
+
+ error = ttydev_enter(tp);
+ if (error)
+ return (error);
+
+ if (tp->t_termios.c_lflag & TOSTOP) {
+ error = tty_wait_background(tp, curthread, SIGTTOU);
+ if (error)
+ goto done;
+ }
+
+ if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) {
+ /* Allow non-blocking writes to bypass serialization. */
+ error = ttydisc_write(tp, uio, ioflag);
+ } else {
+ /* Serialize write() calls. */
+ while (tp->t_flags & TF_BUSY_OUT) {
+ error = tty_wait(tp, &tp->t_outserwait);
+ if (error)
+ goto done;
+ }
+
+ tp->t_flags |= TF_BUSY_OUT;
+ error = ttydisc_write(tp, uio, ioflag);
+ tp->t_flags &= ~TF_BUSY_OUT;
+ cv_signal(&tp->t_outserwait);
+ }
+
+done: tty_unlock(tp);
+ return (error);
+}
+
+static int
+ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
+ struct thread *td)
+{
+ struct tty *tp = dev->si_drv1;
+ int error;
+
+ error = ttydev_enter(tp);
+ if (error)
+ return (error);
+
+ switch (cmd) {
+ case TIOCCBRK:
+ case TIOCCONS:
+ case TIOCDRAIN:
+ case TIOCEXCL:
+ case TIOCFLUSH:
+ case TIOCNXCL:
+ case TIOCSBRK:
+ case TIOCSCTTY:
+ case TIOCSETA:
+ case TIOCSETAF:
+ case TIOCSETAW:
+ case TIOCSPGRP:
+ case TIOCSTART:
+ case TIOCSTAT:
+ case TIOCSTI:
+ case TIOCSTOP:
+ case TIOCSWINSZ:
+#if 0
+ case TIOCSDRAINWAIT:
+ case TIOCSETD:
+#endif
+#ifdef COMPAT_43TTY
+ case TIOCLBIC:
+ case TIOCLBIS:
+ case TIOCLSET:
+ case TIOCSETC:
+ case OTIOCSETD:
+ case TIOCSETN:
+ case TIOCSETP:
+ case TIOCSLTC:
+#endif /* COMPAT_43TTY */
+ /*
+ * If the ioctl() causes the TTY to be modified, let it
+ * wait in the background.
+ */
+ error = tty_wait_background(tp, curthread, SIGTTOU);
+ if (error)
+ goto done;
+ }
+
+ if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) {
+ struct termios *old = &tp->t_termios;
+ struct termios *new = (struct termios *)data;
+ struct termios *lock = TTY_CALLOUT(tp, dev) ?
+ &tp->t_termios_lock_out : &tp->t_termios_lock_in;
+ int cc;
+
+ /*
+ * Lock state devices. Just overwrite the values of the
+ * commands that are currently in use.
+ */
+ new->c_iflag = (old->c_iflag & lock->c_iflag) |
+ (new->c_iflag & ~lock->c_iflag);
+ new->c_oflag = (old->c_oflag & lock->c_oflag) |
+ (new->c_oflag & ~lock->c_oflag);
+ new->c_cflag = (old->c_cflag & lock->c_cflag) |
+ (new->c_cflag & ~lock->c_cflag);
+ new->c_lflag = (old->c_lflag & lock->c_lflag) |
+ (new->c_lflag & ~lock->c_lflag);
+ for (cc = 0; cc < NCCS; ++cc)
+ if (lock->c_cc[cc])
+ new->c_cc[cc] = old->c_cc[cc];
+ if (lock->c_ispeed)
+ new->c_ispeed = old->c_ispeed;
+ if (lock->c_ospeed)
+ new->c_ospeed = old->c_ospeed;
+ }
+
+ error = tty_ioctl(tp, cmd, data, fflag, td);
+done: tty_unlock(tp);
+
+ return (error);
+}
+
+static int
+ttydev_poll(struct cdev *dev, int events, struct thread *td)
+{
+ struct tty *tp = dev->si_drv1;
+ int error, revents = 0;
+
+ error = ttydev_enter(tp);
+ if (error)
+ return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
+
+ if (events & (POLLIN|POLLRDNORM)) {
+ /* See if we can read something. */
+ if (ttydisc_read_poll(tp) > 0)
+ revents |= events & (POLLIN|POLLRDNORM);
+ }
+
+ if (tp->t_flags & TF_ZOMBIE) {
+ /* Hangup flag on zombie state. */
+ revents |= POLLHUP;
+ } else if (events & (POLLOUT|POLLWRNORM)) {
+ /* See if we can write something. */
+ if (ttydisc_write_poll(tp) > 0)
+ revents |= events & (POLLOUT|POLLWRNORM);
+ }
+
+ if (revents == 0) {
+ if (events & (POLLIN|POLLRDNORM))
+ selrecord(td, &tp->t_inpoll);
+ if (events & (POLLOUT|POLLWRNORM))
+ selrecord(td, &tp->t_outpoll);
+ }
+
+ tty_unlock(tp);
+
+ return (revents);
+}
+
+static int
+ttydev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, vm_memattr_t *memattr)
+{
+ struct tty *tp = dev->si_drv1;
+ int error;
+
+ /* Handle mmap() through the driver. */
+
+ error = ttydev_enter(tp);
+ if (error)
+ return (-1);
+ error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr);
+ tty_unlock(tp);
+
+ return (error);
+}
+
+/*
+ * kqueue support.
+ */
+
+static void
+tty_kqops_read_detach(struct knote *kn)
+{
+ struct tty *tp = kn->kn_hook;
+
+ knlist_remove(&tp->t_inpoll.si_note, kn, 0);
+}
+
+static int
+tty_kqops_read_event(struct knote *kn, long hint)
+{
+ struct tty *tp = kn->kn_hook;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) {
+ kn->kn_flags |= EV_EOF;
+ return (1);
+ } else {
+ kn->kn_data = ttydisc_read_poll(tp);
+ return (kn->kn_data > 0);
+ }
+}
+
+static void
+tty_kqops_write_detach(struct knote *kn)
+{
+ struct tty *tp = kn->kn_hook;
+
+ knlist_remove(&tp->t_outpoll.si_note, kn, 0);
+}
+
+static int
+tty_kqops_write_event(struct knote *kn, long hint)
+{
+ struct tty *tp = kn->kn_hook;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tty_gone(tp)) {
+ kn->kn_flags |= EV_EOF;
+ return (1);
+ } else {
+ kn->kn_data = ttydisc_write_poll(tp);
+ return (kn->kn_data > 0);
+ }
+}
+
+static struct filterops tty_kqops_read = {
+ .f_isfd = 1,
+ .f_detach = tty_kqops_read_detach,
+ .f_event = tty_kqops_read_event,
+};
+static struct filterops tty_kqops_write = {
+ .f_isfd = 1,
+ .f_detach = tty_kqops_write_detach,
+ .f_event = tty_kqops_write_event,
+};
+
+static int
+ttydev_kqfilter(struct cdev *dev, struct knote *kn)
+{
+ struct tty *tp = dev->si_drv1;
+ int error;
+
+ error = ttydev_enter(tp);
+ if (error)
+ return (error);
+
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ kn->kn_hook = tp;
+ kn->kn_fop = &tty_kqops_read;
+ knlist_add(&tp->t_inpoll.si_note, kn, 1);
+ break;
+ case EVFILT_WRITE:
+ kn->kn_hook = tp;
+ kn->kn_fop = &tty_kqops_write;
+ knlist_add(&tp->t_outpoll.si_note, kn, 1);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ tty_unlock(tp);
+ return (error);
+}
+
+static struct cdevsw ttydev_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = ttydev_open,
+ .d_close = ttydev_close,
+ .d_read = ttydev_read,
+ .d_write = ttydev_write,
+ .d_ioctl = ttydev_ioctl,
+ .d_kqfilter = ttydev_kqfilter,
+ .d_poll = ttydev_poll,
+ .d_mmap = ttydev_mmap,
+ .d_name = "ttydev",
+ .d_flags = D_TTY,
+};
+
+/*
+ * Init/lock-state devices
+ */
+
+static int
+ttyil_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ struct tty *tp;
+ int error = 0;
+
+ while ((tp = dev->si_drv1) == NULL) {
+ error = tsleep(&dev->si_drv1, PCATCH, "ttdrv1", 1);
+ if (error != EWOULDBLOCK)
+ return (error);
+ }
+ tty_lock(tp);
+ if (tty_gone(tp))
+ error = ENODEV;
+ tty_unlock(tp);
+
+ return (error);
+}
+
+static int
+ttyil_close(struct cdev *dev, int flag, int mode, struct thread *td)
+{
+ return (0);
+}
+
+static int
+ttyil_rdwr(struct cdev *dev, struct uio *uio, int ioflag)
+{
+ return (ENODEV);
+}
+
+static int
+ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
+ struct thread *td)
+{
+ struct tty *tp = dev->si_drv1;
+ int error;
+
+ tty_lock(tp);
+ if (tty_gone(tp)) {
+ error = ENODEV;
+ goto done;
+ }
+
+ error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
+ if (error != ENOIOCTL)
+ goto done;
+ error = 0;
+
+ switch (cmd) {
+ case TIOCGETA:
+ /* Obtain terminal flags through tcgetattr(). */
+ *(struct termios*)data = *(struct termios*)dev->si_drv2;
+ break;
+ case TIOCSETA:
+ /* Set terminal flags through tcsetattr(). */
+ error = priv_check(td, PRIV_TTY_SETA);
+ if (error)
+ break;
+ *(struct termios*)dev->si_drv2 = *(struct termios*)data;
+ break;
+ case TIOCGETD:
+ *(int *)data = TTYDISC;
+ break;
+ case TIOCGWINSZ:
+ bzero(data, sizeof(struct winsize));
+ break;
+ default:
+ error = ENOTTY;
+ }
+
+done: tty_unlock(tp);
+ return (error);
+}
+
+static struct cdevsw ttyil_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = ttyil_open,
+ .d_close = ttyil_close,
+ .d_read = ttyil_rdwr,
+ .d_write = ttyil_rdwr,
+ .d_ioctl = ttyil_ioctl,
+ .d_name = "ttyil",
+ .d_flags = D_TTY,
+};
+
+static void
+tty_init_termios(struct tty *tp)
+{
+ struct termios *t = &tp->t_termios_init_in;
+
+ t->c_cflag = TTYDEF_CFLAG;
+ t->c_iflag = TTYDEF_IFLAG;
+ t->c_lflag = TTYDEF_LFLAG;
+ t->c_oflag = TTYDEF_OFLAG;
+ t->c_ispeed = TTYDEF_SPEED;
+ t->c_ospeed = TTYDEF_SPEED;
+ memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars);
+
+ tp->t_termios_init_out = *t;
+}
+
+void
+tty_init_console(struct tty *tp, speed_t s)
+{
+ struct termios *ti = &tp->t_termios_init_in;
+ struct termios *to = &tp->t_termios_init_out;
+
+ if (s != 0) {
+ ti->c_ispeed = ti->c_ospeed = s;
+ to->c_ispeed = to->c_ospeed = s;
+ }
+
+ ti->c_cflag |= CLOCAL;
+ to->c_cflag |= CLOCAL;
+}
+
+/*
+ * Standard device routine implementations, mostly meant for
+ * pseudo-terminal device drivers. When a driver creates a new terminal
+ * device class, missing routines are patched.
+ */
+
+static int
+ttydevsw_defopen(struct tty *tp)
+{
+
+ return (0);
+}
+
+static void
+ttydevsw_defclose(struct tty *tp)
+{
+}
+
+static void
+ttydevsw_defoutwakeup(struct tty *tp)
+{
+
+ panic("Terminal device has output, while not implemented");
+}
+
+static void
+ttydevsw_definwakeup(struct tty *tp)
+{
+}
+
+static int
+ttydevsw_defioctl(struct tty *tp, u_long cmd, caddr_t data, struct thread *td)
+{
+
+ return (ENOIOCTL);
+}
+
+static int
+ttydevsw_defcioctl(struct tty *tp, int unit, u_long cmd, caddr_t data, struct thread *td)
+{
+
+ return (ENOIOCTL);
+}
+
+static int
+ttydevsw_defparam(struct tty *tp, struct termios *t)
+{
+
+ /*
+ * Allow the baud rate to be adjusted for pseudo-devices, but at
+ * least restrict it to 115200 to prevent excessive buffer
+ * usage. Also disallow 0, to prevent foot shooting.
+ */
+ if (t->c_ispeed < B50)
+ t->c_ispeed = B50;
+ else if (t->c_ispeed > B115200)
+ t->c_ispeed = B115200;
+ if (t->c_ospeed < B50)
+ t->c_ospeed = B50;
+ else if (t->c_ospeed > B115200)
+ t->c_ospeed = B115200;
+ t->c_cflag |= CREAD;
+
+ return (0);
+}
+
+static int
+ttydevsw_defmodem(struct tty *tp, int sigon, int sigoff)
+{
+
+ /* Simulate a carrier to make the TTY layer happy. */
+ return (SER_DCD);
+}
+
+static int
+ttydevsw_defmmap(struct tty *tp, vm_ooffset_t offset, vm_paddr_t *paddr,
+ int nprot, vm_memattr_t *memattr)
+{
+
+ return (-1);
+}
+
+static void
+ttydevsw_defpktnotify(struct tty *tp, char event)
+{
+}
+
+static void
+ttydevsw_deffree(void *softc)
+{
+
+ panic("Terminal device freed without a free-handler");
+}
+
+/*
+ * TTY allocation and deallocation. TTY devices can be deallocated when
+ * the driver doesn't use it anymore, when the TTY isn't a session's
+ * controlling TTY and when the device node isn't opened through devfs.
+ */
+
+struct tty *
+tty_alloc(struct ttydevsw *tsw, void *sc)
+{
+
+ return (tty_alloc_mutex(tsw, sc, NULL));
+}
+
+struct tty *
+tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex)
+{
+ struct tty *tp;
+
+ /* Make sure the driver defines all routines. */
+#define PATCH_FUNC(x) do { \
+ if (tsw->tsw_ ## x == NULL) \
+ tsw->tsw_ ## x = ttydevsw_def ## x; \
+} while (0)
+ PATCH_FUNC(open);
+ PATCH_FUNC(close);
+ PATCH_FUNC(outwakeup);
+ PATCH_FUNC(inwakeup);
+ PATCH_FUNC(ioctl);
+ PATCH_FUNC(cioctl);
+ PATCH_FUNC(param);
+ PATCH_FUNC(modem);
+ PATCH_FUNC(mmap);
+ PATCH_FUNC(pktnotify);
+ PATCH_FUNC(free);
+#undef PATCH_FUNC
+
+ tp = malloc(sizeof(struct tty), M_TTY, M_WAITOK|M_ZERO);
+ tp->t_devsw = tsw;
+ tp->t_devswsoftc = sc;
+ tp->t_flags = tsw->tsw_flags;
+
+ tty_init_termios(tp);
+
+ cv_init(&tp->t_inwait, "ttyin");
+ cv_init(&tp->t_outwait, "ttyout");
+ cv_init(&tp->t_outserwait, "ttyosr");
+ cv_init(&tp->t_bgwait, "ttybg");
+ cv_init(&tp->t_dcdwait, "ttydcd");
+
+ /* Allow drivers to use a custom mutex to lock the TTY. */
+ if (mutex != NULL) {
+ tp->t_mtx = mutex;
+ } else {
+ tp->t_mtx = &tp->t_mtxobj;
+ mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF);
+ }
+
+ knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx);
+ knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx);
+
+ sx_xlock(&tty_list_sx);
+ TAILQ_INSERT_TAIL(&tty_list, tp, t_list);
+ tty_list_count++;
+ sx_xunlock(&tty_list_sx);
+
+ return (tp);
+}
+
+static void
+tty_dealloc(void *arg)
+{
+ struct tty *tp = arg;
+
+ sx_xlock(&tty_list_sx);
+ TAILQ_REMOVE(&tty_list, tp, t_list);
+ tty_list_count--;
+ sx_xunlock(&tty_list_sx);
+
+ /* Make sure we haven't leaked buffers. */
+ MPASS(ttyinq_getsize(&tp->t_inq) == 0);
+ MPASS(ttyoutq_getsize(&tp->t_outq) == 0);
+
+ seldrain(&tp->t_inpoll);
+ seldrain(&tp->t_outpoll);
+ knlist_destroy(&tp->t_inpoll.si_note);
+ knlist_destroy(&tp->t_outpoll.si_note);
+
+ cv_destroy(&tp->t_inwait);
+ cv_destroy(&tp->t_outwait);
+ cv_destroy(&tp->t_bgwait);
+ cv_destroy(&tp->t_dcdwait);
+ cv_destroy(&tp->t_outserwait);
+
+ if (tp->t_mtx == &tp->t_mtxobj)
+ mtx_destroy(&tp->t_mtxobj);
+ ttydevsw_free(tp);
+ free(tp, M_TTY);
+}
+
+static void
+tty_rel_free(struct tty *tp)
+{
+ struct cdev *dev;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+#define TF_ACTIVITY (TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE)
+ if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) {
+ /* TTY is still in use. */
+ tty_unlock(tp);
+ return;
+ }
+
+ /* TTY can be deallocated. */
+ dev = tp->t_dev;
+ tp->t_dev = NULL;
+ tty_unlock(tp);
+
+ if (dev != NULL)
+ destroy_dev_sched_cb(dev, tty_dealloc, tp);
+}
+
+void
+tty_rel_pgrp(struct tty *tp, struct pgrp *pg)
+{
+ MPASS(tp->t_sessioncnt > 0);
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tp->t_pgrp == pg)
+ tp->t_pgrp = NULL;
+
+ tty_unlock(tp);
+}
+
+void
+tty_rel_sess(struct tty *tp, struct session *sess)
+{
+ MPASS(tp->t_sessioncnt > 0);
+
+ /* Current session has left. */
+ if (tp->t_session == sess) {
+ tp->t_session = NULL;
+ MPASS(tp->t_pgrp == NULL);
+ }
+ tp->t_sessioncnt--;
+ tty_rel_free(tp);
+}
+
+void
+tty_rel_gone(struct tty *tp)
+{
+ MPASS(!tty_gone(tp));
+
+ /* Simulate carrier removal. */
+ ttydisc_modem(tp, 0);
+
+ /* Wake up all blocked threads. */
+ tty_wakeup(tp, FREAD|FWRITE);
+ cv_broadcast(&tp->t_bgwait);
+ cv_broadcast(&tp->t_dcdwait);
+
+ tp->t_flags |= TF_GONE;
+ tty_rel_free(tp);
+}
+
+/*
+ * Exposing information about current TTY's through sysctl
+ */
+
+static void
+tty_to_xtty(struct tty *tp, struct xtty *xt)
+{
+ tty_lock_assert(tp, MA_OWNED);
+
+ xt->xt_size = sizeof(struct xtty);
+ xt->xt_insize = ttyinq_getsize(&tp->t_inq);
+ xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq);
+ xt->xt_inlc = ttyinq_bytesline(&tp->t_inq);
+ xt->xt_inlow = tp->t_inlow;
+ xt->xt_outsize = ttyoutq_getsize(&tp->t_outq);
+ xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq);
+ xt->xt_outlow = tp->t_outlow;
+ xt->xt_column = tp->t_column;
+ xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
+ xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0;
+ xt->xt_flags = tp->t_flags;
+ xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : NODEV;
+}
+
+static int
+sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
+{
+ unsigned long lsize;
+ struct xtty *xtlist, *xt;
+ struct tty *tp;
+ int error;
+
+ sx_slock(&tty_list_sx);
+ lsize = tty_list_count * sizeof(struct xtty);
+ if (lsize == 0) {
+ sx_sunlock(&tty_list_sx);
+ return (0);
+ }
+
+ xtlist = xt = malloc(lsize, M_TTY, M_WAITOK);
+
+ TAILQ_FOREACH(tp, &tty_list, t_list) {
+ tty_lock(tp);
+ tty_to_xtty(tp, xt);
+ tty_unlock(tp);
+ xt++;
+ }
+ sx_sunlock(&tty_list_sx);
+
+ error = SYSCTL_OUT(req, xtlist, lsize);
+ free(xtlist, M_TTY);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
+ 0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs");
+
+/*
+ * Device node creation. Device has been set up, now we can expose it to
+ * the user.
+ */
+
+void
+tty_makedev(struct tty *tp, struct ucred *cred, const char *fmt, ...)
+{
+ va_list ap;
+ struct cdev *dev;
+ const char *prefix = "tty";
+ char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */
+ uid_t uid;
+ gid_t gid;
+ mode_t mode;
+
+ /* Remove "tty" prefix from devices like PTY's. */
+ if (tp->t_flags & TF_NOPREFIX)
+ prefix = "";
+
+ va_start(ap, fmt);
+ vsnrprintf(name, sizeof name, 32, fmt, ap);
+ va_end(ap);
+
+ if (cred == NULL) {
+ /* System device. */
+ uid = UID_ROOT;
+ gid = GID_WHEEL;
+ mode = S_IRUSR|S_IWUSR;
+ } else {
+ /* User device. */
+ uid = cred->cr_ruid;
+ gid = GID_TTY;
+ mode = S_IRUSR|S_IWUSR|S_IWGRP;
+ }
+
+ /* Master call-in device. */
+ dev = make_dev_cred(&ttydev_cdevsw, 0, cred,
+ uid, gid, mode, "%s%s", prefix, name);
+ dev->si_drv1 = tp;
+ wakeup(&dev->si_drv1);
+ tp->t_dev = dev;
+
+ /* Slave call-in devices. */
+ if (tp->t_flags & TF_INITLOCK) {
+ dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_INIT, cred,
+ uid, gid, mode, "%s%s.init", prefix, name);
+ dev_depends(tp->t_dev, dev);
+ dev->si_drv1 = tp;
+ wakeup(&dev->si_drv1);
+ dev->si_drv2 = &tp->t_termios_init_in;
+
+ dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred,
+ uid, gid, mode, "%s%s.lock", prefix, name);
+ dev_depends(tp->t_dev, dev);
+ dev->si_drv1 = tp;
+ wakeup(&dev->si_drv1);
+ dev->si_drv2 = &tp->t_termios_lock_in;
+ }
+
+ /* Call-out devices. */
+ if (tp->t_flags & TF_CALLOUT) {
+ dev = make_dev_cred(&ttydev_cdevsw, TTYUNIT_CALLOUT, cred,
+ UID_UUCP, GID_DIALER, 0660, "cua%s", name);
+ dev_depends(tp->t_dev, dev);
+ dev->si_drv1 = tp;
+ wakeup(&dev->si_drv1);
+
+ /* Slave call-out devices. */
+ if (tp->t_flags & TF_INITLOCK) {
+ dev = make_dev_cred(&ttyil_cdevsw,
+ TTYUNIT_CALLOUT | TTYUNIT_INIT, cred,
+ UID_UUCP, GID_DIALER, 0660, "cua%s.init", name);
+ dev_depends(tp->t_dev, dev);
+ dev->si_drv1 = tp;
+ wakeup(&dev->si_drv1);
+ dev->si_drv2 = &tp->t_termios_init_out;
+
+ dev = make_dev_cred(&ttyil_cdevsw,
+ TTYUNIT_CALLOUT | TTYUNIT_LOCK, cred,
+ UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name);
+ dev_depends(tp->t_dev, dev);
+ dev->si_drv1 = tp;
+ wakeup(&dev->si_drv1);
+ dev->si_drv2 = &tp->t_termios_lock_out;
+ }
+ }
+}
+
+/*
+ * Signalling processes.
+ */
+
+void
+tty_signal_sessleader(struct tty *tp, int sig)
+{
+ struct proc *p;
+
+ tty_lock_assert(tp, MA_OWNED);
+ MPASS(sig >= 1 && sig < NSIG);
+
+ /* Make signals start output again. */
+ tp->t_flags &= ~TF_STOPPED;
+
+ if (tp->t_session != NULL && tp->t_session->s_leader != NULL) {
+ p = tp->t_session->s_leader;
+ PROC_LOCK(p);
+ kern_psignal(p, sig);
+ PROC_UNLOCK(p);
+ }
+}
+
+void
+tty_signal_pgrp(struct tty *tp, int sig)
+{
+ ksiginfo_t ksi;
+
+ tty_lock_assert(tp, MA_OWNED);
+ MPASS(sig >= 1 && sig < NSIG);
+
+ /* Make signals start output again. */
+ tp->t_flags &= ~TF_STOPPED;
+
+ if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO))
+ tty_info(tp);
+ if (tp->t_pgrp != NULL) {
+ ksiginfo_init(&ksi);
+ ksi.ksi_signo = sig;
+ ksi.ksi_code = SI_KERNEL;
+ PGRP_LOCK(tp->t_pgrp);
+ pgsignal(tp->t_pgrp, sig, 1, &ksi);
+ PGRP_UNLOCK(tp->t_pgrp);
+ }
+}
+
+void
+tty_wakeup(struct tty *tp, int flags)
+{
+ if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL)
+ pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
+
+ if (flags & FWRITE) {
+ cv_broadcast(&tp->t_outwait);
+ selwakeup(&tp->t_outpoll);
+ KNOTE_LOCKED(&tp->t_outpoll.si_note, 0);
+ }
+ if (flags & FREAD) {
+ cv_broadcast(&tp->t_inwait);
+ selwakeup(&tp->t_inpoll);
+ KNOTE_LOCKED(&tp->t_inpoll.si_note, 0);
+ }
+}
+
+int
+tty_wait(struct tty *tp, struct cv *cv)
+{
+ int error;
+ int revokecnt = tp->t_revokecnt;
+
+ tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
+ MPASS(!tty_gone(tp));
+
+ error = cv_wait_sig(cv, tp->t_mtx);
+
+ /* Restart the system call when we may have been revoked. */
+ if (tp->t_revokecnt != revokecnt)
+ return (ERESTART);
+
+ /* Bail out when the device slipped away. */
+ if (tty_gone(tp))
+ return (ENXIO);
+
+ return (error);
+}
+
+int
+tty_timedwait(struct tty *tp, struct cv *cv, int hz)
+{
+ int error;
+ int revokecnt = tp->t_revokecnt;
+
+ tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED);
+ MPASS(!tty_gone(tp));
+
+ error = cv_timedwait_sig(cv, tp->t_mtx, hz);
+
+ /* Restart the system call when we may have been revoked. */
+ if (tp->t_revokecnt != revokecnt)
+ return (ERESTART);
+
+ /* Bail out when the device slipped away. */
+ if (tty_gone(tp))
+ return (ENXIO);
+
+ return (error);
+}
+
+void
+tty_flush(struct tty *tp, int flags)
+{
+ if (flags & FWRITE) {
+ tp->t_flags &= ~TF_HIWAT_OUT;
+ ttyoutq_flush(&tp->t_outq);
+ tty_wakeup(tp, FWRITE);
+ ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE);
+ }
+ if (flags & FREAD) {
+ tty_hiwat_in_unblock(tp);
+ ttyinq_flush(&tp->t_inq);
+ ttydevsw_inwakeup(tp);
+ ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD);
+ }
+}
+
+void
+tty_set_winsize(struct tty *tp, const struct winsize *wsz)
+{
+
+ if (memcmp(&tp->t_winsize, wsz, sizeof(*wsz)) == 0)
+ return;
+ tp->t_winsize = *wsz;
+ tty_signal_pgrp(tp, SIGWINCH);
+}
+
+static int
+tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag,
+ struct thread *td)
+{
+ int error;
+
+ switch (cmd) {
+ /*
+ * Modem commands.
+ * The SER_* and TIOCM_* flags are the same, but one bit
+ * shifted. I don't know why.
+ */
+ case TIOCSDTR:
+ ttydevsw_modem(tp, SER_DTR, 0);
+ return (0);
+ case TIOCCDTR:
+ ttydevsw_modem(tp, 0, SER_DTR);
+ return (0);
+ case TIOCMSET: {
+ int bits = *(int *)data;
+ ttydevsw_modem(tp,
+ (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1,
+ ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1);
+ return (0);
+ }
+ case TIOCMBIS: {
+ int bits = *(int *)data;
+ ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0);
+ return (0);
+ }
+ case TIOCMBIC: {
+ int bits = *(int *)data;
+ ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1);
+ return (0);
+ }
+ case TIOCMGET:
+ *(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1);
+ return (0);
+
+ case FIOASYNC:
+ if (*(int *)data)
+ tp->t_flags |= TF_ASYNC;
+ else
+ tp->t_flags &= ~TF_ASYNC;
+ return (0);
+ case FIONBIO:
+ /* This device supports non-blocking operation. */
+ return (0);
+ case FIONREAD:
+ *(int *)data = ttyinq_bytescanonicalized(&tp->t_inq);
+ return (0);
+ case FIONWRITE:
+ case TIOCOUTQ:
+ *(int *)data = ttyoutq_bytesused(&tp->t_outq);
+ return (0);
+ case FIOSETOWN:
+ if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
+ /* Not allowed to set ownership. */
+ return (ENOTTY);
+
+ /* Temporarily unlock the TTY to set ownership. */
+ tty_unlock(tp);
+ error = fsetown(*(int *)data, &tp->t_sigio);
+ tty_lock(tp);
+ return (error);
+ case FIOGETOWN:
+ if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
+ /* Not allowed to set ownership. */
+ return (ENOTTY);
+
+ /* Get ownership. */
+ *(int *)data = fgetown(&tp->t_sigio);
+ return (0);
+ case TIOCGETA:
+ /* Obtain terminal flags through tcgetattr(). */
+ *(struct termios*)data = tp->t_termios;
+ return (0);
+ case TIOCSETA:
+ case TIOCSETAW:
+ case TIOCSETAF: {
+ struct termios *t = data;
+
+ /*
+ * Who makes up these funny rules? According to POSIX,
+ * input baud rate is set equal to the output baud rate
+ * when zero.
+ */
+ if (t->c_ispeed == 0)
+ t->c_ispeed = t->c_ospeed;
+
+ /* Discard any unsupported bits. */
+ t->c_iflag &= TTYSUP_IFLAG;
+ t->c_oflag &= TTYSUP_OFLAG;
+ t->c_lflag &= TTYSUP_LFLAG;
+ t->c_cflag &= TTYSUP_CFLAG;
+
+ /* Set terminal flags through tcsetattr(). */
+ if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
+ error = tty_drain(tp);
+ if (error)
+ return (error);
+ if (cmd == TIOCSETAF)
+ tty_flush(tp, FREAD);
+ }
+
+ /*
+ * Only call param() when the flags really change.
+ */
+ if ((t->c_cflag & CIGNORE) == 0 &&
+ (tp->t_termios.c_cflag != t->c_cflag ||
+ ((tp->t_termios.c_iflag ^ t->c_iflag) &
+ (IXON|IXOFF|IXANY)) ||
+ tp->t_termios.c_ispeed != t->c_ispeed ||
+ tp->t_termios.c_ospeed != t->c_ospeed)) {
+ error = ttydevsw_param(tp, t);
+ if (error)
+ return (error);
+
+ /* XXX: CLOCAL? */
+
+ tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
+ tp->t_termios.c_ispeed = t->c_ispeed;
+ tp->t_termios.c_ospeed = t->c_ospeed;
+
+ /* Baud rate has changed - update watermarks. */
+ tty_watermarks(tp);
+ }
+
+ /* Copy new non-device driver parameters. */
+ tp->t_termios.c_iflag = t->c_iflag;
+ tp->t_termios.c_oflag = t->c_oflag;
+ tp->t_termios.c_lflag = t->c_lflag;
+ memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc);
+
+ ttydisc_optimize(tp);
+
+ if ((t->c_lflag & ICANON) == 0) {
+ /*
+ * When in non-canonical mode, wake up all
+ * readers. Canonicalize any partial input. VMIN
+ * and VTIME could also be adjusted.
+ */
+ ttyinq_canonicalize(&tp->t_inq);
+ tty_wakeup(tp, FREAD);
+ }
+
+ /*
+ * For packet mode: notify the PTY consumer that VSTOP
+ * and VSTART may have been changed.
+ */
+ if (tp->t_termios.c_iflag & IXON &&
+ tp->t_termios.c_cc[VSTOP] == CTRL('S') &&
+ tp->t_termios.c_cc[VSTART] == CTRL('Q'))
+ ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP);
+ else
+ ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP);
+ return (0);
+ }
+ case TIOCGETD:
+ /* For compatibility - we only support TTYDISC. */
+ *(int *)data = TTYDISC;
+ return (0);
+ case TIOCGPGRP:
+ if (!tty_is_ctty(tp, td->td_proc))
+ return (ENOTTY);
+
+ if (tp->t_pgrp != NULL)
+ *(int *)data = tp->t_pgrp->pg_id;
+ else
+ *(int *)data = NO_PID;
+ return (0);
+ case TIOCGSID:
+ if (!tty_is_ctty(tp, td->td_proc))
+ return (ENOTTY);
+
+ MPASS(tp->t_session);
+ *(int *)data = tp->t_session->s_sid;
+ return (0);
+ case TIOCSCTTY: {
+ struct proc *p = td->td_proc;
+
+ /* XXX: This looks awful. */
+ tty_unlock(tp);
+ sx_xlock(&proctree_lock);
+ tty_lock(tp);
+
+ if (!SESS_LEADER(p)) {
+ /* Only the session leader may do this. */
+ sx_xunlock(&proctree_lock);
+ return (EPERM);
+ }
+
+ if (tp->t_session != NULL && tp->t_session == p->p_session) {
+ /* This is already our controlling TTY. */
+ sx_xunlock(&proctree_lock);
+ return (0);
+ }
+
+ if (p->p_session->s_ttyp != NULL ||
+ (tp->t_session != NULL && tp->t_session->s_ttyvp != NULL &&
+ tp->t_session->s_ttyvp->v_type != VBAD)) {
+ /*
+ * There is already a relation between a TTY and
+ * a session, or the caller is not the session
+ * leader.
+ *
+ * Allow the TTY to be stolen when the vnode is
+ * invalid, but the reference to the TTY is
+ * still active. This allows immediate reuse of
+ * TTYs of which the session leader has been
+ * killed or the TTY revoked.
+ */
+ sx_xunlock(&proctree_lock);
+ return (EPERM);
+ }
+
+ /* Connect the session to the TTY. */
+ tp->t_session = p->p_session;
+ tp->t_session->s_ttyp = tp;
+ tp->t_sessioncnt++;
+ sx_xunlock(&proctree_lock);
+
+ /* Assign foreground process group. */
+ tp->t_pgrp = p->p_pgrp;
+ PROC_LOCK(p);
+ p->p_flag |= P_CONTROLT;
+ PROC_UNLOCK(p);
+
+ return (0);
+ }
+ case TIOCSPGRP: {
+ struct pgrp *pg;
+
+ /*
+ * XXX: Temporarily unlock the TTY to locate the process
+ * group. This code would be lot nicer if we would ever
+ * decompose proctree_lock.
+ */
+ tty_unlock(tp);
+ sx_slock(&proctree_lock);
+ pg = pgfind(*(int *)data);
+ if (pg != NULL)
+ PGRP_UNLOCK(pg);
+ if (pg == NULL || pg->pg_session != td->td_proc->p_session) {
+ sx_sunlock(&proctree_lock);
+ tty_lock(tp);
+ return (EPERM);
+ }
+ tty_lock(tp);
+
+ /*
+ * Determine if this TTY is the controlling TTY after
+ * relocking the TTY.
+ */
+ if (!tty_is_ctty(tp, td->td_proc)) {
+ sx_sunlock(&proctree_lock);
+ return (ENOTTY);
+ }
+ tp->t_pgrp = pg;
+ sx_sunlock(&proctree_lock);
+
+ /* Wake up the background process groups. */
+ cv_broadcast(&tp->t_bgwait);
+ return (0);
+ }
+ case TIOCFLUSH: {
+ int flags = *(int *)data;
+
+ if (flags == 0)
+ flags = (FREAD|FWRITE);
+ else
+ flags &= (FREAD|FWRITE);
+ tty_flush(tp, flags);
+ return (0);
+ }
+ case TIOCDRAIN:
+ /* Drain TTY output. */
+ return tty_drain(tp);
+ case TIOCCONS:
+ /* Set terminal as console TTY. */
+ if (*(int *)data) {
+ error = priv_check(td, PRIV_TTY_CONSOLE);
+ if (error)
+ return (error);
+
+ /*
+ * XXX: constty should really need to be locked!
+ * XXX: allow disconnected constty's to be stolen!
+ */
+
+ if (constty == tp)
+ return (0);
+ if (constty != NULL)
+ return (EBUSY);
+
+ tty_unlock(tp);
+ constty_set(tp);
+ tty_lock(tp);
+ } else if (constty == tp) {
+ constty_clear();
+ }
+ return (0);
+ case TIOCGWINSZ:
+ /* Obtain window size. */
+ *(struct winsize*)data = tp->t_winsize;
+ return (0);
+ case TIOCSWINSZ:
+ /* Set window size. */
+ tty_set_winsize(tp, data);
+ return (0);
+ case TIOCEXCL:
+ tp->t_flags |= TF_EXCLUDE;
+ return (0);
+ case TIOCNXCL:
+ tp->t_flags &= ~TF_EXCLUDE;
+ return (0);
+ case TIOCSTOP:
+ tp->t_flags |= TF_STOPPED;
+ ttydevsw_pktnotify(tp, TIOCPKT_STOP);
+ return (0);
+ case TIOCSTART:
+ tp->t_flags &= ~TF_STOPPED;
+ ttydevsw_outwakeup(tp);
+ ttydevsw_pktnotify(tp, TIOCPKT_START);
+ return (0);
+ case TIOCSTAT:
+ tty_info(tp);
+ return (0);
+ case TIOCSTI:
+ if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
+ return (EPERM);
+ if (!tty_is_ctty(tp, td->td_proc) &&
+ priv_check(td, PRIV_TTY_STI))
+ return (EACCES);
+ ttydisc_rint(tp, *(char *)data, 0);
+ ttydisc_rint_done(tp);
+ return (0);
+ }
+
+#ifdef COMPAT_43TTY
+ return tty_ioctl_compat(tp, cmd, data, fflag, td);
+#else /* !COMPAT_43TTY */
+ return (ENOIOCTL);
+#endif /* COMPAT_43TTY */
+}
+
+int
+tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td)
+{
+ int error;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tty_gone(tp))
+ return (ENXIO);
+
+ error = ttydevsw_ioctl(tp, cmd, data, td);
+ if (error == ENOIOCTL)
+ error = tty_generic_ioctl(tp, cmd, data, fflag, td);
+
+ return (error);
+}
+
+dev_t
+tty_udev(struct tty *tp)
+{
+ if (tp->t_dev)
+ return dev2udev(tp->t_dev);
+ else
+ return NODEV;
+}
+
+int
+tty_checkoutq(struct tty *tp)
+{
+
+ /* 256 bytes should be enough to print a log message. */
+ return (ttyoutq_bytesleft(&tp->t_outq) >= 256);
+}
+
+void
+tty_hiwat_in_block(struct tty *tp)
+{
+
+ if ((tp->t_flags & TF_HIWAT_IN) == 0 &&
+ tp->t_termios.c_iflag & IXOFF &&
+ tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) {
+ /*
+ * Input flow control. Only enter the high watermark when we
+ * can successfully store the VSTOP character.
+ */
+ if (ttyoutq_write_nofrag(&tp->t_outq,
+ &tp->t_termios.c_cc[VSTOP], 1) == 0)
+ tp->t_flags |= TF_HIWAT_IN;
+ } else {
+ /* No input flow control. */
+ tp->t_flags |= TF_HIWAT_IN;
+ }
+}
+
+void
+tty_hiwat_in_unblock(struct tty *tp)
+{
+
+ if (tp->t_flags & TF_HIWAT_IN &&
+ tp->t_termios.c_iflag & IXOFF &&
+ tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) {
+ /*
+ * Input flow control. Only leave the high watermark when we
+ * can successfully store the VSTART character.
+ */
+ if (ttyoutq_write_nofrag(&tp->t_outq,
+ &tp->t_termios.c_cc[VSTART], 1) == 0)
+ tp->t_flags &= ~TF_HIWAT_IN;
+ } else {
+ /* No input flow control. */
+ tp->t_flags &= ~TF_HIWAT_IN;
+ }
+
+ if (!tty_gone(tp))
+ ttydevsw_inwakeup(tp);
+}
+
+/*
+ * TTY hooks interface.
+ */
+
+static int
+ttyhook_defrint(struct tty *tp, char c, int flags)
+{
+
+ if (ttyhook_rint_bypass(tp, &c, 1) != 1)
+ return (-1);
+
+ return (0);
+}
+
+int
+ttyhook_register(struct tty **rtp, struct proc *p, int fd,
+ struct ttyhook *th, void *softc)
+{
+ struct tty *tp;
+ struct file *fp;
+ struct cdev *dev;
+ struct cdevsw *cdp;
+ struct filedesc *fdp;
+ cap_rights_t rights;
+ int error, ref;
+
+ /* Validate the file descriptor. */
+ fdp = p->p_fd;
+ error = fget_unlocked(fdp, fd, cap_rights_init(&rights, CAP_TTYHOOK),
+ 0, &fp, NULL);
+ if (error != 0)
+ return (error);
+ if (fp->f_ops == &badfileops) {
+ error = EBADF;
+ goto done1;
+ }
+
+ /*
+ * Make sure the vnode is bound to a character device.
+ * Unlocked check for the vnode type is ok there, because we
+ * only shall prevent calling devvn_refthread on the file that
+ * never has been opened over a character device.
+ */
+ if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) {
+ error = EINVAL;
+ goto done1;
+ }
+
+ /* Make sure it is a TTY. */
+ cdp = devvn_refthread(fp->f_vnode, &dev, &ref);
+ if (cdp == NULL) {
+ error = ENXIO;
+ goto done1;
+ }
+ if (dev != fp->f_data) {
+ error = ENXIO;
+ goto done2;
+ }
+ if (cdp != &ttydev_cdevsw) {
+ error = ENOTTY;
+ goto done2;
+ }
+ tp = dev->si_drv1;
+
+ /* Try to attach the hook to the TTY. */
+ error = EBUSY;
+ tty_lock(tp);
+ MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0));
+ if (tp->t_flags & TF_HOOK)
+ goto done3;
+
+ tp->t_flags |= TF_HOOK;
+ tp->t_hook = th;
+ tp->t_hooksoftc = softc;
+ *rtp = tp;
+ error = 0;
+
+ /* Maybe we can switch into bypass mode now. */
+ ttydisc_optimize(tp);
+
+ /* Silently convert rint() calls to rint_bypass() when possible. */
+ if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass))
+ th->th_rint = ttyhook_defrint;
+
+done3: tty_unlock(tp);
+done2: dev_relthread(dev, ref);
+done1: fdrop(fp, curthread);
+ return (error);
+}
+
+void
+ttyhook_unregister(struct tty *tp)
+{
+
+ tty_lock_assert(tp, MA_OWNED);
+ MPASS(tp->t_flags & TF_HOOK);
+
+ /* Disconnect the hook. */
+ tp->t_flags &= ~TF_HOOK;
+ tp->t_hook = NULL;
+
+ /* Maybe we need to leave bypass mode. */
+ ttydisc_optimize(tp);
+
+ /* Maybe deallocate the TTY as well. */
+ tty_rel_free(tp);
+}
+
+/*
+ * /dev/console handling.
+ */
+
+static int
+ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+ struct tty *tp;
+
+ /* System has no console device. */
+ if (dev_console_filename == NULL)
+ return (ENXIO);
+
+ /* Look up corresponding TTY by device name. */
+ sx_slock(&tty_list_sx);
+ TAILQ_FOREACH(tp, &tty_list, t_list) {
+ if (strcmp(dev_console_filename, tty_devname(tp)) == 0) {
+ dev_console->si_drv1 = tp;
+ break;
+ }
+ }
+ sx_sunlock(&tty_list_sx);
+
+ /* System console has no TTY associated. */
+ if (dev_console->si_drv1 == NULL)
+ return (ENXIO);
+
+ return (ttydev_open(dev, oflags, devtype, td));
+}
+
+static int
+ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+
+ log_console(uio);
+
+ return (ttydev_write(dev, uio, ioflag));
+}
+
+/*
+ * /dev/console is a little different than normal TTY's. When opened,
+ * it determines which TTY to use. When data gets written to it, it
+ * will be logged in the kernel message buffer.
+ */
+static struct cdevsw ttyconsdev_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = ttyconsdev_open,
+ .d_close = ttydev_close,
+ .d_read = ttydev_read,
+ .d_write = ttyconsdev_write,
+ .d_ioctl = ttydev_ioctl,
+ .d_kqfilter = ttydev_kqfilter,
+ .d_poll = ttydev_poll,
+ .d_mmap = ttydev_mmap,
+ .d_name = "ttyconsdev",
+ .d_flags = D_TTY,
+};
+
+static void
+ttyconsdev_init(void *unused)
+{
+
+ dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0,
+ NULL, UID_ROOT, GID_WHEEL, 0600, "console");
+}
+
+SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL);
+
+void
+ttyconsdev_select(const char *name)
+{
+
+ dev_console_filename = name;
+}
+
+/*
+ * Debugging routines.
+ */
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+
+static struct {
+ int flag;
+ char val;
+} ttystates[] = {
+#if 0
+ { TF_NOPREFIX, 'N' },
+#endif
+ { TF_INITLOCK, 'I' },
+ { TF_CALLOUT, 'C' },
+
+ /* Keep these together -> 'Oi' and 'Oo'. */
+ { TF_OPENED, 'O' },
+ { TF_OPENED_IN, 'i' },
+ { TF_OPENED_OUT, 'o' },
+ { TF_OPENED_CONS, 'c' },
+
+ { TF_GONE, 'G' },
+ { TF_OPENCLOSE, 'B' },
+ { TF_ASYNC, 'Y' },
+ { TF_LITERAL, 'L' },
+
+ /* Keep these together -> 'Hi' and 'Ho'. */
+ { TF_HIWAT, 'H' },
+ { TF_HIWAT_IN, 'i' },
+ { TF_HIWAT_OUT, 'o' },
+
+ { TF_STOPPED, 'S' },
+ { TF_EXCLUDE, 'X' },
+ { TF_BYPASS, 'l' },
+ { TF_ZOMBIE, 'Z' },
+ { TF_HOOK, 's' },
+
+ /* Keep these together -> 'bi' and 'bo'. */
+ { TF_BUSY, 'b' },
+ { TF_BUSY_IN, 'i' },
+ { TF_BUSY_OUT, 'o' },
+
+ { 0, '\0'},
+};
+
+#define TTY_FLAG_BITS \
+ "\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN\5OPENED_OUT\6GONE" \
+ "\7OPENCLOSE\10ASYNC\11LITERAL\12HIWAT_IN\13HIWAT_OUT\14STOPPED" \
+ "\15EXCLUDE\16BYPASS\17ZOMBIE\20HOOK"
+
+#define DB_PRINTSYM(name, addr) \
+ db_printf("%s " #name ": ", sep); \
+ db_printsym((db_addr_t) addr, DB_STGY_ANY); \
+ db_printf("\n");
+
+static void
+_db_show_devsw(const char *sep, const struct ttydevsw *tsw)
+{
+ db_printf("%sdevsw: ", sep);
+ db_printsym((db_addr_t)tsw, DB_STGY_ANY);
+ db_printf(" (%p)\n", tsw);
+ DB_PRINTSYM(open, tsw->tsw_open);
+ DB_PRINTSYM(close, tsw->tsw_close);
+ DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup);
+ DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup);
+ DB_PRINTSYM(ioctl, tsw->tsw_ioctl);
+ DB_PRINTSYM(param, tsw->tsw_param);
+ DB_PRINTSYM(modem, tsw->tsw_modem);
+ DB_PRINTSYM(mmap, tsw->tsw_mmap);
+ DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify);
+ DB_PRINTSYM(free, tsw->tsw_free);
+}
+static void
+_db_show_hooks(const char *sep, const struct ttyhook *th)
+{
+ db_printf("%shook: ", sep);
+ db_printsym((db_addr_t)th, DB_STGY_ANY);
+ db_printf(" (%p)\n", th);
+ if (th == NULL)
+ return;
+ DB_PRINTSYM(rint, th->th_rint);
+ DB_PRINTSYM(rint_bypass, th->th_rint_bypass);
+ DB_PRINTSYM(rint_done, th->th_rint_done);
+ DB_PRINTSYM(rint_poll, th->th_rint_poll);
+ DB_PRINTSYM(getc_inject, th->th_getc_inject);
+ DB_PRINTSYM(getc_capture, th->th_getc_capture);
+ DB_PRINTSYM(getc_poll, th->th_getc_poll);
+ DB_PRINTSYM(close, th->th_close);
+}
+
+static void
+_db_show_termios(const char *name, const struct termios *t)
+{
+
+ db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x "
+ "lflag 0x%x ispeed %u ospeed %u\n", name,
+ t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag,
+ t->c_ispeed, t->c_ospeed);
+}
+
+/* DDB command to show TTY statistics. */
+DB_SHOW_COMMAND(tty, db_show_tty)
+{
+ struct tty *tp;
+
+ if (!have_addr) {
+ db_printf("usage: show tty <addr>\n");
+ return;
+ }
+ tp = (struct tty *)addr;
+
+ db_printf("0x%p: %s\n", tp, tty_devname(tp));
+ db_printf("\tmtx: %p\n", tp->t_mtx);
+ db_printf("\tflags: %b\n", tp->t_flags, TTY_FLAG_BITS);
+ db_printf("\trevokecnt: %u\n", tp->t_revokecnt);
+
+ /* Buffering mechanisms. */
+ db_printf("\tinq: %p begin %u linestart %u reprint %u end %u "
+ "nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin,
+ tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end,
+ tp->t_inq.ti_nblocks, tp->t_inq.ti_quota);
+ db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n",
+ &tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end,
+ tp->t_outq.to_nblocks, tp->t_outq.to_quota);
+ db_printf("\tinlow: %zu\n", tp->t_inlow);
+ db_printf("\toutlow: %zu\n", tp->t_outlow);
+ _db_show_termios("\ttermios", &tp->t_termios);
+ db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n",
+ tp->t_winsize.ws_row, tp->t_winsize.ws_col,
+ tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel);
+ db_printf("\tcolumn: %u\n", tp->t_column);
+ db_printf("\twritepos: %u\n", tp->t_writepos);
+ db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags);
+
+ /* Init/lock-state devices. */
+ _db_show_termios("\ttermios_init_in", &tp->t_termios_init_in);
+ _db_show_termios("\ttermios_init_out", &tp->t_termios_init_out);
+ _db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in);
+ _db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out);
+
+ /* Hooks */
+ _db_show_devsw("\t", tp->t_devsw);
+ _db_show_hooks("\t", tp->t_hook);
+
+ /* Process info. */
+ db_printf("\tpgrp: %p gid %d jobc %d\n", tp->t_pgrp,
+ tp->t_pgrp ? tp->t_pgrp->pg_id : 0,
+ tp->t_pgrp ? tp->t_pgrp->pg_jobc : 0);
+ db_printf("\tsession: %p", tp->t_session);
+ if (tp->t_session != NULL)
+ db_printf(" count %u leader %p tty %p sid %d login %s",
+ tp->t_session->s_count, tp->t_session->s_leader,
+ tp->t_session->s_ttyp, tp->t_session->s_sid,
+ tp->t_session->s_login);
+ db_printf("\n");
+ db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt);
+ db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc);
+ db_printf("\thooksoftc: %p\n", tp->t_hooksoftc);
+ db_printf("\tdev: %p\n", tp->t_dev);
+}
+
+/* DDB command to list TTYs. */
+DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys)
+{
+ struct tty *tp;
+ size_t isiz, osiz;
+ int i, j;
+
+ /* Make the output look like `pstat -t'. */
+ db_printf("PTR ");
+#if defined(__LP64__)
+ db_printf(" ");
+#endif
+ db_printf(" LINE INQ CAN LIN LOW OUTQ USE LOW "
+ "COL SESS PGID STATE\n");
+
+ TAILQ_FOREACH(tp, &tty_list, t_list) {
+ isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE;
+ osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE;
+
+ db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d %5d ",
+ tp,
+ tty_devname(tp),
+ isiz,
+ tp->t_inq.ti_linestart - tp->t_inq.ti_begin,
+ tp->t_inq.ti_end - tp->t_inq.ti_linestart,
+ isiz - tp->t_inlow,
+ osiz,
+ tp->t_outq.to_end - tp->t_outq.to_begin,
+ osiz - tp->t_outlow,
+ MIN(tp->t_column, 99999),
+ tp->t_session ? tp->t_session->s_sid : 0,
+ tp->t_pgrp ? tp->t_pgrp->pg_id : 0);
+
+ /* Flag bits. */
+ for (i = j = 0; ttystates[i].flag; i++)
+ if (tp->t_flags & ttystates[i].flag) {
+ db_printf("%c", ttystates[i].val);
+ j++;
+ }
+ if (j == 0)
+ db_printf("-");
+ db_printf("\n");
+ }
+}
+#endif /* DDB */
diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c
new file mode 100644
index 0000000..6dce01d
--- /dev/null
+++ b/sys/kern/tty_compat.c
@@ -0,0 +1,484 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tty_compat.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+/*
+ * mapping routines for old line discipline (yuck)
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl_compat.h>
+#include <sys/tty.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+struct speedtab {
+ int sp_speed; /* Speed. */
+ int sp_code; /* Code. */
+};
+
+static int ttcompatgetflags(struct tty *tp);
+static void ttcompatsetflags(struct tty *tp, struct termios *t);
+static void ttcompatsetlflags(struct tty *tp, struct termios *t);
+static int ttcompatspeedtab(int speed, struct speedtab *table);
+
+static int ttydebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, "");
+
+static struct speedtab compatspeeds[] = {
+#define MAX_SPEED 17
+ { 115200, 17 },
+ { 57600, 16 },
+ { 38400, 15 },
+ { 19200, 14 },
+ { 9600, 13 },
+ { 4800, 12 },
+ { 2400, 11 },
+ { 1800, 10 },
+ { 1200, 9 },
+ { 600, 8 },
+ { 300, 7 },
+ { 200, 6 },
+ { 150, 5 },
+ { 134, 4 },
+ { 110, 3 },
+ { 75, 2 },
+ { 50, 1 },
+ { 0, 0 },
+ { -1, -1 },
+};
+static int compatspcodes[] = {
+ 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
+ 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200,
+};
+
+static int
+ttcompatspeedtab(int speed, struct speedtab *table)
+{
+ if (speed == 0)
+ return (0); /* hangup */
+ for ( ; table->sp_speed > 0; table++)
+ if (table->sp_speed <= speed) /* nearest one, rounded down */
+ return (table->sp_code);
+ return (1); /* 50, min and not hangup */
+}
+
+static int
+ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term)
+{
+ switch (*com) {
+ case TIOCSETP:
+ case TIOCSETN: {
+ struct sgttyb *sg = (struct sgttyb *)data;
+ int speed;
+
+ if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
+ return(EINVAL);
+ else if (speed != ttcompatspeedtab(tp->t_termios.c_ispeed,
+ compatspeeds))
+ term->c_ispeed = compatspcodes[speed];
+ else
+ term->c_ispeed = tp->t_termios.c_ispeed;
+ if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
+ return(EINVAL);
+ else if (speed != ttcompatspeedtab(tp->t_termios.c_ospeed,
+ compatspeeds))
+ term->c_ospeed = compatspcodes[speed];
+ else
+ term->c_ospeed = tp->t_termios.c_ospeed;
+ term->c_cc[VERASE] = sg->sg_erase;
+ term->c_cc[VKILL] = sg->sg_kill;
+ tp->t_compatflags = (tp->t_compatflags&0xffff0000) |
+ (sg->sg_flags&0xffff);
+ ttcompatsetflags(tp, term);
+ *com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA;
+ break;
+ }
+ case TIOCSETC: {
+ struct tchars *tc = (struct tchars *)data;
+ cc_t *cc;
+
+ cc = term->c_cc;
+ cc[VINTR] = tc->t_intrc;
+ cc[VQUIT] = tc->t_quitc;
+ cc[VSTART] = tc->t_startc;
+ cc[VSTOP] = tc->t_stopc;
+ cc[VEOF] = tc->t_eofc;
+ cc[VEOL] = tc->t_brkc;
+ if (tc->t_brkc == (char)_POSIX_VDISABLE)
+ cc[VEOL2] = _POSIX_VDISABLE;
+ *com = TIOCSETA;
+ break;
+ }
+ case TIOCSLTC: {
+ struct ltchars *ltc = (struct ltchars *)data;
+ cc_t *cc;
+
+ cc = term->c_cc;
+ cc[VSUSP] = ltc->t_suspc;
+ cc[VDSUSP] = ltc->t_dsuspc;
+ cc[VREPRINT] = ltc->t_rprntc;
+ cc[VDISCARD] = ltc->t_flushc;
+ cc[VWERASE] = ltc->t_werasc;
+ cc[VLNEXT] = ltc->t_lnextc;
+ *com = TIOCSETA;
+ break;
+ }
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET:
+ if (*com == TIOCLSET)
+ tp->t_compatflags = (tp->t_compatflags&0xffff) |
+ *(int *)data<<16;
+ else {
+ tp->t_compatflags = (ttcompatgetflags(tp)&0xffff0000) |
+ (tp->t_compatflags&0xffff);
+ if (*com == TIOCLBIS)
+ tp->t_compatflags |= *(int *)data<<16;
+ else
+ tp->t_compatflags &= ~(*(int *)data<<16);
+ }
+ ttcompatsetlflags(tp, term);
+ *com = TIOCSETA;
+ break;
+ }
+ return 0;
+}
+
+/*ARGSUSED*/
+int
+tty_ioctl_compat(struct tty *tp, u_long com, caddr_t data, int fflag,
+ struct thread *td)
+{
+ switch (com) {
+ case TIOCSETP:
+ case TIOCSETN:
+ case TIOCSETC:
+ case TIOCSLTC:
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET: {
+ struct termios term;
+ int error;
+
+ term = tp->t_termios;
+ if ((error = ttsetcompat(tp, &com, data, &term)) != 0)
+ return error;
+ return tty_ioctl(tp, com, &term, fflag, td);
+ }
+ case TIOCGETP: {
+ struct sgttyb *sg = (struct sgttyb *)data;
+ cc_t *cc = tp->t_termios.c_cc;
+
+ sg->sg_ospeed = ttcompatspeedtab(tp->t_termios.c_ospeed,
+ compatspeeds);
+ if (tp->t_termios.c_ispeed == 0)
+ sg->sg_ispeed = sg->sg_ospeed;
+ else
+ sg->sg_ispeed = ttcompatspeedtab(tp->t_termios.c_ispeed,
+ compatspeeds);
+ sg->sg_erase = cc[VERASE];
+ sg->sg_kill = cc[VKILL];
+ sg->sg_flags = tp->t_compatflags = ttcompatgetflags(tp);
+ break;
+ }
+ case TIOCGETC: {
+ struct tchars *tc = (struct tchars *)data;
+ cc_t *cc = tp->t_termios.c_cc;
+
+ tc->t_intrc = cc[VINTR];
+ tc->t_quitc = cc[VQUIT];
+ tc->t_startc = cc[VSTART];
+ tc->t_stopc = cc[VSTOP];
+ tc->t_eofc = cc[VEOF];
+ tc->t_brkc = cc[VEOL];
+ break;
+ }
+ case TIOCGLTC: {
+ struct ltchars *ltc = (struct ltchars *)data;
+ cc_t *cc = tp->t_termios.c_cc;
+
+ ltc->t_suspc = cc[VSUSP];
+ ltc->t_dsuspc = cc[VDSUSP];
+ ltc->t_rprntc = cc[VREPRINT];
+ ltc->t_flushc = cc[VDISCARD];
+ ltc->t_werasc = cc[VWERASE];
+ ltc->t_lnextc = cc[VLNEXT];
+ break;
+ }
+ case TIOCLGET:
+ tp->t_compatflags =
+ (ttcompatgetflags(tp) & 0xffff0000UL)
+ | (tp->t_compatflags & 0xffff);
+ *(int *)data = tp->t_compatflags>>16;
+ if (ttydebug)
+ printf("CLGET: returning %x\n", *(int *)data);
+ break;
+
+ case OTIOCGETD:
+ *(int *)data = 2;
+ break;
+
+ case OTIOCSETD: {
+ int ldisczero = 0;
+
+ return (tty_ioctl(tp, TIOCSETD,
+ *(int *)data == 2 ? (caddr_t)&ldisczero : data,
+ fflag, td));
+ }
+
+ case OTIOCCONS:
+ *(int *)data = 1;
+ return (tty_ioctl(tp, TIOCCONS, data, fflag, td));
+
+ default:
+ return (ENOIOCTL);
+ }
+ return (0);
+}
+
+static int
+ttcompatgetflags(struct tty *tp)
+{
+ tcflag_t iflag = tp->t_termios.c_iflag;
+ tcflag_t lflag = tp->t_termios.c_lflag;
+ tcflag_t oflag = tp->t_termios.c_oflag;
+ tcflag_t cflag = tp->t_termios.c_cflag;
+ int flags = 0;
+
+ if (iflag&IXOFF)
+ flags |= TANDEM;
+ if (iflag&ICRNL || oflag&ONLCR)
+ flags |= CRMOD;
+ if ((cflag&CSIZE) == CS8) {
+ flags |= PASS8;
+ if (iflag&ISTRIP)
+ flags |= ANYP;
+ }
+ else if (cflag&PARENB) {
+ if (iflag&INPCK) {
+ if (cflag&PARODD)
+ flags |= ODDP;
+ else
+ flags |= EVENP;
+ } else
+ flags |= EVENP | ODDP;
+ }
+
+ if ((lflag&ICANON) == 0) {
+ /* fudge */
+ if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG)
+ || (cflag&(CSIZE|PARENB)) != CS8)
+ flags |= CBREAK;
+ else
+ flags |= RAW;
+ }
+ if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8)
+ flags |= LITOUT;
+ if (cflag&MDMBUF)
+ flags |= MDMBUF;
+ if ((cflag&HUPCL) == 0)
+ flags |= NOHANG;
+ if (oflag&TAB3)
+ flags |= XTABS;
+ if (lflag&ECHOE)
+ flags |= CRTERA|CRTBS;
+ if (lflag&ECHOKE)
+ flags |= CRTKIL|CRTBS;
+ if (lflag&ECHOPRT)
+ flags |= PRTERA;
+ if (lflag&ECHOCTL)
+ flags |= CTLECH;
+ if ((iflag&IXANY) == 0)
+ flags |= DECCTQ;
+ flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH);
+ if (ttydebug)
+ printf("getflags: %x\n", flags);
+ return (flags);
+}
+
+static void
+ttcompatsetflags(struct tty *tp, struct termios *t)
+{
+ int flags = tp->t_compatflags;
+ tcflag_t iflag = t->c_iflag;
+ tcflag_t oflag = t->c_oflag;
+ tcflag_t lflag = t->c_lflag;
+ tcflag_t cflag = t->c_cflag;
+
+ if (flags & RAW) {
+ iflag = IGNBRK;
+ lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN);
+ } else {
+ iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+ iflag |= BRKINT|IXON|IMAXBEL;
+ lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */
+ if (flags & XTABS)
+ oflag |= TAB3;
+ else
+ oflag &= ~TAB3;
+ if (flags & CBREAK)
+ lflag &= ~ICANON;
+ else
+ lflag |= ICANON;
+ if (flags&CRMOD) {
+ iflag |= ICRNL;
+ oflag |= ONLCR;
+ } else {
+ iflag &= ~ICRNL;
+ oflag &= ~ONLCR;
+ }
+ }
+ if (flags&ECHO)
+ lflag |= ECHO;
+ else
+ lflag &= ~ECHO;
+
+ cflag &= ~(CSIZE|PARENB);
+ if (flags&(RAW|LITOUT|PASS8)) {
+ cflag |= CS8;
+ if (!(flags&(RAW|PASS8))
+ || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+ iflag |= ISTRIP;
+ else
+ iflag &= ~ISTRIP;
+ if (flags&(RAW|LITOUT))
+ oflag &= ~OPOST;
+ else
+ oflag |= OPOST;
+ } else {
+ cflag |= CS7|PARENB;
+ iflag |= ISTRIP;
+ oflag |= OPOST;
+ }
+ /* XXX don't set INPCK if RAW or PASS8? */
+ if ((flags&(EVENP|ODDP)) == EVENP) {
+ iflag |= INPCK;
+ cflag &= ~PARODD;
+ } else if ((flags&(EVENP|ODDP)) == ODDP) {
+ iflag |= INPCK;
+ cflag |= PARODD;
+ } else
+ iflag &= ~INPCK;
+ if (flags&TANDEM)
+ iflag |= IXOFF;
+ else
+ iflag &= ~IXOFF;
+ if ((flags&DECCTQ) == 0)
+ iflag |= IXANY;
+ else
+ iflag &= ~IXANY;
+ t->c_iflag = iflag;
+ t->c_oflag = oflag;
+ t->c_lflag = lflag;
+ t->c_cflag = cflag;
+}
+
+static void
+ttcompatsetlflags(struct tty *tp, struct termios *t)
+{
+ int flags = tp->t_compatflags;
+ tcflag_t iflag = t->c_iflag;
+ tcflag_t oflag = t->c_oflag;
+ tcflag_t lflag = t->c_lflag;
+ tcflag_t cflag = t->c_cflag;
+
+ iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
+ if (flags&CRTERA)
+ lflag |= ECHOE;
+ else
+ lflag &= ~ECHOE;
+ if (flags&CRTKIL)
+ lflag |= ECHOKE;
+ else
+ lflag &= ~ECHOKE;
+ if (flags&PRTERA)
+ lflag |= ECHOPRT;
+ else
+ lflag &= ~ECHOPRT;
+ if (flags&CTLECH)
+ lflag |= ECHOCTL;
+ else
+ lflag &= ~ECHOCTL;
+ if (flags&TANDEM)
+ iflag |= IXOFF;
+ else
+ iflag &= ~IXOFF;
+ if ((flags&DECCTQ) == 0)
+ iflag |= IXANY;
+ else
+ iflag &= ~IXANY;
+ if (flags & MDMBUF)
+ cflag |= MDMBUF;
+ else
+ cflag &= ~MDMBUF;
+ if (flags&NOHANG)
+ cflag &= ~HUPCL;
+ else
+ cflag |= HUPCL;
+ lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+ lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH);
+
+ /*
+ * The next if-else statement is copied from above so don't bother
+ * checking it separately. We could avoid fiddlling with the
+ * character size if the mode is already RAW or if neither the
+ * LITOUT bit or the PASS8 bit is being changed, but the delta of
+ * the change is not available here and skipping the RAW case would
+ * make the code different from above.
+ */
+ cflag &= ~(CSIZE|PARENB);
+ if (flags&(RAW|LITOUT|PASS8)) {
+ cflag |= CS8;
+ if (!(flags&(RAW|PASS8))
+ || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
+ iflag |= ISTRIP;
+ else
+ iflag &= ~ISTRIP;
+ if (flags&(RAW|LITOUT))
+ oflag &= ~OPOST;
+ else
+ oflag |= OPOST;
+ } else {
+ cflag |= CS7|PARENB;
+ iflag |= ISTRIP;
+ oflag |= OPOST;
+ }
+ t->c_iflag = iflag;
+ t->c_oflag = oflag;
+ t->c_lflag = lflag;
+ t->c_cflag = cflag;
+}
diff --git a/sys/kern/tty_info.c b/sys/kern/tty_info.c
new file mode 100644
index 0000000..6849d0b
--- /dev/null
+++ b/sys/kern/tty_info.c
@@ -0,0 +1,313 @@
+/*-
+ * Copyright (c) 1982, 1986, 1990, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Copyright (c) 2002 Networks Associates Technologies, Inc.
+ * All rights reserved.
+ *
+ * Portions of this software were developed for the FreeBSD Project by
+ * ThinkSec AS and NAI Labs, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035
+ * ("CBOSS"), as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+/*
+ * Returns 1 if p2 is "better" than p1
+ *
+ * The algorithm for picking the "interesting" process is thus:
+ *
+ * 1) Only foreground processes are eligible - implied.
+ * 2) Runnable processes are favored over anything else. The runner
+ * with the highest cpu utilization is picked (p_estcpu). Ties are
+ * broken by picking the highest pid.
+ * 3) The sleeper with the shortest sleep time is next. With ties,
+ * we pick out just "short-term" sleepers (P_SINTR == 0).
+ * 4) Further ties are broken by picking the highest pid.
+ */
+
+#define TESTAB(a, b) ((a)<<1 | (b))
+#define ONLYA 2
+#define ONLYB 1
+#define BOTH 3
+
+static int
+proc_sum(struct proc *p, fixpt_t *estcpup)
+{
+ struct thread *td;
+ int estcpu;
+ int val;
+
+ val = 0;
+ estcpu = 0;
+ FOREACH_THREAD_IN_PROC(p, td) {
+ thread_lock(td);
+ if (TD_ON_RUNQ(td) ||
+ TD_IS_RUNNING(td))
+ val = 1;
+ estcpu += sched_pctcpu(td);
+ thread_unlock(td);
+ }
+ *estcpup = estcpu;
+
+ return (val);
+}
+
+static int
+thread_compare(struct thread *td, struct thread *td2)
+{
+ int runa, runb;
+ int slpa, slpb;
+ fixpt_t esta, estb;
+
+ if (td == NULL)
+ return (1);
+
+ /*
+ * Fetch running stats, pctcpu usage, and interruptable flag.
+ */
+ thread_lock(td);
+ runa = TD_IS_RUNNING(td) | TD_ON_RUNQ(td);
+ slpa = td->td_flags & TDF_SINTR;
+ esta = sched_pctcpu(td);
+ thread_unlock(td);
+ thread_lock(td2);
+ runb = TD_IS_RUNNING(td2) | TD_ON_RUNQ(td2);
+ estb = sched_pctcpu(td2);
+ slpb = td2->td_flags & TDF_SINTR;
+ thread_unlock(td2);
+ /*
+ * see if at least one of them is runnable
+ */
+ switch (TESTAB(runa, runb)) {
+ case ONLYA:
+ return (0);
+ case ONLYB:
+ return (1);
+ case BOTH:
+ break;
+ }
+ /*
+ * favor one with highest recent cpu utilization
+ */
+ if (estb > esta)
+ return (1);
+ if (esta > estb)
+ return (0);
+ /*
+ * favor one sleeping in a non-interruptible sleep
+ */
+ switch (TESTAB(slpa, slpb)) {
+ case ONLYA:
+ return (0);
+ case ONLYB:
+ return (1);
+ case BOTH:
+ break;
+ }
+
+ return (td < td2);
+}
+
+static int
+proc_compare(struct proc *p1, struct proc *p2)
+{
+
+ int runa, runb;
+ fixpt_t esta, estb;
+
+ if (p1 == NULL)
+ return (1);
+
+ /*
+ * Fetch various stats about these processes. After we drop the
+ * lock the information could be stale but the race is unimportant.
+ */
+ PROC_LOCK(p1);
+ runa = proc_sum(p1, &esta);
+ PROC_UNLOCK(p1);
+ PROC_LOCK(p2);
+ runb = proc_sum(p2, &estb);
+ PROC_UNLOCK(p2);
+
+ /*
+ * see if at least one of them is runnable
+ */
+ switch (TESTAB(runa, runb)) {
+ case ONLYA:
+ return (0);
+ case ONLYB:
+ return (1);
+ case BOTH:
+ break;
+ }
+ /*
+ * favor one with highest recent cpu utilization
+ */
+ if (estb > esta)
+ return (1);
+ if (esta > estb)
+ return (0);
+ /*
+ * weed out zombies
+ */
+ switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
+ case ONLYA:
+ return (1);
+ case ONLYB:
+ return (0);
+ case BOTH:
+ break;
+ }
+
+ return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+}
+
+/*
+ * Report on state of foreground process group.
+ */
+void
+tty_info(struct tty *tp)
+{
+ struct timeval rtime, utime, stime;
+ struct proc *p, *ppick;
+ struct thread *td, *tdpick;
+ const char *stateprefix, *state;
+ long rss;
+ int load, pctcpu;
+ pid_t pid;
+ char comm[MAXCOMLEN + 1];
+ struct rusage ru;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tty_checkoutq(tp) == 0)
+ return;
+
+ /* Print load average. */
+ load = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
+ ttyprintf(tp, "%sload: %d.%02d ", tp->t_column == 0 ? "" : "\n",
+ load / 100, load % 100);
+
+ if (tp->t_session == NULL) {
+ ttyprintf(tp, "not a controlling terminal\n");
+ return;
+ }
+ if (tp->t_pgrp == NULL) {
+ ttyprintf(tp, "no foreground process group\n");
+ return;
+ }
+ PGRP_LOCK(tp->t_pgrp);
+ if (LIST_EMPTY(&tp->t_pgrp->pg_members)) {
+ PGRP_UNLOCK(tp->t_pgrp);
+ ttyprintf(tp, "empty foreground process group\n");
+ return;
+ }
+
+ /*
+ * Pick the most interesting process and copy some of its
+ * state for printing later. This operation could rely on stale
+ * data as we can't hold the proc slock or thread locks over the
+ * whole list. However, we're guaranteed not to reference an exited
+ * thread or proc since we hold the tty locked.
+ */
+ p = NULL;
+ LIST_FOREACH(ppick, &tp->t_pgrp->pg_members, p_pglist)
+ if (proc_compare(p, ppick))
+ p = ppick;
+
+ PROC_LOCK(p);
+ PGRP_UNLOCK(tp->t_pgrp);
+ td = NULL;
+ FOREACH_THREAD_IN_PROC(p, tdpick)
+ if (thread_compare(td, tdpick))
+ td = tdpick;
+ stateprefix = "";
+ thread_lock(td);
+ if (TD_IS_RUNNING(td))
+ state = "running";
+ else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td))
+ state = "runnable";
+ else if (TD_IS_SLEEPING(td)) {
+ /* XXX: If we're sleeping, are we ever not in a queue? */
+ if (TD_ON_SLEEPQ(td))
+ state = td->td_wmesg;
+ else
+ state = "sleeping without queue";
+ } else if (TD_ON_LOCK(td)) {
+ state = td->td_lockname;
+ stateprefix = "*";
+ } else if (TD_IS_SUSPENDED(td))
+ state = "suspended";
+ else if (TD_AWAITING_INTR(td))
+ state = "intrwait";
+ else if (p->p_state == PRS_ZOMBIE)
+ state = "zombie";
+ else
+ state = "unknown";
+ pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT;
+ thread_unlock(td);
+ if (p->p_state == PRS_NEW || p->p_state == PRS_ZOMBIE)
+ rss = 0;
+ else
+ rss = pgtok(vmspace_resident_count(p->p_vmspace));
+ microuptime(&rtime);
+ timevalsub(&rtime, &p->p_stats->p_start);
+ rufetchcalc(p, &ru, &utime, &stime);
+ pid = p->p_pid;
+ strlcpy(comm, p->p_comm, sizeof comm);
+ PROC_UNLOCK(p);
+
+ /* Print command, pid, state, rtime, utime, stime, %cpu, and rss. */
+ ttyprintf(tp,
+ " cmd: %s %d [%s%s] %ld.%02ldr %ld.%02ldu %ld.%02lds %d%% %ldk\n",
+ comm, pid, stateprefix, state,
+ (long)rtime.tv_sec, rtime.tv_usec / 10000,
+ (long)utime.tv_sec, utime.tv_usec / 10000,
+ (long)stime.tv_sec, stime.tv_usec / 10000,
+ pctcpu / 100, rss);
+}
diff --git a/sys/kern/tty_inq.c b/sys/kern/tty_inq.c
new file mode 100644
index 0000000..97017ac
--- /dev/null
+++ b/sys/kern/tty_inq.c
@@ -0,0 +1,489 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
+
+#include <vm/uma.h>
+
+/*
+ * TTY input queue buffering.
+ *
+ * Unlike the output queue, the input queue has more features that are
+ * needed to properly implement various features offered by the TTY
+ * interface:
+ *
+ * - Data can be removed from the tail of the queue, which is used to
+ * implement backspace.
+ * - Once in a while, input has to be `canonicalized'. When ICANON is
+ * turned on, this will be done after a CR has been inserted.
+ * Otherwise, it should be done after any character has been inserted.
+ * - The input queue can store one bit per byte, called the quoting bit.
+ * This bit is used by TTYDISC to make backspace work on quoted
+ * characters.
+ *
+ * In most cases, there is probably less input than output, so unlike
+ * the outq, we'll stick to 128 byte blocks here.
+ */
+
+static int ttyinq_flush_secure = 1;
+SYSCTL_INT(_kern, OID_AUTO, tty_inq_flush_secure, CTLFLAG_RW,
+ &ttyinq_flush_secure, 0, "Zero buffers while flushing");
+
+#define TTYINQ_QUOTESIZE (TTYINQ_DATASIZE / BMSIZE)
+#define BMSIZE 32
+#define GETBIT(tib,boff) \
+ ((tib)->tib_quotes[(boff) / BMSIZE] & (1 << ((boff) % BMSIZE)))
+#define SETBIT(tib,boff) \
+ ((tib)->tib_quotes[(boff) / BMSIZE] |= (1 << ((boff) % BMSIZE)))
+#define CLRBIT(tib,boff) \
+ ((tib)->tib_quotes[(boff) / BMSIZE] &= ~(1 << ((boff) % BMSIZE)))
+
+struct ttyinq_block {
+ struct ttyinq_block *tib_prev;
+ struct ttyinq_block *tib_next;
+ uint32_t tib_quotes[TTYINQ_QUOTESIZE];
+ char tib_data[TTYINQ_DATASIZE];
+};
+
+static uma_zone_t ttyinq_zone;
+
+#define TTYINQ_INSERT_TAIL(ti, tib) do { \
+ if (ti->ti_end == 0) { \
+ tib->tib_prev = NULL; \
+ tib->tib_next = ti->ti_firstblock; \
+ ti->ti_firstblock = tib; \
+ } else { \
+ tib->tib_prev = ti->ti_lastblock; \
+ tib->tib_next = ti->ti_lastblock->tib_next; \
+ ti->ti_lastblock->tib_next = tib; \
+ } \
+ if (tib->tib_next != NULL) \
+ tib->tib_next->tib_prev = tib; \
+ ti->ti_nblocks++; \
+} while (0)
+
+#define TTYINQ_REMOVE_HEAD(ti) do { \
+ ti->ti_firstblock = ti->ti_firstblock->tib_next; \
+ if (ti->ti_firstblock != NULL) \
+ ti->ti_firstblock->tib_prev = NULL; \
+ ti->ti_nblocks--; \
+} while (0)
+
+#define TTYINQ_RECYCLE(ti, tib) do { \
+ if (ti->ti_quota <= ti->ti_nblocks) \
+ uma_zfree(ttyinq_zone, tib); \
+ else \
+ TTYINQ_INSERT_TAIL(ti, tib); \
+} while (0)
+
+void
+ttyinq_setsize(struct ttyinq *ti, struct tty *tp, size_t size)
+{
+ struct ttyinq_block *tib;
+
+ ti->ti_quota = howmany(size, TTYINQ_DATASIZE);
+
+ while (ti->ti_quota > ti->ti_nblocks) {
+ /*
+ * List is getting bigger.
+ * Add new blocks to the tail of the list.
+ *
+ * We must unlock the TTY temporarily, because we need
+ * to allocate memory. This won't be a problem, because
+ * in the worst case, another thread ends up here, which
+ * may cause us to allocate too many blocks, but this
+ * will be caught by the loop below.
+ */
+ tty_unlock(tp);
+ tib = uma_zalloc(ttyinq_zone, M_WAITOK);
+ tty_lock(tp);
+
+ TTYINQ_INSERT_TAIL(ti, tib);
+ }
+}
+
+void
+ttyinq_free(struct ttyinq *ti)
+{
+ struct ttyinq_block *tib;
+
+ ttyinq_flush(ti);
+ ti->ti_quota = 0;
+
+ while ((tib = ti->ti_firstblock) != NULL) {
+ TTYINQ_REMOVE_HEAD(ti);
+ uma_zfree(ttyinq_zone, tib);
+ }
+
+ MPASS(ti->ti_nblocks == 0);
+}
+
+int
+ttyinq_read_uio(struct ttyinq *ti, struct tty *tp, struct uio *uio,
+ size_t rlen, size_t flen)
+{
+
+ MPASS(rlen <= uio->uio_resid);
+
+ while (rlen > 0) {
+ int error;
+ struct ttyinq_block *tib;
+ size_t cbegin, cend, clen;
+
+ /* See if there still is data. */
+ if (ti->ti_begin == ti->ti_linestart)
+ return (0);
+ tib = ti->ti_firstblock;
+ if (tib == NULL)
+ return (0);
+
+ /*
+ * The end address should be the lowest of these three:
+ * - The write pointer
+ * - The blocksize - we can't read beyond the block
+ * - The end address if we could perform the full read
+ */
+ cbegin = ti->ti_begin;
+ cend = MIN(MIN(ti->ti_linestart, ti->ti_begin + rlen),
+ TTYINQ_DATASIZE);
+ clen = cend - cbegin;
+ MPASS(clen >= flen);
+ rlen -= clen;
+
+ /*
+ * We can prevent buffering in some cases:
+ * - We need to read the block until the end.
+ * - We don't need to read the block until the end, but
+ * there is no data beyond it, which allows us to move
+ * the write pointer to a new block.
+ */
+ if (cend == TTYINQ_DATASIZE || cend == ti->ti_end) {
+ /*
+ * Fast path: zero copy. Remove the first block,
+ * so we can unlock the TTY temporarily.
+ */
+ TTYINQ_REMOVE_HEAD(ti);
+ ti->ti_begin = 0;
+
+ /*
+ * Because we remove the first block, we must
+ * fix up the block offsets.
+ */
+#define CORRECT_BLOCK(t) do { \
+ if (t <= TTYINQ_DATASIZE) \
+ t = 0; \
+ else \
+ t -= TTYINQ_DATASIZE; \
+} while (0)
+ CORRECT_BLOCK(ti->ti_linestart);
+ CORRECT_BLOCK(ti->ti_reprint);
+ CORRECT_BLOCK(ti->ti_end);
+#undef CORRECT_BLOCK
+
+ /*
+ * Temporary unlock and copy the data to
+ * userspace. We may need to flush trailing
+ * bytes, like EOF characters.
+ */
+ tty_unlock(tp);
+ error = uiomove(tib->tib_data + cbegin,
+ clen - flen, uio);
+ tty_lock(tp);
+
+ /* Block can now be readded to the list. */
+ TTYINQ_RECYCLE(ti, tib);
+ } else {
+ char ob[TTYINQ_DATASIZE - 1];
+
+ /*
+ * Slow path: store data in a temporary buffer.
+ */
+ memcpy(ob, tib->tib_data + cbegin, clen - flen);
+ ti->ti_begin += clen;
+ MPASS(ti->ti_begin < TTYINQ_DATASIZE);
+
+ /* Temporary unlock and copy the data to userspace. */
+ tty_unlock(tp);
+ error = uiomove(ob, clen - flen, uio);
+ tty_lock(tp);
+ }
+
+ if (error != 0)
+ return (error);
+ if (tty_gone(tp))
+ return (ENXIO);
+ }
+
+ return (0);
+}
+
+static __inline void
+ttyinq_set_quotes(struct ttyinq_block *tib, size_t offset,
+ size_t length, int value)
+{
+
+ if (value) {
+ /* Set the bits. */
+ for (; length > 0; length--, offset++)
+ SETBIT(tib, offset);
+ } else {
+ /* Unset the bits. */
+ for (; length > 0; length--, offset++)
+ CLRBIT(tib, offset);
+ }
+}
+
+size_t
+ttyinq_write(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
+{
+ const char *cbuf = buf;
+ struct ttyinq_block *tib;
+ unsigned int boff;
+ size_t l;
+
+ while (nbytes > 0) {
+ boff = ti->ti_end % TTYINQ_DATASIZE;
+
+ if (ti->ti_end == 0) {
+ /* First time we're being used or drained. */
+ MPASS(ti->ti_begin == 0);
+ tib = ti->ti_firstblock;
+ if (tib == NULL) {
+ /* Queue has no blocks. */
+ break;
+ }
+ ti->ti_lastblock = tib;
+ } else if (boff == 0) {
+ /* We reached the end of this block on last write. */
+ tib = ti->ti_lastblock->tib_next;
+ if (tib == NULL) {
+ /* We've reached the watermark. */
+ break;
+ }
+ ti->ti_lastblock = tib;
+ } else {
+ tib = ti->ti_lastblock;
+ }
+
+ /* Don't copy more than was requested. */
+ l = MIN(nbytes, TTYINQ_DATASIZE - boff);
+ MPASS(l > 0);
+ memcpy(tib->tib_data + boff, cbuf, l);
+
+ /* Set the quoting bits for the proper region. */
+ ttyinq_set_quotes(tib, boff, l, quote);
+
+ cbuf += l;
+ nbytes -= l;
+ ti->ti_end += l;
+ }
+
+ return (cbuf - (const char *)buf);
+}
+
+int
+ttyinq_write_nofrag(struct ttyinq *ti, const void *buf, size_t nbytes, int quote)
+{
+ size_t ret;
+
+ if (ttyinq_bytesleft(ti) < nbytes)
+ return (-1);
+
+ /* We should always be able to write it back. */
+ ret = ttyinq_write(ti, buf, nbytes, quote);
+ MPASS(ret == nbytes);
+
+ return (0);
+}
+
+void
+ttyinq_canonicalize(struct ttyinq *ti)
+{
+
+ ti->ti_linestart = ti->ti_reprint = ti->ti_end;
+ ti->ti_startblock = ti->ti_reprintblock = ti->ti_lastblock;
+}
+
+size_t
+ttyinq_findchar(struct ttyinq *ti, const char *breakc, size_t maxlen,
+ char *lastc)
+{
+ struct ttyinq_block *tib = ti->ti_firstblock;
+ unsigned int boff = ti->ti_begin;
+ unsigned int bend = MIN(MIN(TTYINQ_DATASIZE, ti->ti_linestart),
+ ti->ti_begin + maxlen);
+
+ MPASS(maxlen > 0);
+
+ if (tib == NULL)
+ return (0);
+
+ while (boff < bend) {
+ if (strchr(breakc, tib->tib_data[boff]) && !GETBIT(tib, boff)) {
+ *lastc = tib->tib_data[boff];
+ return (boff - ti->ti_begin + 1);
+ }
+ boff++;
+ }
+
+ /* Not found - just process the entire block. */
+ return (bend - ti->ti_begin);
+}
+
+void
+ttyinq_flush(struct ttyinq *ti)
+{
+ struct ttyinq_block *tib;
+
+ ti->ti_begin = 0;
+ ti->ti_linestart = 0;
+ ti->ti_reprint = 0;
+ ti->ti_end = 0;
+
+ /* Zero all data in the input queue to get rid of passwords. */
+ if (ttyinq_flush_secure) {
+ for (tib = ti->ti_firstblock; tib != NULL; tib = tib->tib_next)
+ bzero(&tib->tib_data, sizeof tib->tib_data);
+ }
+}
+
+int
+ttyinq_peekchar(struct ttyinq *ti, char *c, int *quote)
+{
+ unsigned int boff;
+ struct ttyinq_block *tib = ti->ti_lastblock;
+
+ if (ti->ti_linestart == ti->ti_end)
+ return (-1);
+
+ MPASS(ti->ti_end > 0);
+ boff = (ti->ti_end - 1) % TTYINQ_DATASIZE;
+
+ *c = tib->tib_data[boff];
+ *quote = GETBIT(tib, boff);
+
+ return (0);
+}
+
+void
+ttyinq_unputchar(struct ttyinq *ti)
+{
+
+ MPASS(ti->ti_linestart < ti->ti_end);
+
+ if (--ti->ti_end % TTYINQ_DATASIZE == 0) {
+ /* Roll back to the previous block. */
+ ti->ti_lastblock = ti->ti_lastblock->tib_prev;
+ /*
+ * This can only fail if we are unputchar()'ing the
+ * first character in the queue.
+ */
+ MPASS((ti->ti_lastblock == NULL) == (ti->ti_end == 0));
+ }
+}
+
+void
+ttyinq_reprintpos_set(struct ttyinq *ti)
+{
+
+ ti->ti_reprint = ti->ti_end;
+ ti->ti_reprintblock = ti->ti_lastblock;
+}
+
+void
+ttyinq_reprintpos_reset(struct ttyinq *ti)
+{
+
+ ti->ti_reprint = ti->ti_linestart;
+ ti->ti_reprintblock = ti->ti_startblock;
+}
+
+static void
+ttyinq_line_iterate(struct ttyinq *ti,
+ ttyinq_line_iterator_t *iterator, void *data,
+ unsigned int offset, struct ttyinq_block *tib)
+{
+ unsigned int boff;
+
+ /* Use the proper block when we're at the queue head. */
+ if (offset == 0)
+ tib = ti->ti_firstblock;
+
+ /* Iterate all characters and call the iterator function. */
+ for (; offset < ti->ti_end; offset++) {
+ boff = offset % TTYINQ_DATASIZE;
+ MPASS(tib != NULL);
+
+ /* Call back the iterator function. */
+ iterator(data, tib->tib_data[boff], GETBIT(tib, boff));
+
+ /* Last byte iterated - go to the next block. */
+ if (boff == TTYINQ_DATASIZE - 1)
+ tib = tib->tib_next;
+ MPASS(tib != NULL);
+ }
+}
+
+void
+ttyinq_line_iterate_from_linestart(struct ttyinq *ti,
+ ttyinq_line_iterator_t *iterator, void *data)
+{
+
+ ttyinq_line_iterate(ti, iterator, data,
+ ti->ti_linestart, ti->ti_startblock);
+}
+
+void
+ttyinq_line_iterate_from_reprintpos(struct ttyinq *ti,
+ ttyinq_line_iterator_t *iterator, void *data)
+{
+
+ ttyinq_line_iterate(ti, iterator, data,
+ ti->ti_reprint, ti->ti_reprintblock);
+}
+
+static void
+ttyinq_startup(void *dummy)
+{
+
+ ttyinq_zone = uma_zcreate("ttyinq", sizeof(struct ttyinq_block),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+SYSINIT(ttyinq, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyinq_startup, NULL);
diff --git a/sys/kern/tty_outq.c b/sys/kern/tty_outq.c
new file mode 100644
index 0000000..5d40abe
--- /dev/null
+++ b/sys/kern/tty_outq.c
@@ -0,0 +1,339 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/uio.h>
+
+#include <vm/uma.h>
+
+/*
+ * TTY output queue buffering.
+ *
+ * The previous design of the TTY layer offered the so-called clists.
+ * These clists were used for both the input queues and the output
+ * queue. We don't use certain features on the output side, like quoting
+ * bits for parity marking and such. This mechanism is similar to the
+ * old clists, but only contains the features we need to buffer the
+ * output.
+ */
+
+struct ttyoutq_block {
+ struct ttyoutq_block *tob_next;
+ char tob_data[TTYOUTQ_DATASIZE];
+};
+
+static uma_zone_t ttyoutq_zone;
+
+#define TTYOUTQ_INSERT_TAIL(to, tob) do { \
+ if (to->to_end == 0) { \
+ tob->tob_next = to->to_firstblock; \
+ to->to_firstblock = tob; \
+ } else { \
+ tob->tob_next = to->to_lastblock->tob_next; \
+ to->to_lastblock->tob_next = tob; \
+ } \
+ to->to_nblocks++; \
+} while (0)
+
+#define TTYOUTQ_REMOVE_HEAD(to) do { \
+ to->to_firstblock = to->to_firstblock->tob_next; \
+ to->to_nblocks--; \
+} while (0)
+
+#define TTYOUTQ_RECYCLE(to, tob) do { \
+ if (to->to_quota <= to->to_nblocks) \
+ uma_zfree(ttyoutq_zone, tob); \
+ else \
+ TTYOUTQ_INSERT_TAIL(to, tob); \
+} while(0)
+
+void
+ttyoutq_flush(struct ttyoutq *to)
+{
+
+ to->to_begin = 0;
+ to->to_end = 0;
+}
+
+void
+ttyoutq_setsize(struct ttyoutq *to, struct tty *tp, size_t size)
+{
+ struct ttyoutq_block *tob;
+
+ to->to_quota = howmany(size, TTYOUTQ_DATASIZE);
+
+ while (to->to_quota > to->to_nblocks) {
+ /*
+ * List is getting bigger.
+ * Add new blocks to the tail of the list.
+ *
+ * We must unlock the TTY temporarily, because we need
+ * to allocate memory. This won't be a problem, because
+ * in the worst case, another thread ends up here, which
+ * may cause us to allocate too many blocks, but this
+ * will be caught by the loop below.
+ */
+ tty_unlock(tp);
+ tob = uma_zalloc(ttyoutq_zone, M_WAITOK);
+ tty_lock(tp);
+
+ TTYOUTQ_INSERT_TAIL(to, tob);
+ }
+}
+
+void
+ttyoutq_free(struct ttyoutq *to)
+{
+ struct ttyoutq_block *tob;
+
+ ttyoutq_flush(to);
+ to->to_quota = 0;
+
+ while ((tob = to->to_firstblock) != NULL) {
+ TTYOUTQ_REMOVE_HEAD(to);
+ uma_zfree(ttyoutq_zone, tob);
+ }
+
+ MPASS(to->to_nblocks == 0);
+}
+
+size_t
+ttyoutq_read(struct ttyoutq *to, void *buf, size_t len)
+{
+ char *cbuf = buf;
+
+ while (len > 0) {
+ struct ttyoutq_block *tob;
+ size_t cbegin, cend, clen;
+
+ /* See if there still is data. */
+ if (to->to_begin == to->to_end)
+ break;
+ tob = to->to_firstblock;
+ if (tob == NULL)
+ break;
+
+ /*
+ * The end address should be the lowest of these three:
+ * - The write pointer
+ * - The blocksize - we can't read beyond the block
+ * - The end address if we could perform the full read
+ */
+ cbegin = to->to_begin;
+ cend = MIN(MIN(to->to_end, to->to_begin + len),
+ TTYOUTQ_DATASIZE);
+ clen = cend - cbegin;
+
+ /* Copy the data out of the buffers. */
+ memcpy(cbuf, tob->tob_data + cbegin, clen);
+ cbuf += clen;
+ len -= clen;
+
+ if (cend == to->to_end) {
+ /* Read the complete queue. */
+ to->to_begin = 0;
+ to->to_end = 0;
+ } else if (cend == TTYOUTQ_DATASIZE) {
+ /* Read the block until the end. */
+ TTYOUTQ_REMOVE_HEAD(to);
+ to->to_begin = 0;
+ to->to_end -= TTYOUTQ_DATASIZE;
+ TTYOUTQ_RECYCLE(to, tob);
+ } else {
+ /* Read the block partially. */
+ to->to_begin += clen;
+ }
+ }
+
+ return (cbuf - (char *)buf);
+}
+
+/*
+ * An optimized version of ttyoutq_read() which can be used in pseudo
+ * TTY drivers to directly copy data from the outq to userspace, instead
+ * of buffering it.
+ *
+ * We can only copy data directly if we need to read the entire block
+ * back to the user, because we temporarily remove the block from the
+ * queue. Otherwise we need to copy it to a temporary buffer first, to
+ * make sure data remains in the correct order.
+ */
+int
+ttyoutq_read_uio(struct ttyoutq *to, struct tty *tp, struct uio *uio)
+{
+
+ while (uio->uio_resid > 0) {
+ int error;
+ struct ttyoutq_block *tob;
+ size_t cbegin, cend, clen;
+
+ /* See if there still is data. */
+ if (to->to_begin == to->to_end)
+ return (0);
+ tob = to->to_firstblock;
+ if (tob == NULL)
+ return (0);
+
+ /*
+ * The end address should be the lowest of these three:
+ * - The write pointer
+ * - The blocksize - we can't read beyond the block
+ * - The end address if we could perform the full read
+ */
+ cbegin = to->to_begin;
+ cend = MIN(MIN(to->to_end, to->to_begin + uio->uio_resid),
+ TTYOUTQ_DATASIZE);
+ clen = cend - cbegin;
+
+ /*
+ * We can prevent buffering in some cases:
+ * - We need to read the block until the end.
+ * - We don't need to read the block until the end, but
+ * there is no data beyond it, which allows us to move
+ * the write pointer to a new block.
+ */
+ if (cend == TTYOUTQ_DATASIZE || cend == to->to_end) {
+ /*
+ * Fast path: zero copy. Remove the first block,
+ * so we can unlock the TTY temporarily.
+ */
+ TTYOUTQ_REMOVE_HEAD(to);
+ to->to_begin = 0;
+ if (to->to_end <= TTYOUTQ_DATASIZE)
+ to->to_end = 0;
+ else
+ to->to_end -= TTYOUTQ_DATASIZE;
+
+ /* Temporary unlock and copy the data to userspace. */
+ tty_unlock(tp);
+ error = uiomove(tob->tob_data + cbegin, clen, uio);
+ tty_lock(tp);
+
+ /* Block can now be readded to the list. */
+ TTYOUTQ_RECYCLE(to, tob);
+ } else {
+ char ob[TTYOUTQ_DATASIZE - 1];
+
+ /*
+ * Slow path: store data in a temporary buffer.
+ */
+ memcpy(ob, tob->tob_data + cbegin, clen);
+ to->to_begin += clen;
+ MPASS(to->to_begin < TTYOUTQ_DATASIZE);
+
+ /* Temporary unlock and copy the data to userspace. */
+ tty_unlock(tp);
+ error = uiomove(ob, clen, uio);
+ tty_lock(tp);
+ }
+
+ if (error != 0)
+ return (error);
+ }
+
+ return (0);
+}
+
+size_t
+ttyoutq_write(struct ttyoutq *to, const void *buf, size_t nbytes)
+{
+ const char *cbuf = buf;
+ struct ttyoutq_block *tob;
+ unsigned int boff;
+ size_t l;
+
+ while (nbytes > 0) {
+ boff = to->to_end % TTYOUTQ_DATASIZE;
+
+ if (to->to_end == 0) {
+ /* First time we're being used or drained. */
+ MPASS(to->to_begin == 0);
+ tob = to->to_firstblock;
+ if (tob == NULL) {
+ /* Queue has no blocks. */
+ break;
+ }
+ to->to_lastblock = tob;
+ } else if (boff == 0) {
+ /* We reached the end of this block on last write. */
+ tob = to->to_lastblock->tob_next;
+ if (tob == NULL) {
+ /* We've reached the watermark. */
+ break;
+ }
+ to->to_lastblock = tob;
+ } else {
+ tob = to->to_lastblock;
+ }
+
+ /* Don't copy more than was requested. */
+ l = MIN(nbytes, TTYOUTQ_DATASIZE - boff);
+ MPASS(l > 0);
+ memcpy(tob->tob_data + boff, cbuf, l);
+
+ cbuf += l;
+ nbytes -= l;
+ to->to_end += l;
+ }
+
+ return (cbuf - (const char *)buf);
+}
+
+int
+ttyoutq_write_nofrag(struct ttyoutq *to, const void *buf, size_t nbytes)
+{
+ size_t ret;
+
+ if (ttyoutq_bytesleft(to) < nbytes)
+ return (-1);
+
+ /* We should always be able to write it back. */
+ ret = ttyoutq_write(to, buf, nbytes);
+ MPASS(ret == nbytes);
+
+ return (0);
+}
+
+static void
+ttyoutq_startup(void *dummy)
+{
+
+ ttyoutq_zone = uma_zcreate("ttyoutq", sizeof(struct ttyoutq_block),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+SYSINIT(ttyoutq, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyoutq_startup, NULL);
diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c
new file mode 100644
index 0000000..8d2ac03
--- /dev/null
+++ b/sys/kern/tty_pts.c
@@ -0,0 +1,858 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/* Add compatibility bits for FreeBSD. */
+#define PTS_COMPAT
+/* Add pty(4) compat bits. */
+#define PTS_EXTERNAL
+/* Add bits to make Linux binaries work. */
+#define PTS_LINUX
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/serial.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/ttycom.h>
+
+#include <machine/stdarg.h>
+
+/*
+ * Our utmp(5) format is limited to 8-byte TTY line names. This means
+ * we can at most allocate 1000 pseudo-terminals ("pts/999"). Allow
+ * users to increase this number, assuming they have manually increased
+ * UT_LINESIZE.
+ */
+static struct unrhdr *pts_pool;
+
+static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device");
+
+/*
+ * Per-PTS structure.
+ *
+ * List of locks
+ * (t) locked by tty_lock()
+ * (c) const until freeing
+ */
+struct pts_softc {
+ int pts_unit; /* (c) Device unit number. */
+ unsigned int pts_flags; /* (t) Device flags. */
+#define PTS_PKT 0x1 /* Packet mode. */
+#define PTS_FINISHED 0x2 /* Return errors on read()/write(). */
+ char pts_pkt; /* (t) Unread packet mode data. */
+
+ struct cv pts_inwait; /* (t) Blocking write() on master. */
+ struct selinfo pts_inpoll; /* (t) Select queue for write(). */
+ struct cv pts_outwait; /* (t) Blocking read() on master. */
+ struct selinfo pts_outpoll; /* (t) Select queue for read(). */
+
+#ifdef PTS_EXTERNAL
+ struct cdev *pts_cdev; /* (c) Master device node. */
+#endif /* PTS_EXTERNAL */
+
+ struct ucred *pts_cred; /* (c) Resource limit. */
+};
+
+/*
+ * Controller-side file operations.
+ */
+
+static int
+ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+ int error = 0;
+ char pkt;
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ tty_lock(tp);
+
+ for (;;) {
+ /*
+ * Implement packet mode. When packet mode is turned on,
+ * the first byte contains a bitmask of events that
+ * occured (start, stop, flush, window size, etc).
+ */
+ if (psc->pts_flags & PTS_PKT && psc->pts_pkt) {
+ pkt = psc->pts_pkt;
+ psc->pts_pkt = 0;
+ tty_unlock(tp);
+
+ error = ureadc(pkt, uio);
+ return (error);
+ }
+
+ /*
+ * Transmit regular data.
+ *
+ * XXX: We shouldn't use ttydisc_getc_poll()! Even
+ * though in this implementation, there is likely going
+ * to be data, we should just call ttydisc_getc_uio()
+ * and use its return value to sleep.
+ */
+ if (ttydisc_getc_poll(tp)) {
+ if (psc->pts_flags & PTS_PKT) {
+ /*
+ * XXX: Small race. Fortunately PTY
+ * consumers aren't multithreaded.
+ */
+
+ tty_unlock(tp);
+ error = ureadc(TIOCPKT_DATA, uio);
+ if (error)
+ return (error);
+ tty_lock(tp);
+ }
+
+ error = ttydisc_getc_uio(tp, uio);
+ break;
+ }
+
+ /* Maybe the device isn't used anyway. */
+ if (psc->pts_flags & PTS_FINISHED)
+ break;
+
+ /* Wait for more data. */
+ if (fp->f_flag & O_NONBLOCK) {
+ error = EWOULDBLOCK;
+ break;
+ }
+ error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx);
+ if (error != 0)
+ break;
+ }
+
+ tty_unlock(tp);
+
+ return (error);
+}
+
+static int
+ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+ char ib[256], *ibstart;
+ size_t iblen, rintlen;
+ int error = 0;
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ for (;;) {
+ ibstart = ib;
+ iblen = MIN(uio->uio_resid, sizeof ib);
+ error = uiomove(ib, iblen, uio);
+
+ tty_lock(tp);
+ if (error != 0) {
+ iblen = 0;
+ goto done;
+ }
+
+ /*
+ * When possible, avoid the slow path. rint_bypass()
+ * copies all input to the input queue at once.
+ */
+ MPASS(iblen > 0);
+ do {
+ rintlen = ttydisc_rint_simple(tp, ibstart, iblen);
+ ibstart += rintlen;
+ iblen -= rintlen;
+ if (iblen == 0) {
+ /* All data written. */
+ break;
+ }
+
+ /* Maybe the device isn't used anyway. */
+ if (psc->pts_flags & PTS_FINISHED) {
+ error = EIO;
+ goto done;
+ }
+
+ /* Wait for more data. */
+ if (fp->f_flag & O_NONBLOCK) {
+ error = EWOULDBLOCK;
+ goto done;
+ }
+
+ /* Wake up users on the slave side. */
+ ttydisc_rint_done(tp);
+ error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx);
+ if (error != 0)
+ goto done;
+ } while (iblen > 0);
+
+ if (uio->uio_resid == 0)
+ break;
+ tty_unlock(tp);
+ }
+
+done: ttydisc_rint_done(tp);
+ tty_unlock(tp);
+
+ /*
+ * Don't account for the part of the buffer that we couldn't
+ * pass to the TTY.
+ */
+ uio->uio_resid += iblen;
+ return (error);
+}
+
+static int
+ptsdev_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+static int
+ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+ int error = 0, sig;
+
+ switch (cmd) {
+ case FIONBIO:
+ /* This device supports non-blocking operation. */
+ return (0);
+ case FIONREAD:
+ tty_lock(tp);
+ if (psc->pts_flags & PTS_FINISHED) {
+ /* Force read() to be called. */
+ *(int *)data = 1;
+ } else {
+ *(int *)data = ttydisc_getc_poll(tp);
+ }
+ tty_unlock(tp);
+ return (0);
+ case FIODGNAME: {
+ struct fiodgname_arg *fgn;
+ const char *p;
+ int i;
+
+ /* Reverse device name lookups, for ptsname() and ttyname(). */
+ fgn = data;
+ p = tty_devname(tp);
+ i = strlen(p) + 1;
+ if (i > fgn->len)
+ return (EINVAL);
+ return copyout(p, fgn->buf, i);
+ }
+
+ /*
+ * We need to implement TIOCGPGRP and TIOCGSID here again. When
+ * called on the pseudo-terminal master, it should not check if
+ * the terminal is the foreground terminal of the calling
+ * process.
+ *
+ * TIOCGETA is also implemented here. Various Linux PTY routines
+ * often call isatty(), which is implemented by tcgetattr().
+ */
+#ifdef PTS_LINUX
+ case TIOCGETA:
+ /* Obtain terminal flags through tcgetattr(). */
+ tty_lock(tp);
+ *(struct termios*)data = tp->t_termios;
+ tty_unlock(tp);
+ return (0);
+#endif /* PTS_LINUX */
+ case TIOCSETAF:
+ case TIOCSETAW:
+ /*
+ * We must make sure we turn tcsetattr() calls of TCSAFLUSH and
+ * TCSADRAIN into something different. If an application would
+ * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may
+ * deadlock waiting for all data to be read.
+ */
+ cmd = TIOCSETA;
+ break;
+#if defined(PTS_COMPAT) || defined(PTS_LINUX)
+ case TIOCGPTN:
+ /*
+ * Get the device unit number.
+ */
+ if (psc->pts_unit < 0)
+ return (ENOTTY);
+ *(unsigned int *)data = psc->pts_unit;
+ return (0);
+#endif /* PTS_COMPAT || PTS_LINUX */
+ case TIOCGPGRP:
+ /* Get the foreground process group ID. */
+ tty_lock(tp);
+ if (tp->t_pgrp != NULL)
+ *(int *)data = tp->t_pgrp->pg_id;
+ else
+ *(int *)data = NO_PID;
+ tty_unlock(tp);
+ return (0);
+ case TIOCGSID:
+ /* Get the session leader process ID. */
+ tty_lock(tp);
+ if (tp->t_session == NULL)
+ error = ENOTTY;
+ else
+ *(int *)data = tp->t_session->s_sid;
+ tty_unlock(tp);
+ return (error);
+ case TIOCPTMASTER:
+ /* Yes, we are a pseudo-terminal master. */
+ return (0);
+ case TIOCSIG:
+ /* Signal the foreground process group. */
+ sig = *(int *)data;
+ if (sig < 1 || sig >= NSIG)
+ return (EINVAL);
+
+ tty_lock(tp);
+ tty_signal_pgrp(tp, sig);
+ tty_unlock(tp);
+ return (0);
+ case TIOCPKT:
+ /* Enable/disable packet mode. */
+ tty_lock(tp);
+ if (*(int *)data)
+ psc->pts_flags |= PTS_PKT;
+ else
+ psc->pts_flags &= ~PTS_PKT;
+ tty_unlock(tp);
+ return (0);
+ }
+
+ /* Just redirect this ioctl to the slave device. */
+ tty_lock(tp);
+ error = tty_ioctl(tp, cmd, data, fp->f_flag, td);
+ tty_unlock(tp);
+ if (error == ENOIOCTL)
+ error = ENOTTY;
+
+ return (error);
+}
+
+static int
+ptsdev_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+ int revents = 0;
+
+ tty_lock(tp);
+
+ if (psc->pts_flags & PTS_FINISHED) {
+ /* Slave device is not opened. */
+ tty_unlock(tp);
+ return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
+ }
+
+ if (events & (POLLIN|POLLRDNORM)) {
+ /* See if we can getc something. */
+ if (ttydisc_getc_poll(tp) ||
+ (psc->pts_flags & PTS_PKT && psc->pts_pkt))
+ revents |= events & (POLLIN|POLLRDNORM);
+ }
+ if (events & (POLLOUT|POLLWRNORM)) {
+ /* See if we can rint something. */
+ if (ttydisc_rint_poll(tp))
+ revents |= events & (POLLOUT|POLLWRNORM);
+ }
+
+ /*
+ * No need to check for POLLHUP here. This device cannot be used
+ * as a callout device, which means we always have a carrier,
+ * because the master is.
+ */
+
+ if (revents == 0) {
+ /*
+ * This code might look misleading, but the naming of
+ * poll events on this side is the opposite of the slave
+ * device.
+ */
+ if (events & (POLLIN|POLLRDNORM))
+ selrecord(td, &psc->pts_outpoll);
+ if (events & (POLLOUT|POLLWRNORM))
+ selrecord(td, &psc->pts_inpoll);
+ }
+
+ tty_unlock(tp);
+
+ return (revents);
+}
+
+/*
+ * kqueue support.
+ */
+
+static void
+pts_kqops_read_detach(struct knote *kn)
+{
+ struct file *fp = kn->kn_fp;
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+
+ knlist_remove(&psc->pts_outpoll.si_note, kn, 0);
+}
+
+static int
+pts_kqops_read_event(struct knote *kn, long hint)
+{
+ struct file *fp = kn->kn_fp;
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+
+ if (psc->pts_flags & PTS_FINISHED) {
+ kn->kn_flags |= EV_EOF;
+ return (1);
+ } else {
+ kn->kn_data = ttydisc_getc_poll(tp);
+ return (kn->kn_data > 0);
+ }
+}
+
+static void
+pts_kqops_write_detach(struct knote *kn)
+{
+ struct file *fp = kn->kn_fp;
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+
+ knlist_remove(&psc->pts_inpoll.si_note, kn, 0);
+}
+
+static int
+pts_kqops_write_event(struct knote *kn, long hint)
+{
+ struct file *fp = kn->kn_fp;
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+
+ if (psc->pts_flags & PTS_FINISHED) {
+ kn->kn_flags |= EV_EOF;
+ return (1);
+ } else {
+ kn->kn_data = ttydisc_rint_poll(tp);
+ return (kn->kn_data > 0);
+ }
+}
+
+static struct filterops pts_kqops_read = {
+ .f_isfd = 1,
+ .f_detach = pts_kqops_read_detach,
+ .f_event = pts_kqops_read_event,
+};
+static struct filterops pts_kqops_write = {
+ .f_isfd = 1,
+ .f_detach = pts_kqops_write_detach,
+ .f_event = pts_kqops_write_event,
+};
+
+static int
+ptsdev_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct tty *tp = fp->f_data;
+ struct pts_softc *psc = tty_softc(tp);
+ int error = 0;
+
+ tty_lock(tp);
+
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ kn->kn_fop = &pts_kqops_read;
+ knlist_add(&psc->pts_outpoll.si_note, kn, 1);
+ break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &pts_kqops_write;
+ knlist_add(&psc->pts_inpoll.si_note, kn, 1);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ tty_unlock(tp);
+ return (error);
+}
+
+static int
+ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct tty *tp = fp->f_data;
+#ifdef PTS_EXTERNAL
+ struct pts_softc *psc = tty_softc(tp);
+#endif /* PTS_EXTERNAL */
+ struct cdev *dev = tp->t_dev;
+
+ /*
+ * According to POSIX, we must implement an fstat(). This also
+ * makes this implementation compatible with Linux binaries,
+ * because Linux calls fstat() on the pseudo-terminal master to
+ * obtain st_rdev.
+ *
+ * XXX: POSIX also mentions we must fill in st_dev, but how?
+ */
+
+ bzero(sb, sizeof *sb);
+#ifdef PTS_EXTERNAL
+ if (psc->pts_cdev != NULL)
+ sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev);
+ else
+#endif /* PTS_EXTERNAL */
+ sb->st_ino = sb->st_rdev = tty_udev(tp);
+
+ sb->st_atim = dev->si_atime;
+ sb->st_ctim = dev->si_ctime;
+ sb->st_mtim = dev->si_mtime;
+ sb->st_uid = dev->si_uid;
+ sb->st_gid = dev->si_gid;
+ sb->st_mode = dev->si_mode | S_IFCHR;
+
+ return (0);
+}
+
+static int
+ptsdev_close(struct file *fp, struct thread *td)
+{
+ struct tty *tp = fp->f_data;
+
+ /* Deallocate TTY device. */
+ tty_lock(tp);
+ tty_rel_gone(tp);
+
+ /*
+ * Open of /dev/ptmx or /dev/ptyXX changes the type of file
+ * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode
+ * use count, we need to decrement it, and possibly do other
+ * required cleanup.
+ */
+ if (fp->f_vnode != NULL)
+ return (vnops.fo_close(fp, td));
+
+ return (0);
+}
+
+static struct fileops ptsdev_ops = {
+ .fo_read = ptsdev_read,
+ .fo_write = ptsdev_write,
+ .fo_truncate = ptsdev_truncate,
+ .fo_ioctl = ptsdev_ioctl,
+ .fo_poll = ptsdev_poll,
+ .fo_kqfilter = ptsdev_kqfilter,
+ .fo_stat = ptsdev_stat,
+ .fo_close = ptsdev_close,
+ .fo_chmod = invfo_chmod,
+ .fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_flags = DFLAG_PASSABLE,
+};
+
+/*
+ * Driver-side hooks.
+ */
+
+static void
+ptsdrv_outwakeup(struct tty *tp)
+{
+ struct pts_softc *psc = tty_softc(tp);
+
+ cv_broadcast(&psc->pts_outwait);
+ selwakeup(&psc->pts_outpoll);
+ KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0);
+}
+
+static void
+ptsdrv_inwakeup(struct tty *tp)
+{
+ struct pts_softc *psc = tty_softc(tp);
+
+ cv_broadcast(&psc->pts_inwait);
+ selwakeup(&psc->pts_inpoll);
+ KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0);
+}
+
+static int
+ptsdrv_open(struct tty *tp)
+{
+ struct pts_softc *psc = tty_softc(tp);
+
+ psc->pts_flags &= ~PTS_FINISHED;
+
+ return (0);
+}
+
+static void
+ptsdrv_close(struct tty *tp)
+{
+ struct pts_softc *psc = tty_softc(tp);
+
+ /* Wake up any blocked readers/writers. */
+ psc->pts_flags |= PTS_FINISHED;
+ ptsdrv_outwakeup(tp);
+ ptsdrv_inwakeup(tp);
+}
+
+static void
+ptsdrv_pktnotify(struct tty *tp, char event)
+{
+ struct pts_softc *psc = tty_softc(tp);
+
+ /*
+ * Clear conflicting flags.
+ */
+
+ switch (event) {
+ case TIOCPKT_STOP:
+ psc->pts_pkt &= ~TIOCPKT_START;
+ break;
+ case TIOCPKT_START:
+ psc->pts_pkt &= ~TIOCPKT_STOP;
+ break;
+ case TIOCPKT_NOSTOP:
+ psc->pts_pkt &= ~TIOCPKT_DOSTOP;
+ break;
+ case TIOCPKT_DOSTOP:
+ psc->pts_pkt &= ~TIOCPKT_NOSTOP;
+ break;
+ }
+
+ psc->pts_pkt |= event;
+ ptsdrv_outwakeup(tp);
+}
+
+static void
+ptsdrv_free(void *softc)
+{
+ struct pts_softc *psc = softc;
+
+ /* Make device number available again. */
+ if (psc->pts_unit >= 0)
+ free_unr(pts_pool, psc->pts_unit);
+
+ chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0);
+ racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1);
+ crfree(psc->pts_cred);
+
+ seldrain(&psc->pts_inpoll);
+ seldrain(&psc->pts_outpoll);
+ knlist_destroy(&psc->pts_inpoll.si_note);
+ knlist_destroy(&psc->pts_outpoll.si_note);
+
+#ifdef PTS_EXTERNAL
+ /* Destroy master device as well. */
+ if (psc->pts_cdev != NULL)
+ destroy_dev_sched(psc->pts_cdev);
+#endif /* PTS_EXTERNAL */
+
+ free(psc, M_PTS);
+}
+
+static struct ttydevsw pts_class = {
+ .tsw_flags = TF_NOPREFIX,
+ .tsw_outwakeup = ptsdrv_outwakeup,
+ .tsw_inwakeup = ptsdrv_inwakeup,
+ .tsw_open = ptsdrv_open,
+ .tsw_close = ptsdrv_close,
+ .tsw_pktnotify = ptsdrv_pktnotify,
+ .tsw_free = ptsdrv_free,
+};
+
+#ifndef PTS_EXTERNAL
+static
+#endif /* !PTS_EXTERNAL */
+int
+pts_alloc(int fflags, struct thread *td, struct file *fp)
+{
+ int unit, ok, error;
+ struct tty *tp;
+ struct pts_softc *psc;
+ struct proc *p = td->td_proc;
+ struct ucred *cred = td->td_ucred;
+
+ /* Resource limiting. */
+ PROC_LOCK(p);
+ error = racct_add(p, RACCT_NPTS, 1);
+ if (error != 0) {
+ PROC_UNLOCK(p);
+ return (EAGAIN);
+ }
+ ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
+ if (!ok) {
+ racct_sub(p, RACCT_NPTS, 1);
+ PROC_UNLOCK(p);
+ return (EAGAIN);
+ }
+ PROC_UNLOCK(p);
+
+ /* Try to allocate a new pts unit number. */
+ unit = alloc_unr(pts_pool);
+ if (unit < 0) {
+ racct_sub(p, RACCT_NPTS, 1);
+ chgptscnt(cred->cr_ruidinfo, -1, 0);
+ return (EAGAIN);
+ }
+
+ /* Allocate TTY and softc. */
+ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
+ cv_init(&psc->pts_inwait, "ptsin");
+ cv_init(&psc->pts_outwait, "ptsout");
+
+ psc->pts_unit = unit;
+ psc->pts_cred = crhold(cred);
+
+ tp = tty_alloc(&pts_class, psc);
+ knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
+ knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
+
+ /* Expose the slave device as well. */
+ tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit);
+
+ finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
+
+ return (0);
+}
+
+#ifdef PTS_EXTERNAL
+int
+pts_alloc_external(int fflags, struct thread *td, struct file *fp,
+ struct cdev *dev, const char *name)
+{
+ int ok, error;
+ struct tty *tp;
+ struct pts_softc *psc;
+ struct proc *p = td->td_proc;
+ struct ucred *cred = td->td_ucred;
+
+ /* Resource limiting. */
+ PROC_LOCK(p);
+ error = racct_add(p, RACCT_NPTS, 1);
+ if (error != 0) {
+ PROC_UNLOCK(p);
+ return (EAGAIN);
+ }
+ ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
+ if (!ok) {
+ racct_sub(p, RACCT_NPTS, 1);
+ PROC_UNLOCK(p);
+ return (EAGAIN);
+ }
+ PROC_UNLOCK(p);
+
+ /* Allocate TTY and softc. */
+ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
+ cv_init(&psc->pts_inwait, "ptsin");
+ cv_init(&psc->pts_outwait, "ptsout");
+
+ psc->pts_unit = -1;
+ psc->pts_cdev = dev;
+ psc->pts_cred = crhold(cred);
+
+ tp = tty_alloc(&pts_class, psc);
+ knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
+ knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
+
+ /* Expose the slave device as well. */
+ tty_makedev(tp, td->td_ucred, "%s", name);
+
+ finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
+
+ return (0);
+}
+#endif /* PTS_EXTERNAL */
+
+int
+sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap)
+{
+ int error, fd;
+ struct file *fp;
+
+ /*
+ * POSIX states it's unspecified when other flags are passed. We
+ * don't allow this.
+ */
+ if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC))
+ return (EINVAL);
+
+ error = falloc(td, &fp, &fd, uap->flags);
+ if (error)
+ return (error);
+
+ /* Allocate the actual pseudo-TTY. */
+ error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp);
+ if (error != 0) {
+ fdclose(td->td_proc->p_fd, fp, fd, td);
+ fdrop(fp, td);
+ return (error);
+ }
+
+ /* Pass it back to userspace. */
+ td->td_retval[0] = fd;
+ fdrop(fp, td);
+
+ return (0);
+}
+
+static void
+pts_init(void *unused)
+{
+
+ pts_pool = new_unrhdr(0, INT_MAX, NULL);
+}
+
+SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL);
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
new file mode 100644
index 0000000..07d8358
--- /dev/null
+++ b/sys/kern/tty_tty.c
@@ -0,0 +1,94 @@
+/*-
+ * Copyright (c) 2003 Poul-Henning Kamp. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sx.h>
+#include <sys/vnode.h>
+
+#include <fs/devfs/devfs.h>
+#include <fs/devfs/devfs_int.h>
+
+static d_open_t cttyopen;
+
+static struct cdevsw ctty_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = cttyopen,
+ .d_name = "ctty",
+};
+
+static struct cdev *ctty;
+
+static int
+cttyopen(struct cdev *dev, int flag, int mode, struct thread *td)
+{
+
+ return (ENXIO);
+}
+
+static void
+ctty_clone(void *arg, struct ucred *cred, char *name, int namelen,
+ struct cdev **dev)
+{
+
+ if (*dev != NULL)
+ return;
+ if (strcmp(name, "tty"))
+ return;
+ sx_sunlock(&clone_drain_lock);
+ sx_slock(&proctree_lock);
+ sx_slock(&clone_drain_lock);
+ dev_lock();
+ if (!(curthread->td_proc->p_flag & P_CONTROLT))
+ *dev = ctty;
+ else if (curthread->td_proc->p_session->s_ttyvp == NULL)
+ *dev = ctty;
+ else if (curthread->td_proc->p_session->s_ttyvp->v_type == VBAD ||
+ curthread->td_proc->p_session->s_ttyvp->v_rdev == NULL) {
+ /* e.g. s_ttyvp was revoked */
+ *dev = ctty;
+ } else
+ *dev = curthread->td_proc->p_session->s_ttyvp->v_rdev;
+ dev_refl(*dev);
+ dev_unlock();
+ sx_sunlock(&proctree_lock);
+}
+
+static void
+ctty_drvinit(void *unused)
+{
+
+ EVENTHANDLER_REGISTER(dev_clone, ctty_clone, 0, 1000);
+ ctty = make_dev_credf(MAKEDEV_ETERNAL, &ctty_cdevsw, 0, NULL, UID_ROOT,
+ GID_WHEEL, 0666, "ctty");
+}
+
+SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,ctty_drvinit,NULL);
diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c
new file mode 100644
index 0000000..63b144a
--- /dev/null
+++ b/sys/kern/tty_ttydisc.c
@@ -0,0 +1,1268 @@
+/*-
+ * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed under sponsorship from Snow
+ * B.V., the Netherlands.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/fcntl.h>
+#include <sys/filio.h>
+#include <sys/kernel.h>
+#include <sys/signal.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/ttycom.h>
+#include <sys/ttydefaults.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+/*
+ * Standard TTYDISC `termios' line discipline.
+ */
+
+/* Statistics. */
+static unsigned long tty_nin = 0;
+SYSCTL_ULONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD,
+ &tty_nin, 0, "Total amount of bytes received");
+static unsigned long tty_nout = 0;
+SYSCTL_ULONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
+ &tty_nout, 0, "Total amount of bytes transmitted");
+
+/* termios comparison macro's. */
+#define CMP_CC(v,c) (tp->t_termios.c_cc[v] != _POSIX_VDISABLE && \
+ tp->t_termios.c_cc[v] == (c))
+#define CMP_FLAG(field,opt) (tp->t_termios.c_ ## field ## flag & (opt))
+
+/* Characters that cannot be modified through c_cc. */
+#define CTAB '\t'
+#define CNL '\n'
+#define CCR '\r'
+
+/* Character is a control character. */
+#define CTL_VALID(c) ((c) == 0x7f || (unsigned char)(c) < 0x20)
+/* Control character should be processed on echo. */
+#define CTL_ECHO(c,q) (!(q) && ((c) == CERASE2 || (c) == CTAB || \
+ (c) == CNL || (c) == CCR))
+/* Control character should be printed using ^X notation. */
+#define CTL_PRINT(c,q) ((c) == 0x7f || ((unsigned char)(c) < 0x20 && \
+ ((q) || ((c) != CTAB && (c) != CNL))))
+/* Character is whitespace. */
+#define CTL_WHITE(c) ((c) == ' ' || (c) == CTAB)
+/* Character is alphanumeric. */
+#define CTL_ALNUM(c) (((c) >= '0' && (c) <= '9') || \
+ ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+
+#define TTY_STACKBUF 256
+
+void
+ttydisc_open(struct tty *tp)
+{
+ ttydisc_optimize(tp);
+}
+
+void
+ttydisc_close(struct tty *tp)
+{
+
+ /* Clean up our flags when leaving the discipline. */
+ tp->t_flags &= ~(TF_STOPPED|TF_HIWAT|TF_ZOMBIE);
+
+ /* POSIX states we should flush when close() is called. */
+ ttyinq_flush(&tp->t_inq);
+ ttyoutq_flush(&tp->t_outq);
+
+ if (!tty_gone(tp)) {
+ ttydevsw_inwakeup(tp);
+ ttydevsw_outwakeup(tp);
+ }
+
+ if (ttyhook_hashook(tp, close))
+ ttyhook_close(tp);
+}
+
+static int
+ttydisc_read_canonical(struct tty *tp, struct uio *uio, int ioflag)
+{
+ char breakc[4] = { CNL }; /* enough to hold \n, VEOF and VEOL. */
+ int error;
+ size_t clen, flen = 0, n = 1;
+ unsigned char lastc = _POSIX_VDISABLE;
+
+#define BREAK_ADD(c) do { \
+ if (tp->t_termios.c_cc[c] != _POSIX_VDISABLE) \
+ breakc[n++] = tp->t_termios.c_cc[c]; \
+} while (0)
+ /* Determine which characters we should trigger on. */
+ BREAK_ADD(VEOF);
+ BREAK_ADD(VEOL);
+#undef BREAK_ADD
+ breakc[n] = '\0';
+
+ do {
+ error = tty_wait_background(tp, curthread, SIGTTIN);
+ if (error)
+ return (error);
+
+ /*
+ * Quite a tricky case: unlike the old TTY
+ * implementation, this implementation copies data back
+ * to userspace in large chunks. Unfortunately, we can't
+ * calculate the line length on beforehand if it crosses
+ * ttyinq_block boundaries, because multiple reads could
+ * then make this code read beyond the newline.
+ *
+ * This is why we limit the read to:
+ * - The size the user has requested
+ * - The blocksize (done in tty_inq.c)
+ * - The amount of bytes until the newline
+ *
+ * This causes the line length to be recalculated after
+ * each block has been copied to userspace. This will
+ * cause the TTY layer to return data in chunks using
+ * the blocksize (except the first and last blocks).
+ */
+ clen = ttyinq_findchar(&tp->t_inq, breakc, uio->uio_resid,
+ &lastc);
+
+ /* No more data. */
+ if (clen == 0) {
+ if (tp->t_flags & TF_ZOMBIE)
+ return (0);
+ else if (ioflag & IO_NDELAY)
+ return (EWOULDBLOCK);
+
+ error = tty_wait(tp, &tp->t_inwait);
+ if (error)
+ return (error);
+ continue;
+ }
+
+ /* Don't send the EOF char back to userspace. */
+ if (CMP_CC(VEOF, lastc))
+ flen = 1;
+
+ MPASS(flen <= clen);
+
+ /* Read and throw away the EOF character. */
+ error = ttyinq_read_uio(&tp->t_inq, tp, uio, clen, flen);
+ if (error)
+ return (error);
+
+ } while (uio->uio_resid > 0 && lastc == _POSIX_VDISABLE);
+
+ return (0);
+}
+
+static int
+ttydisc_read_raw_no_timer(struct tty *tp, struct uio *uio, int ioflag)
+{
+ size_t vmin = tp->t_termios.c_cc[VMIN];
+ ssize_t oresid = uio->uio_resid;
+ int error;
+
+ MPASS(tp->t_termios.c_cc[VTIME] == 0);
+
+ /*
+ * This routine implements the easy cases of read()s while in
+ * non-canonical mode, namely case B and D, where we don't have
+ * any timers at all.
+ */
+
+ for (;;) {
+ error = tty_wait_background(tp, curthread, SIGTTIN);
+ if (error)
+ return (error);
+
+ error = ttyinq_read_uio(&tp->t_inq, tp, uio,
+ uio->uio_resid, 0);
+ if (error)
+ return (error);
+ if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
+ return (0);
+
+ /* We have to wait for more. */
+ if (tp->t_flags & TF_ZOMBIE)
+ return (0);
+ else if (ioflag & IO_NDELAY)
+ return (EWOULDBLOCK);
+
+ error = tty_wait(tp, &tp->t_inwait);
+ if (error)
+ return (error);
+ }
+}
+
+static int
+ttydisc_read_raw_read_timer(struct tty *tp, struct uio *uio, int ioflag,
+ int oresid)
+{
+ size_t vmin = MAX(tp->t_termios.c_cc[VMIN], 1);
+ unsigned int vtime = tp->t_termios.c_cc[VTIME];
+ struct timeval end, now, left;
+ int error, hz;
+
+ MPASS(tp->t_termios.c_cc[VTIME] != 0);
+
+ /* Determine when the read should be expired. */
+ end.tv_sec = vtime / 10;
+ end.tv_usec = (vtime % 10) * 100000;
+ getmicrotime(&now);
+ timevaladd(&end, &now);
+
+ for (;;) {
+ error = tty_wait_background(tp, curthread, SIGTTIN);
+ if (error)
+ return (error);
+
+ error = ttyinq_read_uio(&tp->t_inq, tp, uio,
+ uio->uio_resid, 0);
+ if (error)
+ return (error);
+ if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
+ return (0);
+
+ /* Calculate how long we should wait. */
+ getmicrotime(&now);
+ if (timevalcmp(&now, &end, >))
+ return (0);
+ left = end;
+ timevalsub(&left, &now);
+ hz = tvtohz(&left);
+
+ /*
+ * We have to wait for more. If the timer expires, we
+ * should return a 0-byte read.
+ */
+ if (tp->t_flags & TF_ZOMBIE)
+ return (0);
+ else if (ioflag & IO_NDELAY)
+ return (EWOULDBLOCK);
+
+ error = tty_timedwait(tp, &tp->t_inwait, hz);
+ if (error)
+ return (error == EWOULDBLOCK ? 0 : error);
+ }
+
+ return (0);
+}
+
+static int
+ttydisc_read_raw_interbyte_timer(struct tty *tp, struct uio *uio, int ioflag)
+{
+ size_t vmin = tp->t_termios.c_cc[VMIN];
+ ssize_t oresid = uio->uio_resid;
+ int error;
+
+ MPASS(tp->t_termios.c_cc[VMIN] != 0);
+ MPASS(tp->t_termios.c_cc[VTIME] != 0);
+
+ /*
+ * When using the interbyte timer, the timer should be started
+ * after the first byte has been received. We just call into the
+ * generic read timer code after we've received the first byte.
+ */
+
+ for (;;) {
+ error = tty_wait_background(tp, curthread, SIGTTIN);
+ if (error)
+ return (error);
+
+ error = ttyinq_read_uio(&tp->t_inq, tp, uio,
+ uio->uio_resid, 0);
+ if (error)
+ return (error);
+ if (uio->uio_resid == 0 || (oresid - uio->uio_resid) >= vmin)
+ return (0);
+
+ /*
+ * Not enough data, but we did receive some, which means
+ * we'll now start using the interbyte timer.
+ */
+ if (oresid != uio->uio_resid)
+ break;
+
+ /* We have to wait for more. */
+ if (tp->t_flags & TF_ZOMBIE)
+ return (0);
+ else if (ioflag & IO_NDELAY)
+ return (EWOULDBLOCK);
+
+ error = tty_wait(tp, &tp->t_inwait);
+ if (error)
+ return (error);
+ }
+
+ return ttydisc_read_raw_read_timer(tp, uio, ioflag, oresid);
+}
+
+int
+ttydisc_read(struct tty *tp, struct uio *uio, int ioflag)
+{
+ int error;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (uio->uio_resid == 0)
+ return (0);
+
+ if (CMP_FLAG(l, ICANON))
+ error = ttydisc_read_canonical(tp, uio, ioflag);
+ else if (tp->t_termios.c_cc[VTIME] == 0)
+ error = ttydisc_read_raw_no_timer(tp, uio, ioflag);
+ else if (tp->t_termios.c_cc[VMIN] == 0)
+ error = ttydisc_read_raw_read_timer(tp, uio, ioflag,
+ uio->uio_resid);
+ else
+ error = ttydisc_read_raw_interbyte_timer(tp, uio, ioflag);
+
+ if (ttyinq_bytesleft(&tp->t_inq) >= tp->t_inlow ||
+ ttyinq_bytescanonicalized(&tp->t_inq) == 0) {
+ /* Unset the input watermark when we've got enough space. */
+ tty_hiwat_in_unblock(tp);
+ }
+
+ return (error);
+}
+
+static __inline unsigned int
+ttydisc_findchar(const char *obstart, unsigned int oblen)
+{
+ const char *c = obstart;
+
+ while (oblen--) {
+ if (CTL_VALID(*c))
+ break;
+ c++;
+ }
+
+ return (c - obstart);
+}
+
+static int
+ttydisc_write_oproc(struct tty *tp, char c)
+{
+ unsigned int scnt, error;
+
+ MPASS(CMP_FLAG(o, OPOST));
+ MPASS(CTL_VALID(c));
+
+#define PRINT_NORMAL() ttyoutq_write_nofrag(&tp->t_outq, &c, 1)
+ switch (c) {
+ case CEOF:
+ /* End-of-text dropping. */
+ if (CMP_FLAG(o, ONOEOT))
+ return (0);
+ return PRINT_NORMAL();
+
+ case CERASE2:
+ /* Handle backspace to fix tab expansion. */
+ if (PRINT_NORMAL() != 0)
+ return (-1);
+ if (tp->t_column > 0)
+ tp->t_column--;
+ return (0);
+
+ case CTAB:
+ /* Tab expansion. */
+ scnt = 8 - (tp->t_column & 7);
+ if (CMP_FLAG(o, TAB3)) {
+ error = ttyoutq_write_nofrag(&tp->t_outq,
+ " ", scnt);
+ } else {
+ error = PRINT_NORMAL();
+ }
+ if (error)
+ return (-1);
+
+ tp->t_column += scnt;
+ MPASS((tp->t_column % 8) == 0);
+ return (0);
+
+ case CNL:
+ /* Newline conversion. */
+ if (CMP_FLAG(o, ONLCR)) {
+ /* Convert \n to \r\n. */
+ error = ttyoutq_write_nofrag(&tp->t_outq, "\r\n", 2);
+ } else {
+ error = PRINT_NORMAL();
+ }
+ if (error)
+ return (-1);
+
+ if (CMP_FLAG(o, ONLCR|ONLRET)) {
+ tp->t_column = tp->t_writepos = 0;
+ ttyinq_reprintpos_set(&tp->t_inq);
+ }
+ return (0);
+
+ case CCR:
+ /* Carriage return to newline conversion. */
+ if (CMP_FLAG(o, OCRNL))
+ c = CNL;
+ /* Omit carriage returns on column 0. */
+ if (CMP_FLAG(o, ONOCR) && tp->t_column == 0)
+ return (0);
+ if (PRINT_NORMAL() != 0)
+ return (-1);
+
+ tp->t_column = tp->t_writepos = 0;
+ ttyinq_reprintpos_set(&tp->t_inq);
+ return (0);
+ }
+
+ /*
+ * Invisible control character. Print it, but don't
+ * increase the column count.
+ */
+ return PRINT_NORMAL();
+#undef PRINT_NORMAL
+}
+
+/*
+ * Just like the old TTY implementation, we need to copy data in chunks
+ * into a temporary buffer. One of the reasons why we need to do this,
+ * is because output processing (only TAB3 though) may allow the buffer
+ * to grow eight times.
+ */
+int
+ttydisc_write(struct tty *tp, struct uio *uio, int ioflag)
+{
+ char ob[TTY_STACKBUF];
+ char *obstart;
+ int error = 0;
+ unsigned int oblen = 0;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tp->t_flags & TF_ZOMBIE)
+ return (EIO);
+
+ /*
+ * We don't need to check whether the process is the foreground
+ * process group or if we have a carrier. This is already done
+ * in ttydev_write().
+ */
+
+ while (uio->uio_resid > 0) {
+ unsigned int nlen;
+
+ MPASS(oblen == 0);
+
+ /* Step 1: read data. */
+ obstart = ob;
+ nlen = MIN(uio->uio_resid, sizeof ob);
+ tty_unlock(tp);
+ error = uiomove(ob, nlen, uio);
+ tty_lock(tp);
+ if (error != 0)
+ break;
+ oblen = nlen;
+
+ if (tty_gone(tp)) {
+ error = ENXIO;
+ break;
+ }
+
+ MPASS(oblen > 0);
+
+ /* Step 2: process data. */
+ do {
+ unsigned int plen, wlen;
+
+ /* Search for special characters for post processing. */
+ if (CMP_FLAG(o, OPOST)) {
+ plen = ttydisc_findchar(obstart, oblen);
+ } else {
+ plen = oblen;
+ }
+
+ if (plen == 0) {
+ /*
+ * We're going to process a character
+ * that needs processing
+ */
+ if (ttydisc_write_oproc(tp, *obstart) == 0) {
+ obstart++;
+ oblen--;
+
+ tp->t_writepos = tp->t_column;
+ ttyinq_reprintpos_set(&tp->t_inq);
+ continue;
+ }
+ } else {
+ /* We're going to write regular data. */
+ wlen = ttyoutq_write(&tp->t_outq, obstart, plen);
+ obstart += wlen;
+ oblen -= wlen;
+ tp->t_column += wlen;
+
+ tp->t_writepos = tp->t_column;
+ ttyinq_reprintpos_set(&tp->t_inq);
+
+ if (wlen == plen)
+ continue;
+ }
+
+ /* Watermark reached. Try to sleep. */
+ tp->t_flags |= TF_HIWAT_OUT;
+
+ if (ioflag & IO_NDELAY) {
+ error = EWOULDBLOCK;
+ goto done;
+ }
+
+ /*
+ * The driver may write back the data
+ * synchronously. Be sure to check the high
+ * water mark before going to sleep.
+ */
+ ttydevsw_outwakeup(tp);
+ if ((tp->t_flags & TF_HIWAT_OUT) == 0)
+ continue;
+
+ error = tty_wait(tp, &tp->t_outwait);
+ if (error)
+ goto done;
+
+ if (tp->t_flags & TF_ZOMBIE) {
+ error = EIO;
+ goto done;
+ }
+ } while (oblen > 0);
+ }
+
+done:
+ if (!tty_gone(tp))
+ ttydevsw_outwakeup(tp);
+
+ /*
+ * Add the amount of bytes that we didn't process back to the
+ * uio counters. We need to do this to make sure write() doesn't
+ * count the bytes we didn't store in the queue.
+ */
+ uio->uio_resid += oblen;
+ return (error);
+}
+
+void
+ttydisc_optimize(struct tty *tp)
+{
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (ttyhook_hashook(tp, rint_bypass)) {
+ tp->t_flags |= TF_BYPASS;
+ } else if (ttyhook_hashook(tp, rint)) {
+ tp->t_flags &= ~TF_BYPASS;
+ } else if (!CMP_FLAG(i, ICRNL|IGNCR|IMAXBEL|INLCR|ISTRIP|IXON) &&
+ (!CMP_FLAG(i, BRKINT) || CMP_FLAG(i, IGNBRK)) &&
+ (!CMP_FLAG(i, PARMRK) ||
+ CMP_FLAG(i, IGNPAR|IGNBRK) == (IGNPAR|IGNBRK)) &&
+ !CMP_FLAG(l, ECHO|ICANON|IEXTEN|ISIG|PENDIN)) {
+ tp->t_flags |= TF_BYPASS;
+ } else {
+ tp->t_flags &= ~TF_BYPASS;
+ }
+}
+
+void
+ttydisc_modem(struct tty *tp, int open)
+{
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (open)
+ cv_broadcast(&tp->t_dcdwait);
+
+ /*
+ * Ignore modem status lines when CLOCAL is turned on, but don't
+ * enter the zombie state when the TTY isn't opened, because
+ * that would cause the TTY to be in zombie state after being
+ * opened.
+ */
+ if (!tty_opened(tp) || CMP_FLAG(c, CLOCAL))
+ return;
+
+ if (open == 0) {
+ /*
+ * Lost carrier.
+ */
+ tp->t_flags |= TF_ZOMBIE;
+
+ tty_signal_sessleader(tp, SIGHUP);
+ tty_flush(tp, FREAD|FWRITE);
+ } else {
+ /*
+ * Carrier is back again.
+ */
+
+ /* XXX: what should we do here? */
+ }
+}
+
+static int
+ttydisc_echo_force(struct tty *tp, char c, int quote)
+{
+
+ if (CMP_FLAG(o, OPOST) && CTL_ECHO(c, quote)) {
+ /*
+ * Only perform postprocessing when OPOST is turned on
+ * and the character is an unquoted BS/TB/NL/CR.
+ */
+ return ttydisc_write_oproc(tp, c);
+ } else if (CMP_FLAG(l, ECHOCTL) && CTL_PRINT(c, quote)) {
+ /*
+ * Only use ^X notation when ECHOCTL is turned on and
+ * we've got an quoted control character.
+ *
+ * Print backspaces when echoing an end-of-file.
+ */
+ char ob[4] = "^?\b\b";
+
+ /* Print ^X notation. */
+ if (c != 0x7f)
+ ob[1] = c + 'A' - 1;
+
+ if (!quote && CMP_CC(VEOF, c)) {
+ return ttyoutq_write_nofrag(&tp->t_outq, ob, 4);
+ } else {
+ tp->t_column += 2;
+ return ttyoutq_write_nofrag(&tp->t_outq, ob, 2);
+ }
+ } else {
+ /* Can just be printed. */
+ tp->t_column++;
+ return ttyoutq_write_nofrag(&tp->t_outq, &c, 1);
+ }
+}
+
+static int
+ttydisc_echo(struct tty *tp, char c, int quote)
+{
+
+ /*
+ * Only echo characters when ECHO is turned on, or ECHONL when
+ * the character is an unquoted newline.
+ */
+ if (!CMP_FLAG(l, ECHO) &&
+ (!CMP_FLAG(l, ECHONL) || c != CNL || quote))
+ return (0);
+
+ return ttydisc_echo_force(tp, c, quote);
+}
+
+static void
+ttydisc_reprint_char(void *d, char c, int quote)
+{
+ struct tty *tp = d;
+
+ ttydisc_echo(tp, c, quote);
+}
+
+static void
+ttydisc_reprint(struct tty *tp)
+{
+ cc_t c;
+
+ /* Print ^R\n, followed by the line. */
+ c = tp->t_termios.c_cc[VREPRINT];
+ if (c != _POSIX_VDISABLE)
+ ttydisc_echo(tp, c, 0);
+ ttydisc_echo(tp, CNL, 0);
+ ttyinq_reprintpos_reset(&tp->t_inq);
+
+ ttyinq_line_iterate_from_linestart(&tp->t_inq, ttydisc_reprint_char, tp);
+}
+
+struct ttydisc_recalc_length {
+ struct tty *tp;
+ unsigned int curlen;
+};
+
+static void
+ttydisc_recalc_charlength(void *d, char c, int quote)
+{
+ struct ttydisc_recalc_length *data = d;
+ struct tty *tp = data->tp;
+
+ if (CTL_PRINT(c, quote)) {
+ if (CMP_FLAG(l, ECHOCTL))
+ data->curlen += 2;
+ } else if (c == CTAB) {
+ data->curlen += 8 - (data->curlen & 7);
+ } else {
+ data->curlen++;
+ }
+}
+
+static unsigned int
+ttydisc_recalc_linelength(struct tty *tp)
+{
+ struct ttydisc_recalc_length data = { tp, tp->t_writepos };
+
+ ttyinq_line_iterate_from_reprintpos(&tp->t_inq,
+ ttydisc_recalc_charlength, &data);
+ return (data.curlen);
+}
+
+static int
+ttydisc_rubchar(struct tty *tp)
+{
+ char c;
+ int quote;
+ unsigned int prevpos, tablen;
+
+ if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
+ return (-1);
+ ttyinq_unputchar(&tp->t_inq);
+
+ if (CMP_FLAG(l, ECHO)) {
+ /*
+ * Remove the character from the screen. This is even
+ * safe for characters that span multiple characters
+ * (tabs, quoted, etc).
+ */
+ if (tp->t_writepos >= tp->t_column) {
+ /* Retype the sentence. */
+ ttydisc_reprint(tp);
+ } else if (CMP_FLAG(l, ECHOE)) {
+ if (CTL_PRINT(c, quote)) {
+ /* Remove ^X formatted chars. */
+ if (CMP_FLAG(l, ECHOCTL)) {
+ tp->t_column -= 2;
+ ttyoutq_write_nofrag(&tp->t_outq,
+ "\b\b \b\b", 6);
+ }
+ } else if (c == ' ') {
+ /* Space character needs no rubbing. */
+ tp->t_column -= 1;
+ ttyoutq_write_nofrag(&tp->t_outq, "\b", 1);
+ } else if (c == CTAB) {
+ /*
+ * Making backspace work with tabs is
+ * quite hard. Recalculate the length of
+ * this character and remove it.
+ *
+ * Because terminal settings could be
+ * changed while the line is being
+ * inserted, the calculations don't have
+ * to be correct. Make sure we keep the
+ * tab length within proper bounds.
+ */
+ prevpos = ttydisc_recalc_linelength(tp);
+ if (prevpos >= tp->t_column)
+ tablen = 1;
+ else
+ tablen = tp->t_column - prevpos;
+ if (tablen > 8)
+ tablen = 8;
+
+ tp->t_column = prevpos;
+ ttyoutq_write_nofrag(&tp->t_outq,
+ "\b\b\b\b\b\b\b\b", tablen);
+ return (0);
+ } else {
+ /*
+ * Remove a regular character by
+ * punching a space over it.
+ */
+ tp->t_column -= 1;
+ ttyoutq_write_nofrag(&tp->t_outq, "\b \b", 3);
+ }
+ } else {
+ /* Don't print spaces. */
+ ttydisc_echo(tp, tp->t_termios.c_cc[VERASE], 0);
+ }
+ }
+
+ return (0);
+}
+
+static void
+ttydisc_rubword(struct tty *tp)
+{
+ char c;
+ int quote, alnum;
+
+ /* Strip whitespace first. */
+ for (;;) {
+ if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
+ return;
+ if (!CTL_WHITE(c))
+ break;
+ ttydisc_rubchar(tp);
+ }
+
+ /*
+ * Record whether the last character from the previous iteration
+ * was alphanumeric or not. We need this to implement ALTWERASE.
+ */
+ alnum = CTL_ALNUM(c);
+ for (;;) {
+ ttydisc_rubchar(tp);
+
+ if (ttyinq_peekchar(&tp->t_inq, &c, &quote) != 0)
+ return;
+ if (CTL_WHITE(c))
+ return;
+ if (CMP_FLAG(l, ALTWERASE) && CTL_ALNUM(c) != alnum)
+ return;
+ }
+}
+
+int
+ttydisc_rint(struct tty *tp, char c, int flags)
+{
+ int signal, quote = 0;
+ char ob[3] = { 0xff, 0x00 };
+ size_t ol;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ atomic_add_long(&tty_nin, 1);
+
+ if (ttyhook_hashook(tp, rint))
+ return ttyhook_rint(tp, c, flags);
+
+ if (tp->t_flags & TF_BYPASS)
+ goto processed;
+
+ if (flags) {
+ if (flags & TRE_BREAK) {
+ if (CMP_FLAG(i, IGNBRK)) {
+ /* Ignore break characters. */
+ return (0);
+ } else if (CMP_FLAG(i, BRKINT)) {
+ /* Generate SIGINT on break. */
+ tty_flush(tp, FREAD|FWRITE);
+ tty_signal_pgrp(tp, SIGINT);
+ return (0);
+ } else {
+ /* Just print it. */
+ goto parmrk;
+ }
+ } else if (flags & TRE_FRAMING ||
+ (flags & TRE_PARITY && CMP_FLAG(i, INPCK))) {
+ if (CMP_FLAG(i, IGNPAR)) {
+ /* Ignore bad characters. */
+ return (0);
+ } else {
+ /* Just print it. */
+ goto parmrk;
+ }
+ }
+ }
+
+ /* Allow any character to perform a wakeup. */
+ if (CMP_FLAG(i, IXANY))
+ tp->t_flags &= ~TF_STOPPED;
+
+ /* Remove the top bit. */
+ if (CMP_FLAG(i, ISTRIP))
+ c &= ~0x80;
+
+ /* Skip input processing when we want to print it literally. */
+ if (tp->t_flags & TF_LITERAL) {
+ tp->t_flags &= ~TF_LITERAL;
+ quote = 1;
+ goto processed;
+ }
+
+ /* Special control characters that are implementation dependent. */
+ if (CMP_FLAG(l, IEXTEN)) {
+ /* Accept the next character as literal. */
+ if (CMP_CC(VLNEXT, c)) {
+ if (CMP_FLAG(l, ECHO)) {
+ if (CMP_FLAG(l, ECHOE))
+ ttyoutq_write_nofrag(&tp->t_outq, "^\b", 2);
+ else
+ ttydisc_echo(tp, c, 0);
+ }
+ tp->t_flags |= TF_LITERAL;
+ return (0);
+ }
+ }
+
+ /*
+ * Handle signal processing.
+ */
+ if (CMP_FLAG(l, ISIG)) {
+ if (CMP_FLAG(l, ICANON|IEXTEN) == (ICANON|IEXTEN)) {
+ if (CMP_CC(VSTATUS, c)) {
+ tty_signal_pgrp(tp, SIGINFO);
+ return (0);
+ }
+ }
+
+ /*
+ * When compared to the old implementation, this
+ * implementation also flushes the output queue. POSIX
+ * is really brief about this, but does makes us assume
+ * we have to do so.
+ */
+ signal = 0;
+ if (CMP_CC(VINTR, c)) {
+ signal = SIGINT;
+ } else if (CMP_CC(VQUIT, c)) {
+ signal = SIGQUIT;
+ } else if (CMP_CC(VSUSP, c)) {
+ signal = SIGTSTP;
+ }
+
+ if (signal != 0) {
+ /*
+ * Echo the character before signalling the
+ * processes.
+ */
+ if (!CMP_FLAG(l, NOFLSH))
+ tty_flush(tp, FREAD|FWRITE);
+ ttydisc_echo(tp, c, 0);
+ tty_signal_pgrp(tp, signal);
+ return (0);
+ }
+ }
+
+ /*
+ * Handle start/stop characters.
+ */
+ if (CMP_FLAG(i, IXON)) {
+ if (CMP_CC(VSTOP, c)) {
+ /* Stop it if we aren't stopped yet. */
+ if ((tp->t_flags & TF_STOPPED) == 0) {
+ tp->t_flags |= TF_STOPPED;
+ return (0);
+ }
+ /*
+ * Fallthrough:
+ * When VSTART == VSTOP, we should make this key
+ * toggle it.
+ */
+ if (!CMP_CC(VSTART, c))
+ return (0);
+ }
+ if (CMP_CC(VSTART, c)) {
+ tp->t_flags &= ~TF_STOPPED;
+ return (0);
+ }
+ }
+
+ /* Conversion of CR and NL. */
+ switch (c) {
+ case CCR:
+ if (CMP_FLAG(i, IGNCR))
+ return (0);
+ if (CMP_FLAG(i, ICRNL))
+ c = CNL;
+ break;
+ case CNL:
+ if (CMP_FLAG(i, INLCR))
+ c = CCR;
+ break;
+ }
+
+ /* Canonical line editing. */
+ if (CMP_FLAG(l, ICANON)) {
+ if (CMP_CC(VERASE, c) || CMP_CC(VERASE2, c)) {
+ ttydisc_rubchar(tp);
+ return (0);
+ } else if (CMP_CC(VKILL, c)) {
+ while (ttydisc_rubchar(tp) == 0);
+ return (0);
+ } else if (CMP_FLAG(l, IEXTEN)) {
+ if (CMP_CC(VWERASE, c)) {
+ ttydisc_rubword(tp);
+ return (0);
+ } else if (CMP_CC(VREPRINT, c)) {
+ ttydisc_reprint(tp);
+ return (0);
+ }
+ }
+ }
+
+processed:
+ if (CMP_FLAG(i, PARMRK) && (unsigned char)c == 0xff) {
+ /* Print 0xff 0xff. */
+ ob[1] = 0xff;
+ ol = 2;
+ quote = 1;
+ } else {
+ ob[0] = c;
+ ol = 1;
+ }
+
+ goto print;
+
+parmrk:
+ if (CMP_FLAG(i, PARMRK)) {
+ /* Prepend 0xff 0x00 0x.. */
+ ob[2] = c;
+ ol = 3;
+ quote = 1;
+ } else {
+ ob[0] = c;
+ ol = 1;
+ }
+
+print:
+ /* See if we can store this on the input queue. */
+ if (ttyinq_write_nofrag(&tp->t_inq, ob, ol, quote) != 0) {
+ if (CMP_FLAG(i, IMAXBEL))
+ ttyoutq_write_nofrag(&tp->t_outq, "\a", 1);
+
+ /*
+ * Prevent a deadlock here. It may be possible that a
+ * user has entered so much data, there is no data
+ * available to read(), but the buffers are full anyway.
+ *
+ * Only enter the high watermark if the device driver
+ * can actually transmit something.
+ */
+ if (ttyinq_bytescanonicalized(&tp->t_inq) == 0)
+ return (0);
+
+ tty_hiwat_in_block(tp);
+ return (-1);
+ }
+
+ /*
+ * In raw mode, we canonicalize after receiving a single
+ * character. Otherwise, we canonicalize when we receive a
+ * newline, VEOL or VEOF, but only when it isn't quoted.
+ */
+ if (!CMP_FLAG(l, ICANON) ||
+ (!quote && (c == CNL || CMP_CC(VEOL, c) || CMP_CC(VEOF, c)))) {
+ ttyinq_canonicalize(&tp->t_inq);
+ }
+
+ ttydisc_echo(tp, c, quote);
+
+ return (0);
+}
+
+size_t
+ttydisc_rint_simple(struct tty *tp, const void *buf, size_t len)
+{
+ const char *cbuf;
+
+ if (ttydisc_can_bypass(tp))
+ return (ttydisc_rint_bypass(tp, buf, len));
+
+ for (cbuf = buf; len-- > 0; cbuf++) {
+ if (ttydisc_rint(tp, *cbuf, 0) != 0)
+ break;
+ }
+
+ return (cbuf - (const char *)buf);
+}
+
+size_t
+ttydisc_rint_bypass(struct tty *tp, const void *buf, size_t len)
+{
+ size_t ret;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ MPASS(tp->t_flags & TF_BYPASS);
+
+ atomic_add_long(&tty_nin, len);
+
+ if (ttyhook_hashook(tp, rint_bypass))
+ return ttyhook_rint_bypass(tp, buf, len);
+
+ ret = ttyinq_write(&tp->t_inq, buf, len, 0);
+ ttyinq_canonicalize(&tp->t_inq);
+ if (ret < len)
+ tty_hiwat_in_block(tp);
+
+ return (ret);
+}
+
+void
+ttydisc_rint_done(struct tty *tp)
+{
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (ttyhook_hashook(tp, rint_done))
+ ttyhook_rint_done(tp);
+
+ /* Wake up readers. */
+ tty_wakeup(tp, FREAD);
+ /* Wake up driver for echo. */
+ ttydevsw_outwakeup(tp);
+}
+
+size_t
+ttydisc_rint_poll(struct tty *tp)
+{
+ size_t l;
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (ttyhook_hashook(tp, rint_poll))
+ return ttyhook_rint_poll(tp);
+
+ /*
+ * XXX: Still allow character input when there's no space in the
+ * buffers, but we haven't entered the high watermark. This is
+ * to allow backspace characters to be inserted when in
+ * canonical mode.
+ */
+ l = ttyinq_bytesleft(&tp->t_inq);
+ if (l == 0 && (tp->t_flags & TF_HIWAT_IN) == 0)
+ return (1);
+
+ return (l);
+}
+
+static void
+ttydisc_wakeup_watermark(struct tty *tp)
+{
+ size_t c;
+
+ c = ttyoutq_bytesleft(&tp->t_outq);
+ if (tp->t_flags & TF_HIWAT_OUT) {
+ /* Only allow us to run when we're below the watermark. */
+ if (c < tp->t_outlow)
+ return;
+
+ /* Reset the watermark. */
+ tp->t_flags &= ~TF_HIWAT_OUT;
+ } else {
+ /* Only run when we have data at all. */
+ if (c == 0)
+ return;
+ }
+ tty_wakeup(tp, FWRITE);
+}
+
+size_t
+ttydisc_getc(struct tty *tp, void *buf, size_t len)
+{
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tp->t_flags & TF_STOPPED)
+ return (0);
+
+ if (ttyhook_hashook(tp, getc_inject))
+ return ttyhook_getc_inject(tp, buf, len);
+
+ len = ttyoutq_read(&tp->t_outq, buf, len);
+
+ if (ttyhook_hashook(tp, getc_capture))
+ ttyhook_getc_capture(tp, buf, len);
+
+ ttydisc_wakeup_watermark(tp);
+ atomic_add_long(&tty_nout, len);
+
+ return (len);
+}
+
+int
+ttydisc_getc_uio(struct tty *tp, struct uio *uio)
+{
+ int error = 0;
+ ssize_t obytes = uio->uio_resid;
+ size_t len;
+ char buf[TTY_STACKBUF];
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tp->t_flags & TF_STOPPED)
+ return (0);
+
+ /*
+ * When a TTY hook is attached, we cannot perform unbuffered
+ * copying to userspace. Just call ttydisc_getc() and
+ * temporarily store data in a shadow buffer.
+ */
+ if (ttyhook_hashook(tp, getc_capture) ||
+ ttyhook_hashook(tp, getc_inject)) {
+ while (uio->uio_resid > 0) {
+ /* Read to shadow buffer. */
+ len = ttydisc_getc(tp, buf,
+ MIN(uio->uio_resid, sizeof buf));
+ if (len == 0)
+ break;
+
+ /* Copy to userspace. */
+ tty_unlock(tp);
+ error = uiomove(buf, len, uio);
+ tty_lock(tp);
+
+ if (error != 0)
+ break;
+ }
+ } else {
+ error = ttyoutq_read_uio(&tp->t_outq, tp, uio);
+
+ ttydisc_wakeup_watermark(tp);
+ atomic_add_long(&tty_nout, obytes - uio->uio_resid);
+ }
+
+ return (error);
+}
+
+size_t
+ttydisc_getc_poll(struct tty *tp)
+{
+
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tp->t_flags & TF_STOPPED)
+ return (0);
+
+ if (ttyhook_hashook(tp, getc_poll))
+ return ttyhook_getc_poll(tp);
+
+ return ttyoutq_bytesused(&tp->t_outq);
+}
+
+/*
+ * XXX: not really related to the TTYDISC, but we'd better put
+ * tty_putchar() here, because we need to perform proper output
+ * processing.
+ */
+
+int
+tty_putchar(struct tty *tp, char c)
+{
+ tty_lock_assert(tp, MA_OWNED);
+
+ if (tty_gone(tp))
+ return (-1);
+
+ ttydisc_echo_force(tp, c, 0);
+ tp->t_writepos = tp->t_column;
+ ttyinq_reprintpos_set(&tp->t_inq);
+
+ ttydevsw_outwakeup(tp);
+ return (0);
+}
diff --git a/sys/kern/uipc_accf.c b/sys/kern/uipc_accf.c
new file mode 100644
index 0000000..236b60d
--- /dev/null
+++ b/sys/kern/uipc_accf.c
@@ -0,0 +1,298 @@
+/*-
+ * Copyright (c) 2000 Paycounter, Inc.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define ACCEPT_FILTER_MOD
+
+#include "opt_param.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/protosw.h>
+#include <sys/sysctl.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/queue.h>
+
+static struct mtx accept_filter_mtx;
+MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx",
+ MTX_DEF);
+#define ACCEPT_FILTER_LOCK() mtx_lock(&accept_filter_mtx)
+#define ACCEPT_FILTER_UNLOCK() mtx_unlock(&accept_filter_mtx)
+
+static SLIST_HEAD(, accept_filter) accept_filtlsthd =
+ SLIST_HEAD_INITIALIZER(accept_filtlsthd);
+
+MALLOC_DEFINE(M_ACCF, "accf", "accept filter data");
+
+static int unloadable = 0;
+
+SYSCTL_DECL(_net_inet); /* XXX: some header should do this for me */
+SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters");
+SYSCTL_INT(_net_inet_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
+ "Allow unload of accept filters (not recommended)");
+
+/*
+ * Must be passed a malloc'd structure so we don't explode if the kld is
+ * unloaded, we leak the struct on deallocation to deal with this, but if a
+ * filter is loaded with the same name as a leaked one we re-use the entry.
+ */
+int
+accept_filt_add(struct accept_filter *filt)
+{
+ struct accept_filter *p;
+
+ ACCEPT_FILTER_LOCK();
+ SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
+ if (strcmp(p->accf_name, filt->accf_name) == 0) {
+ if (p->accf_callback != NULL) {
+ ACCEPT_FILTER_UNLOCK();
+ return (EEXIST);
+ } else {
+ p->accf_callback = filt->accf_callback;
+ ACCEPT_FILTER_UNLOCK();
+ free(filt, M_ACCF);
+ return (0);
+ }
+ }
+
+ if (p == NULL)
+ SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
+ ACCEPT_FILTER_UNLOCK();
+ return (0);
+}
+
+int
+accept_filt_del(char *name)
+{
+ struct accept_filter *p;
+
+ p = accept_filt_get(name);
+ if (p == NULL)
+ return (ENOENT);
+
+ p->accf_callback = NULL;
+ return (0);
+}
+
+struct accept_filter *
+accept_filt_get(char *name)
+{
+ struct accept_filter *p;
+
+ ACCEPT_FILTER_LOCK();
+ SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
+ if (strcmp(p->accf_name, name) == 0)
+ break;
+ ACCEPT_FILTER_UNLOCK();
+
+ return (p);
+}
+
+int
+accept_filt_generic_mod_event(module_t mod, int event, void *data)
+{
+ struct accept_filter *p;
+ struct accept_filter *accfp = (struct accept_filter *) data;
+ int error;
+
+ switch (event) {
+ case MOD_LOAD:
+ p = malloc(sizeof(*p), M_ACCF,
+ M_WAITOK);
+ bcopy(accfp, p, sizeof(*p));
+ error = accept_filt_add(p);
+ break;
+
+ case MOD_UNLOAD:
+ /*
+ * Do not support unloading yet. we don't keep track of
+ * refcounts and unloading an accept filter callback and then
+ * having it called is a bad thing. A simple fix would be to
+ * track the refcount in the struct accept_filter.
+ */
+ if (unloadable != 0) {
+ error = accept_filt_del(accfp->accf_name);
+ } else
+ error = EOPNOTSUPP;
+ break;
+
+ case MOD_SHUTDOWN:
+ error = 0;
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+int
+do_getopt_accept_filter(struct socket *so, struct sockopt *sopt)
+{
+ struct accept_filter_arg *afap;
+ int error;
+
+ error = 0;
+ afap = malloc(sizeof(*afap), M_TEMP,
+ M_WAITOK | M_ZERO);
+ SOCK_LOCK(so);
+ if ((so->so_options & SO_ACCEPTCONN) == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
+ if (so->so_accf->so_accept_filter_str != NULL)
+ strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
+out:
+ SOCK_UNLOCK(so);
+ if (error == 0)
+ error = sooptcopyout(sopt, afap, sizeof(*afap));
+ free(afap, M_TEMP);
+ return (error);
+}
+
+int
+do_setopt_accept_filter(struct socket *so, struct sockopt *sopt)
+{
+ struct accept_filter_arg *afap;
+ struct accept_filter *afp;
+ struct so_accf *newaf;
+ int error = 0;
+
+ /*
+ * Handle the simple delete case first.
+ */
+ if (sopt == NULL || sopt->sopt_val == NULL) {
+ SOCK_LOCK(so);
+ if ((so->so_options & SO_ACCEPTCONN) == 0) {
+ SOCK_UNLOCK(so);
+ return (EINVAL);
+ }
+ if (so->so_accf != NULL) {
+ struct so_accf *af = so->so_accf;
+ if (af->so_accept_filter != NULL &&
+ af->so_accept_filter->accf_destroy != NULL) {
+ af->so_accept_filter->accf_destroy(so);
+ }
+ if (af->so_accept_filter_str != NULL)
+ free(af->so_accept_filter_str, M_ACCF);
+ free(af, M_ACCF);
+ so->so_accf = NULL;
+ }
+ so->so_options &= ~SO_ACCEPTFILTER;
+ SOCK_UNLOCK(so);
+ return (0);
+ }
+
+ /*
+ * Pre-allocate any memory we may need later to avoid blocking at
+ * untimely moments. This does not optimize for invalid arguments.
+ */
+ afap = malloc(sizeof(*afap), M_TEMP,
+ M_WAITOK);
+ error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
+ afap->af_name[sizeof(afap->af_name)-1] = '\0';
+ afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
+ if (error) {
+ free(afap, M_TEMP);
+ return (error);
+ }
+ afp = accept_filt_get(afap->af_name);
+ if (afp == NULL) {
+ free(afap, M_TEMP);
+ return (ENOENT);
+ }
+ /*
+ * Allocate the new accept filter instance storage. We may
+ * have to free it again later if we fail to attach it. If
+ * attached properly, 'newaf' is NULLed to avoid a free()
+ * while in use.
+ */
+ newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK |
+ M_ZERO);
+ if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
+ int len = strlen(afap->af_name) + 1;
+ newaf->so_accept_filter_str = malloc(len, M_ACCF,
+ M_WAITOK);
+ strcpy(newaf->so_accept_filter_str, afap->af_name);
+ }
+
+ /*
+ * Require a listen socket; don't try to replace an existing filter
+ * without first removing it.
+ */
+ SOCK_LOCK(so);
+ if (((so->so_options & SO_ACCEPTCONN) == 0) ||
+ (so->so_accf != NULL)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Invoke the accf_create() method of the filter if required. The
+ * socket mutex is held over this call, so create methods for filters
+ * can't block.
+ */
+ if (afp->accf_create != NULL) {
+ newaf->so_accept_filter_arg =
+ afp->accf_create(so, afap->af_arg);
+ if (newaf->so_accept_filter_arg == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+ }
+ newaf->so_accept_filter = afp;
+ so->so_accf = newaf;
+ so->so_options |= SO_ACCEPTFILTER;
+ newaf = NULL;
+out:
+ SOCK_UNLOCK(so);
+ if (newaf != NULL) {
+ if (newaf->so_accept_filter_str != NULL)
+ free(newaf->so_accept_filter_str, M_ACCF);
+ free(newaf, M_ACCF);
+ }
+ if (afap != NULL)
+ free(afap, M_TEMP);
+ return (error);
+}
diff --git a/sys/kern/uipc_cow.c b/sys/kern/uipc_cow.c
new file mode 100644
index 0000000..8a3a5ff
--- /dev/null
+++ b/sys/kern/uipc_cow.c
@@ -0,0 +1,182 @@
+/*--
+ * Copyright (c) 1997, Duke University
+ * All rights reserved.
+ *
+ * Author:
+ * Andrew Gallatin <gallatin@cs.duke.edu>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of Duke University may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY DUKE UNIVERSITY ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DUKE UNIVERSITY BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITSOR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is a set of routines for enabling and disabling copy on write
+ * protection for data written into sockets.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/mbuf.h>
+#include <sys/sf_buf.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+
+FEATURE(zero_copy_sockets, "Zero copy sockets support");
+
+struct netsend_cow_stats {
+ int attempted;
+ int fail_not_mapped;
+ int fail_sf_buf;
+ int success;
+ int iodone;
+};
+
+static struct netsend_cow_stats socow_stats;
+
+static int socow_iodone(struct mbuf *m, void *addr, void *args);
+
+static int
+socow_iodone(struct mbuf *m, void *addr, void *args)
+{
+ struct sf_buf *sf;
+ vm_page_t pp;
+
+ sf = args;
+ pp = sf_buf_page(sf);
+ sf_buf_free(sf);
+ /* remove COW mapping */
+ vm_page_lock(pp);
+ vm_page_cowclear(pp);
+ vm_page_unwire(pp, 0);
+ /*
+ * Check for the object going away on us. This can
+ * happen since we don't hold a reference to it.
+ * If so, we're responsible for freeing the page.
+ */
+ if (pp->wire_count == 0 && pp->object == NULL)
+ vm_page_free(pp);
+ vm_page_unlock(pp);
+ socow_stats.iodone++;
+ return (EXT_FREE_OK);
+}
+
+int
+socow_setup(struct mbuf *m0, struct uio *uio)
+{
+ struct sf_buf *sf;
+ vm_page_t pp;
+ struct iovec *iov;
+ struct vmspace *vmspace;
+ struct vm_map *map;
+ vm_offset_t offset, uva;
+ vm_size_t len;
+
+ socow_stats.attempted++;
+ vmspace = curproc->p_vmspace;
+ map = &vmspace->vm_map;
+ uva = (vm_offset_t) uio->uio_iov->iov_base;
+ offset = uva & PAGE_MASK;
+ len = PAGE_SIZE - offset;
+
+ /*
+ * Verify that access to the given address is allowed from user-space.
+ */
+ if (vm_fault_quick_hold_pages(map, uva, len, VM_PROT_READ, &pp, 1) <
+ 0) {
+ socow_stats.fail_not_mapped++;
+ return(0);
+ }
+
+ /*
+ * set up COW
+ */
+ vm_page_lock(pp);
+ if (vm_page_cowsetup(pp) != 0) {
+ vm_page_unhold(pp);
+ vm_page_unlock(pp);
+ return (0);
+ }
+
+ /*
+ * wire the page for I/O
+ */
+ vm_page_wire(pp);
+ vm_page_unhold(pp);
+ vm_page_unlock(pp);
+ /*
+ * Allocate an sf buf
+ */
+ sf = sf_buf_alloc(pp, SFB_CATCH);
+ if (sf == NULL) {
+ vm_page_lock(pp);
+ vm_page_cowclear(pp);
+ vm_page_unwire(pp, 0);
+ /*
+ * Check for the object going away on us. This can
+ * happen since we don't hold a reference to it.
+ * If so, we're responsible for freeing the page.
+ */
+ if (pp->wire_count == 0 && pp->object == NULL)
+ vm_page_free(pp);
+ vm_page_unlock(pp);
+ socow_stats.fail_sf_buf++;
+ return(0);
+ }
+ /*
+ * attach to mbuf
+ */
+ MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, socow_iodone,
+ (void*)sf_buf_kva(sf), sf, M_RDONLY, EXT_SFBUF);
+ m0->m_len = len;
+ m0->m_data = (caddr_t)sf_buf_kva(sf) + offset;
+ socow_stats.success++;
+
+ iov = uio->uio_iov;
+ iov->iov_base = (char *)iov->iov_base + m0->m_len;
+ iov->iov_len -= m0->m_len;
+ uio->uio_resid -= m0->m_len;
+ uio->uio_offset += m0->m_len;
+ if (iov->iov_len == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ }
+
+ return(m0->m_len);
+}
diff --git a/sys/kern/uipc_debug.c b/sys/kern/uipc_debug.c
new file mode 100644
index 0000000..128c64b
--- /dev/null
+++ b/sys/kern/uipc_debug.c
@@ -0,0 +1,531 @@
+/*-
+ * Copyright (c) 2007 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Debugger routines relating to sockets, protocols, etc, for use in DDB.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+static void
+db_print_sotype(short so_type)
+{
+
+ switch (so_type) {
+ case SOCK_STREAM:
+ db_printf("SOCK_STREAM");
+ break;
+
+ case SOCK_DGRAM:
+ db_printf("SOCK_DGRAM");
+ break;
+
+ case SOCK_RAW:
+ db_printf("SOCK_RAW");
+ break;
+
+ case SOCK_RDM:
+ db_printf("SOCK_RDM");
+ break;
+
+ case SOCK_SEQPACKET:
+ db_printf("SOCK_SEQPACKET");
+ break;
+
+ default:
+ db_printf("unknown");
+ break;
+ }
+}
+
+static void
+db_print_sooptions(short so_options)
+{
+ int comma;
+
+ comma = 0;
+ if (so_options & SO_DEBUG) {
+ db_printf("%sSO_DEBUG", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_ACCEPTCONN) {
+ db_printf("%sSO_ACCEPTCONN", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_REUSEADDR) {
+ db_printf("%sSO_REUSEADDR", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_KEEPALIVE) {
+ db_printf("%sSO_KEEPALIVE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_DONTROUTE) {
+ db_printf("%sSO_DONTROUTE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_BROADCAST) {
+ db_printf("%sSO_BROADCAST", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_USELOOPBACK) {
+ db_printf("%sSO_USELOOPBACK", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_LINGER) {
+ db_printf("%sSO_LINGER", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_OOBINLINE) {
+ db_printf("%sSO_OOBINLINE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_REUSEPORT) {
+ db_printf("%sSO_REUSEPORT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_TIMESTAMP) {
+ db_printf("%sSO_TIMESTAMP", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_NOSIGPIPE) {
+ db_printf("%sSO_NOSIGPIPE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_ACCEPTFILTER) {
+ db_printf("%sSO_ACCEPTFILTER", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_BINTIME) {
+ db_printf("%sSO_BINTIME", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_NO_OFFLOAD) {
+ db_printf("%sSO_NO_OFFLOAD", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_options & SO_NO_DDP) {
+ db_printf("%sSO_NO_DDP", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_sostate(short so_state)
+{
+ int comma;
+
+ comma = 0;
+ if (so_state & SS_NOFDREF) {
+ db_printf("%sSS_NOFDREF", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ISCONNECTED) {
+ db_printf("%sSS_ISCONNECTED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ISCONNECTING) {
+ db_printf("%sSS_ISCONNECTING", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ISDISCONNECTING) {
+ db_printf("%sSS_ISDISCONNECTING", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_NBIO) {
+ db_printf("%sSS_NBIO", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ASYNC) {
+ db_printf("%sSS_ASYNC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_ISCONFIRMING) {
+ db_printf("%sSS_ISCONFIRMING", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_state & SS_PROTOREF) {
+ db_printf("%sSS_PROTOREF", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_soqstate(int so_qstate)
+{
+ int comma;
+
+ comma = 0;
+ if (so_qstate & SQ_INCOMP) {
+ db_printf("%sSQ_INCOMP", comma ? ", " : "");
+ comma = 1;
+ }
+ if (so_qstate & SQ_COMP) {
+ db_printf("%sSQ_COMP", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_sbstate(short sb_state)
+{
+ int comma;
+
+ comma = 0;
+ if (sb_state & SBS_CANTSENDMORE) {
+ db_printf("%sSS_CANTSENDMORE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_state & SBS_CANTRCVMORE) {
+ db_printf("%sSS_CANTRCVMORE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_state & SBS_RCVATMARK) {
+ db_printf("%sSS_RCVATMARK", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_indent(int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+}
+
+static void
+db_print_domain(struct domain *d, const char *domain_name, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", domain_name, d);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("dom_family: %d ", d->dom_family);
+ db_printf("dom_name: %s\n", d->dom_name);
+
+ db_print_indent(indent);
+ db_printf("dom_init: %p ", d->dom_init);
+ db_printf("dom_externalize: %p ", d->dom_externalize);
+ db_printf("dom_dispose: %p\n", d->dom_dispose);
+
+ db_print_indent(indent);
+ db_printf("dom_protosw: %p ", d->dom_protosw);
+ db_printf("dom_next: %p\n", d->dom_next);
+
+ db_print_indent(indent);
+ db_printf("dom_rtattach: %p ", d->dom_rtattach);
+ db_printf("dom_rtoffset: %d ", d->dom_rtoffset);
+ db_printf("dom_maxrtkey: %d\n", d->dom_maxrtkey);
+
+ db_print_indent(indent);
+ db_printf("dom_ifattach: %p ", d->dom_ifattach);
+ db_printf("dom_ifdetach: %p\n", d->dom_ifdetach);
+}
+
+static void
+db_print_prflags(short pr_flags)
+{
+ int comma;
+
+ comma = 0;
+ if (pr_flags & PR_ATOMIC) {
+ db_printf("%sPR_ATOMIC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_ADDR) {
+ db_printf("%sPR_ADDR", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_CONNREQUIRED) {
+ db_printf("%sPR_CONNREQUIRED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_WANTRCVD) {
+ db_printf("%sPR_WANTRCVD", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_RIGHTS) {
+ db_printf("%sPR_RIGHTS", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_IMPLOPCL) {
+ db_printf("%sPR_IMPLOPCL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (pr_flags & PR_LASTHDR) {
+ db_printf("%sPR_LASTHDR", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_protosw(struct protosw *pr, const char *prname, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", prname, pr);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("pr_type: %d ", pr->pr_type);
+ db_printf("pr_domain: %p\n", pr->pr_domain);
+ if (pr->pr_domain != NULL)
+ db_print_domain(pr->pr_domain, "pr_domain", indent);
+
+ db_print_indent(indent);
+ db_printf("pr_protocol: %d\n", pr->pr_protocol);
+
+ db_print_indent(indent);
+ db_printf("pr_flags: %d (", pr->pr_flags);
+ db_print_prflags(pr->pr_flags);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("pr_input: %p ", pr->pr_input);
+ db_printf("pr_output: %p ", pr->pr_output);
+ db_printf("pr_ctlinput: %p\n", pr->pr_ctlinput);
+
+ db_print_indent(indent);
+ db_printf("pr_ctloutput: %p ", pr->pr_ctloutput);
+ db_printf("pr_init: %p\n", pr->pr_init);
+
+ db_print_indent(indent);
+ db_printf("pr_fasttimo: %p ", pr->pr_fasttimo);
+ db_printf("pr_slowtimo: %p ", pr->pr_slowtimo);
+ db_printf("pr_drain: %p\n", pr->pr_drain);
+
+ db_print_indent(indent);
+}
+
+static void
+db_print_sbflags(short sb_flags)
+{
+ int comma;
+
+ comma = 0;
+ if (sb_flags & SB_WAIT) {
+ db_printf("%sSB_WAIT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_SEL) {
+ db_printf("%sSB_SEL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_ASYNC) {
+ db_printf("%sSB_ASYNC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_UPCALL) {
+ db_printf("%sSB_UPCALL", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_NOINTR) {
+ db_printf("%sSB_NOINTR", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_AIO) {
+ db_printf("%sSB_AIO", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_KNOTE) {
+ db_printf("%sSB_KNOTE", comma ? ", " : "");
+ comma = 1;
+ }
+ if (sb_flags & SB_AUTOSIZE) {
+ db_printf("%sSB_AUTOSIZE", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_sockbuf(struct sockbuf *sb, const char *sockbufname, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", sockbufname, sb);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("sb_state: 0x%x (", sb->sb_state);
+ db_print_sbstate(sb->sb_state);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("sb_mb: %p ", sb->sb_mb);
+ db_printf("sb_mbtail: %p ", sb->sb_mbtail);
+ db_printf("sb_lastrecord: %p\n", sb->sb_lastrecord);
+
+ db_print_indent(indent);
+ db_printf("sb_sndptr: %p ", sb->sb_sndptr);
+ db_printf("sb_sndptroff: %u\n", sb->sb_sndptroff);
+
+ db_print_indent(indent);
+ db_printf("sb_cc: %u ", sb->sb_cc);
+ db_printf("sb_hiwat: %u ", sb->sb_hiwat);
+ db_printf("sb_mbcnt: %u ", sb->sb_mbcnt);
+ db_printf("sb_mbmax: %u\n", sb->sb_mbmax);
+
+ db_print_indent(indent);
+ db_printf("sb_ctl: %u ", sb->sb_ctl);
+ db_printf("sb_lowat: %d ", sb->sb_lowat);
+ db_printf("sb_timeo: %jd\n", sb->sb_timeo);
+
+ db_print_indent(indent);
+ db_printf("sb_flags: 0x%x (", sb->sb_flags);
+ db_print_sbflags(sb->sb_flags);
+ db_printf(")\n");
+}
+
+static void
+db_print_socket(struct socket *so, const char *socketname, int indent)
+{
+
+ db_print_indent(indent);
+ db_printf("%s at %p\n", socketname, so);
+
+ indent += 2;
+
+ db_print_indent(indent);
+ db_printf("so_count: %d ", so->so_count);
+ db_printf("so_type: %d (", so->so_type);
+ db_print_sotype(so->so_type);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("so_options: 0x%x (", so->so_options);
+ db_print_sooptions(so->so_options);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("so_linger: %d ", so->so_linger);
+ db_printf("so_state: 0x%x (", so->so_state);
+ db_print_sostate(so->so_state);
+ db_printf(")\n");
+
+ db_print_indent(indent);
+ db_printf("so_qstate: 0x%x (", so->so_qstate);
+ db_print_soqstate(so->so_qstate);
+ db_printf(") ");
+ db_printf("so_pcb: %p ", so->so_pcb);
+ db_printf("so_proto: %p\n", so->so_proto);
+
+ if (so->so_proto != NULL)
+ db_print_protosw(so->so_proto, "so_proto", indent);
+
+ db_print_indent(indent);
+ db_printf("so_head: %p ", so->so_head);
+ db_printf("so_incomp first: %p ", TAILQ_FIRST(&so->so_incomp));
+ db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp));
+
+ db_print_indent(indent);
+ /* so_list skipped */
+ db_printf("so_qlen: %d ", so->so_qlen);
+ db_printf("so_incqlen: %d ", so->so_incqlen);
+ db_printf("so_qlimit: %d ", so->so_qlimit);
+ db_printf("so_timeo: %d ", so->so_timeo);
+ db_printf("so_error: %d\n", so->so_error);
+
+ db_print_indent(indent);
+ db_printf("so_sigio: %p ", so->so_sigio);
+ db_printf("so_oobmark: %lu ", so->so_oobmark);
+ db_printf("so_aiojobq first: %p\n", TAILQ_FIRST(&so->so_aiojobq));
+
+ db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
+ db_print_sockbuf(&so->so_snd, "so_snd", indent);
+}
+
+DB_SHOW_COMMAND(socket, db_show_socket)
+{
+ struct socket *so;
+
+ if (!have_addr) {
+ db_printf("usage: show socket <addr>\n");
+ return;
+ }
+ so = (struct socket *)addr;
+
+ db_print_socket(so, "socket", 0);
+}
+
+DB_SHOW_COMMAND(sockbuf, db_show_sockbuf)
+{
+ struct sockbuf *sb;
+
+ if (!have_addr) {
+ db_printf("usage: show sockbuf <addr>\n");
+ return;
+ }
+ sb = (struct sockbuf *)addr;
+
+ db_print_sockbuf(sb, "sockbuf", 0);
+}
+
+DB_SHOW_COMMAND(protosw, db_show_protosw)
+{
+ struct protosw *pr;
+
+ if (!have_addr) {
+ db_printf("usage: show protosw <addr>\n");
+ return;
+ }
+ pr = (struct protosw *)addr;
+
+ db_print_protosw(pr, "protosw", 0);
+}
+
+DB_SHOW_COMMAND(domain, db_show_domain)
+{
+ struct domain *d;
+
+ if (!have_addr) {
+ db_printf("usage: show protosw <addr>\n");
+ return;
+ }
+ d = (struct domain *)addr;
+
+ db_print_domain(d, "domain", 0);
+}
+#endif
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
new file mode 100644
index 0000000..709cc0e
--- /dev/null
+++ b/sys/kern/uipc_domain.c
@@ -0,0 +1,523 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/protosw.h>
+#include <sys/domain.h>
+#include <sys/eventhandler.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/socketvar.h>
+#include <sys/systm.h>
+
+#include <net/vnet.h>
+
+/*
+ * System initialization
+ *
+ * Note: domain initialization takes place on a per domain basis
+ * as a result of traversing a SYSINIT linker set. Most likely,
+ * each domain would want to call DOMAIN_SET(9) itself, which
+ * would cause the domain to be added just after domaininit()
+ * is called during startup.
+ *
+ * See DOMAIN_SET(9) for details on its use.
+ */
+
+static void domaininit(void *);
+SYSINIT(domain, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, domaininit, NULL);
+
+static void domainfinalize(void *);
+SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize,
+ NULL);
+
+static struct callout pffast_callout;
+static struct callout pfslow_callout;
+
+static void pffasttimo(void *);
+static void pfslowtimo(void *);
+
+struct domain *domains; /* registered protocol domains */
+int domain_init_status = 0;
+static struct mtx dom_mtx; /* domain list lock */
+MTX_SYSINIT(domain, &dom_mtx, "domain list", MTX_DEF);
+
+/*
+ * Dummy protocol specific user requests function pointer array.
+ * All functions return EOPNOTSUPP.
+ */
+struct pr_usrreqs nousrreqs = {
+ .pru_accept = pru_accept_notsupp,
+ .pru_attach = pru_attach_notsupp,
+ .pru_bind = pru_bind_notsupp,
+ .pru_connect = pru_connect_notsupp,
+ .pru_connect2 = pru_connect2_notsupp,
+ .pru_control = pru_control_notsupp,
+ .pru_disconnect = pru_disconnect_notsupp,
+ .pru_listen = pru_listen_notsupp,
+ .pru_peeraddr = pru_peeraddr_notsupp,
+ .pru_rcvd = pru_rcvd_notsupp,
+ .pru_rcvoob = pru_rcvoob_notsupp,
+ .pru_send = pru_send_notsupp,
+ .pru_sense = pru_sense_null,
+ .pru_shutdown = pru_shutdown_notsupp,
+ .pru_sockaddr = pru_sockaddr_notsupp,
+ .pru_sosend = pru_sosend_notsupp,
+ .pru_soreceive = pru_soreceive_notsupp,
+ .pru_sopoll = pru_sopoll_notsupp,
+};
+
+static void
+protosw_init(struct protosw *pr)
+{
+ struct pr_usrreqs *pu;
+
+ pu = pr->pr_usrreqs;
+ KASSERT(pu != NULL, ("protosw_init: %ssw[%d] has no usrreqs!",
+ pr->pr_domain->dom_name,
+ (int)(pr - pr->pr_domain->dom_protosw)));
+
+ /*
+ * Protocol switch methods fall into three categories: mandatory,
+ * mandatory but protosw_init() provides a default, and optional.
+ *
+ * For true protocols (i.e., pru_attach != NULL), KASSERT truly
+ * mandatory methods with no defaults, and initialize defaults for
+ * other mandatory methods if the protocol hasn't defined an
+ * implementation (NULL function pointer).
+ */
+#if 0
+ if (pu->pru_attach != NULL) {
+ KASSERT(pu->pru_abort != NULL,
+ ("protosw_init: %ssw[%d] pru_abort NULL",
+ pr->pr_domain->dom_name,
+ (int)(pr - pr->pr_domain->dom_protosw)));
+ KASSERT(pu->pru_send != NULL,
+ ("protosw_init: %ssw[%d] pru_send NULL",
+ pr->pr_domain->dom_name,
+ (int)(pr - pr->pr_domain->dom_protosw)));
+ }
+#endif
+
+#define DEFAULT(foo, bar) if ((foo) == NULL) (foo) = (bar)
+ DEFAULT(pu->pru_accept, pru_accept_notsupp);
+ DEFAULT(pu->pru_bind, pru_bind_notsupp);
+ DEFAULT(pu->pru_bindat, pru_bindat_notsupp);
+ DEFAULT(pu->pru_connect, pru_connect_notsupp);
+ DEFAULT(pu->pru_connect2, pru_connect2_notsupp);
+ DEFAULT(pu->pru_connectat, pru_connectat_notsupp);
+ DEFAULT(pu->pru_control, pru_control_notsupp);
+ DEFAULT(pu->pru_disconnect, pru_disconnect_notsupp);
+ DEFAULT(pu->pru_listen, pru_listen_notsupp);
+ DEFAULT(pu->pru_peeraddr, pru_peeraddr_notsupp);
+ DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp);
+ DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp);
+ DEFAULT(pu->pru_sense, pru_sense_null);
+ DEFAULT(pu->pru_shutdown, pru_shutdown_notsupp);
+ DEFAULT(pu->pru_sockaddr, pru_sockaddr_notsupp);
+ DEFAULT(pu->pru_sosend, sosend_generic);
+ DEFAULT(pu->pru_soreceive, soreceive_generic);
+ DEFAULT(pu->pru_sopoll, sopoll_generic);
+#undef DEFAULT
+ if (pr->pr_init)
+ (*pr->pr_init)();
+}
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because a socket may be using it.
+ * XXX can't fail at this time.
+ */
+void
+domain_init(void *arg)
+{
+ struct domain *dp = arg;
+ struct protosw *pr;
+
+ if (dp->dom_init)
+ (*dp->dom_init)();
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ protosw_init(pr);
+ /*
+ * update global information about maximums
+ */
+ max_hdr = max_linkhdr + max_protohdr;
+ max_datalen = MHLEN - max_hdr;
+ if (max_datalen < 1)
+ panic("%s: max_datalen < 1", __func__);
+}
+
+#ifdef VIMAGE
+void
+vnet_domain_init(void *arg)
+{
+
+ /* Virtualized case is no different -- call init functions. */
+ domain_init(arg);
+}
+
+void
+vnet_domain_uninit(void *arg)
+{
+ struct domain *dp = arg;
+ struct protosw *pr;
+
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_destroy)
+ (*pr->pr_destroy)();
+ if (dp->dom_destroy)
+ (*dp->dom_destroy)();
+}
+#endif
+
+/*
+ * Add a new protocol domain to the list of supported domains
+ * Note: you cant unload it again because a socket may be using it.
+ * XXX can't fail at this time.
+ */
+void
+domain_add(void *data)
+{
+ struct domain *dp;
+
+ dp = (struct domain *)data;
+ mtx_lock(&dom_mtx);
+ dp->dom_next = domains;
+ domains = dp;
+
+ KASSERT(domain_init_status >= 1,
+ ("attempt to domain_add(%s) before domaininit()",
+ dp->dom_name));
+#ifndef INVARIANTS
+ if (domain_init_status < 1)
+ printf("WARNING: attempt to domain_add(%s) before "
+ "domaininit()\n", dp->dom_name);
+#endif
+#ifdef notyet
+ KASSERT(domain_init_status < 2,
+ ("attempt to domain_add(%s) after domainfinalize()",
+ dp->dom_name));
+#else
+ if (domain_init_status >= 2)
+ printf("WARNING: attempt to domain_add(%s) after "
+ "domainfinalize()\n", dp->dom_name);
+#endif
+ mtx_unlock(&dom_mtx);
+}
+
+/* ARGSUSED*/
+static void
+domaininit(void *dummy)
+{
+
+ if (max_linkhdr < 16) /* XXX */
+ max_linkhdr = 16;
+
+ callout_init(&pffast_callout, CALLOUT_MPSAFE);
+ callout_init(&pfslow_callout, CALLOUT_MPSAFE);
+
+ mtx_lock(&dom_mtx);
+ KASSERT(domain_init_status == 0, ("domaininit called too late!"));
+ domain_init_status = 1;
+ mtx_unlock(&dom_mtx);
+}
+
+/* ARGSUSED*/
+static void
+domainfinalize(void *dummy)
+{
+
+ mtx_lock(&dom_mtx);
+ KASSERT(domain_init_status == 1, ("domainfinalize called too late!"));
+ domain_init_status = 2;
+ mtx_unlock(&dom_mtx);
+
+ callout_reset(&pffast_callout, 1, pffasttimo, NULL);
+ callout_reset(&pfslow_callout, 1, pfslowtimo, NULL);
+}
+
+struct domain *
+pffinddomain(int family)
+{
+ struct domain *dp;
+
+ for (dp = domains; dp != NULL; dp = dp->dom_next)
+ if (dp->dom_family == family)
+ return (dp);
+ return (NULL);
+}
+
+struct protosw *
+pffindtype(int family, int type)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+ dp = pffinddomain(family);
+ if (dp == NULL)
+ return (NULL);
+
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_type && pr->pr_type == type)
+ return (pr);
+ return (NULL);
+}
+
+struct protosw *
+pffindproto(int family, int protocol, int type)
+{
+ struct domain *dp;
+ struct protosw *pr;
+ struct protosw *maybe;
+
+ maybe = NULL;
+ if (family == 0)
+ return (NULL);
+
+ dp = pffinddomain(family);
+ if (dp == NULL)
+ return (NULL);
+
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+ if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
+ return (pr);
+
+ if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
+ pr->pr_protocol == 0 && maybe == NULL)
+ maybe = pr;
+ }
+ return (maybe);
+}
+
+/*
+ * The caller must make sure that the new protocol is fully set up and ready to
+ * accept requests before it is registered.
+ */
+int
+pf_proto_register(int family, struct protosw *npr)
+{
+ VNET_ITERATOR_DECL(vnet_iter);
+ struct domain *dp;
+ struct protosw *pr, *fpr;
+
+ /* Sanity checks. */
+ if (family == 0)
+ return (EPFNOSUPPORT);
+ if (npr->pr_type == 0)
+ return (EPROTOTYPE);
+ if (npr->pr_protocol == 0)
+ return (EPROTONOSUPPORT);
+ if (npr->pr_usrreqs == NULL)
+ return (ENXIO);
+
+ /* Try to find the specified domain based on the family. */
+ dp = pffinddomain(family);
+ if (dp == NULL)
+ return (EPFNOSUPPORT);
+
+ /* Initialize backpointer to struct domain. */
+ npr->pr_domain = dp;
+ fpr = NULL;
+
+ /*
+ * Protect us against races when two protocol registrations for
+ * the same protocol happen at the same time.
+ */
+ mtx_lock(&dom_mtx);
+
+ /* The new protocol must not yet exist. */
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+ if ((pr->pr_type == npr->pr_type) &&
+ (pr->pr_protocol == npr->pr_protocol)) {
+ mtx_unlock(&dom_mtx);
+ return (EEXIST); /* XXX: Check only protocol? */
+ }
+ /* While here, remember the first free spacer. */
+ if ((fpr == NULL) && (pr->pr_protocol == PROTO_SPACER))
+ fpr = pr;
+ }
+
+ /* If no free spacer is found we can't add the new protocol. */
+ if (fpr == NULL) {
+ mtx_unlock(&dom_mtx);
+ return (ENOMEM);
+ }
+
+ /* Copy the new struct protosw over the spacer. */
+ bcopy(npr, fpr, sizeof(*fpr));
+
+ /* Job is done, no more protection required. */
+ mtx_unlock(&dom_mtx);
+
+ /* Initialize and activate the protocol. */
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET_QUIET(vnet_iter);
+ protosw_init(fpr);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+
+ return (0);
+}
+
+/*
+ * The caller must make sure the protocol and its functions correctly shut down
+ * all sockets and release all locks and memory references.
+ */
+int
+pf_proto_unregister(int family, int protocol, int type)
+{
+ struct domain *dp;
+ struct protosw *pr, *dpr;
+
+ /* Sanity checks. */
+ if (family == 0)
+ return (EPFNOSUPPORT);
+ if (protocol == 0)
+ return (EPROTONOSUPPORT);
+ if (type == 0)
+ return (EPROTOTYPE);
+
+ /* Try to find the specified domain based on the family type. */
+ dp = pffinddomain(family);
+ if (dp == NULL)
+ return (EPFNOSUPPORT);
+
+ dpr = NULL;
+
+ /* Lock out everyone else while we are manipulating the protosw. */
+ mtx_lock(&dom_mtx);
+
+ /* The protocol must exist and only once. */
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+ if ((pr->pr_type == type) && (pr->pr_protocol == protocol)) {
+ if (dpr != NULL) {
+ mtx_unlock(&dom_mtx);
+ return (EMLINK); /* Should not happen! */
+ } else
+ dpr = pr;
+ }
+ }
+
+ /* Protocol does not exist. */
+ if (dpr == NULL) {
+ mtx_unlock(&dom_mtx);
+ return (EPROTONOSUPPORT);
+ }
+
+ /* De-orbit the protocol and make the slot available again. */
+ dpr->pr_type = 0;
+ dpr->pr_domain = dp;
+ dpr->pr_protocol = PROTO_SPACER;
+ dpr->pr_flags = 0;
+ dpr->pr_input = NULL;
+ dpr->pr_output = NULL;
+ dpr->pr_ctlinput = NULL;
+ dpr->pr_ctloutput = NULL;
+ dpr->pr_init = NULL;
+ dpr->pr_fasttimo = NULL;
+ dpr->pr_slowtimo = NULL;
+ dpr->pr_drain = NULL;
+ dpr->pr_usrreqs = &nousrreqs;
+
+ /* Job is done, not more protection required. */
+ mtx_unlock(&dom_mtx);
+
+ return (0);
+}
+
+void
+pfctlinput(int cmd, struct sockaddr *sa)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_ctlinput)
+ (*pr->pr_ctlinput)(cmd, sa, (void *)0);
+}
+
+void
+pfctlinput2(int cmd, struct sockaddr *sa, void *ctlparam)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+ if (!sa)
+ return;
+ for (dp = domains; dp; dp = dp->dom_next) {
+ /*
+ * the check must be made by xx_ctlinput() anyways, to
+ * make sure we use data item pointed to by ctlparam in
+ * correct way. the following check is made just for safety.
+ */
+ if (dp->dom_family != sa->sa_family)
+ continue;
+
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_ctlinput)
+ (*pr->pr_ctlinput)(cmd, sa, ctlparam);
+ }
+}
+
+static void
+pfslowtimo(void *arg)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_slowtimo)
+ (*pr->pr_slowtimo)();
+ callout_reset(&pfslow_callout, hz/2, pfslowtimo, NULL);
+}
+
+static void
+pffasttimo(void *arg)
+{
+ struct domain *dp;
+ struct protosw *pr;
+
+ for (dp = domains; dp; dp = dp->dom_next)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ if (pr->pr_fasttimo)
+ (*pr->pr_fasttimo)();
+ callout_reset(&pffast_callout, hz/5, pffasttimo, NULL);
+}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
new file mode 100644
index 0000000..8e278a4
--- /dev/null
+++ b/sys/kern/uipc_mbuf.c
@@ -0,0 +1,2182 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_param.h"
+#include "opt_mbuf_stress_test.h"
+#include "opt_mbuf_profiling.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/sysctl.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/uio.h>
+
+int max_linkhdr;
+int max_protohdr;
+int max_hdr;
+int max_datalen;
+#ifdef MBUF_STRESS_TEST
+int m_defragpackets;
+int m_defragbytes;
+int m_defraguseless;
+int m_defragfailure;
+int m_defragrandomfailures;
+#endif
+
+/*
+ * sysctl(8) exported objects
+ */
+SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
+ &max_linkhdr, 0, "Size of largest link layer header");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
+ &max_protohdr, 0, "Size of largest protocol layer header");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD,
+ &max_hdr, 0, "Size of largest link plus protocol header");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD,
+ &max_datalen, 0, "Minimum space left in mbuf after max_hdr");
+#ifdef MBUF_STRESS_TEST
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD,
+ &m_defragpackets, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD,
+ &m_defragbytes, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD,
+ &m_defraguseless, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD,
+ &m_defragfailure, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW,
+ &m_defragrandomfailures, 0, "");
+#endif
+
+/*
+ * Ensure the correct size of various mbuf parameters. It could be off due
+ * to compiler-induced padding and alignment artifacts.
+ */
+CTASSERT(sizeof(struct mbuf) == MSIZE);
+CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
+CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
+
+/*
+ * m_get2() allocates minimum mbuf that would fit "size" argument.
+ */
+struct mbuf *
+m_get2(int size, int how, short type, int flags)
+{
+ struct mb_args args;
+ struct mbuf *m, *n;
+
+ args.flags = flags;
+ args.type = type;
+
+ if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
+ return (uma_zalloc_arg(zone_mbuf, &args, how));
+ if (size <= MCLBYTES)
+ return (uma_zalloc_arg(zone_pack, &args, how));
+
+ if (size > MJUMPAGESIZE)
+ return (NULL);
+
+ m = uma_zalloc_arg(zone_mbuf, &args, how);
+ if (m == NULL)
+ return (NULL);
+
+ n = uma_zalloc_arg(zone_jumbop, m, how);
+ if (n == NULL) {
+ uma_zfree(zone_mbuf, m);
+ return (NULL);
+ }
+
+ return (m);
+}
+
+/*
+ * m_getjcl() returns an mbuf with a cluster of the specified size attached.
+ * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
+ */
+struct mbuf *
+m_getjcl(int how, short type, int flags, int size)
+{
+ struct mb_args args;
+ struct mbuf *m, *n;
+ uma_zone_t zone;
+
+ if (size == MCLBYTES)
+ return m_getcl(how, type, flags);
+
+ args.flags = flags;
+ args.type = type;
+
+ m = uma_zalloc_arg(zone_mbuf, &args, how);
+ if (m == NULL)
+ return (NULL);
+
+ zone = m_getzone(size);
+ n = uma_zalloc_arg(zone, m, how);
+ if (n == NULL) {
+ uma_zfree(zone_mbuf, m);
+ return (NULL);
+ }
+ return (m);
+}
+
+/*
+ * Allocate a given length worth of mbufs and/or clusters (whatever fits
+ * best) and return a pointer to the top of the allocated chain. If an
+ * existing mbuf chain is provided, then we will append the new chain
+ * to the existing one but still return the top of the newly allocated
+ * chain.
+ */
+struct mbuf *
+m_getm2(struct mbuf *m, int len, int how, short type, int flags)
+{
+ struct mbuf *mb, *nm = NULL, *mtail = NULL;
+
+ KASSERT(len >= 0, ("%s: len is < 0", __func__));
+
+ /* Validate flags. */
+ flags &= (M_PKTHDR | M_EOR);
+
+ /* Packet header mbuf must be first in chain. */
+ if ((flags & M_PKTHDR) && m != NULL)
+ flags &= ~M_PKTHDR;
+
+ /* Loop and append maximum sized mbufs to the chain tail. */
+ while (len > 0) {
+ if (len > MCLBYTES)
+ mb = m_getjcl(how, type, (flags & M_PKTHDR),
+ MJUMPAGESIZE);
+ else if (len >= MINCLSIZE)
+ mb = m_getcl(how, type, (flags & M_PKTHDR));
+ else if (flags & M_PKTHDR)
+ mb = m_gethdr(how, type);
+ else
+ mb = m_get(how, type);
+
+ /* Fail the whole operation if one mbuf can't be allocated. */
+ if (mb == NULL) {
+ if (nm != NULL)
+ m_freem(nm);
+ return (NULL);
+ }
+
+ /* Book keeping. */
+ len -= (mb->m_flags & M_EXT) ? mb->m_ext.ext_size :
+ ((mb->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+ if (mtail != NULL)
+ mtail->m_next = mb;
+ else
+ nm = mb;
+ mtail = mb;
+ flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */
+ }
+ if (flags & M_EOR)
+ mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */
+
+ /* If mbuf was supplied, append new chain to the end of it. */
+ if (m != NULL) {
+ for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
+ ;
+ mtail->m_next = nm;
+ mtail->m_flags &= ~M_EOR;
+ } else
+ m = nm;
+
+ return (m);
+}
+
+/*
+ * Free an entire chain of mbufs and associated external buffers, if
+ * applicable.
+ */
+void
+m_freem(struct mbuf *mb)
+{
+
+ while (mb != NULL)
+ mb = m_free(mb);
+}
+
+/*-
+ * Configure a provided mbuf to refer to the provided external storage
+ * buffer and setup a reference count for said buffer. If the setting
+ * up of the reference count fails, the M_EXT bit will not be set. If
+ * successfull, the M_EXT bit is set in the mbuf's flags.
+ *
+ * Arguments:
+ * mb The existing mbuf to which to attach the provided buffer.
+ * buf The address of the provided external storage buffer.
+ * size The size of the provided buffer.
+ * freef A pointer to a routine that is responsible for freeing the
+ * provided external storage buffer.
+ * args A pointer to an argument structure (of any type) to be passed
+ * to the provided freef routine (may be NULL).
+ * flags Any other flags to be passed to the provided mbuf.
+ * type The type that the external storage buffer should be
+ * labeled with.
+ *
+ * Returns:
+ * Nothing.
+ */
+int
+m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
+ int (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2,
+ int flags, int type, int wait)
+{
+ KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
+
+ if (type != EXT_EXTREF)
+ mb->m_ext.ref_cnt = uma_zalloc(zone_ext_refcnt, wait);
+
+ if (mb->m_ext.ref_cnt == NULL)
+ return (ENOMEM);
+
+ *(mb->m_ext.ref_cnt) = 1;
+ mb->m_flags |= (M_EXT | flags);
+ mb->m_ext.ext_buf = buf;
+ mb->m_data = mb->m_ext.ext_buf;
+ mb->m_ext.ext_size = size;
+ mb->m_ext.ext_free = freef;
+ mb->m_ext.ext_arg1 = arg1;
+ mb->m_ext.ext_arg2 = arg2;
+ mb->m_ext.ext_type = type;
+ mb->m_ext.ext_flags = 0;
+
+ return (0);
+}
+
+/*
+ * Non-directly-exported function to clean up after mbufs with M_EXT
+ * storage attached to them if the reference count hits 1.
+ */
+void
+mb_free_ext(struct mbuf *m)
+{
+ int skipmbuf;
+
+ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+ KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
+
+ /*
+ * check if the header is embedded in the cluster
+ */
+ skipmbuf = (m->m_flags & M_NOFREE);
+
+ /* Free attached storage if this mbuf is the only reference to it. */
+ if (*(m->m_ext.ref_cnt) == 1 ||
+ atomic_fetchadd_int(m->m_ext.ref_cnt, -1) == 1) {
+ switch (m->m_ext.ext_type) {
+ case EXT_PACKET: /* The packet zone is special. */
+ if (*(m->m_ext.ref_cnt) == 0)
+ *(m->m_ext.ref_cnt) = 1;
+ uma_zfree(zone_pack, m);
+ return; /* Job done. */
+ case EXT_CLUSTER:
+ uma_zfree(zone_clust, m->m_ext.ext_buf);
+ break;
+ case EXT_JUMBOP:
+ uma_zfree(zone_jumbop, m->m_ext.ext_buf);
+ break;
+ case EXT_JUMBO9:
+ uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
+ break;
+ case EXT_JUMBO16:
+ uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
+ break;
+ case EXT_SFBUF:
+ case EXT_NET_DRV:
+ case EXT_MOD_TYPE:
+ case EXT_DISPOSABLE:
+ *(m->m_ext.ref_cnt) = 0;
+ uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *,
+ m->m_ext.ref_cnt));
+ /* FALLTHROUGH */
+ case EXT_EXTREF:
+ KASSERT(m->m_ext.ext_free != NULL,
+ ("%s: ext_free not set", __func__));
+ (void)(*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1,
+ m->m_ext.ext_arg2);
+ break;
+ default:
+ KASSERT(m->m_ext.ext_type == 0,
+ ("%s: unknown ext_type", __func__));
+ }
+ }
+ if (skipmbuf)
+ return;
+
+ /*
+ * Free this mbuf back to the mbuf zone with all m_ext
+ * information purged.
+ */
+ m->m_ext.ext_buf = NULL;
+ m->m_ext.ext_free = NULL;
+ m->m_ext.ext_arg1 = NULL;
+ m->m_ext.ext_arg2 = NULL;
+ m->m_ext.ref_cnt = NULL;
+ m->m_ext.ext_size = 0;
+ m->m_ext.ext_type = 0;
+ m->m_ext.ext_flags = 0;
+ m->m_flags &= ~M_EXT;
+ uma_zfree(zone_mbuf, m);
+}
+
+/*
+ * Attach the cluster from *m to *n, set up m_ext in *n
+ * and bump the refcount of the cluster.
+ */
+static void
+mb_dupcl(struct mbuf *n, struct mbuf *m)
+{
+ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+ KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
+ KASSERT((n->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
+
+ if (*(m->m_ext.ref_cnt) == 1)
+ *(m->m_ext.ref_cnt) += 1;
+ else
+ atomic_add_int(m->m_ext.ref_cnt, 1);
+ n->m_ext.ext_buf = m->m_ext.ext_buf;
+ n->m_ext.ext_free = m->m_ext.ext_free;
+ n->m_ext.ext_arg1 = m->m_ext.ext_arg1;
+ n->m_ext.ext_arg2 = m->m_ext.ext_arg2;
+ n->m_ext.ext_size = m->m_ext.ext_size;
+ n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+ n->m_ext.ext_type = m->m_ext.ext_type;
+ n->m_ext.ext_flags = m->m_ext.ext_flags;
+ n->m_flags |= M_EXT;
+ n->m_flags |= m->m_flags & M_RDONLY;
+}
+
+/*
+ * Clean up mbuf (chain) from any tags and packet headers.
+ * If "all" is set then the first mbuf in the chain will be
+ * cleaned too.
+ */
+void
+m_demote(struct mbuf *m0, int all)
+{
+ struct mbuf *m;
+
+ for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
+ if (m->m_flags & M_PKTHDR) {
+ m_tag_delete_chain(m, NULL);
+ m->m_flags &= ~M_PKTHDR;
+ bzero(&m->m_pkthdr, sizeof(struct pkthdr));
+ }
+ if (m != m0 && m->m_nextpkt != NULL) {
+ KASSERT(m->m_nextpkt == NULL,
+ ("%s: m_nextpkt not NULL", __func__));
+ m_freem(m->m_nextpkt);
+ m->m_nextpkt = NULL;
+ }
+ m->m_flags = m->m_flags & (M_EXT|M_RDONLY|M_NOFREE);
+ }
+}
+
+/*
+ * Sanity checks on mbuf (chain) for use in KASSERT() and general
+ * debugging.
+ * Returns 0 or panics when bad and 1 on all tests passed.
+ * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
+ * blow up later.
+ */
+int
+m_sanity(struct mbuf *m0, int sanitize)
+{
+ struct mbuf *m;
+ caddr_t a, b;
+ int pktlen = 0;
+
+#ifdef INVARIANTS
+#define M_SANITY_ACTION(s) panic("mbuf %p: " s, m)
+#else
+#define M_SANITY_ACTION(s) printf("mbuf %p: " s, m)
+#endif
+
+ for (m = m0; m != NULL; m = m->m_next) {
+ /*
+ * Basic pointer checks. If any of these fails then some
+ * unrelated kernel memory before or after us is trashed.
+ * No way to recover from that.
+ */
+ a = ((m->m_flags & M_EXT) ? m->m_ext.ext_buf :
+ ((m->m_flags & M_PKTHDR) ? (caddr_t)(&m->m_pktdat) :
+ (caddr_t)(&m->m_dat)) );
+ b = (caddr_t)(a + (m->m_flags & M_EXT ? m->m_ext.ext_size :
+ ((m->m_flags & M_PKTHDR) ? MHLEN : MLEN)));
+ if ((caddr_t)m->m_data < a)
+ M_SANITY_ACTION("m_data outside mbuf data range left");
+ if ((caddr_t)m->m_data > b)
+ M_SANITY_ACTION("m_data outside mbuf data range right");
+ if ((caddr_t)m->m_data + m->m_len > b)
+ M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
+
+ /* m->m_nextpkt may only be set on first mbuf in chain. */
+ if (m != m0 && m->m_nextpkt != NULL) {
+ if (sanitize) {
+ m_freem(m->m_nextpkt);
+ m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
+ } else
+ M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
+ }
+
+ /* packet length (not mbuf length!) calculation */
+ if (m0->m_flags & M_PKTHDR)
+ pktlen += m->m_len;
+
+ /* m_tags may only be attached to first mbuf in chain. */
+ if (m != m0 && m->m_flags & M_PKTHDR &&
+ !SLIST_EMPTY(&m->m_pkthdr.tags)) {
+ if (sanitize) {
+ m_tag_delete_chain(m, NULL);
+ /* put in 0xDEADC0DE perhaps? */
+ } else
+ M_SANITY_ACTION("m_tags on in-chain mbuf");
+ }
+
+ /* M_PKTHDR may only be set on first mbuf in chain */
+ if (m != m0 && m->m_flags & M_PKTHDR) {
+ if (sanitize) {
+ bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
+ m->m_flags &= ~M_PKTHDR;
+ /* put in 0xDEADCODE and leave hdr flag in */
+ } else
+ M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
+ }
+ }
+ m = m0;
+ if (pktlen && pktlen != m->m_pkthdr.len) {
+ if (sanitize)
+ m->m_pkthdr.len = 0;
+ else
+ M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
+ }
+ return 1;
+
+#undef M_SANITY_ACTION
+}
+
+
+/*
+ * "Move" mbuf pkthdr from "from" to "to".
+ * "from" must have M_PKTHDR set, and "to" must be empty.
+ */
+void
+m_move_pkthdr(struct mbuf *to, struct mbuf *from)
+{
+
+#if 0
+ /* see below for why these are not enabled */
+ M_ASSERTPKTHDR(to);
+ /* Note: with MAC, this may not be a good assertion. */
+ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags),
+ ("m_move_pkthdr: to has tags"));
+#endif
+#ifdef MAC
+ /*
+ * XXXMAC: It could be this should also occur for non-MAC?
+ */
+ if (to->m_flags & M_PKTHDR)
+ m_tag_delete_chain(to, NULL);
+#endif
+ to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+ if ((to->m_flags & M_EXT) == 0)
+ to->m_data = to->m_pktdat;
+ to->m_pkthdr = from->m_pkthdr; /* especially tags */
+ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */
+ from->m_flags &= ~M_PKTHDR;
+}
+
+/*
+ * Duplicate "from"'s mbuf pkthdr in "to".
+ * "from" must have M_PKTHDR set, and "to" must be empty.
+ * In particular, this does a deep copy of the packet tags.
+ */
+int
+m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
+{
+
+#if 0
+ /*
+ * The mbuf allocator only initializes the pkthdr
+ * when the mbuf is allocated with m_gethdr(). Many users
+ * (e.g. m_copy*, m_prepend) use m_get() and then
+ * smash the pkthdr as needed causing these
+ * assertions to trip. For now just disable them.
+ */
+ M_ASSERTPKTHDR(to);
+ /* Note: with MAC, this may not be a good assertion. */
+ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags"));
+#endif
+ MBUF_CHECKSLEEP(how);
+#ifdef MAC
+ if (to->m_flags & M_PKTHDR)
+ m_tag_delete_chain(to, NULL);
+#endif
+ to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+ if ((to->m_flags & M_EXT) == 0)
+ to->m_data = to->m_pktdat;
+ to->m_pkthdr = from->m_pkthdr;
+ SLIST_INIT(&to->m_pkthdr.tags);
+ return (m_tag_copy_chain(to, from, MBTOM(how)));
+}
+
+/*
+ * Lesser-used path for M_PREPEND:
+ * allocate new mbuf to prepend to chain,
+ * copy junk along.
+ */
+struct mbuf *
+m_prepend(struct mbuf *m, int len, int how)
+{
+ struct mbuf *mn;
+
+ if (m->m_flags & M_PKTHDR)
+ mn = m_gethdr(how, m->m_type);
+ else
+ mn = m_get(how, m->m_type);
+ if (mn == NULL) {
+ m_freem(m);
+ return (NULL);
+ }
+ if (m->m_flags & M_PKTHDR)
+ m_move_pkthdr(mn, m);
+ mn->m_next = m;
+ m = mn;
+ if(m->m_flags & M_PKTHDR) {
+ if (len < MHLEN)
+ MH_ALIGN(m, len);
+ } else {
+ if (len < MLEN)
+ M_ALIGN(m, len);
+ }
+ m->m_len = len;
+ return (m);
+}
+
+/*
+ * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
+ * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
+ * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
+ * Note that the copy is read-only, because clusters are not copied,
+ * only their reference counts are incremented.
+ */
+struct mbuf *
+m_copym(struct mbuf *m, int off0, int len, int wait)
+{
+ struct mbuf *n, **np;
+ int off = off0;
+ struct mbuf *top;
+ int copyhdr = 0;
+
+ KASSERT(off >= 0, ("m_copym, negative off %d", off));
+ KASSERT(len >= 0, ("m_copym, negative len %d", len));
+ MBUF_CHECKSLEEP(wait);
+ if (off == 0 && m->m_flags & M_PKTHDR)
+ copyhdr = 1;
+ while (off > 0) {
+ KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain"));
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ m = m->m_next;
+ }
+ np = &top;
+ top = 0;
+ while (len > 0) {
+ if (m == NULL) {
+ KASSERT(len == M_COPYALL,
+ ("m_copym, length > size of mbuf chain"));
+ break;
+ }
+ if (copyhdr)
+ n = m_gethdr(wait, m->m_type);
+ else
+ n = m_get(wait, m->m_type);
+ *np = n;
+ if (n == NULL)
+ goto nospace;
+ if (copyhdr) {
+ if (!m_dup_pkthdr(n, m, wait))
+ goto nospace;
+ if (len == M_COPYALL)
+ n->m_pkthdr.len -= off0;
+ else
+ n->m_pkthdr.len = len;
+ copyhdr = 0;
+ }
+ n->m_len = min(len, m->m_len - off);
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data + off;
+ mb_dupcl(n, m);
+ } else
+ bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
+ (u_int)n->m_len);
+ if (len != M_COPYALL)
+ len -= n->m_len;
+ off = 0;
+ m = m->m_next;
+ np = &n->m_next;
+ }
+
+ return (top);
+nospace:
+ m_freem(top);
+ return (NULL);
+}
+
+/*
+ * Returns mbuf chain with new head for the prepending case.
+ * Copies from mbuf (chain) n from off for len to mbuf (chain) m
+ * either prepending or appending the data.
+ * The resulting mbuf (chain) m is fully writeable.
+ * m is destination (is made writeable)
+ * n is source, off is offset in source, len is len from offset
+ * dir, 0 append, 1 prepend
+ * how, wait or nowait
+ */
+
+static int
+m_bcopyxxx(void *s, void *t, u_int len)
+{
+ bcopy(s, t, (size_t)len);
+ return 0;
+}
+
+struct mbuf *
+m_copymdata(struct mbuf *m, struct mbuf *n, int off, int len,
+ int prep, int how)
+{
+ struct mbuf *mm, *x, *z, *prev = NULL;
+ caddr_t p;
+ int i, nlen = 0;
+ caddr_t buf[MLEN];
+
+ KASSERT(m != NULL && n != NULL, ("m_copymdata, no target or source"));
+ KASSERT(off >= 0, ("m_copymdata, negative off %d", off));
+ KASSERT(len >= 0, ("m_copymdata, negative len %d", len));
+ KASSERT(prep == 0 || prep == 1, ("m_copymdata, unknown direction %d", prep));
+
+ mm = m;
+ if (!prep) {
+ while(mm->m_next) {
+ prev = mm;
+ mm = mm->m_next;
+ }
+ }
+ for (z = n; z != NULL; z = z->m_next)
+ nlen += z->m_len;
+ if (len == M_COPYALL)
+ len = nlen - off;
+ if (off + len > nlen || len < 1)
+ return NULL;
+
+ if (!M_WRITABLE(mm)) {
+ /* XXX: Use proper m_xxx function instead. */
+ x = m_getcl(how, MT_DATA, mm->m_flags);
+ if (x == NULL)
+ return NULL;
+ bcopy(mm->m_ext.ext_buf, x->m_ext.ext_buf, x->m_ext.ext_size);
+ p = x->m_ext.ext_buf + (mm->m_data - mm->m_ext.ext_buf);
+ x->m_data = p;
+ mm->m_next = NULL;
+ if (mm != m)
+ prev->m_next = x;
+ m_free(mm);
+ mm = x;
+ }
+
+ /*
+ * Append/prepend the data. Allocating mbufs as necessary.
+ */
+ /* Shortcut if enough free space in first/last mbuf. */
+ if (!prep && M_TRAILINGSPACE(mm) >= len) {
+ m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t) +
+ mm->m_len);
+ mm->m_len += len;
+ mm->m_pkthdr.len += len;
+ return m;
+ }
+ if (prep && M_LEADINGSPACE(mm) >= len) {
+ mm->m_data = mtod(mm, caddr_t) - len;
+ m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t));
+ mm->m_len += len;
+ mm->m_pkthdr.len += len;
+ return mm;
+ }
+
+ /* Expand first/last mbuf to cluster if possible. */
+ if (!prep && !(mm->m_flags & M_EXT) && len > M_TRAILINGSPACE(mm)) {
+ bcopy(mm->m_data, &buf, mm->m_len);
+ m_clget(mm, how);
+ if (!(mm->m_flags & M_EXT))
+ return NULL;
+ bcopy(&buf, mm->m_ext.ext_buf, mm->m_len);
+ mm->m_data = mm->m_ext.ext_buf;
+ }
+ if (prep && !(mm->m_flags & M_EXT) && len > M_LEADINGSPACE(mm)) {
+ bcopy(mm->m_data, &buf, mm->m_len);
+ m_clget(mm, how);
+ if (!(mm->m_flags & M_EXT))
+ return NULL;
+ bcopy(&buf, (caddr_t *)mm->m_ext.ext_buf +
+ mm->m_ext.ext_size - mm->m_len, mm->m_len);
+ mm->m_data = (caddr_t)mm->m_ext.ext_buf +
+ mm->m_ext.ext_size - mm->m_len;
+ }
+
+ /* Append/prepend as many mbuf (clusters) as necessary to fit len. */
+ if (!prep && len > M_TRAILINGSPACE(mm)) {
+ if (!m_getm(mm, len - M_TRAILINGSPACE(mm), how, MT_DATA))
+ return NULL;
+ }
+ if (prep && len > M_LEADINGSPACE(mm)) {
+ if (!(z = m_getm(NULL, len - M_LEADINGSPACE(mm), how, MT_DATA)))
+ return NULL;
+ i = 0;
+ for (x = z; x != NULL; x = x->m_next) {
+ i += x->m_flags & M_EXT ? x->m_ext.ext_size :
+ (x->m_flags & M_PKTHDR ? MHLEN : MLEN);
+ if (!x->m_next)
+ break;
+ }
+ z->m_data += i - len;
+ m_move_pkthdr(mm, z);
+ x->m_next = mm;
+ mm = z;
+ }
+
+ /* Seek to start position in source mbuf. Optimization for long chains. */
+ while (off > 0) {
+ if (off < n->m_len)
+ break;
+ off -= n->m_len;
+ n = n->m_next;
+ }
+
+ /* Copy data into target mbuf. */
+ z = mm;
+ while (len > 0) {
+ KASSERT(z != NULL, ("m_copymdata, falling off target edge"));
+ i = M_TRAILINGSPACE(z);
+ m_apply(n, off, i, m_bcopyxxx, mtod(z, caddr_t) + z->m_len);
+ z->m_len += i;
+ /* fixup pkthdr.len if necessary */
+ if ((prep ? mm : m)->m_flags & M_PKTHDR)
+ (prep ? mm : m)->m_pkthdr.len += i;
+ off += i;
+ len -= i;
+ z = z->m_next;
+ }
+ return (prep ? mm : m);
+}
+
+/*
+ * Copy an entire packet, including header (which must be present).
+ * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
+ * Note that the copy is read-only, because clusters are not copied,
+ * only their reference counts are incremented.
+ * Preserve alignment of the first mbuf so if the creator has left
+ * some room at the beginning (e.g. for inserting protocol headers)
+ * the copies still have the room available.
+ */
+struct mbuf *
+m_copypacket(struct mbuf *m, int how)
+{
+ struct mbuf *top, *n, *o;
+
+ MBUF_CHECKSLEEP(how);
+ n = m_get(how, m->m_type);
+ top = n;
+ if (n == NULL)
+ goto nospace;
+
+ if (!m_dup_pkthdr(n, m, how))
+ goto nospace;
+ n->m_len = m->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data;
+ mb_dupcl(n, m);
+ } else {
+ n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
+ bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+ }
+
+ m = m->m_next;
+ while (m) {
+ o = m_get(how, m->m_type);
+ if (o == NULL)
+ goto nospace;
+
+ n->m_next = o;
+ n = n->m_next;
+
+ n->m_len = m->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data;
+ mb_dupcl(n, m);
+ } else {
+ bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+ }
+
+ m = m->m_next;
+ }
+ return top;
+nospace:
+ m_freem(top);
+ return (NULL);
+}
+
+/*
+ * Copy data from an mbuf chain starting "off" bytes from the beginning,
+ * continuing for "len" bytes, into the indicated buffer.
+ */
+void
+m_copydata(const struct mbuf *m, int off, int len, caddr_t cp)
+{
+ u_int count;
+
+ KASSERT(off >= 0, ("m_copydata, negative off %d", off));
+ KASSERT(len >= 0, ("m_copydata, negative len %d", len));
+ while (off > 0) {
+ KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain"));
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ m = m->m_next;
+ }
+ while (len > 0) {
+ KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
+ count = min(m->m_len - off, len);
+ bcopy(mtod(m, caddr_t) + off, cp, count);
+ len -= count;
+ cp += count;
+ off = 0;
+ m = m->m_next;
+ }
+}
+
+/*
+ * Copy a packet header mbuf chain into a completely new chain, including
+ * copying any mbuf clusters. Use this instead of m_copypacket() when
+ * you need a writable copy of an mbuf chain.
+ */
+struct mbuf *
+m_dup(struct mbuf *m, int how)
+{
+ struct mbuf **p, *top = NULL;
+ int remain, moff, nsize;
+
+ MBUF_CHECKSLEEP(how);
+ /* Sanity check */
+ if (m == NULL)
+ return (NULL);
+ M_ASSERTPKTHDR(m);
+
+ /* While there's more data, get a new mbuf, tack it on, and fill it */
+ remain = m->m_pkthdr.len;
+ moff = 0;
+ p = &top;
+ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */
+ struct mbuf *n;
+
+ /* Get the next new mbuf */
+ if (remain >= MINCLSIZE) {
+ n = m_getcl(how, m->m_type, 0);
+ nsize = MCLBYTES;
+ } else {
+ n = m_get(how, m->m_type);
+ nsize = MLEN;
+ }
+ if (n == NULL)
+ goto nospace;
+
+ if (top == NULL) { /* First one, must be PKTHDR */
+ if (!m_dup_pkthdr(n, m, how)) {
+ m_free(n);
+ goto nospace;
+ }
+ if ((n->m_flags & M_EXT) == 0)
+ nsize = MHLEN;
+ }
+ n->m_len = 0;
+
+ /* Link it into the new chain */
+ *p = n;
+ p = &n->m_next;
+
+ /* Copy data from original mbuf(s) into new mbuf */
+ while (n->m_len < nsize && m != NULL) {
+ int chunk = min(nsize - n->m_len, m->m_len - moff);
+
+ bcopy(m->m_data + moff, n->m_data + n->m_len, chunk);
+ moff += chunk;
+ n->m_len += chunk;
+ remain -= chunk;
+ if (moff == m->m_len) {
+ m = m->m_next;
+ moff = 0;
+ }
+ }
+
+ /* Check correct total mbuf length */
+ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL),
+ ("%s: bogus m_pkthdr.len", __func__));
+ }
+ return (top);
+
+nospace:
+ m_freem(top);
+ return (NULL);
+}
+
+/*
+ * Concatenate mbuf chain n to m.
+ * Both chains must be of the same type (e.g. MT_DATA).
+ * Any m_pkthdr is not updated.
+ */
+void
+m_cat(struct mbuf *m, struct mbuf *n)
+{
+ while (m->m_next)
+ m = m->m_next;
+ while (n) {
+ if (!M_WRITABLE(m) ||
+ M_TRAILINGSPACE(m) < n->m_len) {
+ /* just join the two chains */
+ m->m_next = n;
+ return;
+ }
+ /* splat the data from one into the other */
+ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+ (u_int)n->m_len);
+ m->m_len += n->m_len;
+ n = m_free(n);
+ }
+}
+
+void
+m_adj(struct mbuf *mp, int req_len)
+{
+ int len = req_len;
+ struct mbuf *m;
+ int count;
+
+ if ((m = mp) == NULL)
+ return;
+ if (len >= 0) {
+ /*
+ * Trim from head.
+ */
+ while (m != NULL && len > 0) {
+ if (m->m_len <= len) {
+ len -= m->m_len;
+ m->m_len = 0;
+ m = m->m_next;
+ } else {
+ m->m_len -= len;
+ m->m_data += len;
+ len = 0;
+ }
+ }
+ if (mp->m_flags & M_PKTHDR)
+ mp->m_pkthdr.len -= (req_len - len);
+ } else {
+ /*
+ * Trim from tail. Scan the mbuf chain,
+ * calculating its length and finding the last mbuf.
+ * If the adjustment only affects this mbuf, then just
+ * adjust and return. Otherwise, rescan and truncate
+ * after the remaining size.
+ */
+ len = -len;
+ count = 0;
+ for (;;) {
+ count += m->m_len;
+ if (m->m_next == (struct mbuf *)0)
+ break;
+ m = m->m_next;
+ }
+ if (m->m_len >= len) {
+ m->m_len -= len;
+ if (mp->m_flags & M_PKTHDR)
+ mp->m_pkthdr.len -= len;
+ return;
+ }
+ count -= len;
+ if (count < 0)
+ count = 0;
+ /*
+ * Correct length for chain is "count".
+ * Find the mbuf with last data, adjust its length,
+ * and toss data from remaining mbufs on chain.
+ */
+ m = mp;
+ if (m->m_flags & M_PKTHDR)
+ m->m_pkthdr.len = count;
+ for (; m; m = m->m_next) {
+ if (m->m_len >= count) {
+ m->m_len = count;
+ if (m->m_next != NULL) {
+ m_freem(m->m_next);
+ m->m_next = NULL;
+ }
+ break;
+ }
+ count -= m->m_len;
+ }
+ }
+}
+
+/*
+ * Rearange an mbuf chain so that len bytes are contiguous
+ * and in the data area of an mbuf (so that mtod will work
+ * for a structure of size len). Returns the resulting
+ * mbuf chain on success, frees it and returns null on failure.
+ * If there is room, it will add up to max_protohdr-len extra bytes to the
+ * contiguous region in an attempt to avoid being called next time.
+ */
+struct mbuf *
+m_pullup(struct mbuf *n, int len)
+{
+ struct mbuf *m;
+ int count;
+ int space;
+
+ /*
+ * If first mbuf has no cluster, and has room for len bytes
+ * without shifting current data, pullup into it,
+ * otherwise allocate a new mbuf to prepend to the chain.
+ */
+ if ((n->m_flags & M_EXT) == 0 &&
+ n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
+ if (n->m_len >= len)
+ return (n);
+ m = n;
+ n = n->m_next;
+ len -= m->m_len;
+ } else {
+ if (len > MHLEN)
+ goto bad;
+ m = m_get(M_NOWAIT, n->m_type);
+ if (m == NULL)
+ goto bad;
+ if (n->m_flags & M_PKTHDR)
+ m_move_pkthdr(m, n);
+ }
+ space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+ do {
+ count = min(min(max(len, max_protohdr), space), n->m_len);
+ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len,
+ (u_int)count);
+ len -= count;
+ m->m_len += count;
+ n->m_len -= count;
+ space -= count;
+ if (n->m_len)
+ n->m_data += count;
+ else
+ n = m_free(n);
+ } while (len > 0 && n);
+ if (len > 0) {
+ (void) m_free(m);
+ goto bad;
+ }
+ m->m_next = n;
+ return (m);
+bad:
+ m_freem(n);
+ return (NULL);
+}
+
+/*
+ * Like m_pullup(), except a new mbuf is always allocated, and we allow
+ * the amount of empty space before the data in the new mbuf to be specified
+ * (in the event that the caller expects to prepend later).
+ */
+int MSFail;
+
+struct mbuf *
+m_copyup(struct mbuf *n, int len, int dstoff)
+{
+ struct mbuf *m;
+ int count, space;
+
+ if (len > (MHLEN - dstoff))
+ goto bad;
+ m = m_get(M_NOWAIT, n->m_type);
+ if (m == NULL)
+ goto bad;
+ if (n->m_flags & M_PKTHDR)
+ m_move_pkthdr(m, n);
+ m->m_data += dstoff;
+ space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
+ do {
+ count = min(min(max(len, max_protohdr), space), n->m_len);
+ memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
+ (unsigned)count);
+ len -= count;
+ m->m_len += count;
+ n->m_len -= count;
+ space -= count;
+ if (n->m_len)
+ n->m_data += count;
+ else
+ n = m_free(n);
+ } while (len > 0 && n);
+ if (len > 0) {
+ (void) m_free(m);
+ goto bad;
+ }
+ m->m_next = n;
+ return (m);
+ bad:
+ m_freem(n);
+ MSFail++;
+ return (NULL);
+}
+
+/*
+ * Partition an mbuf chain in two pieces, returning the tail --
+ * all but the first len0 bytes. In case of failure, it returns NULL and
+ * attempts to restore the chain to its original state.
+ *
+ * Note that the resulting mbufs might be read-only, because the new
+ * mbuf can end up sharing an mbuf cluster with the original mbuf if
+ * the "breaking point" happens to lie within a cluster mbuf. Use the
+ * M_WRITABLE() macro to check for this case.
+ */
+struct mbuf *
+m_split(struct mbuf *m0, int len0, int wait)
+{
+ struct mbuf *m, *n;
+ u_int len = len0, remain;
+
+ MBUF_CHECKSLEEP(wait);
+ for (m = m0; m && len > m->m_len; m = m->m_next)
+ len -= m->m_len;
+ if (m == NULL)
+ return (NULL);
+ remain = m->m_len - len;
+ if (m0->m_flags & M_PKTHDR && remain == 0) {
+ n = m_gethdr(wait, m0->m_type);
+ return (NULL);
+ n->m_next = m->m_next;
+ m->m_next = NULL;
+ n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+ n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+ m0->m_pkthdr.len = len0;
+ return (n);
+ } else if (m0->m_flags & M_PKTHDR) {
+ n = m_gethdr(wait, m0->m_type);
+ if (n == NULL)
+ return (NULL);
+ n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+ n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+ m0->m_pkthdr.len = len0;
+ if (m->m_flags & M_EXT)
+ goto extpacket;
+ if (remain > MHLEN) {
+ /* m can't be the lead packet */
+ MH_ALIGN(n, 0);
+ n->m_next = m_split(m, len, wait);
+ if (n->m_next == NULL) {
+ (void) m_free(n);
+ return (NULL);
+ } else {
+ n->m_len = 0;
+ return (n);
+ }
+ } else
+ MH_ALIGN(n, remain);
+ } else if (remain == 0) {
+ n = m->m_next;
+ m->m_next = NULL;
+ return (n);
+ } else {
+ n = m_get(wait, m->m_type);
+ if (n == NULL)
+ return (NULL);
+ M_ALIGN(n, remain);
+ }
+extpacket:
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data + len;
+ mb_dupcl(n, m);
+ } else {
+ bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
+ }
+ n->m_len = remain;
+ m->m_len = len;
+ n->m_next = m->m_next;
+ m->m_next = NULL;
+ return (n);
+}
+/*
+ * Routine to copy from device local memory into mbufs.
+ * Note that `off' argument is offset into first mbuf of target chain from
+ * which to begin copying the data to.
+ */
+struct mbuf *
+m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
+ void (*copy)(char *from, caddr_t to, u_int len))
+{
+ struct mbuf *m;
+ struct mbuf *top = NULL, **mp = &top;
+ int len;
+
+ if (off < 0 || off > MHLEN)
+ return (NULL);
+
+ while (totlen > 0) {
+ if (top == NULL) { /* First one, must be PKTHDR */
+ if (totlen + off >= MINCLSIZE) {
+ m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
+ len = MCLBYTES;
+ } else {
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ len = MHLEN;
+
+ /* Place initial small packet/header at end of mbuf */
+ if (m && totlen + off + max_linkhdr <= MLEN) {
+ m->m_data += max_linkhdr;
+ len -= max_linkhdr;
+ }
+ }
+ if (m == NULL)
+ return NULL;
+ m->m_pkthdr.rcvif = ifp;
+ m->m_pkthdr.len = totlen;
+ } else {
+ if (totlen + off >= MINCLSIZE) {
+ m = m_getcl(M_NOWAIT, MT_DATA, 0);
+ len = MCLBYTES;
+ } else {
+ m = m_get(M_NOWAIT, MT_DATA);
+ len = MLEN;
+ }
+ if (m == NULL) {
+ m_freem(top);
+ return NULL;
+ }
+ }
+ if (off) {
+ m->m_data += off;
+ len -= off;
+ off = 0;
+ }
+ m->m_len = len = min(totlen, len);
+ if (copy)
+ copy(buf, mtod(m, caddr_t), (u_int)len);
+ else
+ bcopy(buf, mtod(m, caddr_t), (u_int)len);
+ buf += len;
+ *mp = m;
+ mp = &m->m_next;
+ totlen -= len;
+ }
+ return (top);
+}
+
+/*
+ * Copy data from a buffer back into the indicated mbuf chain,
+ * starting "off" bytes from the beginning, extending the mbuf
+ * chain if necessary.
+ */
+void
+m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp)
+{
+ int mlen;
+ struct mbuf *m = m0, *n;
+ int totlen = 0;
+
+ if (m0 == NULL)
+ return;
+ while (off > (mlen = m->m_len)) {
+ off -= mlen;
+ totlen += mlen;
+ if (m->m_next == NULL) {
+ n = m_get(M_NOWAIT, m->m_type);
+ if (n == NULL)
+ goto out;
+ bzero(mtod(n, caddr_t), MLEN);
+ n->m_len = min(MLEN, len + off);
+ m->m_next = n;
+ }
+ m = m->m_next;
+ }
+ while (len > 0) {
+ if (m->m_next == NULL && (len > m->m_len - off)) {
+ m->m_len += min(len - (m->m_len - off),
+ M_TRAILINGSPACE(m));
+ }
+ mlen = min (m->m_len - off, len);
+ bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen);
+ cp += mlen;
+ len -= mlen;
+ mlen += off;
+ off = 0;
+ totlen += mlen;
+ if (len == 0)
+ break;
+ if (m->m_next == NULL) {
+ n = m_get(M_NOWAIT, m->m_type);
+ if (n == NULL)
+ break;
+ n->m_len = min(MLEN, len);
+ m->m_next = n;
+ }
+ m = m->m_next;
+ }
+out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
+ m->m_pkthdr.len = totlen;
+}
+
+/*
+ * Append the specified data to the indicated mbuf chain,
+ * Extend the mbuf chain if the new data does not fit in
+ * existing space.
+ *
+ * Return 1 if able to complete the job; otherwise 0.
+ */
+int
+m_append(struct mbuf *m0, int len, c_caddr_t cp)
+{
+ struct mbuf *m, *n;
+ int remainder, space;
+
+ for (m = m0; m->m_next != NULL; m = m->m_next)
+ ;
+ remainder = len;
+ space = M_TRAILINGSPACE(m);
+ if (space > 0) {
+ /*
+ * Copy into available space.
+ */
+ if (space > remainder)
+ space = remainder;
+ bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
+ m->m_len += space;
+ cp += space, remainder -= space;
+ }
+ while (remainder > 0) {
+ /*
+ * Allocate a new mbuf; could check space
+ * and allocate a cluster instead.
+ */
+ n = m_get(M_NOWAIT, m->m_type);
+ if (n == NULL)
+ break;
+ n->m_len = min(MLEN, remainder);
+ bcopy(cp, mtod(n, caddr_t), n->m_len);
+ cp += n->m_len, remainder -= n->m_len;
+ m->m_next = n;
+ m = n;
+ }
+ if (m0->m_flags & M_PKTHDR)
+ m0->m_pkthdr.len += len - remainder;
+ return (remainder == 0);
+}
+
+/*
+ * Apply function f to the data in an mbuf chain starting "off" bytes from
+ * the beginning, continuing for "len" bytes.
+ */
+int
+m_apply(struct mbuf *m, int off, int len,
+ int (*f)(void *, void *, u_int), void *arg)
+{
+ u_int count;
+ int rval;
+
+ KASSERT(off >= 0, ("m_apply, negative off %d", off));
+ KASSERT(len >= 0, ("m_apply, negative len %d", len));
+ while (off > 0) {
+ KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
+ if (off < m->m_len)
+ break;
+ off -= m->m_len;
+ m = m->m_next;
+ }
+ while (len > 0) {
+ KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain"));
+ count = min(m->m_len - off, len);
+ rval = (*f)(arg, mtod(m, caddr_t) + off, count);
+ if (rval)
+ return (rval);
+ len -= count;
+ off = 0;
+ m = m->m_next;
+ }
+ return (0);
+}
+
+/*
+ * Return a pointer to mbuf/offset of location in mbuf chain.
+ */
+struct mbuf *
+m_getptr(struct mbuf *m, int loc, int *off)
+{
+
+ while (loc >= 0) {
+ /* Normal end of search. */
+ if (m->m_len > loc) {
+ *off = loc;
+ return (m);
+ } else {
+ loc -= m->m_len;
+ if (m->m_next == NULL) {
+ if (loc == 0) {
+ /* Point at the end of valid data. */
+ *off = m->m_len;
+ return (m);
+ }
+ return (NULL);
+ }
+ m = m->m_next;
+ }
+ }
+ return (NULL);
+}
+
+void
+m_print(const struct mbuf *m, int maxlen)
+{
+ int len;
+ int pdata;
+ const struct mbuf *m2;
+
+ if (m == NULL) {
+ printf("mbuf: %p\n", m);
+ return;
+ }
+
+ if (m->m_flags & M_PKTHDR)
+ len = m->m_pkthdr.len;
+ else
+ len = -1;
+ m2 = m;
+ while (m2 != NULL && (len == -1 || len)) {
+ pdata = m2->m_len;
+ if (maxlen != -1 && pdata > maxlen)
+ pdata = maxlen;
+ printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len,
+ m2->m_next, m2->m_flags, "\20\20freelist\17skipfw"
+ "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
+ "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
+ if (pdata)
+ printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
+ if (len != -1)
+ len -= m2->m_len;
+ m2 = m2->m_next;
+ }
+ if (len > 0)
+ printf("%d bytes unaccounted for.\n", len);
+ return;
+}
+
+u_int
+m_fixhdr(struct mbuf *m0)
+{
+ u_int len;
+
+ len = m_length(m0, NULL);
+ m0->m_pkthdr.len = len;
+ return (len);
+}
+
+u_int
+m_length(struct mbuf *m0, struct mbuf **last)
+{
+ struct mbuf *m;
+ u_int len;
+
+ len = 0;
+ for (m = m0; m != NULL; m = m->m_next) {
+ len += m->m_len;
+ if (m->m_next == NULL)
+ break;
+ }
+ if (last != NULL)
+ *last = m;
+ return (len);
+}
+
+/*
+ * Defragment a mbuf chain, returning the shortest possible
+ * chain of mbufs and clusters. If allocation fails and
+ * this cannot be completed, NULL will be returned, but
+ * the passed in chain will be unchanged. Upon success,
+ * the original chain will be freed, and the new chain
+ * will be returned.
+ *
+ * If a non-packet header is passed in, the original
+ * mbuf (chain?) will be returned unharmed.
+ */
+struct mbuf *
+m_defrag(struct mbuf *m0, int how)
+{
+ struct mbuf *m_new = NULL, *m_final = NULL;
+ int progress = 0, length;
+
+ MBUF_CHECKSLEEP(how);
+ if (!(m0->m_flags & M_PKTHDR))
+ return (m0);
+
+ m_fixhdr(m0); /* Needed sanity check */
+
+#ifdef MBUF_STRESS_TEST
+ if (m_defragrandomfailures) {
+ int temp = arc4random() & 0xff;
+ if (temp == 0xba)
+ goto nospace;
+ }
+#endif
+
+ if (m0->m_pkthdr.len > MHLEN)
+ m_final = m_getcl(how, MT_DATA, M_PKTHDR);
+ else
+ m_final = m_gethdr(how, MT_DATA);
+
+ if (m_final == NULL)
+ goto nospace;
+
+ if (m_dup_pkthdr(m_final, m0, how) == 0)
+ goto nospace;
+
+ m_new = m_final;
+
+ while (progress < m0->m_pkthdr.len) {
+ length = m0->m_pkthdr.len - progress;
+ if (length > MCLBYTES)
+ length = MCLBYTES;
+
+ if (m_new == NULL) {
+ if (length > MLEN)
+ m_new = m_getcl(how, MT_DATA, 0);
+ else
+ m_new = m_get(how, MT_DATA);
+ if (m_new == NULL)
+ goto nospace;
+ }
+
+ m_copydata(m0, progress, length, mtod(m_new, caddr_t));
+ progress += length;
+ m_new->m_len = length;
+ if (m_new != m_final)
+ m_cat(m_final, m_new);
+ m_new = NULL;
+ }
+#ifdef MBUF_STRESS_TEST
+ if (m0->m_next == NULL)
+ m_defraguseless++;
+#endif
+ m_freem(m0);
+ m0 = m_final;
+#ifdef MBUF_STRESS_TEST
+ m_defragpackets++;
+ m_defragbytes += m0->m_pkthdr.len;
+#endif
+ return (m0);
+nospace:
+#ifdef MBUF_STRESS_TEST
+ m_defragfailure++;
+#endif
+ if (m_final)
+ m_freem(m_final);
+ return (NULL);
+}
+
+/*
+ * Defragment an mbuf chain, returning at most maxfrags separate
+ * mbufs+clusters. If this is not possible NULL is returned and
+ * the original mbuf chain is left in it's present (potentially
+ * modified) state. We use two techniques: collapsing consecutive
+ * mbufs and replacing consecutive mbufs by a cluster.
+ *
+ * NB: this should really be named m_defrag but that name is taken
+ */
+struct mbuf *
+m_collapse(struct mbuf *m0, int how, int maxfrags)
+{
+ struct mbuf *m, *n, *n2, **prev;
+ u_int curfrags;
+
+ /*
+ * Calculate the current number of frags.
+ */
+ curfrags = 0;
+ for (m = m0; m != NULL; m = m->m_next)
+ curfrags++;
+ /*
+ * First, try to collapse mbufs. Note that we always collapse
+ * towards the front so we don't need to deal with moving the
+ * pkthdr. This may be suboptimal if the first mbuf has much
+ * less data than the following.
+ */
+ m = m0;
+again:
+ for (;;) {
+ n = m->m_next;
+ if (n == NULL)
+ break;
+ if (M_WRITABLE(m) &&
+ n->m_len < M_TRAILINGSPACE(m)) {
+ bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
+ n->m_len);
+ m->m_len += n->m_len;
+ m->m_next = n->m_next;
+ m_free(n);
+ if (--curfrags <= maxfrags)
+ return m0;
+ } else
+ m = n;
+ }
+ KASSERT(maxfrags > 1,
+ ("maxfrags %u, but normal collapse failed", maxfrags));
+ /*
+ * Collapse consecutive mbufs to a cluster.
+ */
+ prev = &m0->m_next; /* NB: not the first mbuf */
+ while ((n = *prev) != NULL) {
+ if ((n2 = n->m_next) != NULL &&
+ n->m_len + n2->m_len < MCLBYTES) {
+ m = m_getcl(how, MT_DATA, 0);
+ if (m == NULL)
+ goto bad;
+ bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
+ bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
+ n2->m_len);
+ m->m_len = n->m_len + n2->m_len;
+ m->m_next = n2->m_next;
+ *prev = m;
+ m_free(n);
+ m_free(n2);
+ if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */
+ return m0;
+ /*
+ * Still not there, try the normal collapse
+ * again before we allocate another cluster.
+ */
+ goto again;
+ }
+ prev = &n->m_next;
+ }
+ /*
+ * No place where we can collapse to a cluster; punt.
+ * This can occur if, for example, you request 2 frags
+ * but the packet requires that both be clusters (we
+ * never reallocate the first mbuf to avoid moving the
+ * packet header).
+ */
+bad:
+ return NULL;
+}
+
+#ifdef MBUF_STRESS_TEST
+
+/*
+ * Fragment an mbuf chain. There's no reason you'd ever want to do
+ * this in normal usage, but it's great for stress testing various
+ * mbuf consumers.
+ *
+ * If fragmentation is not possible, the original chain will be
+ * returned.
+ *
+ * Possible length values:
+ * 0 no fragmentation will occur
+ * > 0 each fragment will be of the specified length
+ * -1 each fragment will be the same random value in length
+ * -2 each fragment's length will be entirely random
+ * (Random values range from 1 to 256)
+ */
+struct mbuf *
+m_fragment(struct mbuf *m0, int how, int length)
+{
+ struct mbuf *m_new = NULL, *m_final = NULL;
+ int progress = 0;
+
+ if (!(m0->m_flags & M_PKTHDR))
+ return (m0);
+
+ if ((length == 0) || (length < -2))
+ return (m0);
+
+ m_fixhdr(m0); /* Needed sanity check */
+
+ m_final = m_getcl(how, MT_DATA, M_PKTHDR);
+
+ if (m_final == NULL)
+ goto nospace;
+
+ if (m_dup_pkthdr(m_final, m0, how) == 0)
+ goto nospace;
+
+ m_new = m_final;
+
+ if (length == -1)
+ length = 1 + (arc4random() & 255);
+
+ while (progress < m0->m_pkthdr.len) {
+ int fraglen;
+
+ if (length > 0)
+ fraglen = length;
+ else
+ fraglen = 1 + (arc4random() & 255);
+ if (fraglen > m0->m_pkthdr.len - progress)
+ fraglen = m0->m_pkthdr.len - progress;
+
+ if (fraglen > MCLBYTES)
+ fraglen = MCLBYTES;
+
+ if (m_new == NULL) {
+ m_new = m_getcl(how, MT_DATA, 0);
+ if (m_new == NULL)
+ goto nospace;
+ }
+
+ m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t));
+ progress += fraglen;
+ m_new->m_len = fraglen;
+ if (m_new != m_final)
+ m_cat(m_final, m_new);
+ m_new = NULL;
+ }
+ m_freem(m0);
+ m0 = m_final;
+ return (m0);
+nospace:
+ if (m_final)
+ m_freem(m_final);
+ /* Return the original chain on failure */
+ return (m0);
+}
+
+#endif
+
+/*
+ * Copy the contents of uio into a properly sized mbuf chain.
+ */
+struct mbuf *
+m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
+{
+ struct mbuf *m, *mb;
+ int error, length;
+ ssize_t total;
+ int progress = 0;
+
+ /*
+ * len can be zero or an arbitrary large value bound by
+ * the total data supplied by the uio.
+ */
+ if (len > 0)
+ total = min(uio->uio_resid, len);
+ else
+ total = uio->uio_resid;
+
+ /*
+ * The smallest unit returned by m_getm2() is a single mbuf
+ * with pkthdr. We can't align past it.
+ */
+ if (align >= MHLEN)
+ return (NULL);
+
+ /*
+ * Give us the full allocation or nothing.
+ * If len is zero return the smallest empty mbuf.
+ */
+ m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
+ if (m == NULL)
+ return (NULL);
+ m->m_data += align;
+
+ /* Fill all mbufs with uio data and update header information. */
+ for (mb = m; mb != NULL; mb = mb->m_next) {
+ length = min(M_TRAILINGSPACE(mb), total - progress);
+
+ error = uiomove(mtod(mb, void *), length, uio);
+ if (error) {
+ m_freem(m);
+ return (NULL);
+ }
+
+ mb->m_len = length;
+ progress += length;
+ if (flags & M_PKTHDR)
+ m->m_pkthdr.len += length;
+ }
+ KASSERT(progress == total, ("%s: progress != total", __func__));
+
+ return (m);
+}
+
+/*
+ * Copy an mbuf chain into a uio limited by len if set.
+ */
+int
+m_mbuftouio(struct uio *uio, struct mbuf *m, int len)
+{
+ int error, length, total;
+ int progress = 0;
+
+ if (len > 0)
+ total = min(uio->uio_resid, len);
+ else
+ total = uio->uio_resid;
+
+ /* Fill the uio with data from the mbufs. */
+ for (; m != NULL; m = m->m_next) {
+ length = min(m->m_len, total - progress);
+
+ error = uiomove(mtod(m, void *), length, uio);
+ if (error)
+ return (error);
+
+ progress += length;
+ }
+
+ return (0);
+}
+
+/*
+ * Set the m_data pointer of a newly-allocated mbuf
+ * to place an object of the specified size at the
+ * end of the mbuf, longword aligned.
+ */
+void
+m_align(struct mbuf *m, int len)
+{
+#ifdef INVARIANTS
+ const char *msg = "%s: not a virgin mbuf";
+#endif
+ int adjust;
+
+ if (m->m_flags & M_EXT) {
+ KASSERT(m->m_data == m->m_ext.ext_buf, (msg, __func__));
+ adjust = m->m_ext.ext_size - len;
+ } else if (m->m_flags & M_PKTHDR) {
+ KASSERT(m->m_data == m->m_pktdat, (msg, __func__));
+ adjust = MHLEN - len;
+ } else {
+ KASSERT(m->m_data == m->m_dat, (msg, __func__));
+ adjust = MLEN - len;
+ }
+
+ m->m_data += adjust &~ (sizeof(long)-1);
+}
+
+/*
+ * Create a writable copy of the mbuf chain. While doing this
+ * we compact the chain with a goal of producing a chain with
+ * at most two mbufs. The second mbuf in this chain is likely
+ * to be a cluster. The primary purpose of this work is to create
+ * a writable packet for encryption, compression, etc. The
+ * secondary goal is to linearize the data so the data can be
+ * passed to crypto hardware in the most efficient manner possible.
+ */
+struct mbuf *
+m_unshare(struct mbuf *m0, int how)
+{
+ struct mbuf *m, *mprev;
+ struct mbuf *n, *mfirst, *mlast;
+ int len, off;
+
+ mprev = NULL;
+ for (m = m0; m != NULL; m = mprev->m_next) {
+ /*
+ * Regular mbufs are ignored unless there's a cluster
+ * in front of it that we can use to coalesce. We do
+ * the latter mainly so later clusters can be coalesced
+ * also w/o having to handle them specially (i.e. convert
+ * mbuf+cluster -> cluster). This optimization is heavily
+ * influenced by the assumption that we're running over
+ * Ethernet where MCLBYTES is large enough that the max
+ * packet size will permit lots of coalescing into a
+ * single cluster. This in turn permits efficient
+ * crypto operations, especially when using hardware.
+ */
+ if ((m->m_flags & M_EXT) == 0) {
+ if (mprev && (mprev->m_flags & M_EXT) &&
+ m->m_len <= M_TRAILINGSPACE(mprev)) {
+ /* XXX: this ignores mbuf types */
+ memcpy(mtod(mprev, caddr_t) + mprev->m_len,
+ mtod(m, caddr_t), m->m_len);
+ mprev->m_len += m->m_len;
+ mprev->m_next = m->m_next; /* unlink from chain */
+ m_free(m); /* reclaim mbuf */
+#if 0
+ newipsecstat.ips_mbcoalesced++;
+#endif
+ } else {
+ mprev = m;
+ }
+ continue;
+ }
+ /*
+ * Writable mbufs are left alone (for now).
+ */
+ if (M_WRITABLE(m)) {
+ mprev = m;
+ continue;
+ }
+
+ /*
+ * Not writable, replace with a copy or coalesce with
+ * the previous mbuf if possible (since we have to copy
+ * it anyway, we try to reduce the number of mbufs and
+ * clusters so that future work is easier).
+ */
+ KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags));
+ /* NB: we only coalesce into a cluster or larger */
+ if (mprev != NULL && (mprev->m_flags & M_EXT) &&
+ m->m_len <= M_TRAILINGSPACE(mprev)) {
+ /* XXX: this ignores mbuf types */
+ memcpy(mtod(mprev, caddr_t) + mprev->m_len,
+ mtod(m, caddr_t), m->m_len);
+ mprev->m_len += m->m_len;
+ mprev->m_next = m->m_next; /* unlink from chain */
+ m_free(m); /* reclaim mbuf */
+#if 0
+ newipsecstat.ips_clcoalesced++;
+#endif
+ continue;
+ }
+
+ /*
+ * Allocate new space to hold the copy and copy the data.
+ * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
+ * splitting them into clusters. We could just malloc a
+ * buffer and make it external but too many device drivers
+ * don't know how to break up the non-contiguous memory when
+ * doing DMA.
+ */
+ n = m_getcl(how, m->m_type, m->m_flags);
+ if (n == NULL) {
+ m_freem(m0);
+ return (NULL);
+ }
+ len = m->m_len;
+ off = 0;
+ mfirst = n;
+ mlast = NULL;
+ for (;;) {
+ int cc = min(len, MCLBYTES);
+ memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc);
+ n->m_len = cc;
+ if (mlast != NULL)
+ mlast->m_next = n;
+ mlast = n;
+#if 0
+ newipsecstat.ips_clcopied++;
+#endif
+
+ len -= cc;
+ if (len <= 0)
+ break;
+ off += cc;
+
+ n = m_getcl(how, m->m_type, m->m_flags);
+ if (n == NULL) {
+ m_freem(mfirst);
+ m_freem(m0);
+ return (NULL);
+ }
+ }
+ n->m_next = m->m_next;
+ if (mprev == NULL)
+ m0 = mfirst; /* new head of chain */
+ else
+ mprev->m_next = mfirst; /* replace old mbuf */
+ m_free(m); /* release old mbuf */
+ mprev = mfirst;
+ }
+ return (m0);
+}
+
+#ifdef MBUF_PROFILING
+
+#define MP_BUCKETS 32 /* don't just change this as things may overflow.*/
+struct mbufprofile {
+ uintmax_t wasted[MP_BUCKETS];
+ uintmax_t used[MP_BUCKETS];
+ uintmax_t segments[MP_BUCKETS];
+} mbprof;
+
+#define MP_MAXDIGITS 21 /* strlen("16,000,000,000,000,000,000") == 21 */
+#define MP_NUMLINES 6
+#define MP_NUMSPERLINE 16
+#define MP_EXTRABYTES 64 /* > strlen("used:\nwasted:\nsegments:\n") */
+/* work out max space needed and add a bit of spare space too */
+#define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE)
+#define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES)
+
+char mbprofbuf[MP_BUFSIZE];
+
+void
+m_profile(struct mbuf *m)
+{
+ int segments = 0;
+ int used = 0;
+ int wasted = 0;
+
+ while (m) {
+ segments++;
+ used += m->m_len;
+ if (m->m_flags & M_EXT) {
+ wasted += MHLEN - sizeof(m->m_ext) +
+ m->m_ext.ext_size - m->m_len;
+ } else {
+ if (m->m_flags & M_PKTHDR)
+ wasted += MHLEN - m->m_len;
+ else
+ wasted += MLEN - m->m_len;
+ }
+ m = m->m_next;
+ }
+ /* be paranoid.. it helps */
+ if (segments > MP_BUCKETS - 1)
+ segments = MP_BUCKETS - 1;
+ if (used > 100000)
+ used = 100000;
+ if (wasted > 100000)
+ wasted = 100000;
+ /* store in the appropriate bucket */
+ /* don't bother locking. if it's slightly off, so what? */
+ mbprof.segments[segments]++;
+ mbprof.used[fls(used)]++;
+ mbprof.wasted[fls(wasted)]++;
+}
+
+static void
+mbprof_textify(void)
+{
+ int offset;
+ char *c;
+ uint64_t *p;
+
+
+ p = &mbprof.wasted[0];
+ c = mbprofbuf;
+ offset = snprintf(c, MP_MAXLINE + 10,
+ "wasted:\n"
+ "%ju %ju %ju %ju %ju %ju %ju %ju "
+ "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+ p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+ p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#ifdef BIG_ARRAY
+ p = &mbprof.wasted[16];
+ c += offset;
+ offset = snprintf(c, MP_MAXLINE,
+ "%ju %ju %ju %ju %ju %ju %ju %ju "
+ "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+ p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+ p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#endif
+ p = &mbprof.used[0];
+ c += offset;
+ offset = snprintf(c, MP_MAXLINE + 10,
+ "used:\n"
+ "%ju %ju %ju %ju %ju %ju %ju %ju "
+ "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+ p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+ p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#ifdef BIG_ARRAY
+ p = &mbprof.used[16];
+ c += offset;
+ offset = snprintf(c, MP_MAXLINE,
+ "%ju %ju %ju %ju %ju %ju %ju %ju "
+ "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+ p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+ p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#endif
+ p = &mbprof.segments[0];
+ c += offset;
+ offset = snprintf(c, MP_MAXLINE + 10,
+ "segments:\n"
+ "%ju %ju %ju %ju %ju %ju %ju %ju "
+ "%ju %ju %ju %ju %ju %ju %ju %ju\n",
+ p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+ p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#ifdef BIG_ARRAY
+ p = &mbprof.segments[16];
+ c += offset;
+ offset = snprintf(c, MP_MAXLINE,
+ "%ju %ju %ju %ju %ju %ju %ju %ju "
+ "%ju %ju %ju %ju %ju %ju %ju %jju",
+ p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7],
+ p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
+#endif
+}
+
+static int
+mbprof_handler(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+
+ mbprof_textify();
+ error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1);
+ return (error);
+}
+
+static int
+mbprof_clr_handler(SYSCTL_HANDLER_ARGS)
+{
+ int clear, error;
+
+ clear = 0;
+ error = sysctl_handle_int(oidp, &clear, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ if (clear) {
+ bzero(&mbprof, sizeof(mbprof));
+ }
+
+ return (error);
+}
+
+
+SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, mbprof_handler, "A", "mbuf profiling statistics");
+
+SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW,
+ NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics");
+#endif
+
diff --git a/sys/kern/uipc_mbuf2.c b/sys/kern/uipc_mbuf2.c
new file mode 100644
index 0000000..e32e2a1
--- /dev/null
+++ b/sys/kern/uipc_mbuf2.c
@@ -0,0 +1,453 @@
+/* $KAME: uipc_mbuf2.c,v 1.31 2001/11/28 11:08:53 itojun Exp $ */
+/* $NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $ */
+
+/*-
+ * Copyright (C) 1999 WIDE Project.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the project nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*#define PULLDOWN_DEBUG*/
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+
+#include <security/mac/mac_framework.h>
+
+static MALLOC_DEFINE(M_PACKET_TAGS, MBUF_TAG_MEM_NAME,
+ "packet-attached information");
+
+/* can't call it m_dup(), as freebsd[34] uses m_dup() with different arg */
+static struct mbuf *m_dup1(struct mbuf *, int, int, int);
+
+/*
+ * ensure that [off, off + len) is contiguous on the mbuf chain "m".
+ * packet chain before "off" is kept untouched.
+ * if offp == NULL, the target will start at <retval, 0> on resulting chain.
+ * if offp != NULL, the target will start at <retval, *offp> on resulting chain.
+ *
+ * on error return (NULL return value), original "m" will be freed.
+ *
+ * XXX: M_TRAILINGSPACE/M_LEADINGSPACE only permitted on writable ext_buf.
+ */
+struct mbuf *
+m_pulldown(struct mbuf *m, int off, int len, int *offp)
+{
+ struct mbuf *n, *o;
+ int hlen, tlen, olen;
+ int writable;
+
+ /* check invalid arguments. */
+ if (m == NULL)
+ panic("m == NULL in m_pulldown()");
+ if (len > MCLBYTES) {
+ m_freem(m);
+ return NULL; /* impossible */
+ }
+
+#ifdef PULLDOWN_DEBUG
+ {
+ struct mbuf *t;
+ printf("before:");
+ for (t = m; t; t = t->m_next)
+ printf(" %d", t->m_len);
+ printf("\n");
+ }
+#endif
+ n = m;
+ while (n != NULL && off > 0) {
+ if (n->m_len > off)
+ break;
+ off -= n->m_len;
+ n = n->m_next;
+ }
+ /* be sure to point non-empty mbuf */
+ while (n != NULL && n->m_len == 0)
+ n = n->m_next;
+ if (!n) {
+ m_freem(m);
+ return NULL; /* mbuf chain too short */
+ }
+
+ /*
+ * XXX: This code is flawed because it considers a "writable" mbuf
+ * data region to require all of the following:
+ * (i) mbuf _has_ to have M_EXT set; if it is just a regular
+ * mbuf, it is still not considered "writable."
+ * (ii) since mbuf has M_EXT, the ext_type _has_ to be
+ * EXT_CLUSTER. Anything else makes it non-writable.
+ * (iii) M_WRITABLE() must evaluate true.
+ * Ideally, the requirement should only be (iii).
+ *
+ * If we're writable, we're sure we're writable, because the ref. count
+ * cannot increase from 1, as that would require posession of mbuf
+ * n by someone else (which is impossible). However, if we're _not_
+ * writable, we may eventually become writable )if the ref. count drops
+ * to 1), but we'll fail to notice it unless we re-evaluate
+ * M_WRITABLE(). For now, we only evaluate once at the beginning and
+ * live with this.
+ */
+ /*
+ * XXX: This is dumb. If we're just a regular mbuf with no M_EXT,
+ * then we're not "writable," according to this code.
+ */
+ writable = 0;
+ if ((n->m_flags & M_EXT) == 0 ||
+ (n->m_ext.ext_type == EXT_CLUSTER && M_WRITABLE(n)))
+ writable = 1;
+
+ /*
+ * the target data is on <n, off>.
+ * if we got enough data on the mbuf "n", we're done.
+ */
+ if ((off == 0 || offp) && len <= n->m_len - off && writable)
+ goto ok;
+
+ /*
+ * when len <= n->m_len - off and off != 0, it is a special case.
+ * len bytes from <n, off> sits in single mbuf, but the caller does
+ * not like the starting position (off).
+ * chop the current mbuf into two pieces, set off to 0.
+ */
+ if (len <= n->m_len - off) {
+ o = m_dup1(n, off, n->m_len - off, M_NOWAIT);
+ if (o == NULL) {
+ m_freem(m);
+ return NULL; /* ENOBUFS */
+ }
+ n->m_len = off;
+ o->m_next = n->m_next;
+ n->m_next = o;
+ n = n->m_next;
+ off = 0;
+ goto ok;
+ }
+
+ /*
+ * we need to take hlen from <n, off> and tlen from <n->m_next, 0>,
+ * and construct contiguous mbuf with m_len == len.
+ * note that hlen + tlen == len, and tlen > 0.
+ */
+ hlen = n->m_len - off;
+ tlen = len - hlen;
+
+ /*
+ * ensure that we have enough trailing data on mbuf chain.
+ * if not, we can do nothing about the chain.
+ */
+ olen = 0;
+ for (o = n->m_next; o != NULL; o = o->m_next)
+ olen += o->m_len;
+ if (hlen + olen < len) {
+ m_freem(m);
+ return NULL; /* mbuf chain too short */
+ }
+
+ /*
+ * easy cases first.
+ * we need to use m_copydata() to get data from <n->m_next, 0>.
+ */
+ if ((off == 0 || offp) && M_TRAILINGSPACE(n) >= tlen
+ && writable) {
+ m_copydata(n->m_next, 0, tlen, mtod(n, caddr_t) + n->m_len);
+ n->m_len += tlen;
+ m_adj(n->m_next, tlen);
+ goto ok;
+ }
+ if ((off == 0 || offp) && M_LEADINGSPACE(n->m_next) >= hlen
+ && writable) {
+ n->m_next->m_data -= hlen;
+ n->m_next->m_len += hlen;
+ bcopy(mtod(n, caddr_t) + off, mtod(n->m_next, caddr_t), hlen);
+ n->m_len -= hlen;
+ n = n->m_next;
+ off = 0;
+ goto ok;
+ }
+
+ /*
+ * now, we need to do the hard way. don't m_copy as there's no room
+ * on both end.
+ */
+ if (len > MLEN)
+ o = m_getcl(M_NOWAIT, m->m_type, 0);
+ else
+ o = m_get(M_NOWAIT, m->m_type);
+ if (!o) {
+ m_freem(m);
+ return NULL; /* ENOBUFS */
+ }
+ /* get hlen from <n, off> into <o, 0> */
+ o->m_len = hlen;
+ bcopy(mtod(n, caddr_t) + off, mtod(o, caddr_t), hlen);
+ n->m_len -= hlen;
+ /* get tlen from <n->m_next, 0> into <o, hlen> */
+ m_copydata(n->m_next, 0, tlen, mtod(o, caddr_t) + o->m_len);
+ o->m_len += tlen;
+ m_adj(n->m_next, tlen);
+ o->m_next = n->m_next;
+ n->m_next = o;
+ n = o;
+ off = 0;
+
+ok:
+#ifdef PULLDOWN_DEBUG
+ {
+ struct mbuf *t;
+ printf("after:");
+ for (t = m; t; t = t->m_next)
+ printf("%c%d", t == n ? '*' : ' ', t->m_len);
+ printf(" (off=%d)\n", off);
+ }
+#endif
+ if (offp)
+ *offp = off;
+ return n;
+}
+
+static struct mbuf *
+m_dup1(struct mbuf *m, int off, int len, int wait)
+{
+ struct mbuf *n;
+ int copyhdr;
+
+ if (len > MCLBYTES)
+ return NULL;
+ if (off == 0 && (m->m_flags & M_PKTHDR) != 0)
+ copyhdr = 1;
+ else
+ copyhdr = 0;
+ if (len >= MINCLSIZE) {
+ if (copyhdr == 1)
+ n = m_getcl(wait, m->m_type, M_PKTHDR);
+ else
+ n = m_getcl(wait, m->m_type, 0);
+ } else {
+ if (copyhdr == 1)
+ n = m_gethdr(wait, m->m_type);
+ else
+ n = m_get(wait, m->m_type);
+ }
+ if (!n)
+ return NULL; /* ENOBUFS */
+
+ if (copyhdr && !m_dup_pkthdr(n, m, wait)) {
+ m_free(n);
+ return NULL;
+ }
+ m_copydata(m, off, len, mtod(n, caddr_t));
+ n->m_len = len;
+ return n;
+}
+
+/* Free a packet tag. */
+void
+m_tag_free_default(struct m_tag *t)
+{
+#ifdef MAC
+ if (t->m_tag_id == PACKET_TAG_MACLABEL)
+ mac_mbuf_tag_destroy(t);
+#endif
+ free(t, M_PACKET_TAGS);
+}
+
+/* Get a packet tag structure along with specified data following. */
+struct m_tag *
+m_tag_alloc(uint32_t cookie, int type, int len, int wait)
+{
+ struct m_tag *t;
+
+ MBUF_CHECKSLEEP(wait);
+ if (len < 0)
+ return NULL;
+ t = malloc(len + sizeof(struct m_tag), M_PACKET_TAGS, wait);
+ if (t == NULL)
+ return NULL;
+ m_tag_setup(t, cookie, type, len);
+ t->m_tag_free = m_tag_free_default;
+ return t;
+}
+
+/* Unlink and free a packet tag. */
+void
+m_tag_delete(struct mbuf *m, struct m_tag *t)
+{
+
+ KASSERT(m && t, ("m_tag_delete: null argument, m %p t %p", m, t));
+ m_tag_unlink(m, t);
+ m_tag_free(t);
+}
+
+/* Unlink and free a packet tag chain, starting from given tag. */
+void
+m_tag_delete_chain(struct mbuf *m, struct m_tag *t)
+{
+ struct m_tag *p, *q;
+
+ KASSERT(m, ("m_tag_delete_chain: null mbuf"));
+ if (t != NULL)
+ p = t;
+ else
+ p = SLIST_FIRST(&m->m_pkthdr.tags);
+ if (p == NULL)
+ return;
+ while ((q = SLIST_NEXT(p, m_tag_link)) != NULL)
+ m_tag_delete(m, q);
+ m_tag_delete(m, p);
+}
+
+/*
+ * Strip off all tags that would normally vanish when
+ * passing through a network interface. Only persistent
+ * tags will exist after this; these are expected to remain
+ * so long as the mbuf chain exists, regardless of the
+ * path the mbufs take.
+ */
+void
+m_tag_delete_nonpersistent(struct mbuf *m)
+{
+ struct m_tag *p, *q;
+
+ SLIST_FOREACH_SAFE(p, &m->m_pkthdr.tags, m_tag_link, q)
+ if ((p->m_tag_id & MTAG_PERSISTENT) == 0)
+ m_tag_delete(m, p);
+}
+
+/* Find a tag, starting from a given position. */
+struct m_tag *
+m_tag_locate(struct mbuf *m, uint32_t cookie, int type, struct m_tag *t)
+{
+ struct m_tag *p;
+
+ KASSERT(m, ("m_tag_locate: null mbuf"));
+ if (t == NULL)
+ p = SLIST_FIRST(&m->m_pkthdr.tags);
+ else
+ p = SLIST_NEXT(t, m_tag_link);
+ while (p != NULL) {
+ if (p->m_tag_cookie == cookie && p->m_tag_id == type)
+ return p;
+ p = SLIST_NEXT(p, m_tag_link);
+ }
+ return NULL;
+}
+
+/* Copy a single tag. */
+struct m_tag *
+m_tag_copy(struct m_tag *t, int how)
+{
+ struct m_tag *p;
+
+ MBUF_CHECKSLEEP(how);
+ KASSERT(t, ("m_tag_copy: null tag"));
+ p = m_tag_alloc(t->m_tag_cookie, t->m_tag_id, t->m_tag_len, how);
+ if (p == NULL)
+ return (NULL);
+#ifdef MAC
+ /*
+ * XXXMAC: we should probably pass off the initialization, and
+ * copying here? can we hide that PACKET_TAG_MACLABEL is
+ * special from the mbuf code?
+ */
+ if (t->m_tag_id == PACKET_TAG_MACLABEL) {
+ if (mac_mbuf_tag_init(p, how) != 0) {
+ m_tag_free(p);
+ return (NULL);
+ }
+ mac_mbuf_tag_copy(t, p);
+ } else
+#endif
+ bcopy(t + 1, p + 1, t->m_tag_len); /* Copy the data */
+ return p;
+}
+
+/*
+ * Copy two tag chains. The destination mbuf (to) loses any attached
+ * tags even if the operation fails. This should not be a problem, as
+ * m_tag_copy_chain() is typically called with a newly-allocated
+ * destination mbuf.
+ */
+int
+m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int how)
+{
+ struct m_tag *p, *t, *tprev = NULL;
+
+ MBUF_CHECKSLEEP(how);
+ KASSERT(to && from,
+ ("m_tag_copy_chain: null argument, to %p from %p", to, from));
+ m_tag_delete_chain(to, NULL);
+ SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) {
+ t = m_tag_copy(p, how);
+ if (t == NULL) {
+ m_tag_delete_chain(to, NULL);
+ return 0;
+ }
+ if (tprev == NULL)
+ SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link);
+ else
+ SLIST_INSERT_AFTER(tprev, t, m_tag_link);
+ tprev = t;
+ }
+ return 1;
+}
diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
new file mode 100644
index 0000000..fe7e886
--- /dev/null
+++ b/sys/kern/uipc_mqueue.c
@@ -0,0 +1,2883 @@
+/*-
+ * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * POSIX message queue implementation.
+ *
+ * 1) A mqueue filesystem can be mounted, each message queue appears
+ * in mounted directory, user can change queue's permission and
+ * ownership, or remove a queue. Manually creating a file in the
+ * directory causes a message queue to be created in the kernel with
+ * default message queue attributes applied and same name used, this
+ * method is not advocated since mq_open syscall allows user to specify
+ * different attributes. Also the file system can be mounted multiple
+ * times at different mount points but shows same contents.
+ *
+ * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
+ * but directly operate on internal data structure, this allows user to
+ * use the IPC facility without having to mount mqueue file system.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/buf.h>
+#include <sys/capability.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mount.h>
+#include <sys/mqueue.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/posix4.h>
+#include <sys/poll.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sysproto.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <machine/atomic.h>
+
+FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
+
+/*
+ * Limits and constants
+ */
+#define MQFS_NAMELEN NAME_MAX
+#define MQFS_DELEN (8 + MQFS_NAMELEN)
+
+/* node types */
+typedef enum {
+ mqfstype_none = 0,
+ mqfstype_root,
+ mqfstype_dir,
+ mqfstype_this,
+ mqfstype_parent,
+ mqfstype_file,
+ mqfstype_symlink,
+} mqfs_type_t;
+
+struct mqfs_node;
+
+/*
+ * mqfs_info: describes a mqfs instance
+ */
+struct mqfs_info {
+ struct sx mi_lock;
+ struct mqfs_node *mi_root;
+ struct unrhdr *mi_unrhdr;
+};
+
+struct mqfs_vdata {
+ LIST_ENTRY(mqfs_vdata) mv_link;
+ struct mqfs_node *mv_node;
+ struct vnode *mv_vnode;
+ struct task mv_task;
+};
+
+/*
+ * mqfs_node: describes a node (file or directory) within a mqfs
+ */
+struct mqfs_node {
+ char mn_name[MQFS_NAMELEN+1];
+ struct mqfs_info *mn_info;
+ struct mqfs_node *mn_parent;
+ LIST_HEAD(,mqfs_node) mn_children;
+ LIST_ENTRY(mqfs_node) mn_sibling;
+ LIST_HEAD(,mqfs_vdata) mn_vnodes;
+ int mn_refcount;
+ mqfs_type_t mn_type;
+ int mn_deleted;
+ uint32_t mn_fileno;
+ void *mn_data;
+ struct timespec mn_birth;
+ struct timespec mn_ctime;
+ struct timespec mn_atime;
+ struct timespec mn_mtime;
+ uid_t mn_uid;
+ gid_t mn_gid;
+ int mn_mode;
+};
+
+#define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node)
+#define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data))
+#define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data))
+#define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \
+ (fp)->f_data)->mn_data))
+
+TAILQ_HEAD(msgq, mqueue_msg);
+
+struct mqueue;
+
+struct mqueue_notifier {
+ LIST_ENTRY(mqueue_notifier) nt_link;
+ struct sigevent nt_sigev;
+ ksiginfo_t nt_ksi;
+ struct proc *nt_proc;
+};
+
+struct mqueue {
+ struct mtx mq_mutex;
+ int mq_flags;
+ long mq_maxmsg;
+ long mq_msgsize;
+ long mq_curmsgs;
+ long mq_totalbytes;
+ struct msgq mq_msgq;
+ int mq_receivers;
+ int mq_senders;
+ struct selinfo mq_rsel;
+ struct selinfo mq_wsel;
+ struct mqueue_notifier *mq_notifier;
+};
+
+#define MQ_RSEL 0x01
+#define MQ_WSEL 0x02
+
+struct mqueue_msg {
+ TAILQ_ENTRY(mqueue_msg) msg_link;
+ unsigned int msg_prio;
+ unsigned int msg_size;
+ /* following real data... */
+};
+
+static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
+ "POSIX real time message queue");
+
+static int default_maxmsg = 10;
+static int default_msgsize = 1024;
+
+static int maxmsg = 100;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
+ &maxmsg, 0, "Default maximum messages in queue");
+static int maxmsgsize = 16384;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
+ &maxmsgsize, 0, "Default maximum message size");
+static int maxmq = 100;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
+ &maxmq, 0, "maximum message queues");
+static int curmq = 0;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
+ &curmq, 0, "current message queue number");
+static int unloadable = 0;
+static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
+
+static eventhandler_tag exit_tag;
+
+/* Only one instance per-system */
+static struct mqfs_info mqfs_data;
+static uma_zone_t mqnode_zone;
+static uma_zone_t mqueue_zone;
+static uma_zone_t mvdata_zone;
+static uma_zone_t mqnoti_zone;
+static struct vop_vector mqfs_vnodeops;
+static struct fileops mqueueops;
+
+/*
+ * Directory structure construction and manipulation
+ */
+#ifdef notyet
+static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent,
+ const char *name, int namelen, struct ucred *cred, int mode);
+static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent,
+ const char *name, int namelen, struct ucred *cred, int mode);
+#endif
+
+static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent,
+ const char *name, int namelen, struct ucred *cred, int mode);
+static int mqfs_destroy(struct mqfs_node *mn);
+static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
+static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
+static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
+
+/*
+ * Message queue construction and maniplation
+ */
+static struct mqueue *mqueue_alloc(const struct mq_attr *attr);
+static void mqueue_free(struct mqueue *mq);
+static int mqueue_send(struct mqueue *mq, const char *msg_ptr,
+ size_t msg_len, unsigned msg_prio, int waitok,
+ const struct timespec *abs_timeout);
+static int mqueue_receive(struct mqueue *mq, char *msg_ptr,
+ size_t msg_len, unsigned *msg_prio, int waitok,
+ const struct timespec *abs_timeout);
+static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
+ int timo);
+static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
+ int timo);
+static void mqueue_send_notification(struct mqueue *mq);
+static void mqueue_fdclose(struct thread *td, int fd, struct file *fp);
+static void mq_proc_exit(void *arg, struct proc *p);
+
+/*
+ * kqueue filters
+ */
+static void filt_mqdetach(struct knote *kn);
+static int filt_mqread(struct knote *kn, long hint);
+static int filt_mqwrite(struct knote *kn, long hint);
+
+struct filterops mq_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_mqdetach,
+ .f_event = filt_mqread,
+};
+struct filterops mq_wfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_mqdetach,
+ .f_event = filt_mqwrite,
+};
+
+/*
+ * Initialize fileno bitmap
+ */
+static void
+mqfs_fileno_init(struct mqfs_info *mi)
+{
+ struct unrhdr *up;
+
+ up = new_unrhdr(1, INT_MAX, NULL);
+ mi->mi_unrhdr = up;
+}
+
+/*
+ * Tear down fileno bitmap
+ */
+static void
+mqfs_fileno_uninit(struct mqfs_info *mi)
+{
+ struct unrhdr *up;
+
+ up = mi->mi_unrhdr;
+ mi->mi_unrhdr = NULL;
+ delete_unrhdr(up);
+}
+
+/*
+ * Allocate a file number
+ */
+static void
+mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
+{
+ /* make sure our parent has a file number */
+ if (mn->mn_parent && !mn->mn_parent->mn_fileno)
+ mqfs_fileno_alloc(mi, mn->mn_parent);
+
+ switch (mn->mn_type) {
+ case mqfstype_root:
+ case mqfstype_dir:
+ case mqfstype_file:
+ case mqfstype_symlink:
+ mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
+ break;
+ case mqfstype_this:
+ KASSERT(mn->mn_parent != NULL,
+ ("mqfstype_this node has no parent"));
+ mn->mn_fileno = mn->mn_parent->mn_fileno;
+ break;
+ case mqfstype_parent:
+ KASSERT(mn->mn_parent != NULL,
+ ("mqfstype_parent node has no parent"));
+ if (mn->mn_parent == mi->mi_root) {
+ mn->mn_fileno = mn->mn_parent->mn_fileno;
+ break;
+ }
+ KASSERT(mn->mn_parent->mn_parent != NULL,
+ ("mqfstype_parent node has no grandparent"));
+ mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
+ break;
+ default:
+ KASSERT(0,
+ ("mqfs_fileno_alloc() called for unknown type node: %d",
+ mn->mn_type));
+ break;
+ }
+}
+
+/*
+ * Release a file number
+ */
+static void
+mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
+{
+ switch (mn->mn_type) {
+ case mqfstype_root:
+ case mqfstype_dir:
+ case mqfstype_file:
+ case mqfstype_symlink:
+ free_unr(mi->mi_unrhdr, mn->mn_fileno);
+ break;
+ case mqfstype_this:
+ case mqfstype_parent:
+ /* ignore these, as they don't "own" their file number */
+ break;
+ default:
+ KASSERT(0,
+ ("mqfs_fileno_free() called for unknown type node: %d",
+ mn->mn_type));
+ break;
+ }
+}
+
+static __inline struct mqfs_node *
+mqnode_alloc(void)
+{
+ return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
+}
+
+static __inline void
+mqnode_free(struct mqfs_node *node)
+{
+ uma_zfree(mqnode_zone, node);
+}
+
+static __inline void
+mqnode_addref(struct mqfs_node *node)
+{
+ atomic_fetchadd_int(&node->mn_refcount, 1);
+}
+
+static __inline void
+mqnode_release(struct mqfs_node *node)
+{
+ struct mqfs_info *mqfs;
+ int old, exp;
+
+ mqfs = node->mn_info;
+ old = atomic_fetchadd_int(&node->mn_refcount, -1);
+ if (node->mn_type == mqfstype_dir ||
+ node->mn_type == mqfstype_root)
+ exp = 3; /* include . and .. */
+ else
+ exp = 1;
+ if (old == exp) {
+ int locked = sx_xlocked(&mqfs->mi_lock);
+ if (!locked)
+ sx_xlock(&mqfs->mi_lock);
+ mqfs_destroy(node);
+ if (!locked)
+ sx_xunlock(&mqfs->mi_lock);
+ }
+}
+
+/*
+ * Add a node to a directory
+ */
+static int
+mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
+{
+ KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
+ KASSERT(parent->mn_info != NULL,
+ ("%s(): parent has no mn_info", __func__));
+ KASSERT(parent->mn_type == mqfstype_dir ||
+ parent->mn_type == mqfstype_root,
+ ("%s(): parent is not a directory", __func__));
+
+ node->mn_info = parent->mn_info;
+ node->mn_parent = parent;
+ LIST_INIT(&node->mn_children);
+ LIST_INIT(&node->mn_vnodes);
+ LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
+ mqnode_addref(parent);
+ return (0);
+}
+
+static struct mqfs_node *
+mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
+ int nodetype)
+{
+ struct mqfs_node *node;
+
+ node = mqnode_alloc();
+ strncpy(node->mn_name, name, namelen);
+ node->mn_type = nodetype;
+ node->mn_refcount = 1;
+ vfs_timestamp(&node->mn_birth);
+ node->mn_ctime = node->mn_atime = node->mn_mtime
+ = node->mn_birth;
+ node->mn_uid = cred->cr_uid;
+ node->mn_gid = cred->cr_gid;
+ node->mn_mode = mode;
+ return (node);
+}
+
+/*
+ * Create a file
+ */
+static struct mqfs_node *
+mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
+ struct ucred *cred, int mode)
+{
+ struct mqfs_node *node;
+
+ node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
+ if (mqfs_add_node(parent, node) != 0) {
+ mqnode_free(node);
+ return (NULL);
+ }
+ return (node);
+}
+
+/*
+ * Add . and .. to a directory
+ */
+static int
+mqfs_fixup_dir(struct mqfs_node *parent)
+{
+ struct mqfs_node *dir;
+
+ dir = mqnode_alloc();
+ dir->mn_name[0] = '.';
+ dir->mn_type = mqfstype_this;
+ dir->mn_refcount = 1;
+ if (mqfs_add_node(parent, dir) != 0) {
+ mqnode_free(dir);
+ return (-1);
+ }
+
+ dir = mqnode_alloc();
+ dir->mn_name[0] = dir->mn_name[1] = '.';
+ dir->mn_type = mqfstype_parent;
+ dir->mn_refcount = 1;
+
+ if (mqfs_add_node(parent, dir) != 0) {
+ mqnode_free(dir);
+ return (-1);
+ }
+
+ return (0);
+}
+
+#ifdef notyet
+
+/*
+ * Create a directory
+ */
+static struct mqfs_node *
+mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
+ struct ucred *cred, int mode)
+{
+ struct mqfs_node *node;
+
+ node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
+ if (mqfs_add_node(parent, node) != 0) {
+ mqnode_free(node);
+ return (NULL);
+ }
+
+ if (mqfs_fixup_dir(node) != 0) {
+ mqfs_destroy(node);
+ return (NULL);
+ }
+ return (node);
+}
+
+/*
+ * Create a symlink
+ */
+static struct mqfs_node *
+mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
+ struct ucred *cred, int mode)
+{
+ struct mqfs_node *node;
+
+ node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
+ if (mqfs_add_node(parent, node) != 0) {
+ mqnode_free(node);
+ return (NULL);
+ }
+ return (node);
+}
+
+#endif
+
+/*
+ * Destroy a node or a tree of nodes
+ */
+static int
+mqfs_destroy(struct mqfs_node *node)
+{
+ struct mqfs_node *parent;
+
+ KASSERT(node != NULL,
+ ("%s(): node is NULL", __func__));
+ KASSERT(node->mn_info != NULL,
+ ("%s(): node has no mn_info", __func__));
+
+ /* destroy children */
+ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
+ while (! LIST_EMPTY(&node->mn_children))
+ mqfs_destroy(LIST_FIRST(&node->mn_children));
+
+ /* unlink from parent */
+ if ((parent = node->mn_parent) != NULL) {
+ KASSERT(parent->mn_info == node->mn_info,
+ ("%s(): parent has different mn_info", __func__));
+ LIST_REMOVE(node, mn_sibling);
+ }
+
+ if (node->mn_fileno != 0)
+ mqfs_fileno_free(node->mn_info, node);
+ if (node->mn_data != NULL)
+ mqueue_free(node->mn_data);
+ mqnode_free(node);
+ return (0);
+}
+
+/*
+ * Mount a mqfs instance
+ */
+static int
+mqfs_mount(struct mount *mp)
+{
+ struct statfs *sbp;
+
+ if (mp->mnt_flag & MNT_UPDATE)
+ return (EOPNOTSUPP);
+
+ mp->mnt_data = &mqfs_data;
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_LOCAL;
+ MNT_IUNLOCK(mp);
+ vfs_getnewfsid(mp);
+
+ sbp = &mp->mnt_stat;
+ vfs_mountedfrom(mp, "mqueue");
+ sbp->f_bsize = PAGE_SIZE;
+ sbp->f_iosize = PAGE_SIZE;
+ sbp->f_blocks = 1;
+ sbp->f_bfree = 0;
+ sbp->f_bavail = 0;
+ sbp->f_files = 1;
+ sbp->f_ffree = 0;
+ return (0);
+}
+
+/*
+ * Unmount a mqfs instance
+ */
+static int
+mqfs_unmount(struct mount *mp, int mntflags)
+{
+ int error;
+
+ error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0,
+ curthread);
+ return (error);
+}
+
+/*
+ * Return a root vnode
+ */
+static int
+mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
+{
+ struct mqfs_info *mqfs;
+ int ret;
+
+ mqfs = VFSTOMQFS(mp);
+ ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
+ return (ret);
+}
+
+/*
+ * Return filesystem stats
+ */
+static int
+mqfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+ /* XXX update statistics */
+ return (0);
+}
+
+/*
+ * Initialize a mqfs instance
+ */
+static int
+mqfs_init(struct vfsconf *vfc)
+{
+ struct mqfs_node *root;
+ struct mqfs_info *mi;
+
+ mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ mvdata_zone = uma_zcreate("mvdata",
+ sizeof(struct mqfs_vdata), NULL, NULL, NULL,
+ NULL, UMA_ALIGN_PTR, 0);
+ mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ mi = &mqfs_data;
+ sx_init(&mi->mi_lock, "mqfs lock");
+ /* set up the root diretory */
+ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
+ mqfstype_root);
+ root->mn_info = mi;
+ LIST_INIT(&root->mn_children);
+ LIST_INIT(&root->mn_vnodes);
+ mi->mi_root = root;
+ mqfs_fileno_init(mi);
+ mqfs_fileno_alloc(mi, root);
+ mqfs_fixup_dir(root);
+ exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
+ EVENTHANDLER_PRI_ANY);
+ mq_fdclose = mqueue_fdclose;
+ p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
+ return (0);
+}
+
+/*
+ * Destroy a mqfs instance
+ */
+static int
+mqfs_uninit(struct vfsconf *vfc)
+{
+ struct mqfs_info *mi;
+
+ if (!unloadable)
+ return (EOPNOTSUPP);
+ EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
+ mi = &mqfs_data;
+ mqfs_destroy(mi->mi_root);
+ mi->mi_root = NULL;
+ mqfs_fileno_uninit(mi);
+ sx_destroy(&mi->mi_lock);
+ uma_zdestroy(mqnode_zone);
+ uma_zdestroy(mqueue_zone);
+ uma_zdestroy(mvdata_zone);
+ uma_zdestroy(mqnoti_zone);
+ return (0);
+}
+
+/*
+ * task routine
+ */
+static void
+do_recycle(void *context, int pending __unused)
+{
+ struct vnode *vp = (struct vnode *)context;
+
+ vrecycle(vp);
+ vdrop(vp);
+}
+
+/*
+ * Allocate a vnode
+ */
+static int
+mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
+{
+ struct mqfs_vdata *vd;
+ struct mqfs_info *mqfs;
+ struct vnode *newvpp;
+ int error;
+
+ mqfs = pn->mn_info;
+ *vpp = NULL;
+ sx_xlock(&mqfs->mi_lock);
+ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+ if (vd->mv_vnode->v_mount == mp) {
+ vhold(vd->mv_vnode);
+ break;
+ }
+ }
+
+ if (vd != NULL) {
+found:
+ *vpp = vd->mv_vnode;
+ sx_xunlock(&mqfs->mi_lock);
+ error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
+ vdrop(*vpp);
+ return (error);
+ }
+ sx_xunlock(&mqfs->mi_lock);
+
+ error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
+ if (error)
+ return (error);
+ vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
+ error = insmntque(newvpp, mp);
+ if (error != 0)
+ return (error);
+
+ sx_xlock(&mqfs->mi_lock);
+ /*
+ * Check if it has already been allocated
+ * while we were blocked.
+ */
+ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+ if (vd->mv_vnode->v_mount == mp) {
+ vhold(vd->mv_vnode);
+ sx_xunlock(&mqfs->mi_lock);
+
+ vgone(newvpp);
+ vput(newvpp);
+ goto found;
+ }
+ }
+
+ *vpp = newvpp;
+
+ vd = uma_zalloc(mvdata_zone, M_WAITOK);
+ (*vpp)->v_data = vd;
+ vd->mv_vnode = *vpp;
+ vd->mv_node = pn;
+ TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
+ LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
+ mqnode_addref(pn);
+ switch (pn->mn_type) {
+ case mqfstype_root:
+ (*vpp)->v_vflag = VV_ROOT;
+ /* fall through */
+ case mqfstype_dir:
+ case mqfstype_this:
+ case mqfstype_parent:
+ (*vpp)->v_type = VDIR;
+ break;
+ case mqfstype_file:
+ (*vpp)->v_type = VREG;
+ break;
+ case mqfstype_symlink:
+ (*vpp)->v_type = VLNK;
+ break;
+ case mqfstype_none:
+ KASSERT(0, ("mqfs_allocf called for null node\n"));
+ default:
+ panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
+ }
+ sx_xunlock(&mqfs->mi_lock);
+ return (0);
+}
+
+/*
+ * Search a directory entry
+ */
+static struct mqfs_node *
+mqfs_search(struct mqfs_node *pd, const char *name, int len)
+{
+ struct mqfs_node *pn;
+
+ sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
+ LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
+ if (strncmp(pn->mn_name, name, len) == 0 &&
+ pn->mn_name[len] == '\0')
+ return (pn);
+ }
+ return (NULL);
+}
+
+/*
+ * Look up a file or directory.
+ */
+static int
+mqfs_lookupx(struct vop_cachedlookup_args *ap)
+{
+ struct componentname *cnp;
+ struct vnode *dvp, **vpp;
+ struct mqfs_node *pd;
+ struct mqfs_node *pn;
+ struct mqfs_info *mqfs;
+ int nameiop, flags, error, namelen;
+ char *pname;
+ struct thread *td;
+
+ cnp = ap->a_cnp;
+ vpp = ap->a_vpp;
+ dvp = ap->a_dvp;
+ pname = cnp->cn_nameptr;
+ namelen = cnp->cn_namelen;
+ td = cnp->cn_thread;
+ flags = cnp->cn_flags;
+ nameiop = cnp->cn_nameiop;
+ pd = VTON(dvp);
+ pn = NULL;
+ mqfs = pd->mn_info;
+ *vpp = NULLVP;
+
+ if (dvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
+ if (error)
+ return (error);
+
+ /* shortcut: check if the name is too long */
+ if (cnp->cn_namelen >= MQFS_NAMELEN)
+ return (ENOENT);
+
+ /* self */
+ if (namelen == 1 && pname[0] == '.') {
+ if ((flags & ISLASTCN) && nameiop != LOOKUP)
+ return (EINVAL);
+ pn = pd;
+ *vpp = dvp;
+ VREF(dvp);
+ return (0);
+ }
+
+ /* parent */
+ if (cnp->cn_flags & ISDOTDOT) {
+ if (dvp->v_vflag & VV_ROOT)
+ return (EIO);
+ if ((flags & ISLASTCN) && nameiop != LOOKUP)
+ return (EINVAL);
+ VOP_UNLOCK(dvp, 0);
+ KASSERT(pd->mn_parent, ("non-root directory has no parent"));
+ pn = pd->mn_parent;
+ error = mqfs_allocv(dvp->v_mount, vpp, pn);
+ vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+ return (error);
+ }
+
+ /* named node */
+ sx_xlock(&mqfs->mi_lock);
+ pn = mqfs_search(pd, pname, namelen);
+ if (pn != NULL)
+ mqnode_addref(pn);
+ sx_xunlock(&mqfs->mi_lock);
+
+ /* found */
+ if (pn != NULL) {
+ /* DELETE */
+ if (nameiop == DELETE && (flags & ISLASTCN)) {
+ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
+ if (error) {
+ mqnode_release(pn);
+ return (error);
+ }
+ if (*vpp == dvp) {
+ VREF(dvp);
+ *vpp = dvp;
+ mqnode_release(pn);
+ return (0);
+ }
+ }
+
+ /* allocate vnode */
+ error = mqfs_allocv(dvp->v_mount, vpp, pn);
+ mqnode_release(pn);
+ if (error == 0 && cnp->cn_flags & MAKEENTRY)
+ cache_enter(dvp, *vpp, cnp);
+ return (error);
+ }
+
+ /* not found */
+
+ /* will create a new entry in the directory ? */
+ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
+ && (flags & ISLASTCN)) {
+ error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
+ if (error)
+ return (error);
+ cnp->cn_flags |= SAVENAME;
+ return (EJUSTRETURN);
+ }
+ return (ENOENT);
+}
+
+#if 0
+struct vop_lookup_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * vnode lookup operation
+ */
+static int
+mqfs_lookup(struct vop_cachedlookup_args *ap)
+{
+ int rc;
+
+ rc = mqfs_lookupx(ap);
+ return (rc);
+}
+
+#if 0
+struct vop_create_args {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ struct vattr *a_vap;
+};
+#endif
+
+/*
+ * vnode creation operation
+ */
+static int
+mqfs_create(struct vop_create_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ struct componentname *cnp = ap->a_cnp;
+ struct mqfs_node *pd;
+ struct mqfs_node *pn;
+ struct mqueue *mq;
+ int error;
+
+ pd = VTON(ap->a_dvp);
+ if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
+ return (ENOTDIR);
+ mq = mqueue_alloc(NULL);
+ if (mq == NULL)
+ return (EAGAIN);
+ sx_xlock(&mqfs->mi_lock);
+ if ((cnp->cn_flags & HASBUF) == 0)
+ panic("%s: no name", __func__);
+ pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
+ cnp->cn_cred, ap->a_vap->va_mode);
+ if (pn == NULL) {
+ sx_xunlock(&mqfs->mi_lock);
+ error = ENOSPC;
+ } else {
+ mqnode_addref(pn);
+ sx_xunlock(&mqfs->mi_lock);
+ error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
+ mqnode_release(pn);
+ if (error)
+ mqfs_destroy(pn);
+ else
+ pn->mn_data = mq;
+ }
+ if (error)
+ mqueue_free(mq);
+ return (error);
+}
+
+/*
+ * Remove an entry
+ */
+static
+int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
+{
+ struct mqfs_node *parent;
+ struct mqfs_vdata *vd;
+ int error = 0;
+
+ sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
+
+ if (ucred->cr_uid != pn->mn_uid &&
+ (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
+ error = EACCES;
+ else if (!pn->mn_deleted) {
+ parent = pn->mn_parent;
+ pn->mn_parent = NULL;
+ pn->mn_deleted = 1;
+ LIST_REMOVE(pn, mn_sibling);
+ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+ cache_purge(vd->mv_vnode);
+ vhold(vd->mv_vnode);
+ taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
+ }
+ mqnode_release(pn);
+ mqnode_release(parent);
+ } else
+ error = ENOENT;
+ return (error);
+}
+
+#if 0
+struct vop_remove_args {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * vnode removal operation
+ */
+static int
+mqfs_remove(struct vop_remove_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ struct mqfs_node *pn;
+ int error;
+
+ if (ap->a_vp->v_type == VDIR)
+ return (EPERM);
+ pn = VTON(ap->a_vp);
+ sx_xlock(&mqfs->mi_lock);
+ error = do_unlink(pn, ap->a_cnp->cn_cred);
+ sx_xunlock(&mqfs->mi_lock);
+ return (error);
+}
+
+#if 0
+struct vop_inactive_args {
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_inactive(struct vop_inactive_args *ap)
+{
+ struct mqfs_node *pn = VTON(ap->a_vp);
+
+ if (pn->mn_deleted)
+ vrecycle(ap->a_vp);
+ return (0);
+}
+
+#if 0
+struct vop_reclaim_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_reclaim(struct vop_reclaim_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
+ struct vnode *vp = ap->a_vp;
+ struct mqfs_node *pn;
+ struct mqfs_vdata *vd;
+
+ vd = vp->v_data;
+ pn = vd->mv_node;
+ sx_xlock(&mqfs->mi_lock);
+ vp->v_data = NULL;
+ LIST_REMOVE(vd, mv_link);
+ uma_zfree(mvdata_zone, vd);
+ mqnode_release(pn);
+ sx_xunlock(&mqfs->mi_lock);
+ return (0);
+}
+
+#if 0
+struct vop_open_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ int a_mode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ struct file *a_fp;
+};
+#endif
+
+static int
+mqfs_open(struct vop_open_args *ap)
+{
+ return (0);
+}
+
+#if 0
+struct vop_close_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ int a_fflag;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_close(struct vop_close_args *ap)
+{
+ return (0);
+}
+
+#if 0
+struct vop_access_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ accmode_t a_accmode;
+ struct ucred *a_cred;
+ struct thread *a_td;
+};
+#endif
+
+/*
+ * Verify permissions
+ */
+static int
+mqfs_access(struct vop_access_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct vattr vattr;
+ int error;
+
+ error = VOP_GETATTR(vp, &vattr, ap->a_cred);
+ if (error)
+ return (error);
+ error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
+ vattr.va_gid, ap->a_accmode, ap->a_cred, NULL);
+ return (error);
+}
+
+#if 0
+struct vop_getattr_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Get file attributes
+ */
+static int
+mqfs_getattr(struct vop_getattr_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct mqfs_node *pn = VTON(vp);
+ struct vattr *vap = ap->a_vap;
+ int error = 0;
+
+ vap->va_type = vp->v_type;
+ vap->va_mode = pn->mn_mode;
+ vap->va_nlink = 1;
+ vap->va_uid = pn->mn_uid;
+ vap->va_gid = pn->mn_gid;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_fileid = pn->mn_fileno;
+ vap->va_size = 0;
+ vap->va_blocksize = PAGE_SIZE;
+ vap->va_bytes = vap->va_size = 0;
+ vap->va_atime = pn->mn_atime;
+ vap->va_mtime = pn->mn_mtime;
+ vap->va_ctime = pn->mn_ctime;
+ vap->va_birthtime = pn->mn_birth;
+ vap->va_gen = 0;
+ vap->va_flags = 0;
+ vap->va_rdev = NODEV;
+ vap->va_bytes = 0;
+ vap->va_filerev = 0;
+ return (error);
+}
+
+#if 0
+struct vop_setattr_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct vattr *a_vap;
+ struct ucred *a_cred;
+};
+#endif
+/*
+ * Set attributes
+ */
+static int
+mqfs_setattr(struct vop_setattr_args *ap)
+{
+ struct mqfs_node *pn;
+ struct vattr *vap;
+ struct vnode *vp;
+ struct thread *td;
+ int c, error;
+ uid_t uid;
+ gid_t gid;
+
+ td = curthread;
+ vap = ap->a_vap;
+ vp = ap->a_vp;
+ if ((vap->va_type != VNON) ||
+ (vap->va_nlink != VNOVAL) ||
+ (vap->va_fsid != VNOVAL) ||
+ (vap->va_fileid != VNOVAL) ||
+ (vap->va_blocksize != VNOVAL) ||
+ (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
+ (vap->va_rdev != VNOVAL) ||
+ ((int)vap->va_bytes != VNOVAL) ||
+ (vap->va_gen != VNOVAL)) {
+ return (EINVAL);
+ }
+
+ pn = VTON(vp);
+
+ error = c = 0;
+ if (vap->va_uid == (uid_t)VNOVAL)
+ uid = pn->mn_uid;
+ else
+ uid = vap->va_uid;
+ if (vap->va_gid == (gid_t)VNOVAL)
+ gid = pn->mn_gid;
+ else
+ gid = vap->va_gid;
+
+ if (uid != pn->mn_uid || gid != pn->mn_gid) {
+ /*
+ * To modify the ownership of a file, must possess VADMIN
+ * for that file.
+ */
+ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
+ return (error);
+
+ /*
+ * XXXRW: Why is there a privilege check here: shouldn't the
+ * check in VOP_ACCESS() be enough? Also, are the group bits
+ * below definitely right?
+ */
+ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
+ (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
+ (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
+ return (error);
+ pn->mn_uid = uid;
+ pn->mn_gid = gid;
+ c = 1;
+ }
+
+ if (vap->va_mode != (mode_t)VNOVAL) {
+ if ((ap->a_cred->cr_uid != pn->mn_uid) &&
+ (error = priv_check(td, PRIV_MQ_ADMIN)))
+ return (error);
+ pn->mn_mode = vap->va_mode;
+ c = 1;
+ }
+
+ if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+ /* See the comment in ufs_vnops::ufs_setattr(). */
+ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
+ ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
+ (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
+ return (error);
+ if (vap->va_atime.tv_sec != VNOVAL) {
+ pn->mn_atime = vap->va_atime;
+ }
+ if (vap->va_mtime.tv_sec != VNOVAL) {
+ pn->mn_mtime = vap->va_mtime;
+ }
+ c = 1;
+ }
+ if (c) {
+ vfs_timestamp(&pn->mn_ctime);
+ }
+ return (0);
+}
+
+#if 0
+struct vop_read_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ int a_ioflag;
+ struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Read from a file
+ */
+static int
+mqfs_read(struct vop_read_args *ap)
+{
+ char buf[80];
+ struct vnode *vp = ap->a_vp;
+ struct uio *uio = ap->a_uio;
+ struct mqfs_node *pn;
+ struct mqueue *mq;
+ int len, error;
+
+ if (vp->v_type != VREG)
+ return (EINVAL);
+
+ pn = VTON(vp);
+ mq = VTOMQ(vp);
+ snprintf(buf, sizeof(buf),
+ "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
+ mq->mq_totalbytes,
+ mq->mq_maxmsg,
+ mq->mq_curmsgs,
+ mq->mq_msgsize);
+ buf[sizeof(buf)-1] = '\0';
+ len = strlen(buf);
+ error = uiomove_frombuf(buf, len, uio);
+ return (error);
+}
+
+#if 0
+struct vop_readdir_args {
+ struct vop_generic_args a_gen;
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *a_ncookies;
+ u_long **a_cookies;
+};
+#endif
+
+/*
+ * Return directory entries.
+ */
+static int
+mqfs_readdir(struct vop_readdir_args *ap)
+{
+ struct vnode *vp;
+ struct mqfs_info *mi;
+ struct mqfs_node *pd;
+ struct mqfs_node *pn;
+ struct dirent entry;
+ struct uio *uio;
+ int *tmp_ncookies = NULL;
+ off_t offset;
+ int error, i;
+
+ vp = ap->a_vp;
+ mi = VFSTOMQFS(vp->v_mount);
+ pd = VTON(vp);
+ uio = ap->a_uio;
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (uio->uio_offset < 0)
+ return (EINVAL);
+
+ if (ap->a_ncookies != NULL) {
+ tmp_ncookies = ap->a_ncookies;
+ *ap->a_ncookies = 0;
+ ap->a_ncookies = NULL;
+ }
+
+ error = 0;
+ offset = 0;
+
+ sx_xlock(&mi->mi_lock);
+
+ LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
+ entry.d_reclen = sizeof(entry);
+ if (!pn->mn_fileno)
+ mqfs_fileno_alloc(mi, pn);
+ entry.d_fileno = pn->mn_fileno;
+ for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
+ entry.d_name[i] = pn->mn_name[i];
+ entry.d_name[i] = 0;
+ entry.d_namlen = i;
+ switch (pn->mn_type) {
+ case mqfstype_root:
+ case mqfstype_dir:
+ case mqfstype_this:
+ case mqfstype_parent:
+ entry.d_type = DT_DIR;
+ break;
+ case mqfstype_file:
+ entry.d_type = DT_REG;
+ break;
+ case mqfstype_symlink:
+ entry.d_type = DT_LNK;
+ break;
+ default:
+ panic("%s has unexpected node type: %d", pn->mn_name,
+ pn->mn_type);
+ }
+ if (entry.d_reclen > uio->uio_resid)
+ break;
+ if (offset >= uio->uio_offset) {
+ error = vfs_read_dirent(ap, &entry, offset);
+ if (error)
+ break;
+ }
+ offset += entry.d_reclen;
+ }
+ sx_xunlock(&mi->mi_lock);
+
+ uio->uio_offset = offset;
+
+ if (tmp_ncookies != NULL)
+ ap->a_ncookies = tmp_ncookies;
+
+ return (error);
+}
+
+#ifdef notyet
+
+#if 0
+struct vop_mkdir_args {
+ struct vnode *a_dvp;
+ struvt vnode **a_vpp;
+ struvt componentname *a_cnp;
+ struct vattr *a_vap;
+};
+#endif
+
+/*
+ * Create a directory.
+ */
+static int
+mqfs_mkdir(struct vop_mkdir_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ struct componentname *cnp = ap->a_cnp;
+ struct mqfs_node *pd = VTON(ap->a_dvp);
+ struct mqfs_node *pn;
+ int error;
+
+ if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
+ return (ENOTDIR);
+ sx_xlock(&mqfs->mi_lock);
+ if ((cnp->cn_flags & HASBUF) == 0)
+ panic("%s: no name", __func__);
+ pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
+ ap->a_vap->cn_cred, ap->a_vap->va_mode);
+ if (pn != NULL)
+ mqnode_addref(pn);
+ sx_xunlock(&mqfs->mi_lock);
+ if (pn == NULL) {
+ error = ENOSPC;
+ } else {
+ error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
+ mqnode_release(pn);
+ }
+ return (error);
+}
+
+#if 0
+struct vop_rmdir_args {
+ struct vnode *a_dvp;
+ struct vnode *a_vp;
+ struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * Remove a directory.
+ */
+static int
+mqfs_rmdir(struct vop_rmdir_args *ap)
+{
+ struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+ struct mqfs_node *pn = VTON(ap->a_vp);
+ struct mqfs_node *pt;
+
+ if (pn->mn_type != mqfstype_dir)
+ return (ENOTDIR);
+
+ sx_xlock(&mqfs->mi_lock);
+ if (pn->mn_deleted) {
+ sx_xunlock(&mqfs->mi_lock);
+ return (ENOENT);
+ }
+
+ pt = LIST_FIRST(&pn->mn_children);
+ pt = LIST_NEXT(pt, mn_sibling);
+ pt = LIST_NEXT(pt, mn_sibling);
+ if (pt != NULL) {
+ sx_xunlock(&mqfs->mi_lock);
+ return (ENOTEMPTY);
+ }
+ pt = pn->mn_parent;
+ pn->mn_parent = NULL;
+ pn->mn_deleted = 1;
+ LIST_REMOVE(pn, mn_sibling);
+ mqnode_release(pn);
+ mqnode_release(pt);
+ sx_xunlock(&mqfs->mi_lock);
+ cache_purge(ap->a_vp);
+ return (0);
+}
+
+#endif /* notyet */
+
+/*
+ * Allocate a message queue
+ */
+static struct mqueue *
+mqueue_alloc(const struct mq_attr *attr)
+{
+ struct mqueue *mq;
+
+ if (curmq >= maxmq)
+ return (NULL);
+ mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
+ TAILQ_INIT(&mq->mq_msgq);
+ if (attr != NULL) {
+ mq->mq_maxmsg = attr->mq_maxmsg;
+ mq->mq_msgsize = attr->mq_msgsize;
+ } else {
+ mq->mq_maxmsg = default_maxmsg;
+ mq->mq_msgsize = default_msgsize;
+ }
+ mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
+ knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
+ knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
+ atomic_add_int(&curmq, 1);
+ return (mq);
+}
+
+/*
+ * Destroy a message queue
+ */
+static void
+mqueue_free(struct mqueue *mq)
+{
+ struct mqueue_msg *msg;
+
+ while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
+ TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
+ free(msg, M_MQUEUEDATA);
+ }
+
+ mtx_destroy(&mq->mq_mutex);
+ seldrain(&mq->mq_rsel);
+ seldrain(&mq->mq_wsel);
+ knlist_destroy(&mq->mq_rsel.si_note);
+ knlist_destroy(&mq->mq_wsel.si_note);
+ uma_zfree(mqueue_zone, mq);
+ atomic_add_int(&curmq, -1);
+}
+
+/*
+ * Load a message from user space
+ */
+static struct mqueue_msg *
+mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
+{
+ struct mqueue_msg *msg;
+ size_t len;
+ int error;
+
+ len = sizeof(struct mqueue_msg) + msg_size;
+ msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
+ error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
+ msg_size);
+ if (error) {
+ free(msg, M_MQUEUEDATA);
+ msg = NULL;
+ } else {
+ msg->msg_size = msg_size;
+ msg->msg_prio = msg_prio;
+ }
+ return (msg);
+}
+
+/*
+ * Save a message to user space
+ */
+static int
+mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
+{
+ int error;
+
+ error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
+ msg->msg_size);
+ if (error == 0 && msg_prio != NULL)
+ error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
+ return (error);
+}
+
+/*
+ * Free a message's memory
+ */
+static __inline void
+mqueue_freemsg(struct mqueue_msg *msg)
+{
+ free(msg, M_MQUEUEDATA);
+}
+
+/*
+ * Send a message. if waitok is false, thread will not be
+ * blocked if there is no data in queue, otherwise, absolute
+ * time will be checked.
+ */
+int
+mqueue_send(struct mqueue *mq, const char *msg_ptr,
+ size_t msg_len, unsigned msg_prio, int waitok,
+ const struct timespec *abs_timeout)
+{
+ struct mqueue_msg *msg;
+ struct timespec ts, ts2;
+ struct timeval tv;
+ int error;
+
+ if (msg_prio >= MQ_PRIO_MAX)
+ return (EINVAL);
+ if (msg_len > mq->mq_msgsize)
+ return (EMSGSIZE);
+ msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
+ if (msg == NULL)
+ return (EFAULT);
+
+ /* O_NONBLOCK case */
+ if (!waitok) {
+ error = _mqueue_send(mq, msg, -1);
+ if (error)
+ goto bad;
+ return (0);
+ }
+
+ /* we allow a null timeout (wait forever) */
+ if (abs_timeout == NULL) {
+ error = _mqueue_send(mq, msg, 0);
+ if (error)
+ goto bad;
+ return (0);
+ }
+
+ /* send it before checking time */
+ error = _mqueue_send(mq, msg, -1);
+ if (error == 0)
+ return (0);
+
+ if (error != EAGAIN)
+ goto bad;
+
+ if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
+ error = EINVAL;
+ goto bad;
+ }
+ for (;;) {
+ ts2 = *abs_timeout;
+ getnanotime(&ts);
+ timespecsub(&ts2, &ts);
+ if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
+ error = ETIMEDOUT;
+ break;
+ }
+ TIMESPEC_TO_TIMEVAL(&tv, &ts2);
+ error = _mqueue_send(mq, msg, tvtohz(&tv));
+ if (error != ETIMEDOUT)
+ break;
+ }
+ if (error == 0)
+ return (0);
+bad:
+ mqueue_freemsg(msg);
+ return (error);
+}
+
+/*
+ * Common routine to send a message
+ */
+static int
+_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
+{
+ struct mqueue_msg *msg2;
+ int error = 0;
+
+ mtx_lock(&mq->mq_mutex);
+ while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
+ if (timo < 0) {
+ mtx_unlock(&mq->mq_mutex);
+ return (EAGAIN);
+ }
+ mq->mq_senders++;
+ error = msleep(&mq->mq_senders, &mq->mq_mutex,
+ PCATCH, "mqsend", timo);
+ mq->mq_senders--;
+ if (error == EAGAIN)
+ error = ETIMEDOUT;
+ }
+ if (mq->mq_curmsgs >= mq->mq_maxmsg) {
+ mtx_unlock(&mq->mq_mutex);
+ return (error);
+ }
+ error = 0;
+ if (TAILQ_EMPTY(&mq->mq_msgq)) {
+ TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
+ } else {
+ if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
+ TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
+ } else {
+ TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
+ if (msg2->msg_prio < msg->msg_prio)
+ break;
+ }
+ TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
+ }
+ }
+ mq->mq_curmsgs++;
+ mq->mq_totalbytes += msg->msg_size;
+ if (mq->mq_receivers)
+ wakeup_one(&mq->mq_receivers);
+ else if (mq->mq_notifier != NULL)
+ mqueue_send_notification(mq);
+ if (mq->mq_flags & MQ_RSEL) {
+ mq->mq_flags &= ~MQ_RSEL;
+ selwakeup(&mq->mq_rsel);
+ }
+ KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
+ mtx_unlock(&mq->mq_mutex);
+ return (0);
+}
+
+/*
+ * Send realtime a signal to process which registered itself
+ * successfully by mq_notify.
+ */
+static void
+mqueue_send_notification(struct mqueue *mq)
+{
+ struct mqueue_notifier *nt;
+ struct thread *td;
+ struct proc *p;
+ int error;
+
+ mtx_assert(&mq->mq_mutex, MA_OWNED);
+ nt = mq->mq_notifier;
+ if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
+ p = nt->nt_proc;
+ error = sigev_findtd(p, &nt->nt_sigev, &td);
+ if (error) {
+ mq->mq_notifier = NULL;
+ return;
+ }
+ if (!KSI_ONQ(&nt->nt_ksi)) {
+ ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
+ tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
+ }
+ PROC_UNLOCK(p);
+ }
+ mq->mq_notifier = NULL;
+}
+
+/*
+ * Get a message. if waitok is false, thread will not be
+ * blocked if there is no data in queue, otherwise, absolute
+ * time will be checked.
+ */
+int
+mqueue_receive(struct mqueue *mq, char *msg_ptr,
+ size_t msg_len, unsigned *msg_prio, int waitok,
+ const struct timespec *abs_timeout)
+{
+ struct mqueue_msg *msg;
+ struct timespec ts, ts2;
+ struct timeval tv;
+ int error;
+
+ if (msg_len < mq->mq_msgsize)
+ return (EMSGSIZE);
+
+ /* O_NONBLOCK case */
+ if (!waitok) {
+ error = _mqueue_recv(mq, &msg, -1);
+ if (error)
+ return (error);
+ goto received;
+ }
+
+ /* we allow a null timeout (wait forever). */
+ if (abs_timeout == NULL) {
+ error = _mqueue_recv(mq, &msg, 0);
+ if (error)
+ return (error);
+ goto received;
+ }
+
+ /* try to get a message before checking time */
+ error = _mqueue_recv(mq, &msg, -1);
+ if (error == 0)
+ goto received;
+
+ if (error != EAGAIN)
+ return (error);
+
+ if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
+ error = EINVAL;
+ return (error);
+ }
+
+ for (;;) {
+ ts2 = *abs_timeout;
+ getnanotime(&ts);
+ timespecsub(&ts2, &ts);
+ if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
+ error = ETIMEDOUT;
+ return (error);
+ }
+ TIMESPEC_TO_TIMEVAL(&tv, &ts2);
+ error = _mqueue_recv(mq, &msg, tvtohz(&tv));
+ if (error == 0)
+ break;
+ if (error != ETIMEDOUT)
+ return (error);
+ }
+
+received:
+ error = mqueue_savemsg(msg, msg_ptr, msg_prio);
+ if (error == 0) {
+ curthread->td_retval[0] = msg->msg_size;
+ curthread->td_retval[1] = 0;
+ }
+ mqueue_freemsg(msg);
+ return (error);
+}
+
+/*
+ * Common routine to receive a message
+ */
+static int
+_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
+{
+ int error = 0;
+
+ mtx_lock(&mq->mq_mutex);
+ while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
+ if (timo < 0) {
+ mtx_unlock(&mq->mq_mutex);
+ return (EAGAIN);
+ }
+ mq->mq_receivers++;
+ error = msleep(&mq->mq_receivers, &mq->mq_mutex,
+ PCATCH, "mqrecv", timo);
+ mq->mq_receivers--;
+ if (error == EAGAIN)
+ error = ETIMEDOUT;
+ }
+ if (*msg != NULL) {
+ error = 0;
+ TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
+ mq->mq_curmsgs--;
+ mq->mq_totalbytes -= (*msg)->msg_size;
+ if (mq->mq_senders)
+ wakeup_one(&mq->mq_senders);
+ if (mq->mq_flags & MQ_WSEL) {
+ mq->mq_flags &= ~MQ_WSEL;
+ selwakeup(&mq->mq_wsel);
+ }
+ KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
+ }
+ if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
+ !TAILQ_EMPTY(&mq->mq_msgq)) {
+ mqueue_send_notification(mq);
+ }
+ mtx_unlock(&mq->mq_mutex);
+ return (error);
+}
+
+static __inline struct mqueue_notifier *
+notifier_alloc(void)
+{
+ return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
+}
+
+static __inline void
+notifier_free(struct mqueue_notifier *p)
+{
+ uma_zfree(mqnoti_zone, p);
+}
+
+static struct mqueue_notifier *
+notifier_search(struct proc *p, int fd)
+{
+ struct mqueue_notifier *nt;
+
+ LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
+ if (nt->nt_ksi.ksi_mqd == fd)
+ break;
+ }
+ return (nt);
+}
+
+static __inline void
+notifier_insert(struct proc *p, struct mqueue_notifier *nt)
+{
+ LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
+}
+
+static __inline void
+notifier_delete(struct proc *p, struct mqueue_notifier *nt)
+{
+ LIST_REMOVE(nt, nt_link);
+ notifier_free(nt);
+}
+
+static void
+notifier_remove(struct proc *p, struct mqueue *mq, int fd)
+{
+ struct mqueue_notifier *nt;
+
+ mtx_assert(&mq->mq_mutex, MA_OWNED);
+ PROC_LOCK(p);
+ nt = notifier_search(p, fd);
+ if (nt != NULL) {
+ if (mq->mq_notifier == nt)
+ mq->mq_notifier = NULL;
+ sigqueue_take(&nt->nt_ksi);
+ notifier_delete(p, nt);
+ }
+ PROC_UNLOCK(p);
+}
+
+static int
+kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
+ const struct mq_attr *attr)
+{
+ char path[MQFS_NAMELEN + 1];
+ struct mqfs_node *pn;
+ struct filedesc *fdp;
+ struct file *fp;
+ struct mqueue *mq;
+ int fd, error, len, cmode;
+
+ fdp = td->td_proc->p_fd;
+ cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
+ mq = NULL;
+ if ((flags & O_CREAT) != 0 && attr != NULL) {
+ if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
+ return (EINVAL);
+ if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
+ return (EINVAL);
+ }
+
+ error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
+ if (error)
+ return (error);
+
+ /*
+ * The first character of name must be a slash (/) character
+ * and the remaining characters of name cannot include any slash
+ * characters.
+ */
+ len = strlen(path);
+ if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
+ return (EINVAL);
+
+ error = falloc(td, &fp, &fd, O_CLOEXEC);
+ if (error)
+ return (error);
+
+ sx_xlock(&mqfs_data.mi_lock);
+ pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+ if (pn == NULL) {
+ if (!(flags & O_CREAT)) {
+ error = ENOENT;
+ } else {
+ mq = mqueue_alloc(attr);
+ if (mq == NULL) {
+ error = ENFILE;
+ } else {
+ pn = mqfs_create_file(mqfs_data.mi_root,
+ path + 1, len - 1, td->td_ucred,
+ cmode);
+ if (pn == NULL) {
+ error = ENOSPC;
+ mqueue_free(mq);
+ }
+ }
+ }
+
+ if (error == 0) {
+ pn->mn_data = mq;
+ }
+ } else {
+ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
+ error = EEXIST;
+ } else {
+ accmode_t accmode = 0;
+
+ if (flags & FREAD)
+ accmode |= VREAD;
+ if (flags & FWRITE)
+ accmode |= VWRITE;
+ error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
+ pn->mn_gid, accmode, td->td_ucred, NULL);
+ }
+ }
+
+ if (error) {
+ sx_xunlock(&mqfs_data.mi_lock);
+ fdclose(fdp, fp, fd, td);
+ fdrop(fp, td);
+ return (error);
+ }
+
+ mqnode_addref(pn);
+ sx_xunlock(&mqfs_data.mi_lock);
+
+ finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
+ &mqueueops);
+
+ td->td_retval[0] = fd;
+ fdrop(fp, td);
+ return (0);
+}
+
+/*
+ * Syscall to open a message queue.
+ */
+int
+sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
+{
+ struct mq_attr attr;
+ int flags, error;
+
+ if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
+ return (EINVAL);
+ flags = FFLAGS(uap->flags);
+ if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
+ error = copyin(uap->attr, &attr, sizeof(attr));
+ if (error)
+ return (error);
+ }
+ return (kern_kmq_open(td, uap->path, flags, uap->mode,
+ uap->attr != NULL ? &attr : NULL));
+}
+
+/*
+ * Syscall to unlink a message queue.
+ */
+int
+sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
+{
+ char path[MQFS_NAMELEN+1];
+ struct mqfs_node *pn;
+ int error, len;
+
+ error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
+ if (error)
+ return (error);
+
+ len = strlen(path);
+ if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
+ return (EINVAL);
+
+ sx_xlock(&mqfs_data.mi_lock);
+ pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+ if (pn != NULL)
+ error = do_unlink(pn, td->td_ucred);
+ else
+ error = ENOENT;
+ sx_xunlock(&mqfs_data.mi_lock);
+ return (error);
+}
+
+typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
+
+/*
+ * Get message queue by giving file slot
+ */
+static int
+_getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
+ struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
+{
+ struct mqfs_node *pn;
+ int error;
+
+ error = func(td, fd, rightsp, fpp);
+ if (error)
+ return (error);
+ if (&mqueueops != (*fpp)->f_ops) {
+ fdrop(*fpp, td);
+ return (EBADF);
+ }
+ pn = (*fpp)->f_data;
+ if (ppn)
+ *ppn = pn;
+ if (pmq)
+ *pmq = pn->mn_data;
+ return (0);
+}
+
+static __inline int
+getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
+ struct mqueue **pmq)
+{
+ cap_rights_t rights;
+
+ return _getmq(td, fd, cap_rights_init(&rights, CAP_POLL_EVENT), fget,
+ fpp, ppn, pmq);
+}
+
+static __inline int
+getmq_read(struct thread *td, int fd, struct file **fpp,
+ struct mqfs_node **ppn, struct mqueue **pmq)
+{
+ cap_rights_t rights;
+
+ return _getmq(td, fd, cap_rights_init(&rights, CAP_READ), fget_read,
+ fpp, ppn, pmq);
+}
+
+static __inline int
+getmq_write(struct thread *td, int fd, struct file **fpp,
+ struct mqfs_node **ppn, struct mqueue **pmq)
+{
+ cap_rights_t rights;
+
+ return _getmq(td, fd, cap_rights_init(&rights, CAP_WRITE), fget_write,
+ fpp, ppn, pmq);
+}
+
+static int
+kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
+ struct mq_attr *oattr)
+{
+ struct mqueue *mq;
+ struct file *fp;
+ u_int oflag, flag;
+ int error;
+
+ if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
+ return (EINVAL);
+ error = getmq(td, mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+ oattr->mq_maxmsg = mq->mq_maxmsg;
+ oattr->mq_msgsize = mq->mq_msgsize;
+ oattr->mq_curmsgs = mq->mq_curmsgs;
+ if (attr != NULL) {
+ do {
+ oflag = flag = fp->f_flag;
+ flag &= ~O_NONBLOCK;
+ flag |= (attr->mq_flags & O_NONBLOCK);
+ } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
+ } else
+ oflag = fp->f_flag;
+ oattr->mq_flags = (O_NONBLOCK & oflag);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
+{
+ struct mq_attr attr, oattr;
+ int error;
+
+ if (uap->attr != NULL) {
+ error = copyin(uap->attr, &attr, sizeof(attr));
+ if (error != 0)
+ return (error);
+ }
+ error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
+ &oattr);
+ if (error != 0)
+ return (error);
+ if (uap->oattr != NULL)
+ error = copyout(&oattr, uap->oattr, sizeof(oattr));
+ return (error);
+}
+
+int
+sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
+{
+ struct mqueue *mq;
+ struct file *fp;
+ struct timespec *abs_timeout, ets;
+ int error;
+ int waitok;
+
+ error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+ if (uap->abs_timeout != NULL) {
+ error = copyin(uap->abs_timeout, &ets, sizeof(ets));
+ if (error != 0)
+ return (error);
+ abs_timeout = &ets;
+ } else
+ abs_timeout = NULL;
+ waitok = !(fp->f_flag & O_NONBLOCK);
+ error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
+ uap->msg_prio, waitok, abs_timeout);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
+{
+ struct mqueue *mq;
+ struct file *fp;
+ struct timespec *abs_timeout, ets;
+ int error, waitok;
+
+ error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+ if (uap->abs_timeout != NULL) {
+ error = copyin(uap->abs_timeout, &ets, sizeof(ets));
+ if (error != 0)
+ return (error);
+ abs_timeout = &ets;
+ } else
+ abs_timeout = NULL;
+ waitok = !(fp->f_flag & O_NONBLOCK);
+ error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
+ uap->msg_prio, waitok, abs_timeout);
+ fdrop(fp, td);
+ return (error);
+}
+
+static int
+kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
+{
+#ifdef CAPABILITIES
+ cap_rights_t rights;
+#endif
+ struct filedesc *fdp;
+ struct proc *p;
+ struct mqueue *mq;
+ struct file *fp, *fp2;
+ struct mqueue_notifier *nt, *newnt = NULL;
+ int error;
+
+ if (sigev != NULL) {
+ if (sigev->sigev_notify != SIGEV_SIGNAL &&
+ sigev->sigev_notify != SIGEV_THREAD_ID &&
+ sigev->sigev_notify != SIGEV_NONE)
+ return (EINVAL);
+ if ((sigev->sigev_notify == SIGEV_SIGNAL ||
+ sigev->sigev_notify == SIGEV_THREAD_ID) &&
+ !_SIG_VALID(sigev->sigev_signo))
+ return (EINVAL);
+ }
+ p = td->td_proc;
+ fdp = td->td_proc->p_fd;
+ error = getmq(td, mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+again:
+ FILEDESC_SLOCK(fdp);
+ fp2 = fget_locked(fdp, mqd);
+ if (fp2 == NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ goto out;
+ }
+#ifdef CAPABILITIES
+ error = cap_check(cap_rights(fdp, mqd),
+ cap_rights_init(&rights, CAP_POLL_EVENT));
+ if (error) {
+ FILEDESC_SUNLOCK(fdp);
+ goto out;
+ }
+#endif
+ if (fp2 != fp) {
+ FILEDESC_SUNLOCK(fdp);
+ error = EBADF;
+ goto out;
+ }
+ mtx_lock(&mq->mq_mutex);
+ FILEDESC_SUNLOCK(fdp);
+ if (sigev != NULL) {
+ if (mq->mq_notifier != NULL) {
+ error = EBUSY;
+ } else {
+ PROC_LOCK(p);
+ nt = notifier_search(p, mqd);
+ if (nt == NULL) {
+ if (newnt == NULL) {
+ PROC_UNLOCK(p);
+ mtx_unlock(&mq->mq_mutex);
+ newnt = notifier_alloc();
+ goto again;
+ }
+ }
+
+ if (nt != NULL) {
+ sigqueue_take(&nt->nt_ksi);
+ if (newnt != NULL) {
+ notifier_free(newnt);
+ newnt = NULL;
+ }
+ } else {
+ nt = newnt;
+ newnt = NULL;
+ ksiginfo_init(&nt->nt_ksi);
+ nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
+ nt->nt_ksi.ksi_code = SI_MESGQ;
+ nt->nt_proc = p;
+ nt->nt_ksi.ksi_mqd = mqd;
+ notifier_insert(p, nt);
+ }
+ nt->nt_sigev = *sigev;
+ mq->mq_notifier = nt;
+ PROC_UNLOCK(p);
+ /*
+ * if there is no receivers and message queue
+ * is not empty, we should send notification
+ * as soon as possible.
+ */
+ if (mq->mq_receivers == 0 &&
+ !TAILQ_EMPTY(&mq->mq_msgq))
+ mqueue_send_notification(mq);
+ }
+ } else {
+ notifier_remove(p, mq, mqd);
+ }
+ mtx_unlock(&mq->mq_mutex);
+
+out:
+ fdrop(fp, td);
+ if (newnt != NULL)
+ notifier_free(newnt);
+ return (error);
+}
+
+int
+sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
+{
+ struct sigevent ev, *evp;
+ int error;
+
+ if (uap->sigev == NULL) {
+ evp = NULL;
+ } else {
+ error = copyin(uap->sigev, &ev, sizeof(ev));
+ if (error != 0)
+ return (error);
+ evp = &ev;
+ }
+ return (kern_kmq_notify(td, uap->mqd, evp));
+}
+
+static void
+mqueue_fdclose(struct thread *td, int fd, struct file *fp)
+{
+ struct filedesc *fdp;
+ struct mqueue *mq;
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ if (fp->f_ops == &mqueueops) {
+ mq = FPTOMQ(fp);
+ mtx_lock(&mq->mq_mutex);
+ notifier_remove(td->td_proc, mq, fd);
+
+ /* have to wakeup thread in same process */
+ if (mq->mq_flags & MQ_RSEL) {
+ mq->mq_flags &= ~MQ_RSEL;
+ selwakeup(&mq->mq_rsel);
+ }
+ if (mq->mq_flags & MQ_WSEL) {
+ mq->mq_flags &= ~MQ_WSEL;
+ selwakeup(&mq->mq_wsel);
+ }
+ mtx_unlock(&mq->mq_mutex);
+ }
+}
+
+static void
+mq_proc_exit(void *arg __unused, struct proc *p)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+ struct mqueue *mq;
+ int i;
+
+ fdp = p->p_fd;
+ FILEDESC_SLOCK(fdp);
+ for (i = 0; i < fdp->fd_nfiles; ++i) {
+ fp = fget_locked(fdp, i);
+ if (fp != NULL && fp->f_ops == &mqueueops) {
+ mq = FPTOMQ(fp);
+ mtx_lock(&mq->mq_mutex);
+ notifier_remove(p, FPTOMQ(fp), i);
+ mtx_unlock(&mq->mq_mutex);
+ }
+ }
+ FILEDESC_SUNLOCK(fdp);
+ KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
+}
+
+static int
+mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+mqf_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+static int
+mqf_ioctl(struct file *fp, u_long cmd, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+ return (ENOTTY);
+}
+
+static int
+mqf_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct mqueue *mq = FPTOMQ(fp);
+ int revents = 0;
+
+ mtx_lock(&mq->mq_mutex);
+ if (events & (POLLIN | POLLRDNORM)) {
+ if (mq->mq_curmsgs) {
+ revents |= events & (POLLIN | POLLRDNORM);
+ } else {
+ mq->mq_flags |= MQ_RSEL;
+ selrecord(td, &mq->mq_rsel);
+ }
+ }
+ if (events & POLLOUT) {
+ if (mq->mq_curmsgs < mq->mq_maxmsg)
+ revents |= POLLOUT;
+ else {
+ mq->mq_flags |= MQ_WSEL;
+ selrecord(td, &mq->mq_wsel);
+ }
+ }
+ mtx_unlock(&mq->mq_mutex);
+ return (revents);
+}
+
+static int
+mqf_close(struct file *fp, struct thread *td)
+{
+ struct mqfs_node *pn;
+
+ fp->f_ops = &badfileops;
+ pn = fp->f_data;
+ fp->f_data = NULL;
+ sx_xlock(&mqfs_data.mi_lock);
+ mqnode_release(pn);
+ sx_xunlock(&mqfs_data.mi_lock);
+ return (0);
+}
+
+static int
+mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct mqfs_node *pn = fp->f_data;
+
+ bzero(st, sizeof *st);
+ sx_xlock(&mqfs_data.mi_lock);
+ st->st_atim = pn->mn_atime;
+ st->st_mtim = pn->mn_mtime;
+ st->st_ctim = pn->mn_ctime;
+ st->st_birthtim = pn->mn_birth;
+ st->st_uid = pn->mn_uid;
+ st->st_gid = pn->mn_gid;
+ st->st_mode = S_IFIFO | pn->mn_mode;
+ sx_xunlock(&mqfs_data.mi_lock);
+ return (0);
+}
+
+static int
+mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct mqfs_node *pn;
+ int error;
+
+ error = 0;
+ pn = fp->f_data;
+ sx_xlock(&mqfs_data.mi_lock);
+ error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
+ active_cred, NULL);
+ if (error != 0)
+ goto out;
+ pn->mn_mode = mode & ACCESSPERMS;
+out:
+ sx_xunlock(&mqfs_data.mi_lock);
+ return (error);
+}
+
+static int
+mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct mqfs_node *pn;
+ int error;
+
+ error = 0;
+ pn = fp->f_data;
+ sx_xlock(&mqfs_data.mi_lock);
+ if (uid == (uid_t)-1)
+ uid = pn->mn_uid;
+ if (gid == (gid_t)-1)
+ gid = pn->mn_gid;
+ if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
+ (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
+ (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
+ goto out;
+ pn->mn_uid = uid;
+ pn->mn_gid = gid;
+out:
+ sx_xunlock(&mqfs_data.mi_lock);
+ return (error);
+}
+
+static int
+mqf_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct mqueue *mq = FPTOMQ(fp);
+ int error = 0;
+
+ if (kn->kn_filter == EVFILT_READ) {
+ kn->kn_fop = &mq_rfiltops;
+ knlist_add(&mq->mq_rsel.si_note, kn, 0);
+ } else if (kn->kn_filter == EVFILT_WRITE) {
+ kn->kn_fop = &mq_wfiltops;
+ knlist_add(&mq->mq_wsel.si_note, kn, 0);
+ } else
+ error = EINVAL;
+ return (error);
+}
+
+static void
+filt_mqdetach(struct knote *kn)
+{
+ struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+ if (kn->kn_filter == EVFILT_READ)
+ knlist_remove(&mq->mq_rsel.si_note, kn, 0);
+ else if (kn->kn_filter == EVFILT_WRITE)
+ knlist_remove(&mq->mq_wsel.si_note, kn, 0);
+ else
+ panic("filt_mqdetach");
+}
+
+static int
+filt_mqread(struct knote *kn, long hint)
+{
+ struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+ mtx_assert(&mq->mq_mutex, MA_OWNED);
+ return (mq->mq_curmsgs != 0);
+}
+
+static int
+filt_mqwrite(struct knote *kn, long hint)
+{
+ struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+ mtx_assert(&mq->mq_mutex, MA_OWNED);
+ return (mq->mq_curmsgs < mq->mq_maxmsg);
+}
+
+static struct fileops mqueueops = {
+ .fo_read = mqf_read,
+ .fo_write = mqf_write,
+ .fo_truncate = mqf_truncate,
+ .fo_ioctl = mqf_ioctl,
+ .fo_poll = mqf_poll,
+ .fo_kqfilter = mqf_kqfilter,
+ .fo_stat = mqf_stat,
+ .fo_chmod = mqf_chmod,
+ .fo_chown = mqf_chown,
+ .fo_close = mqf_close,
+ .fo_sendfile = invfo_sendfile,
+};
+
+static struct vop_vector mqfs_vnodeops = {
+ .vop_default = &default_vnodeops,
+ .vop_access = mqfs_access,
+ .vop_cachedlookup = mqfs_lookup,
+ .vop_lookup = vfs_cache_lookup,
+ .vop_reclaim = mqfs_reclaim,
+ .vop_create = mqfs_create,
+ .vop_remove = mqfs_remove,
+ .vop_inactive = mqfs_inactive,
+ .vop_open = mqfs_open,
+ .vop_close = mqfs_close,
+ .vop_getattr = mqfs_getattr,
+ .vop_setattr = mqfs_setattr,
+ .vop_read = mqfs_read,
+ .vop_write = VOP_EOPNOTSUPP,
+ .vop_readdir = mqfs_readdir,
+ .vop_mkdir = VOP_EOPNOTSUPP,
+ .vop_rmdir = VOP_EOPNOTSUPP
+};
+
+static struct vfsops mqfs_vfsops = {
+ .vfs_init = mqfs_init,
+ .vfs_uninit = mqfs_uninit,
+ .vfs_mount = mqfs_mount,
+ .vfs_unmount = mqfs_unmount,
+ .vfs_root = mqfs_root,
+ .vfs_statfs = mqfs_statfs,
+};
+
+static struct vfsconf mqueuefs_vfsconf = {
+ .vfc_version = VFS_VERSION,
+ .vfc_name = "mqueuefs",
+ .vfc_vfsops = &mqfs_vfsops,
+ .vfc_typenum = -1,
+ .vfc_flags = VFCF_SYNTHETIC
+};
+
+static struct syscall_helper_data mq_syscalls[] = {
+ SYSCALL_INIT_HELPER(kmq_open),
+ SYSCALL_INIT_HELPER(kmq_setattr),
+ SYSCALL_INIT_HELPER(kmq_timedsend),
+ SYSCALL_INIT_HELPER(kmq_timedreceive),
+ SYSCALL_INIT_HELPER(kmq_notify),
+ SYSCALL_INIT_HELPER(kmq_unlink),
+ SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static void
+mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
+{
+
+ to->mq_flags = from->mq_flags;
+ to->mq_maxmsg = from->mq_maxmsg;
+ to->mq_msgsize = from->mq_msgsize;
+ to->mq_curmsgs = from->mq_curmsgs;
+}
+
+static void
+mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
+{
+
+ to->mq_flags = from->mq_flags;
+ to->mq_maxmsg = from->mq_maxmsg;
+ to->mq_msgsize = from->mq_msgsize;
+ to->mq_curmsgs = from->mq_curmsgs;
+}
+
+int
+freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
+{
+ struct mq_attr attr;
+ struct mq_attr32 attr32;
+ int flags, error;
+
+ if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
+ return (EINVAL);
+ flags = FFLAGS(uap->flags);
+ if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
+ error = copyin(uap->attr, &attr32, sizeof(attr32));
+ if (error)
+ return (error);
+ mq_attr_from32(&attr32, &attr);
+ }
+ return (kern_kmq_open(td, uap->path, flags, uap->mode,
+ uap->attr != NULL ? &attr : NULL));
+}
+
+int
+freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
+{
+ struct mq_attr attr, oattr;
+ struct mq_attr32 attr32, oattr32;
+ int error;
+
+ if (uap->attr != NULL) {
+ error = copyin(uap->attr, &attr32, sizeof(attr32));
+ if (error != 0)
+ return (error);
+ mq_attr_from32(&attr32, &attr);
+ }
+ error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
+ &oattr);
+ if (error != 0)
+ return (error);
+ if (uap->oattr != NULL) {
+ mq_attr_to32(&oattr, &oattr32);
+ error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
+ }
+ return (error);
+}
+
+int
+freebsd32_kmq_timedsend(struct thread *td,
+ struct freebsd32_kmq_timedsend_args *uap)
+{
+ struct mqueue *mq;
+ struct file *fp;
+ struct timespec32 ets32;
+ struct timespec *abs_timeout, ets;
+ int error;
+ int waitok;
+
+ error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+ if (uap->abs_timeout != NULL) {
+ error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
+ if (error != 0)
+ return (error);
+ CP(ets32, ets, tv_sec);
+ CP(ets32, ets, tv_nsec);
+ abs_timeout = &ets;
+ } else
+ abs_timeout = NULL;
+ waitok = !(fp->f_flag & O_NONBLOCK);
+ error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
+ uap->msg_prio, waitok, abs_timeout);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+freebsd32_kmq_timedreceive(struct thread *td,
+ struct freebsd32_kmq_timedreceive_args *uap)
+{
+ struct mqueue *mq;
+ struct file *fp;
+ struct timespec32 ets32;
+ struct timespec *abs_timeout, ets;
+ int error, waitok;
+
+ error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
+ if (error)
+ return (error);
+ if (uap->abs_timeout != NULL) {
+ error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
+ if (error != 0)
+ return (error);
+ CP(ets32, ets, tv_sec);
+ CP(ets32, ets, tv_nsec);
+ abs_timeout = &ets;
+ } else
+ abs_timeout = NULL;
+ waitok = !(fp->f_flag & O_NONBLOCK);
+ error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
+ uap->msg_prio, waitok, abs_timeout);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
+{
+ struct sigevent ev, *evp;
+ struct sigevent32 ev32;
+ int error;
+
+ if (uap->sigev == NULL) {
+ evp = NULL;
+ } else {
+ error = copyin(uap->sigev, &ev32, sizeof(ev32));
+ if (error != 0)
+ return (error);
+ error = convert_sigevent32(&ev32, &ev);
+ if (error != 0)
+ return (error);
+ evp = &ev;
+ }
+ return (kern_kmq_notify(td, uap->mqd, evp));
+}
+
+static struct syscall_helper_data mq32_syscalls[] = {
+ SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
+ SYSCALL32_INIT_HELPER(freebsd32_kmq_setattr),
+ SYSCALL32_INIT_HELPER(freebsd32_kmq_timedsend),
+ SYSCALL32_INIT_HELPER(freebsd32_kmq_timedreceive),
+ SYSCALL32_INIT_HELPER(freebsd32_kmq_notify),
+ SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
+ SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+mqinit(void)
+{
+ int error;
+
+ error = syscall_helper_register(mq_syscalls);
+ if (error != 0)
+ return (error);
+#ifdef COMPAT_FREEBSD32
+ error = syscall32_helper_register(mq32_syscalls);
+ if (error != 0)
+ return (error);
+#endif
+ return (0);
+}
+
+static int
+mqunload(void)
+{
+
+#ifdef COMPAT_FREEBSD32
+ syscall32_helper_unregister(mq32_syscalls);
+#endif
+ syscall_helper_unregister(mq_syscalls);
+ return (0);
+}
+
+static int
+mq_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ error = vfs_modevent(module, cmd, arg);
+ if (error != 0)
+ return (error);
+
+ switch (cmd) {
+ case MOD_LOAD:
+ error = mqinit();
+ if (error != 0)
+ mqunload();
+ break;
+ case MOD_UNLOAD:
+ error = mqunload();
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t mqueuefs_mod = {
+ "mqueuefs",
+ mq_modload,
+ &mqueuefs_vfsconf
+};
+DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
+MODULE_VERSION(mqueuefs, 1);
diff --git a/sys/kern/uipc_sem.c b/sys/kern/uipc_sem.c
new file mode 100644
index 0000000..f641654
--- /dev/null
+++ b/sys/kern/uipc_sem.c
@@ -0,0 +1,1111 @@
+/*-
+ * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
+ * Copyright (c) 2003-2005 SPARTA, Inc.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by Network
+ * Associates Laboratories, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
+ * as part of the DARPA CHATS research program.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/condvar.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/ksem.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/_semaphore.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/vnode.h>
+
+#include <security/mac/mac_framework.h>
+
+FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
+/*
+ * TODO
+ *
+ * - Resource limits?
+ * - Replace global sem_lock with mtx_pool locks?
+ * - Add a MAC check_create() hook for creating new named semaphores.
+ */
+
+#ifndef SEM_MAX
+#define SEM_MAX 30
+#endif
+
+#ifdef SEM_DEBUG
+#define DP(x) printf x
+#else
+#define DP(x)
+#endif
+
+struct ksem_mapping {
+ char *km_path;
+ Fnv32_t km_fnv;
+ struct ksem *km_ksem;
+ LIST_ENTRY(ksem_mapping) km_link;
+};
+
+static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
+static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
+static struct sx ksem_dict_lock;
+static struct mtx ksem_count_lock;
+static struct mtx sem_lock;
+static u_long ksem_hash;
+static int ksem_dead;
+
+#define KSEM_HASH(fnv) (&ksem_dictionary[(fnv) & ksem_hash])
+
+static int nsems = 0;
+SYSCTL_DECL(_p1003_1b);
+SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
+ "Number of active kernel POSIX semaphores");
+
+static int kern_sem_wait(struct thread *td, semid_t id, int tryflag,
+ struct timespec *abstime);
+static int ksem_access(struct ksem *ks, struct ucred *ucred);
+static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
+ unsigned int value);
+static int ksem_create(struct thread *td, const char *path,
+ semid_t *semidp, mode_t mode, unsigned int value,
+ int flags, int compat32);
+static void ksem_drop(struct ksem *ks);
+static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
+ struct file **fpp);
+static struct ksem *ksem_hold(struct ksem *ks);
+static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
+static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
+static void ksem_module_destroy(void);
+static int ksem_module_init(void);
+static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
+static int sem_modload(struct module *module, int cmd, void *arg);
+
+static fo_rdwr_t ksem_read;
+static fo_rdwr_t ksem_write;
+static fo_truncate_t ksem_truncate;
+static fo_ioctl_t ksem_ioctl;
+static fo_poll_t ksem_poll;
+static fo_kqfilter_t ksem_kqfilter;
+static fo_stat_t ksem_stat;
+static fo_close_t ksem_closef;
+static fo_chmod_t ksem_chmod;
+static fo_chown_t ksem_chown;
+
+/* File descriptor operations. */
+static struct fileops ksem_ops = {
+ .fo_read = ksem_read,
+ .fo_write = ksem_write,
+ .fo_truncate = ksem_truncate,
+ .fo_ioctl = ksem_ioctl,
+ .fo_poll = ksem_poll,
+ .fo_kqfilter = ksem_kqfilter,
+ .fo_stat = ksem_stat,
+ .fo_close = ksem_closef,
+ .fo_chmod = ksem_chmod,
+ .fo_chown = ksem_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_flags = DFLAG_PASSABLE
+};
+
+FEATURE(posix_sem, "POSIX semaphores");
+
+static int
+ksem_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+ksem_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+ksem_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EINVAL);
+}
+
+static int
+ksem_ioctl(struct file *fp, u_long com, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+ksem_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+ksem_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct ksem *ks;
+#ifdef MAC
+ int error;
+#endif
+
+ ks = fp->f_data;
+
+#ifdef MAC
+ error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
+ if (error)
+ return (error);
+#endif
+
+ /*
+ * Attempt to return sanish values for fstat() on a semaphore
+ * file descriptor.
+ */
+ bzero(sb, sizeof(*sb));
+
+ mtx_lock(&sem_lock);
+ sb->st_atim = ks->ks_atime;
+ sb->st_ctim = ks->ks_ctime;
+ sb->st_mtim = ks->ks_mtime;
+ sb->st_birthtim = ks->ks_birthtime;
+ sb->st_uid = ks->ks_uid;
+ sb->st_gid = ks->ks_gid;
+ sb->st_mode = S_IFREG | ks->ks_mode; /* XXX */
+ mtx_unlock(&sem_lock);
+
+ return (0);
+}
+
+static int
+ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct ksem *ks;
+ int error;
+
+ error = 0;
+ ks = fp->f_data;
+ mtx_lock(&sem_lock);
+#ifdef MAC
+ error = mac_posixsem_check_setmode(active_cred, ks, mode);
+ if (error != 0)
+ goto out;
+#endif
+ error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
+ active_cred, NULL);
+ if (error != 0)
+ goto out;
+ ks->ks_mode = mode & ACCESSPERMS;
+out:
+ mtx_unlock(&sem_lock);
+ return (error);
+}
+
+static int
+ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct ksem *ks;
+ int error;
+
+ error = 0;
+ ks = fp->f_data;
+ mtx_lock(&sem_lock);
+#ifdef MAC
+ error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
+ if (error != 0)
+ goto out;
+#endif
+ if (uid == (uid_t)-1)
+ uid = ks->ks_uid;
+ if (gid == (gid_t)-1)
+ gid = ks->ks_gid;
+ if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
+ (gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
+ (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
+ goto out;
+ ks->ks_uid = uid;
+ ks->ks_gid = gid;
+out:
+ mtx_unlock(&sem_lock);
+ return (error);
+}
+
+static int
+ksem_closef(struct file *fp, struct thread *td)
+{
+ struct ksem *ks;
+
+ ks = fp->f_data;
+ fp->f_data = NULL;
+ ksem_drop(ks);
+
+ return (0);
+}
+
+/*
+ * ksem object management including creation and reference counting
+ * routines.
+ */
+static struct ksem *
+ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
+{
+ struct ksem *ks;
+
+ mtx_lock(&ksem_count_lock);
+ if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
+ mtx_unlock(&ksem_count_lock);
+ return (NULL);
+ }
+ nsems++;
+ mtx_unlock(&ksem_count_lock);
+ ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
+ ks->ks_uid = ucred->cr_uid;
+ ks->ks_gid = ucred->cr_gid;
+ ks->ks_mode = mode;
+ ks->ks_value = value;
+ cv_init(&ks->ks_cv, "ksem");
+ vfs_timestamp(&ks->ks_birthtime);
+ ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
+ refcount_init(&ks->ks_ref, 1);
+#ifdef MAC
+ mac_posixsem_init(ks);
+ mac_posixsem_create(ucred, ks);
+#endif
+
+ return (ks);
+}
+
+static struct ksem *
+ksem_hold(struct ksem *ks)
+{
+
+ refcount_acquire(&ks->ks_ref);
+ return (ks);
+}
+
+static void
+ksem_drop(struct ksem *ks)
+{
+
+ if (refcount_release(&ks->ks_ref)) {
+#ifdef MAC
+ mac_posixsem_destroy(ks);
+#endif
+ cv_destroy(&ks->ks_cv);
+ free(ks, M_KSEM);
+ mtx_lock(&ksem_count_lock);
+ nsems--;
+ mtx_unlock(&ksem_count_lock);
+ }
+}
+
+/*
+ * Determine if the credentials have sufficient permissions for read
+ * and write access.
+ */
+static int
+ksem_access(struct ksem *ks, struct ucred *ucred)
+{
+ int error;
+
+ error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
+ VREAD | VWRITE, ucred, NULL);
+ if (error)
+ error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0);
+ return (error);
+}
+
+/*
+ * Dictionary management. We maintain an in-kernel dictionary to map
+ * paths to semaphore objects. We use the FNV hash on the path to
+ * store the mappings in a hash table.
+ */
+static struct ksem *
+ksem_lookup(char *path, Fnv32_t fnv)
+{
+ struct ksem_mapping *map;
+
+ LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
+ if (map->km_fnv != fnv)
+ continue;
+ if (strcmp(map->km_path, path) == 0)
+ return (map->km_ksem);
+ }
+
+ return (NULL);
+}
+
+static void
+ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
+{
+ struct ksem_mapping *map;
+
+ map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
+ map->km_path = path;
+ map->km_fnv = fnv;
+ map->km_ksem = ksem_hold(ks);
+ ks->ks_path = path;
+ LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
+}
+
+static int
+ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
+{
+ struct ksem_mapping *map;
+ int error;
+
+ LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
+ if (map->km_fnv != fnv)
+ continue;
+ if (strcmp(map->km_path, path) == 0) {
+#ifdef MAC
+ error = mac_posixsem_check_unlink(ucred, map->km_ksem);
+ if (error)
+ return (error);
+#endif
+ error = ksem_access(map->km_ksem, ucred);
+ if (error)
+ return (error);
+ map->km_ksem->ks_path = NULL;
+ LIST_REMOVE(map, km_link);
+ ksem_drop(map->km_ksem);
+ free(map->km_path, M_KSEM);
+ free(map, M_KSEM);
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+static void
+ksem_info_impl(struct ksem *ks, char *path, size_t size, uint32_t *value)
+{
+
+ if (ks->ks_path == NULL)
+ return;
+ sx_slock(&ksem_dict_lock);
+ if (ks->ks_path != NULL)
+ strlcpy(path, ks->ks_path, size);
+ if (value != NULL)
+ *value = ks->ks_value;
+ sx_sunlock(&ksem_dict_lock);
+}
+
+static int
+ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
+ int compat32)
+{
+ semid_t semid;
+#ifdef COMPAT_FREEBSD32
+ int32_t semid32;
+#endif
+ void *ptr;
+ size_t ptrs;
+
+#ifdef COMPAT_FREEBSD32
+ if (compat32) {
+ semid32 = fd;
+ ptr = &semid32;
+ ptrs = sizeof(semid32);
+ } else {
+#endif
+ semid = fd;
+ ptr = &semid;
+ ptrs = sizeof(semid);
+ compat32 = 0; /* silence gcc */
+#ifdef COMPAT_FREEBSD32
+ }
+#endif
+
+ return (copyout(ptr, semidp, ptrs));
+}
+
+/* Other helper routines. */
+static int
+ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
+ unsigned int value, int flags, int compat32)
+{
+ struct filedesc *fdp;
+ struct ksem *ks;
+ struct file *fp;
+ char *path;
+ Fnv32_t fnv;
+ int error, fd;
+
+ if (value > SEM_VALUE_MAX)
+ return (EINVAL);
+
+ fdp = td->td_proc->p_fd;
+ mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
+ error = falloc(td, &fp, &fd, O_CLOEXEC);
+ if (error) {
+ if (name == NULL)
+ error = ENOSPC;
+ return (error);
+ }
+
+ /*
+ * Go ahead and copyout the file descriptor now. This is a bit
+ * premature, but it is a lot easier to handle errors as opposed
+ * to later when we've possibly created a new semaphore, etc.
+ */
+ error = ksem_create_copyout_semid(td, semidp, fd, compat32);
+ if (error) {
+ fdclose(fdp, fp, fd, td);
+ fdrop(fp, td);
+ return (error);
+ }
+
+ if (name == NULL) {
+ /* Create an anonymous semaphore. */
+ ks = ksem_alloc(td->td_ucred, mode, value);
+ if (ks == NULL)
+ error = ENOSPC;
+ else
+ ks->ks_flags |= KS_ANONYMOUS;
+ } else {
+ path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
+ error = copyinstr(name, path, MAXPATHLEN, NULL);
+
+ /* Require paths to start with a '/' character. */
+ if (error == 0 && path[0] != '/')
+ error = EINVAL;
+ if (error) {
+ fdclose(fdp, fp, fd, td);
+ fdrop(fp, td);
+ free(path, M_KSEM);
+ return (error);
+ }
+
+ fnv = fnv_32_str(path, FNV1_32_INIT);
+ sx_xlock(&ksem_dict_lock);
+ ks = ksem_lookup(path, fnv);
+ if (ks == NULL) {
+ /* Object does not exist, create it if requested. */
+ if (flags & O_CREAT) {
+ ks = ksem_alloc(td->td_ucred, mode, value);
+ if (ks == NULL)
+ error = ENFILE;
+ else {
+ ksem_insert(path, fnv, ks);
+ path = NULL;
+ }
+ } else
+ error = ENOENT;
+ } else {
+ /*
+ * Object already exists, obtain a new
+ * reference if requested and permitted.
+ */
+ if ((flags & (O_CREAT | O_EXCL)) ==
+ (O_CREAT | O_EXCL))
+ error = EEXIST;
+ else {
+#ifdef MAC
+ error = mac_posixsem_check_open(td->td_ucred,
+ ks);
+ if (error == 0)
+#endif
+ error = ksem_access(ks, td->td_ucred);
+ }
+ if (error == 0)
+ ksem_hold(ks);
+#ifdef INVARIANTS
+ else
+ ks = NULL;
+#endif
+ }
+ sx_xunlock(&ksem_dict_lock);
+ if (path)
+ free(path, M_KSEM);
+ }
+
+ if (error) {
+ KASSERT(ks == NULL, ("ksem_create error with a ksem"));
+ fdclose(fdp, fp, fd, td);
+ fdrop(fp, td);
+ return (error);
+ }
+ KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
+
+ finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
+
+ fdrop(fp, td);
+
+ return (0);
+}
+
+static int
+ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
+ struct file **fpp)
+{
+ struct ksem *ks;
+ struct file *fp;
+ int error;
+
+ error = fget(td, id, rightsp, &fp);
+ if (error)
+ return (EINVAL);
+ if (fp->f_type != DTYPE_SEM) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ ks = fp->f_data;
+ if (ks->ks_flags & KS_DEAD) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ *fpp = fp;
+ return (0);
+}
+
+/* System calls. */
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_init_args {
+ unsigned int value;
+ semid_t *idp;
+};
+#endif
+int
+sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
+{
+
+ return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
+ 0, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_open_args {
+ char *name;
+ int oflag;
+ mode_t mode;
+ unsigned int value;
+ semid_t *idp;
+};
+#endif
+int
+sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
+{
+
+ DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
+
+ if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
+ return (EINVAL);
+ return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
+ uap->oflag, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_unlink_args {
+ char *name;
+};
+#endif
+int
+sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
+{
+ char *path;
+ Fnv32_t fnv;
+ int error;
+
+ path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->name, path, MAXPATHLEN, NULL);
+ if (error) {
+ free(path, M_TEMP);
+ return (error);
+ }
+
+ fnv = fnv_32_str(path, FNV1_32_INIT);
+ sx_xlock(&ksem_dict_lock);
+ error = ksem_remove(path, fnv, td->td_ucred);
+ sx_xunlock(&ksem_dict_lock);
+ free(path, M_TEMP);
+
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_close_args {
+ semid_t id;
+};
+#endif
+int
+sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
+{
+ struct ksem *ks;
+ struct file *fp;
+ int error;
+
+ /* No capability rights required to close a semaphore. */
+ error = ksem_get(td, uap->id, 0, &fp);
+ if (error)
+ return (error);
+ ks = fp->f_data;
+ if (ks->ks_flags & KS_ANONYMOUS) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ error = kern_close(td, uap->id);
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_post_args {
+ semid_t id;
+};
+#endif
+int
+sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
+{
+ cap_rights_t rights;
+ struct file *fp;
+ struct ksem *ks;
+ int error;
+
+ error = ksem_get(td, uap->id,
+ cap_rights_init(&rights, CAP_SEM_POST), &fp);
+ if (error)
+ return (error);
+ ks = fp->f_data;
+
+ mtx_lock(&sem_lock);
+#ifdef MAC
+ error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
+ if (error)
+ goto err;
+#endif
+ if (ks->ks_value == SEM_VALUE_MAX) {
+ error = EOVERFLOW;
+ goto err;
+ }
+ ++ks->ks_value;
+ if (ks->ks_waiters > 0)
+ cv_signal(&ks->ks_cv);
+ error = 0;
+ vfs_timestamp(&ks->ks_ctime);
+err:
+ mtx_unlock(&sem_lock);
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_wait_args {
+ semid_t id;
+};
+#endif
+int
+sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
+{
+
+ return (kern_sem_wait(td, uap->id, 0, NULL));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_timedwait_args {
+ semid_t id;
+ const struct timespec *abstime;
+};
+#endif
+int
+sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
+{
+ struct timespec abstime;
+ struct timespec *ts;
+ int error;
+
+ /*
+ * We allow a null timespec (wait forever).
+ */
+ if (uap->abstime == NULL)
+ ts = NULL;
+ else {
+ error = copyin(uap->abstime, &abstime, sizeof(abstime));
+ if (error != 0)
+ return (error);
+ if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
+ return (EINVAL);
+ ts = &abstime;
+ }
+ return (kern_sem_wait(td, uap->id, 0, ts));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_trywait_args {
+ semid_t id;
+};
+#endif
+int
+sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
+{
+
+ return (kern_sem_wait(td, uap->id, 1, NULL));
+}
+
+static int
+kern_sem_wait(struct thread *td, semid_t id, int tryflag,
+ struct timespec *abstime)
+{
+ struct timespec ts1, ts2;
+ struct timeval tv;
+ cap_rights_t rights;
+ struct file *fp;
+ struct ksem *ks;
+ int error;
+
+ DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
+ error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp);
+ if (error)
+ return (error);
+ ks = fp->f_data;
+ mtx_lock(&sem_lock);
+ DP((">>> kern_sem_wait critical section entered! pid=%d\n",
+ (int)td->td_proc->p_pid));
+#ifdef MAC
+ error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
+ if (error) {
+ DP(("kern_sem_wait mac failed\n"));
+ goto err;
+ }
+#endif
+ DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
+ vfs_timestamp(&ks->ks_atime);
+ while (ks->ks_value == 0) {
+ ks->ks_waiters++;
+ if (tryflag != 0)
+ error = EAGAIN;
+ else if (abstime == NULL)
+ error = cv_wait_sig(&ks->ks_cv, &sem_lock);
+ else {
+ for (;;) {
+ ts1 = *abstime;
+ getnanotime(&ts2);
+ timespecsub(&ts1, &ts2);
+ TIMESPEC_TO_TIMEVAL(&tv, &ts1);
+ if (tv.tv_sec < 0) {
+ error = ETIMEDOUT;
+ break;
+ }
+ error = cv_timedwait_sig(&ks->ks_cv,
+ &sem_lock, tvtohz(&tv));
+ if (error != EWOULDBLOCK)
+ break;
+ }
+ }
+ ks->ks_waiters--;
+ if (error)
+ goto err;
+ }
+ ks->ks_value--;
+ DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
+ error = 0;
+err:
+ mtx_unlock(&sem_lock);
+ fdrop(fp, td);
+ DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
+ (int)td->td_proc->p_pid, error));
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_getvalue_args {
+ semid_t id;
+ int *val;
+};
+#endif
+int
+sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
+{
+ cap_rights_t rights;
+ struct file *fp;
+ struct ksem *ks;
+ int error, val;
+
+ error = ksem_get(td, uap->id,
+ cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp);
+ if (error)
+ return (error);
+ ks = fp->f_data;
+
+ mtx_lock(&sem_lock);
+#ifdef MAC
+ error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
+ if (error) {
+ mtx_unlock(&sem_lock);
+ fdrop(fp, td);
+ return (error);
+ }
+#endif
+ val = ks->ks_value;
+ vfs_timestamp(&ks->ks_atime);
+ mtx_unlock(&sem_lock);
+ fdrop(fp, td);
+ error = copyout(&val, uap->val, sizeof(val));
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ksem_destroy_args {
+ semid_t id;
+};
+#endif
+int
+sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
+{
+ struct file *fp;
+ struct ksem *ks;
+ int error;
+
+ /* No capability rights required to close a semaphore. */
+ error = ksem_get(td, uap->id, 0, &fp);
+ if (error)
+ return (error);
+ ks = fp->f_data;
+ if (!(ks->ks_flags & KS_ANONYMOUS)) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ mtx_lock(&sem_lock);
+ if (ks->ks_waiters != 0) {
+ mtx_unlock(&sem_lock);
+ error = EBUSY;
+ goto err;
+ }
+ ks->ks_flags |= KS_DEAD;
+ mtx_unlock(&sem_lock);
+
+ error = kern_close(td, uap->id);
+err:
+ fdrop(fp, td);
+ return (error);
+}
+
+static struct syscall_helper_data ksem_syscalls[] = {
+ SYSCALL_INIT_HELPER(ksem_init),
+ SYSCALL_INIT_HELPER(ksem_open),
+ SYSCALL_INIT_HELPER(ksem_unlink),
+ SYSCALL_INIT_HELPER(ksem_close),
+ SYSCALL_INIT_HELPER(ksem_post),
+ SYSCALL_INIT_HELPER(ksem_wait),
+ SYSCALL_INIT_HELPER(ksem_timedwait),
+ SYSCALL_INIT_HELPER(ksem_trywait),
+ SYSCALL_INIT_HELPER(ksem_getvalue),
+ SYSCALL_INIT_HELPER(ksem_destroy),
+ SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+int
+freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
+{
+
+ return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
+ 0, 1));
+}
+
+int
+freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
+{
+
+ if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
+ return (EINVAL);
+ return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
+ uap->oflag, 1));
+}
+
+int
+freebsd32_ksem_timedwait(struct thread *td,
+ struct freebsd32_ksem_timedwait_args *uap)
+{
+ struct timespec32 abstime32;
+ struct timespec *ts, abstime;
+ int error;
+
+ /*
+ * We allow a null timespec (wait forever).
+ */
+ if (uap->abstime == NULL)
+ ts = NULL;
+ else {
+ error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
+ if (error != 0)
+ return (error);
+ CP(abstime32, abstime, tv_sec);
+ CP(abstime32, abstime, tv_nsec);
+ if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
+ return (EINVAL);
+ ts = &abstime;
+ }
+ return (kern_sem_wait(td, uap->id, 0, ts));
+}
+
+static struct syscall_helper_data ksem32_syscalls[] = {
+ SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
+ SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
+ SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
+ SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
+ SYSCALL_INIT_LAST
+};
+#endif
+
+static int
+ksem_module_init(void)
+{
+ int error;
+
+ mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
+ mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
+ sx_init(&ksem_dict_lock, "ksem dictionary");
+ ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
+ p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
+ p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
+ p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
+ ksem_info = ksem_info_impl;
+
+ error = syscall_helper_register(ksem_syscalls);
+ if (error)
+ return (error);
+#ifdef COMPAT_FREEBSD32
+ error = syscall32_helper_register(ksem32_syscalls);
+ if (error)
+ return (error);
+#endif
+ return (0);
+}
+
+static void
+ksem_module_destroy(void)
+{
+
+#ifdef COMPAT_FREEBSD32
+ syscall32_helper_unregister(ksem32_syscalls);
+#endif
+ syscall_helper_unregister(ksem_syscalls);
+
+ ksem_info = NULL;
+ p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
+ hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
+ sx_destroy(&ksem_dict_lock);
+ mtx_destroy(&ksem_count_lock);
+ mtx_destroy(&sem_lock);
+ p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
+ p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
+}
+
+static int
+sem_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ error = ksem_module_init();
+ if (error)
+ ksem_module_destroy();
+ break;
+
+ case MOD_UNLOAD:
+ mtx_lock(&ksem_count_lock);
+ if (nsems != 0) {
+ error = EOPNOTSUPP;
+ mtx_unlock(&ksem_count_lock);
+ break;
+ }
+ ksem_dead = 1;
+ mtx_unlock(&ksem_count_lock);
+ ksem_module_destroy();
+ break;
+
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t sem_mod = {
+ "sem",
+ &sem_modload,
+ NULL
+};
+
+DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
+MODULE_VERSION(sem, 1);
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
new file mode 100644
index 0000000..54366af
--- /dev/null
+++ b/sys/kern/uipc_shm.c
@@ -0,0 +1,1033 @@
+/*-
+ * Copyright (c) 2006, 2011 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Support for shared swap-backed anonymous memory objects via
+ * shm_open(2) and shm_unlink(2). While most of the implementation is
+ * here, vm_mmap.c contains mapping logic changes.
+ *
+ * TODO:
+ *
+ * (1) Need to export data to a userland tool via a sysctl. Should ipcs(1)
+ * and ipcrm(1) be expanded or should new tools to manage both POSIX
+ * kernel semaphores and POSIX shared memory be written?
+ *
+ * (2) Add support for this file type to fstat(1).
+ *
+ * (3) Resource limits? Does this need its own resource limits or are the
+ * existing limits in mmap(2) sufficient?
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/refcount.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/unistd.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
+
+struct shm_mapping {
+ char *sm_path;
+ Fnv32_t sm_fnv;
+ struct shmfd *sm_shmfd;
+ LIST_ENTRY(shm_mapping) sm_link;
+};
+
+static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
+static LIST_HEAD(, shm_mapping) *shm_dictionary;
+static struct sx shm_dict_lock;
+static struct mtx shm_timestamp_lock;
+static u_long shm_hash;
+
+#define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash])
+
+static int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
+static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
+static void shm_dict_init(void *arg);
+static void shm_drop(struct shmfd *shmfd);
+static struct shmfd *shm_hold(struct shmfd *shmfd);
+static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
+static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
+static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
+static int shm_dotruncate(struct shmfd *shmfd, off_t length);
+
+static fo_rdwr_t shm_read;
+static fo_rdwr_t shm_write;
+static fo_truncate_t shm_truncate;
+static fo_ioctl_t shm_ioctl;
+static fo_poll_t shm_poll;
+static fo_kqfilter_t shm_kqfilter;
+static fo_stat_t shm_stat;
+static fo_close_t shm_close;
+static fo_chmod_t shm_chmod;
+static fo_chown_t shm_chown;
+static fo_seek_t shm_seek;
+
+/* File descriptor operations. */
+static struct fileops shm_ops = {
+ .fo_read = shm_read,
+ .fo_write = shm_write,
+ .fo_truncate = shm_truncate,
+ .fo_ioctl = shm_ioctl,
+ .fo_poll = shm_poll,
+ .fo_kqfilter = shm_kqfilter,
+ .fo_stat = shm_stat,
+ .fo_close = shm_close,
+ .fo_chmod = shm_chmod,
+ .fo_chown = shm_chown,
+ .fo_sendfile = invfo_sendfile,
+ .fo_seek = shm_seek,
+ .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+};
+
+FEATURE(posix_shm, "POSIX shared memory");
+
+static int
+uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
+{
+ vm_page_t m;
+ vm_pindex_t idx;
+ size_t tlen;
+ int error, offset, rv;
+
+ idx = OFF_TO_IDX(uio->uio_offset);
+ offset = uio->uio_offset & PAGE_MASK;
+ tlen = MIN(PAGE_SIZE - offset, len);
+
+ VM_OBJECT_WLOCK(obj);
+
+ /*
+ * Parallel reads of the page content from disk are prevented
+ * by exclusive busy.
+ *
+ * Although the tmpfs vnode lock is held here, it is
+ * nonetheless safe to sleep waiting for a free page. The
+ * pageout daemon does not need to acquire the tmpfs vnode
+ * lock to page out tobj's pages because tobj is a OBJT_SWAP
+ * type object.
+ */
+ m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL);
+ if (m->valid != VM_PAGE_BITS_ALL) {
+ if (vm_pager_has_page(obj, idx, NULL, NULL)) {
+ rv = vm_pager_get_pages(obj, &m, 1, 0);
+ m = vm_page_lookup(obj, idx);
+ if (m == NULL) {
+ printf(
+ "uiomove_object: vm_obj %p idx %jd null lookup rv %d\n",
+ obj, idx, rv);
+ VM_OBJECT_WUNLOCK(obj);
+ return (EIO);
+ }
+ if (rv != VM_PAGER_OK) {
+ printf(
+ "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
+ obj, idx, m->valid, rv);
+ vm_page_lock(m);
+ vm_page_free(m);
+ vm_page_unlock(m);
+ VM_OBJECT_WUNLOCK(obj);
+ return (EIO);
+ }
+ } else
+ vm_page_zero_invalid(m, TRUE);
+ }
+ vm_page_xunbusy(m);
+ vm_page_lock(m);
+ vm_page_hold(m);
+ vm_page_unlock(m);
+ VM_OBJECT_WUNLOCK(obj);
+ error = uiomove_fromphys(&m, offset, tlen, uio);
+ if (uio->uio_rw == UIO_WRITE && error == 0) {
+ VM_OBJECT_WLOCK(obj);
+ vm_page_dirty(m);
+ VM_OBJECT_WUNLOCK(obj);
+ }
+ vm_page_lock(m);
+ vm_page_unhold(m);
+ if (m->queue == PQ_NONE) {
+ vm_page_deactivate(m);
+ } else {
+ /* Requeue to maintain LRU ordering. */
+ vm_page_requeue(m);
+ }
+ vm_page_unlock(m);
+
+ return (error);
+}
+
+int
+uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
+{
+ ssize_t resid;
+ size_t len;
+ int error;
+
+ error = 0;
+ while ((resid = uio->uio_resid) > 0) {
+ if (obj_size <= uio->uio_offset)
+ break;
+ len = MIN(obj_size - uio->uio_offset, resid);
+ if (len == 0)
+ break;
+ error = uiomove_object_page(obj, len, uio);
+ if (error != 0 || resid == uio->uio_resid)
+ break;
+ }
+ return (error);
+}
+
+static int
+shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
+{
+ struct shmfd *shmfd;
+ off_t foffset;
+ int error;
+
+ shmfd = fp->f_data;
+ foffset = foffset_lock(fp, 0);
+ error = 0;
+ switch (whence) {
+ case L_INCR:
+ if (foffset < 0 ||
+ (offset > 0 && foffset > OFF_MAX - offset)) {
+ error = EOVERFLOW;
+ break;
+ }
+ offset += foffset;
+ break;
+ case L_XTND:
+ if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
+ error = EOVERFLOW;
+ break;
+ }
+ offset += shmfd->shm_size;
+ break;
+ case L_SET:
+ break;
+ default:
+ error = EINVAL;
+ }
+ if (error == 0) {
+ if (offset < 0 || offset > shmfd->shm_size)
+ error = EINVAL;
+ else
+ *(off_t *)(td->td_retval) = offset;
+ }
+ foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
+ return (error);
+}
+
+static int
+shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct shmfd *shmfd;
+ void *rl_cookie;
+ int error;
+
+ shmfd = fp->f_data;
+ foffset_lock_uio(fp, uio, flags);
+ rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
+ uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
+#ifdef MAC
+ error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
+ if (error)
+ return (error);
+#endif
+ error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
+ rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+ foffset_unlock_uio(fp, uio, flags);
+ return (error);
+}
+
+static int
+shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ struct shmfd *shmfd;
+ void *rl_cookie;
+ int error;
+
+ shmfd = fp->f_data;
+#ifdef MAC
+ error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
+ if (error)
+ return (error);
+#endif
+ foffset_lock_uio(fp, uio, flags);
+ if ((flags & FOF_OFFSET) == 0) {
+ rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
+ &shmfd->shm_mtx);
+ } else {
+ rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
+ uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
+ }
+
+ error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
+ rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+ foffset_unlock_uio(fp, uio, flags);
+ return (error);
+}
+
+static int
+shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct shmfd *shmfd;
+#ifdef MAC
+ int error;
+#endif
+
+ shmfd = fp->f_data;
+#ifdef MAC
+ error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
+ if (error)
+ return (error);
+#endif
+ return (shm_dotruncate(shmfd, length));
+}
+
+static int
+shm_ioctl(struct file *fp, u_long com, void *data,
+ struct ucred *active_cred, struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+shm_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+shm_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (EOPNOTSUPP);
+}
+
+static int
+shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct shmfd *shmfd;
+#ifdef MAC
+ int error;
+#endif
+
+ shmfd = fp->f_data;
+
+#ifdef MAC
+ error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
+ if (error)
+ return (error);
+#endif
+
+ /*
+ * Attempt to return sanish values for fstat() on a memory file
+ * descriptor.
+ */
+ bzero(sb, sizeof(*sb));
+ sb->st_blksize = PAGE_SIZE;
+ sb->st_size = shmfd->shm_size;
+ sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
+ mtx_lock(&shm_timestamp_lock);
+ sb->st_atim = shmfd->shm_atime;
+ sb->st_ctim = shmfd->shm_ctime;
+ sb->st_mtim = shmfd->shm_mtime;
+ sb->st_birthtim = shmfd->shm_birthtime;
+ sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */
+ sb->st_uid = shmfd->shm_uid;
+ sb->st_gid = shmfd->shm_gid;
+ mtx_unlock(&shm_timestamp_lock);
+
+ return (0);
+}
+
+static int
+shm_close(struct file *fp, struct thread *td)
+{
+ struct shmfd *shmfd;
+
+ shmfd = fp->f_data;
+ fp->f_data = NULL;
+ shm_drop(shmfd);
+
+ return (0);
+}
+
+static int
+shm_dotruncate(struct shmfd *shmfd, off_t length)
+{
+ vm_object_t object;
+ vm_page_t m, ma[1];
+ vm_pindex_t idx, nobjsize;
+ vm_ooffset_t delta;
+ int base, rv;
+
+ object = shmfd->shm_object;
+ VM_OBJECT_WLOCK(object);
+ if (length == shmfd->shm_size) {
+ VM_OBJECT_WUNLOCK(object);
+ return (0);
+ }
+ nobjsize = OFF_TO_IDX(length + PAGE_MASK);
+
+ /* Are we shrinking? If so, trim the end. */
+ if (length < shmfd->shm_size) {
+ /*
+ * Disallow any requests to shrink the size if this
+ * object is mapped into the kernel.
+ */
+ if (shmfd->shm_kmappings > 0) {
+ VM_OBJECT_WUNLOCK(object);
+ return (EBUSY);
+ }
+
+ /*
+ * Zero the truncated part of the last page.
+ */
+ base = length & PAGE_MASK;
+ if (base != 0) {
+ idx = OFF_TO_IDX(length);
+retry:
+ m = vm_page_lookup(object, idx);
+ if (m != NULL) {
+ if (vm_page_sleep_if_busy(m, "shmtrc"))
+ goto retry;
+ } else if (vm_pager_has_page(object, idx, NULL, NULL)) {
+ m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
+ if (m == NULL) {
+ VM_OBJECT_WUNLOCK(object);
+ VM_WAIT;
+ VM_OBJECT_WLOCK(object);
+ goto retry;
+ } else if (m->valid != VM_PAGE_BITS_ALL) {
+ ma[0] = m;
+ rv = vm_pager_get_pages(object, ma, 1,
+ 0);
+ m = vm_page_lookup(object, idx);
+ } else
+ /* A cached page was reactivated. */
+ rv = VM_PAGER_OK;
+ vm_page_lock(m);
+ if (rv == VM_PAGER_OK) {
+ vm_page_deactivate(m);
+ vm_page_unlock(m);
+ vm_page_xunbusy(m);
+ } else {
+ vm_page_free(m);
+ vm_page_unlock(m);
+ VM_OBJECT_WUNLOCK(object);
+ return (EIO);
+ }
+ }
+ if (m != NULL) {
+ pmap_zero_page_area(m, base, PAGE_SIZE - base);
+ KASSERT(m->valid == VM_PAGE_BITS_ALL,
+ ("shm_dotruncate: page %p is invalid", m));
+ vm_page_dirty(m);
+ vm_pager_page_unswapped(m);
+ }
+ }
+ delta = ptoa(object->size - nobjsize);
+
+ /* Toss in memory pages. */
+ if (nobjsize < object->size)
+ vm_object_page_remove(object, nobjsize, object->size,
+ 0);
+
+ /* Toss pages from swap. */
+ if (object->type == OBJT_SWAP)
+ swap_pager_freespace(object, nobjsize, delta);
+
+ /* Free the swap accounted for shm */
+ swap_release_by_cred(delta, object->cred);
+ object->charge -= delta;
+ } else {
+ /* Attempt to reserve the swap */
+ delta = ptoa(nobjsize - object->size);
+ if (!swap_reserve_by_cred(delta, object->cred)) {
+ VM_OBJECT_WUNLOCK(object);
+ return (ENOMEM);
+ }
+ object->charge += delta;
+ }
+ shmfd->shm_size = length;
+ mtx_lock(&shm_timestamp_lock);
+ vfs_timestamp(&shmfd->shm_ctime);
+ shmfd->shm_mtime = shmfd->shm_ctime;
+ mtx_unlock(&shm_timestamp_lock);
+ object->size = nobjsize;
+ VM_OBJECT_WUNLOCK(object);
+ return (0);
+}
+
+/*
+ * shmfd object management including creation and reference counting
+ * routines.
+ */
+static struct shmfd *
+shm_alloc(struct ucred *ucred, mode_t mode)
+{
+ struct shmfd *shmfd;
+
+ shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
+ shmfd->shm_size = 0;
+ shmfd->shm_uid = ucred->cr_uid;
+ shmfd->shm_gid = ucred->cr_gid;
+ shmfd->shm_mode = mode;
+ shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
+ shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
+ KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
+ VM_OBJECT_WLOCK(shmfd->shm_object);
+ vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
+ vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT);
+ VM_OBJECT_WUNLOCK(shmfd->shm_object);
+ vfs_timestamp(&shmfd->shm_birthtime);
+ shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
+ shmfd->shm_birthtime;
+ refcount_init(&shmfd->shm_refs, 1);
+ mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
+ rangelock_init(&shmfd->shm_rl);
+#ifdef MAC
+ mac_posixshm_init(shmfd);
+ mac_posixshm_create(ucred, shmfd);
+#endif
+
+ return (shmfd);
+}
+
+static struct shmfd *
+shm_hold(struct shmfd *shmfd)
+{
+
+ refcount_acquire(&shmfd->shm_refs);
+ return (shmfd);
+}
+
+static void
+shm_drop(struct shmfd *shmfd)
+{
+
+ if (refcount_release(&shmfd->shm_refs)) {
+#ifdef MAC
+ mac_posixshm_destroy(shmfd);
+#endif
+ rangelock_destroy(&shmfd->shm_rl);
+ mtx_destroy(&shmfd->shm_mtx);
+ vm_object_deallocate(shmfd->shm_object);
+ free(shmfd, M_SHMFD);
+ }
+}
+
+/*
+ * Determine if the credentials have sufficient permissions for a
+ * specified combination of FREAD and FWRITE.
+ */
+static int
+shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
+{
+ accmode_t accmode;
+ int error;
+
+ accmode = 0;
+ if (flags & FREAD)
+ accmode |= VREAD;
+ if (flags & FWRITE)
+ accmode |= VWRITE;
+ mtx_lock(&shm_timestamp_lock);
+ error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
+ accmode, ucred, NULL);
+ mtx_unlock(&shm_timestamp_lock);
+ return (error);
+}
+
+/*
+ * Dictionary management. We maintain an in-kernel dictionary to map
+ * paths to shmfd objects. We use the FNV hash on the path to store
+ * the mappings in a hash table.
+ */
+static void
+shm_dict_init(void *arg)
+{
+
+ mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
+ sx_init(&shm_dict_lock, "shm dictionary");
+ shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
+}
+SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL);
+
+static struct shmfd *
+shm_lookup(char *path, Fnv32_t fnv)
+{
+ struct shm_mapping *map;
+
+ LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
+ if (map->sm_fnv != fnv)
+ continue;
+ if (strcmp(map->sm_path, path) == 0)
+ return (map->sm_shmfd);
+ }
+
+ return (NULL);
+}
+
+static void
+shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
+{
+ struct shm_mapping *map;
+
+ map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
+ map->sm_path = path;
+ map->sm_fnv = fnv;
+ map->sm_shmfd = shm_hold(shmfd);
+ shmfd->shm_path = path;
+ LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
+}
+
+static int
+shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
+{
+ struct shm_mapping *map;
+ int error;
+
+ LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
+ if (map->sm_fnv != fnv)
+ continue;
+ if (strcmp(map->sm_path, path) == 0) {
+#ifdef MAC
+ error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
+ if (error)
+ return (error);
+#endif
+ error = shm_access(map->sm_shmfd, ucred,
+ FREAD | FWRITE);
+ if (error)
+ return (error);
+ map->sm_shmfd->shm_path = NULL;
+ LIST_REMOVE(map, sm_link);
+ shm_drop(map->sm_shmfd);
+ free(map->sm_path, M_SHMFD);
+ free(map, M_SHMFD);
+ return (0);
+ }
+ }
+
+ return (ENOENT);
+}
+
+/* System calls. */
+int
+sys_shm_open(struct thread *td, struct shm_open_args *uap)
+{
+ struct filedesc *fdp;
+ struct shmfd *shmfd;
+ struct file *fp;
+ char *path;
+ Fnv32_t fnv;
+ mode_t cmode;
+ int fd, error;
+
+#ifdef CAPABILITY_MODE
+ /*
+ * shm_open(2) is only allowed for anonymous objects.
+ */
+ if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
+ return (ECAPMODE);
+#endif
+
+ if ((uap->flags & O_ACCMODE) != O_RDONLY &&
+ (uap->flags & O_ACCMODE) != O_RDWR)
+ return (EINVAL);
+
+ if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC)) != 0)
+ return (EINVAL);
+
+ fdp = td->td_proc->p_fd;
+ cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS;
+
+ error = falloc(td, &fp, &fd, O_CLOEXEC);
+ if (error)
+ return (error);
+
+ /* A SHM_ANON path pointer creates an anonymous object. */
+ if (uap->path == SHM_ANON) {
+ /* A read-only anonymous object is pointless. */
+ if ((uap->flags & O_ACCMODE) == O_RDONLY) {
+ fdclose(fdp, fp, fd, td);
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ shmfd = shm_alloc(td->td_ucred, cmode);
+ } else {
+ path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
+ error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
+
+ /* Require paths to start with a '/' character. */
+ if (error == 0 && path[0] != '/')
+ error = EINVAL;
+ if (error) {
+ fdclose(fdp, fp, fd, td);
+ fdrop(fp, td);
+ free(path, M_SHMFD);
+ return (error);
+ }
+
+ fnv = fnv_32_str(path, FNV1_32_INIT);
+ sx_xlock(&shm_dict_lock);
+ shmfd = shm_lookup(path, fnv);
+ if (shmfd == NULL) {
+ /* Object does not yet exist, create it if requested. */
+ if (uap->flags & O_CREAT) {
+#ifdef MAC
+ error = mac_posixshm_check_create(td->td_ucred,
+ path);
+ if (error == 0) {
+#endif
+ shmfd = shm_alloc(td->td_ucred, cmode);
+ shm_insert(path, fnv, shmfd);
+#ifdef MAC
+ }
+#endif
+ } else {
+ free(path, M_SHMFD);
+ error = ENOENT;
+ }
+ } else {
+ /*
+ * Object already exists, obtain a new
+ * reference if requested and permitted.
+ */
+ free(path, M_SHMFD);
+ if ((uap->flags & (O_CREAT | O_EXCL)) ==
+ (O_CREAT | O_EXCL))
+ error = EEXIST;
+ else {
+#ifdef MAC
+ error = mac_posixshm_check_open(td->td_ucred,
+ shmfd, FFLAGS(uap->flags & O_ACCMODE));
+ if (error == 0)
+#endif
+ error = shm_access(shmfd, td->td_ucred,
+ FFLAGS(uap->flags & O_ACCMODE));
+ }
+
+ /*
+ * Truncate the file back to zero length if
+ * O_TRUNC was specified and the object was
+ * opened with read/write.
+ */
+ if (error == 0 &&
+ (uap->flags & (O_ACCMODE | O_TRUNC)) ==
+ (O_RDWR | O_TRUNC)) {
+#ifdef MAC
+ error = mac_posixshm_check_truncate(
+ td->td_ucred, fp->f_cred, shmfd);
+ if (error == 0)
+#endif
+ shm_dotruncate(shmfd, 0);
+ }
+ if (error == 0)
+ shm_hold(shmfd);
+ }
+ sx_xunlock(&shm_dict_lock);
+
+ if (error) {
+ fdclose(fdp, fp, fd, td);
+ fdrop(fp, td);
+ return (error);
+ }
+ }
+
+ finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
+
+ td->td_retval[0] = fd;
+ fdrop(fp, td);
+
+ return (0);
+}
+
+int
+sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
+{
+ char *path;
+ Fnv32_t fnv;
+ int error;
+
+ path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
+ if (error) {
+ free(path, M_TEMP);
+ return (error);
+ }
+
+ fnv = fnv_32_str(path, FNV1_32_INIT);
+ sx_xlock(&shm_dict_lock);
+ error = shm_remove(path, fnv, td->td_ucred);
+ sx_xunlock(&shm_dict_lock);
+ free(path, M_TEMP);
+
+ return (error);
+}
+
+/*
+ * mmap() helper to validate mmap() requests against shm object state
+ * and give mmap() the vm_object to use for the mapping.
+ */
+int
+shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
+ vm_object_t *obj)
+{
+
+ /*
+ * XXXRW: This validation is probably insufficient, and subject to
+ * sign errors. It should be fixed.
+ */
+ if (foff >= shmfd->shm_size ||
+ foff + objsize > round_page(shmfd->shm_size))
+ return (EINVAL);
+
+ mtx_lock(&shm_timestamp_lock);
+ vfs_timestamp(&shmfd->shm_atime);
+ mtx_unlock(&shm_timestamp_lock);
+ vm_object_reference(shmfd->shm_object);
+ *obj = shmfd->shm_object;
+ return (0);
+}
+
+static int
+shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct shmfd *shmfd;
+ int error;
+
+ error = 0;
+ shmfd = fp->f_data;
+ mtx_lock(&shm_timestamp_lock);
+ /*
+ * SUSv4 says that x bits of permission need not be affected.
+ * Be consistent with our shm_open there.
+ */
+#ifdef MAC
+ error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
+ if (error != 0)
+ goto out;
+#endif
+ error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
+ shmfd->shm_gid, VADMIN, active_cred, NULL);
+ if (error != 0)
+ goto out;
+ shmfd->shm_mode = mode & ACCESSPERMS;
+out:
+ mtx_unlock(&shm_timestamp_lock);
+ return (error);
+}
+
+static int
+shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct shmfd *shmfd;
+ int error;
+
+ error = 0;
+ shmfd = fp->f_data;
+ mtx_lock(&shm_timestamp_lock);
+#ifdef MAC
+ error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
+ if (error != 0)
+ goto out;
+#endif
+ if (uid == (uid_t)-1)
+ uid = shmfd->shm_uid;
+ if (gid == (gid_t)-1)
+ gid = shmfd->shm_gid;
+ if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
+ (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
+ (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
+ goto out;
+ shmfd->shm_uid = uid;
+ shmfd->shm_gid = gid;
+out:
+ mtx_unlock(&shm_timestamp_lock);
+ return (error);
+}
+
+/*
+ * Helper routines to allow the backing object of a shared memory file
+ * descriptor to be mapped in the kernel.
+ */
+int
+shm_map(struct file *fp, size_t size, off_t offset, void **memp)
+{
+ struct shmfd *shmfd;
+ vm_offset_t kva, ofs;
+ vm_object_t obj;
+ int rv;
+
+ if (fp->f_type != DTYPE_SHM)
+ return (EINVAL);
+ shmfd = fp->f_data;
+ obj = shmfd->shm_object;
+ VM_OBJECT_WLOCK(obj);
+ /*
+ * XXXRW: This validation is probably insufficient, and subject to
+ * sign errors. It should be fixed.
+ */
+ if (offset >= shmfd->shm_size ||
+ offset + size > round_page(shmfd->shm_size)) {
+ VM_OBJECT_WUNLOCK(obj);
+ return (EINVAL);
+ }
+
+ shmfd->shm_kmappings++;
+ vm_object_reference_locked(obj);
+ VM_OBJECT_WUNLOCK(obj);
+
+ /* Map the object into the kernel_map and wire it. */
+ kva = vm_map_min(kernel_map);
+ ofs = offset & PAGE_MASK;
+ offset = trunc_page(offset);
+ size = round_page(size + ofs);
+ rv = vm_map_find(kernel_map, obj, offset, &kva, size,
+ VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
+ VM_PROT_READ | VM_PROT_WRITE, 0);
+ if (rv == KERN_SUCCESS) {
+ rv = vm_map_wire(kernel_map, kva, kva + size,
+ VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
+ if (rv == KERN_SUCCESS) {
+ *memp = (void *)(kva + ofs);
+ return (0);
+ }
+ vm_map_remove(kernel_map, kva, kva + size);
+ } else
+ vm_object_deallocate(obj);
+
+ /* On failure, drop our mapping reference. */
+ VM_OBJECT_WLOCK(obj);
+ shmfd->shm_kmappings--;
+ VM_OBJECT_WUNLOCK(obj);
+
+ return (vm_mmap_to_errno(rv));
+}
+
+/*
+ * We require the caller to unmap the entire entry. This allows us to
+ * safely decrement shm_kmappings when a mapping is removed.
+ */
+int
+shm_unmap(struct file *fp, void *mem, size_t size)
+{
+ struct shmfd *shmfd;
+ vm_map_entry_t entry;
+ vm_offset_t kva, ofs;
+ vm_object_t obj;
+ vm_pindex_t pindex;
+ vm_prot_t prot;
+ boolean_t wired;
+ vm_map_t map;
+ int rv;
+
+ if (fp->f_type != DTYPE_SHM)
+ return (EINVAL);
+ shmfd = fp->f_data;
+ kva = (vm_offset_t)mem;
+ ofs = kva & PAGE_MASK;
+ kva = trunc_page(kva);
+ size = round_page(size + ofs);
+ map = kernel_map;
+ rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
+ &obj, &pindex, &prot, &wired);
+ if (rv != KERN_SUCCESS)
+ return (EINVAL);
+ if (entry->start != kva || entry->end != kva + size) {
+ vm_map_lookup_done(map, entry);
+ return (EINVAL);
+ }
+ vm_map_lookup_done(map, entry);
+ if (obj != shmfd->shm_object)
+ return (EINVAL);
+ vm_map_remove(map, kva, kva + size);
+ VM_OBJECT_WLOCK(obj);
+ KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
+ shmfd->shm_kmappings--;
+ VM_OBJECT_WUNLOCK(obj);
+ return (0);
+}
+
+void
+shm_path(struct shmfd *shmfd, char *path, size_t size)
+{
+
+ if (shmfd->shm_path == NULL)
+ return;
+ sx_slock(&shm_dict_lock);
+ if (shmfd->shm_path != NULL)
+ strlcpy(path, shmfd->shm_path, size);
+ sx_sunlock(&shm_dict_lock);
+}
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
new file mode 100644
index 0000000..9fa8ae0
--- /dev/null
+++ b/sys/kern/uipc_sockbuf.c
@@ -0,0 +1,1061 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/aio.h> /* for aio_swake proto */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+/*
+ * Function pointer set by the AIO routines so that the socket buffer code
+ * can call back into the AIO module if it is loaded.
+ */
+void (*aio_swake)(struct socket *, struct sockbuf *);
+
+/*
+ * Primitive routines for operating on socket buffers
+ */
+
+u_long sb_max = SB_MAX;
+u_long sb_max_adj =
+ (quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
+
+static u_long sb_efficiency = 8; /* parameter for sbreserve() */
+
+static void sbdrop_internal(struct sockbuf *sb, int len);
+static void sbflush_internal(struct sockbuf *sb);
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the socket; it
+ * would normally be applied to a socket when the user informs the system
+ * that no more data is to be sent, by the protocol code (in case
+ * PRU_SHUTDOWN). Socantrcvmore indicates that no more data will be
+ * received, and will normally be applied to the socket by a protocol when it
+ * detects that the peer will send no more data. Data queued for reading in
+ * the socket may yet be read.
+ */
+void
+socantsendmore_locked(struct socket *so)
+{
+
+ SOCKBUF_LOCK_ASSERT(&so->so_snd);
+
+ so->so_snd.sb_state |= SBS_CANTSENDMORE;
+ sowwakeup_locked(so);
+ mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
+}
+
+void
+socantsendmore(struct socket *so)
+{
+
+ SOCKBUF_LOCK(&so->so_snd);
+ socantsendmore_locked(so);
+ mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
+}
+
+void
+socantrcvmore_locked(struct socket *so)
+{
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+ so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+ sorwakeup_locked(so);
+ mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
+}
+
+void
+socantrcvmore(struct socket *so)
+{
+
+ SOCKBUF_LOCK(&so->so_rcv);
+ socantrcvmore_locked(so);
+ mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ sb->sb_flags |= SB_WAIT;
+ return (msleep_sbt(&sb->sb_cc, &sb->sb_mtx,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+ sb->sb_timeo, 0, 0));
+}
+
+int
+sblock(struct sockbuf *sb, int flags)
+{
+
+ KASSERT((flags & SBL_VALID) == flags,
+ ("sblock: flags invalid (0x%x)", flags));
+
+ if (flags & SBL_WAIT) {
+ if ((sb->sb_flags & SB_NOINTR) ||
+ (flags & SBL_NOINTR)) {
+ sx_xlock(&sb->sb_sx);
+ return (0);
+ }
+ return (sx_xlock_sig(&sb->sb_sx));
+ } else {
+ if (sx_try_xlock(&sb->sb_sx) == 0)
+ return (EWOULDBLOCK);
+ return (0);
+ }
+}
+
+void
+sbunlock(struct sockbuf *sb)
+{
+
+ sx_xunlock(&sb->sb_sx);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer. Do asynchronous notification
+ * via SIGIO if the socket has the SS_ASYNC flag set.
+ *
+ * Called with the socket buffer lock held; will release the lock by the end
+ * of the function. This allows the caller to acquire the socket buffer lock
+ * while testing for the need for various sorts of wakeup and hold it through
+ * to the point where it's no longer required. We currently hold the lock
+ * through calls out to other subsystems (with the exception of kqueue), and
+ * then release it to avoid lock order issues. It's not clear that's
+ * correct.
+ */
+void
+sowakeup(struct socket *so, struct sockbuf *sb)
+{
+ int ret;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ selwakeuppri(&sb->sb_sel, PSOCK);
+ if (!SEL_WAITING(&sb->sb_sel))
+ sb->sb_flags &= ~SB_SEL;
+ if (sb->sb_flags & SB_WAIT) {
+ sb->sb_flags &= ~SB_WAIT;
+ wakeup(&sb->sb_cc);
+ }
+ KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
+ if (sb->sb_upcall != NULL) {
+ ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
+ if (ret == SU_ISCONNECTED) {
+ KASSERT(sb == &so->so_rcv,
+ ("SO_SND upcall returned SU_ISCONNECTED"));
+ soupcall_clear(so, SO_RCV);
+ }
+ } else
+ ret = SU_OK;
+ if (sb->sb_flags & SB_AIO)
+ aio_swake(so, sb);
+ SOCKBUF_UNLOCK(sb);
+ if (ret == SU_ISCONNECTED)
+ soisconnected(so);
+ if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+ pgsigio(&so->so_sigio, SIGIO, 0);
+ mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and one for
+ * receiving data. Each buffer contains a queue of mbufs, information about
+ * the number of mbufs and amount of data in the queue, and other fields
+ * allowing select() statements and notification on data availability to be
+ * implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records. Each
+ * record is a list of mbufs chained together with the m_next field. Records
+ * are chained together with the m_nextpkt field. The upper level routine
+ * soreceive() expects the following conventions to be observed when placing
+ * information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's name,
+ * then a record containing that name must be present before any
+ * associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really just
+ * additional data associated with the message), and there are ``rights''
+ * to be received, then a record containing this data should be present
+ * (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by a data
+ * record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve(). This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits). The space should
+ * be released by calling sbrelease() when the socket is destroyed.
+ */
+int
+soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
+{
+ struct thread *td = curthread;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
+ goto bad;
+ if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
+ goto bad2;
+ if (so->so_rcv.sb_lowat == 0)
+ so->so_rcv.sb_lowat = 1;
+ if (so->so_snd.sb_lowat == 0)
+ so->so_snd.sb_lowat = MCLBYTES;
+ if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (0);
+bad2:
+ sbrelease_locked(&so->so_snd, so);
+bad:
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (ENOBUFS);
+}
+
+static int
+sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
+{
+ int error = 0;
+ u_long tmp_sb_max = sb_max;
+
+ error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
+ if (error || !req->newptr)
+ return (error);
+ if (tmp_sb_max < MSIZE + MCLBYTES)
+ return (EINVAL);
+ sb_max = tmp_sb_max;
+ sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
+ return (0);
+}
+
+/*
+ * Allot mbufs to a sockbuf. Attempt to scale mbmax so that mbcnt doesn't
+ * become limiting if buffering efficiency is near the normal case.
+ */
+int
+sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
+ struct thread *td)
+{
+ rlim_t sbsize_limit;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ /*
+ * When a thread is passed, we take into account the thread's socket
+ * buffer size limit. The caller will generally pass curthread, but
+ * in the TCP input path, NULL will be passed to indicate that no
+ * appropriate thread resource limits are available. In that case,
+ * we don't apply a process limit.
+ */
+ if (cc > sb_max_adj)
+ return (0);
+ if (td != NULL) {
+ PROC_LOCK(td->td_proc);
+ sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
+ PROC_UNLOCK(td->td_proc);
+ } else
+ sbsize_limit = RLIM_INFINITY;
+ if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
+ sbsize_limit))
+ return (0);
+ sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+ if (sb->sb_lowat > sb->sb_hiwat)
+ sb->sb_lowat = sb->sb_hiwat;
+ return (1);
+}
+
+int
+sbreserve(struct sockbuf *sb, u_long cc, struct socket *so,
+ struct thread *td)
+{
+ int error;
+
+ SOCKBUF_LOCK(sb);
+ error = sbreserve_locked(sb, cc, so, td);
+ SOCKBUF_UNLOCK(sb);
+ return (error);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease_internal(struct sockbuf *sb, struct socket *so)
+{
+
+ sbflush_internal(sb);
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+ RLIM_INFINITY);
+ sb->sb_mbmax = 0;
+}
+
+void
+sbrelease_locked(struct sockbuf *sb, struct socket *so)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ sbrelease_internal(sb, so);
+}
+
+void
+sbrelease(struct sockbuf *sb, struct socket *so)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbrelease_locked(sb, so);
+ SOCKBUF_UNLOCK(sb);
+}
+
+void
+sbdestroy(struct sockbuf *sb, struct socket *so)
+{
+
+ sbrelease_internal(sb, so);
+}
+
+/*
+ * Routines to add and remove data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to append
+ * new mbufs to a socket buffer, after checking that adequate space is
+ * available, comparing the function sbspace() with the amount of data to be
+ * added. sbappendrecord() differs from sbappend() in that data supplied is
+ * treated as the beginning of a new record. To place a sender's address,
+ * optional access rights, and data in a socket receive buffer,
+ * sbappendaddr() should be used. To place access rights and data in a
+ * socket receive buffer, sbappendrights() should be used. In either case,
+ * the new data begins a new record. Note that unlike sbappend() and
+ * sbappendrecord(), these routines check for the caller that there will be
+ * enough space to store the data. Each fails if there is not enough space,
+ * or if it cannot find mbufs to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data awaiting
+ * acknowledgement. Data is normally copied from a socket send buffer in a
+ * protocol with m_copy for output to a peer, and then removing the data from
+ * the socket buffer with sbdrop() or sbdroprecord() when the data is
+ * acknowledged by the peer.
+ */
+#ifdef SOCKBUF_DEBUG
+void
+sblastrecordchk(struct sockbuf *sb, const char *file, int line)
+{
+ struct mbuf *m = sb->sb_mb;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ while (m && m->m_nextpkt)
+ m = m->m_nextpkt;
+
+ if (m != sb->sb_lastrecord) {
+ printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
+ __func__, sb->sb_mb, sb->sb_lastrecord, m);
+ printf("packet chain:\n");
+ for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
+ printf("\t%p\n", m);
+ panic("%s from %s:%u", __func__, file, line);
+ }
+}
+
+void
+sblastmbufchk(struct sockbuf *sb, const char *file, int line)
+{
+ struct mbuf *m = sb->sb_mb;
+ struct mbuf *n;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ while (m && m->m_nextpkt)
+ m = m->m_nextpkt;
+
+ while (m && m->m_next)
+ m = m->m_next;
+
+ if (m != sb->sb_mbtail) {
+ printf("%s: sb_mb %p sb_mbtail %p last %p\n",
+ __func__, sb->sb_mb, sb->sb_mbtail, m);
+ printf("packet tree:\n");
+ for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
+ printf("\t");
+ for (n = m; n != NULL; n = n->m_next)
+ printf("%p ", n);
+ printf("\n");
+ }
+ panic("%s from %s:%u", __func__, file, line);
+ }
+}
+#endif /* SOCKBUF_DEBUG */
+
+#define SBLINKRECORD(sb, m0) do { \
+ SOCKBUF_LOCK_ASSERT(sb); \
+ if ((sb)->sb_lastrecord != NULL) \
+ (sb)->sb_lastrecord->m_nextpkt = (m0); \
+ else \
+ (sb)->sb_mb = (m0); \
+ (sb)->sb_lastrecord = (m0); \
+} while (/*CONSTCOND*/0)
+
+/*
+ * Append mbuf chain m to the last record in the socket buffer sb. The
+ * additional space associated the mbuf chain is recorded in sb. Empty mbufs
+ * are discarded and mbufs are compacted where possible.
+ */
+void
+sbappend_locked(struct sockbuf *sb, struct mbuf *m)
+{
+ struct mbuf *n;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if (m == 0)
+ return;
+
+ SBLASTRECORDCHK(sb);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ } else {
+ /*
+ * XXX Would like to simply use sb_mbtail here, but
+ * XXX I need to verify that I won't miss an EOR that
+ * XXX way.
+ */
+ if ((n = sb->sb_lastrecord) != NULL) {
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ } else {
+ /*
+ * If this is the first record in the socket buffer,
+ * it's also the last record.
+ */
+ sb->sb_lastrecord = m;
+ }
+ }
+ sbcompress(sb, m, n);
+ SBLASTRECORDCHK(sb);
+}
+
+/*
+ * Append mbuf chain m to the last record in the socket buffer sb. The
+ * additional space associated the mbuf chain is recorded in sb. Empty mbufs
+ * are discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(struct sockbuf *sb, struct mbuf *m)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbappend_locked(sb, m);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller absolutely
+ * knows that there will never be more than one record in the socket buffer,
+ * that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
+{
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
+ KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
+
+ SBLASTMBUFCHK(sb);
+
+ /* Remove all packet headers and mbuf tags to get a pure data chain. */
+ m_demote(m, 1);
+
+ sbcompress(sb, m, sb->sb_mbtail);
+
+ sb->sb_lastrecord = sb->sb_mb;
+ SBLASTRECORDCHK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller absolutely
+ * knows that there will never be more than one record in the socket buffer,
+ * that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream(struct sockbuf *sb, struct mbuf *m)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbappendstream_locked(sb, m);
+ SOCKBUF_UNLOCK(sb);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(struct sockbuf *sb)
+{
+ struct mbuf *m;
+ struct mbuf *n = 0;
+ u_long len = 0, mbcnt = 0;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ for (m = sb->sb_mb; m; m = n) {
+ n = m->m_nextpkt;
+ for (; m; m = m->m_next) {
+ len += m->m_len;
+ mbcnt += MSIZE;
+ if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+ mbcnt += m->m_ext.ext_size;
+ }
+ }
+ if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+ printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
+ mbcnt, sb->sb_mbcnt);
+ panic("sbcheck");
+ }
+}
+#endif
+
+/*
+ * As above, except the mbuf chain begins a new record.
+ */
+void
+sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0)
+{
+ struct mbuf *m;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if (m0 == 0)
+ return;
+ /*
+ * Put the first mbuf on the queue. Note this permits zero length
+ * records.
+ */
+ sballoc(sb, m0);
+ SBLASTRECORDCHK(sb);
+ SBLINKRECORD(sb, m0);
+ sb->sb_mbtail = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ /* always call sbcompress() so it can do SBLASTMBUFCHK() */
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * As above, except the mbuf chain begins a new record.
+ */
+void
+sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbappendrecord_locked(sb, m0);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket. If present, m0 must include a packet header
+ * with total length. Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
+sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
+ struct mbuf *m0, struct mbuf *control)
+{
+ struct mbuf *m, *n, *nlast;
+ int space = asa->sa_len;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+ panic("sbappendaddr_locked");
+ if (m0)
+ space += m0->m_pkthdr.len;
+ space += m_length(control, &n);
+
+ if (space > sbspace(sb))
+ return (0);
+#if MSIZE <= 256
+ if (asa->sa_len > MLEN)
+ return (0);
+#endif
+ m = m_get(M_NOWAIT, MT_SONAME);
+ if (m == NULL)
+ return (0);
+ m->m_len = asa->sa_len;
+ bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+ if (n)
+ n->m_next = m0; /* concatenate data to control */
+ else
+ control = m0;
+ m->m_next = control;
+ for (n = m; n->m_next != NULL; n = n->m_next)
+ sballoc(sb, n);
+ sballoc(sb, n);
+ nlast = n;
+ SBLINKRECORD(sb, m);
+
+ sb->sb_mbtail = nlast;
+ SBLASTMBUFCHK(sb);
+
+ SBLASTRECORDCHK(sb);
+ return (1);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket. If present, m0 must include a packet header
+ * with total length. Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
+sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
+ struct mbuf *m0, struct mbuf *control)
+{
+ int retval;
+
+ SOCKBUF_LOCK(sb);
+ retval = sbappendaddr_locked(sb, asa, m0, control);
+ SOCKBUF_UNLOCK(sb);
+ return (retval);
+}
+
+int
+sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
+ struct mbuf *control)
+{
+ struct mbuf *m, *n, *mlast;
+ int space;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if (control == 0)
+ panic("sbappendcontrol_locked");
+ space = m_length(control, &n) + m_length(m0, NULL);
+
+ if (space > sbspace(sb))
+ return (0);
+ n->m_next = m0; /* concatenate data to control */
+
+ SBLASTRECORDCHK(sb);
+
+ for (m = control; m->m_next; m = m->m_next)
+ sballoc(sb, m);
+ sballoc(sb, m);
+ mlast = m;
+ SBLINKRECORD(sb, control);
+
+ sb->sb_mbtail = mlast;
+ SBLASTMBUFCHK(sb);
+
+ SBLASTRECORDCHK(sb);
+ return (1);
+}
+
+int
+sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
+{
+ int retval;
+
+ SOCKBUF_LOCK(sb);
+ retval = sbappendcontrol_locked(sb, m0, control);
+ SOCKBUF_UNLOCK(sb);
+ return (retval);
+}
+
+/*
+ * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
+ * (n). If (n) is NULL, the buffer is presumed empty.
+ *
+ * When the data is compressed, mbufs in the chain may be handled in one of
+ * three ways:
+ *
+ * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
+ * record boundary, and no change in data type).
+ *
+ * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
+ * an mbuf already in the socket buffer. This can occur if an
+ * appropriate mbuf exists, there is room, and no merging of data types
+ * will occur.
+ *
+ * (3) The mbuf may be appended to the end of the existing mbuf chain.
+ *
+ * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
+ * end-of-record.
+ */
+void
+sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
+{
+ int eor = 0;
+ struct mbuf *o;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ while (m) {
+ eor |= m->m_flags & M_EOR;
+ if (m->m_len == 0 &&
+ (eor == 0 ||
+ (((o = m->m_next) || (o = n)) &&
+ o->m_type == m->m_type))) {
+ if (sb->sb_lastrecord == m)
+ sb->sb_lastrecord = m->m_next;
+ m = m_free(m);
+ continue;
+ }
+ if (n && (n->m_flags & M_EOR) == 0 &&
+ M_WRITABLE(n) &&
+ ((sb->sb_flags & SB_NOCOALESCE) == 0) &&
+ m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+ m->m_len <= M_TRAILINGSPACE(n) &&
+ n->m_type == m->m_type) {
+ bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+ (unsigned)m->m_len);
+ n->m_len += m->m_len;
+ sb->sb_cc += m->m_len;
+ if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+ /* XXX: Probably don't need.*/
+ sb->sb_ctl += m->m_len;
+ m = m_free(m);
+ continue;
+ }
+ if (n)
+ n->m_next = m;
+ else
+ sb->sb_mb = m;
+ sb->sb_mbtail = m;
+ sballoc(sb, m);
+ n = m;
+ m->m_flags &= ~M_EOR;
+ m = m->m_next;
+ n->m_next = 0;
+ }
+ if (eor) {
+ KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
+ n->m_flags |= eor;
+ }
+ SBLASTMBUFCHK(sb);
+}
+
+/*
+ * Free all mbufs in a sockbuf. Check that all resources are reclaimed.
+ */
+static void
+sbflush_internal(struct sockbuf *sb)
+{
+
+ while (sb->sb_mbcnt) {
+ /*
+ * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
+ * we would loop forever. Panic instead.
+ */
+ if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+ break;
+ sbdrop_internal(sb, (int)sb->sb_cc);
+ }
+ if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+ panic("sbflush_internal: cc %u || mb %p || mbcnt %u",
+ sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+void
+sbflush_locked(struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+ sbflush_internal(sb);
+}
+
+void
+sbflush(struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbflush_locked(sb);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+static void
+sbdrop_internal(struct sockbuf *sb, int len)
+{
+ struct mbuf *m;
+ struct mbuf *next;
+
+ next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+ while (len > 0) {
+ if (m == 0) {
+ if (next == 0)
+ panic("sbdrop");
+ m = next;
+ next = m->m_nextpkt;
+ continue;
+ }
+ if (m->m_len > len) {
+ m->m_len -= len;
+ m->m_data += len;
+ sb->sb_cc -= len;
+ if (sb->sb_sndptroff != 0)
+ sb->sb_sndptroff -= len;
+ if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+ sb->sb_ctl -= len;
+ break;
+ }
+ len -= m->m_len;
+ sbfree(sb, m);
+ m = m_free(m);
+ }
+ while (m && m->m_len == 0) {
+ sbfree(sb, m);
+ m = m_free(m);
+ }
+ if (m) {
+ sb->sb_mb = m;
+ m->m_nextpkt = next;
+ } else
+ sb->sb_mb = next;
+ /*
+ * First part is an inline SB_EMPTY_FIXUP(). Second part makes sure
+ * sb_lastrecord is up-to-date if we dropped part of the last record.
+ */
+ m = sb->sb_mb;
+ if (m == NULL) {
+ sb->sb_mbtail = NULL;
+ sb->sb_lastrecord = NULL;
+ } else if (m->m_nextpkt == NULL) {
+ sb->sb_lastrecord = m;
+ }
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop_locked(struct sockbuf *sb, int len)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ sbdrop_internal(sb, len);
+}
+
+void
+sbdrop(struct sockbuf *sb, int len)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbdrop_locked(sb, len);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Maintain a pointer and offset pair into the socket buffer mbuf chain to
+ * avoid traversal of the entire socket buffer for larger offsets.
+ */
+struct mbuf *
+sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
+{
+ struct mbuf *m, *ret;
+
+ KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+ KASSERT(off + len <= sb->sb_cc, ("%s: beyond sb", __func__));
+ KASSERT(sb->sb_sndptroff <= sb->sb_cc, ("%s: sndptroff broken", __func__));
+
+ /*
+ * Is off below stored offset? Happens on retransmits.
+ * Just return, we can't help here.
+ */
+ if (sb->sb_sndptroff > off) {
+ *moff = off;
+ return (sb->sb_mb);
+ }
+
+ /* Return closest mbuf in chain for current offset. */
+ *moff = off - sb->sb_sndptroff;
+ m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
+ if (*moff == m->m_len) {
+ *moff = 0;
+ sb->sb_sndptroff += m->m_len;
+ m = ret = m->m_next;
+ KASSERT(ret->m_len > 0,
+ ("mbuf %p in sockbuf %p chain has no valid data", ret, sb));
+ }
+
+ /* Advance by len to be as close as possible for the next transmit. */
+ for (off = off - sb->sb_sndptroff + len - 1;
+ off > 0 && m != NULL && off >= m->m_len;
+ m = m->m_next) {
+ sb->sb_sndptroff += m->m_len;
+ off -= m->m_len;
+ }
+ if (off > 0 && m == NULL)
+ panic("%s: sockbuf %p and mbuf %p clashing", __func__, sb, ret);
+ sb->sb_sndptr = m;
+
+ return (ret);
+}
+
+/*
+ * Drop a record off the front of a sockbuf and move the next record to the
+ * front.
+ */
+void
+sbdroprecord_locked(struct sockbuf *sb)
+{
+ struct mbuf *m;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ m = sb->sb_mb;
+ if (m) {
+ sb->sb_mb = m->m_nextpkt;
+ do {
+ sbfree(sb, m);
+ m = m_free(m);
+ } while (m);
+ }
+ SB_EMPTY_FIXUP(sb);
+}
+
+/*
+ * Drop a record off the front of a sockbuf and move the next record to the
+ * front.
+ */
+void
+sbdroprecord(struct sockbuf *sb)
+{
+
+ SOCKBUF_LOCK(sb);
+ sbdroprecord_locked(sb);
+ SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Create a "control" mbuf containing the specified data with the specified
+ * type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(caddr_t p, int size, int type, int level)
+{
+ struct cmsghdr *cp;
+ struct mbuf *m;
+
+ if (CMSG_SPACE((u_int)size) > MCLBYTES)
+ return ((struct mbuf *) NULL);
+ if (CMSG_SPACE((u_int)size) > MLEN)
+ m = m_getcl(M_NOWAIT, MT_CONTROL, 0);
+ else
+ m = m_get(M_NOWAIT, MT_CONTROL);
+ if (m == NULL)
+ return ((struct mbuf *) NULL);
+ cp = mtod(m, struct cmsghdr *);
+ m->m_len = 0;
+ KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
+ ("sbcreatecontrol: short mbuf"));
+ if (p != NULL)
+ (void)memcpy(CMSG_DATA(cp), p, size);
+ m->m_len = CMSG_SPACE(size);
+ cp->cmsg_len = CMSG_LEN(size);
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+ return (m);
+}
+
+/*
+ * This does the same for socket buffers that sotoxsocket does for sockets:
+ * generate an user-format data structure describing the socket buffer. Note
+ * that the xsockbuf structure, since it is always embedded in a socket, does
+ * not include a self pointer nor a length. We make this entry point public
+ * in case some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+
+ xsb->sb_cc = sb->sb_cc;
+ xsb->sb_hiwat = sb->sb_hiwat;
+ xsb->sb_mbcnt = sb->sb_mbcnt;
+ xsb->sb_mcnt = sb->sb_mcnt;
+ xsb->sb_ccnt = sb->sb_ccnt;
+ xsb->sb_mbmax = sb->sb_mbmax;
+ xsb->sb_lowat = sb->sb_lowat;
+ xsb->sb_flags = sb->sb_flags;
+ xsb->sb_timeo = sb->sb_timeo;
+}
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
+ &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
+SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+ &sb_efficiency, 0, "Socket buffer size waste factor");
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
new file mode 100644
index 0000000..639d865
--- /dev/null
+++ b/sys/kern/uipc_socket.c
@@ -0,0 +1,3752 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2004 The FreeBSD Foundation
+ * Copyright (c) 2004-2008 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
+ */
+
+/*
+ * Comments on the socket life cycle:
+ *
+ * soalloc() sets of socket layer state for a socket, called only by
+ * socreate() and sonewconn(). Socket layer private.
+ *
+ * sodealloc() tears down socket layer state for a socket, called only by
+ * sofree() and sonewconn(). Socket layer private.
+ *
+ * pru_attach() associates protocol layer state with an allocated socket;
+ * called only once, may fail, aborting socket allocation. This is called
+ * from socreate() and sonewconn(). Socket layer private.
+ *
+ * pru_detach() disassociates protocol layer state from an attached socket,
+ * and will be called exactly once for sockets in which pru_attach() has
+ * been successfully called. If pru_attach() returned an error,
+ * pru_detach() will not be called. Socket layer private.
+ *
+ * pru_abort() and pru_close() notify the protocol layer that the last
+ * consumer of a socket is starting to tear down the socket, and that the
+ * protocol should terminate the connection. Historically, pru_abort() also
+ * detached protocol state from the socket state, but this is no longer the
+ * case.
+ *
+ * socreate() creates a socket and attaches protocol state. This is a public
+ * interface that may be used by socket layer consumers to create new
+ * sockets.
+ *
+ * sonewconn() creates a socket and attaches protocol state. This is a
+ * public interface that may be used by protocols to create new sockets when
+ * a new connection is received and will be available for accept() on a
+ * listen socket.
+ *
+ * soclose() destroys a socket after possibly waiting for it to disconnect.
+ * This is a public interface that socket consumers should use to close and
+ * release a socket when done with it.
+ *
+ * soabort() destroys a socket without waiting for it to disconnect (used
+ * only for incoming connections that are already partially or fully
+ * connected). This is used internally by the socket layer when clearing
+ * listen socket queues (due to overflow or close on the listen socket), but
+ * is also a public interface protocols may use to abort connections in
+ * their incomplete listen queues should they no longer be required. Sockets
+ * placed in completed connection listen queues should not be aborted for
+ * reasons described in the comment above the soclose() implementation. This
+ * is not a general purpose close routine, and except in the specific
+ * circumstances described here, should not be used.
+ *
+ * sofree() will free a socket and its protocol state if all references on
+ * the socket have been released, and is the public interface to attempt to
+ * free a socket when a reference is removed. This is a socket layer private
+ * interface.
+ *
+ * NOTE: In addition to socreate() and soclose(), which provide a single
+ * socket reference to the consumer to be managed as required, there are two
+ * calls to explicitly manage socket references, soref(), and sorele().
+ * Currently, these are generally required only when transitioning a socket
+ * from a listen queue to a file descriptor, in order to prevent garbage
+ * collection of the socket at an untimely moment. For a number of reasons,
+ * these interfaces are not preferred, and should be avoided.
+ *
+ * NOTE: With regard to VNETs the general rule is that callers do not set
+ * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
+ * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
+ * and sorflush(), which are usually called from a pre-set VNET context.
+ * sopoll() currently does not need a VNET context to be set.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_zero.h"
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mac.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/domain.h>
+#include <sys/file.h> /* for struct knote */
+#include <sys/kernel.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/resourcevar.h>
+#include <net/route.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/jail.h>
+#include <sys/syslog.h>
+#include <netinet/in.h>
+
+#include <net/vnet.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/sysent.h>
+#include <compat/freebsd32/freebsd32.h>
+#endif
+
+static int soreceive_rcvoob(struct socket *so, struct uio *uio,
+ int flags);
+
+static void filt_sordetach(struct knote *kn);
+static int filt_soread(struct knote *kn, long hint);
+static void filt_sowdetach(struct knote *kn);
+static int filt_sowrite(struct knote *kn, long hint);
+static int filt_solisten(struct knote *kn, long hint);
+
+static struct filterops solisten_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_sordetach,
+ .f_event = filt_solisten,
+};
+static struct filterops soread_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_sordetach,
+ .f_event = filt_soread,
+};
+static struct filterops sowrite_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_sowdetach,
+ .f_event = filt_sowrite,
+};
+
+so_gen_t so_gencnt; /* generation count for sockets */
+
+MALLOC_DEFINE(M_SONAME, "soname", "socket name");
+MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
+
+#define VNET_SO_ASSERT(so) \
+ VNET_ASSERT(curvnet != NULL, \
+ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
+
+/*
+ * Limit on the number of connections in the listen queue waiting
+ * for accept(2).
+ * NB: The orginal sysctl somaxconn is still available but hidden
+ * to prevent confusion about the actual purpose of this number.
+ */
+static int somaxconn = SOMAXCONN;
+
+static int
+sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ int val;
+
+ val = somaxconn;
+ error = sysctl_handle_int(oidp, &val, 0, req);
+ if (error || !req->newptr )
+ return (error);
+
+ if (val < 1 || val > USHRT_MAX)
+ return (EINVAL);
+
+ somaxconn = val;
+ return (0);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
+ 0, sizeof(int), sysctl_somaxconn, "I",
+ "Maximum listen socket pending connection accept queue size");
+SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
+ 0, sizeof(int), sysctl_somaxconn, "I",
+ "Maximum listen socket pending connection accept queue size (compat)");
+
+static int numopensockets;
+SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
+ &numopensockets, 0, "Number of open sockets");
+
+#if defined(SOCKET_SEND_COW) || defined(SOCKET_RECV_PFLIP)
+SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
+ "Zero copy controls");
+#ifdef SOCKET_RECV_PFLIP
+int so_zero_copy_receive = 1;
+SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
+ &so_zero_copy_receive, 0, "Enable zero copy receive");
+#endif
+#ifdef SOCKET_SEND_COW
+int so_zero_copy_send = 1;
+SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
+ &so_zero_copy_send, 0, "Enable zero copy send");
+#endif /* SOCKET_SEND_COW */
+#endif /* SOCKET_SEND_COW || SOCKET_RECV_PFLIP */
+
+/*
+ * accept_mtx locks down per-socket fields relating to accept queues. See
+ * socketvar.h for an annotation of the protected fields of struct socket.
+ */
+struct mtx accept_mtx;
+MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
+
+/*
+ * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
+ * so_gencnt field.
+ */
+static struct mtx so_global_mtx;
+MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
+
+/*
+ * General IPC sysctl name space, used by sockets and a variety of other IPC
+ * types.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/*
+ * Initialize the socket subsystem and set up the socket
+ * memory allocator.
+ */
+static uma_zone_t socket_zone;
+int maxsockets;
+
+static void
+socket_zone_change(void *tag)
+{
+
+ maxsockets = uma_zone_set_max(socket_zone, maxsockets);
+}
+
+static void
+socket_init(void *tag)
+{
+
+ socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ maxsockets = uma_zone_set_max(socket_zone, maxsockets);
+ uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
+ EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
+ EVENTHANDLER_PRI_FIRST);
+}
+SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
+
+/*
+ * Initialise maxsockets. This SYSINIT must be run after
+ * tunable_mbinit().
+ */
+static void
+init_maxsockets(void *ignored)
+{
+
+ TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+ maxsockets = imax(maxsockets, maxfiles);
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
+
+/*
+ * Sysctl to get and set the maximum global sockets limit. Notify protocols
+ * of the change so that they can update their dependent limits as required.
+ */
+static int
+sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
+{
+ int error, newmaxsockets;
+
+ newmaxsockets = maxsockets;
+ error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
+ if (error == 0 && req->newptr) {
+ if (newmaxsockets > maxsockets &&
+ newmaxsockets <= maxfiles) {
+ maxsockets = newmaxsockets;
+ EVENTHANDLER_INVOKE(maxsockets_change);
+ } else
+ error = EINVAL;
+ }
+ return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
+ &maxsockets, 0, sysctl_maxsockets, "IU",
+ "Maximum number of sockets avaliable");
+
+/*
+ * Socket operation routines. These routines are called by the routines in
+ * sys_socket.c or from a system process, and implement the semantics of
+ * socket operations by switching out to the protocol specific routines.
+ */
+
+/*
+ * Get a socket structure from our zone, and initialize it. Note that it
+ * would probably be better to allocate socket and PCB at the same time, but
+ * I'm not convinced that all the protocols can be easily modified to do
+ * this.
+ *
+ * soalloc() returns a socket with a ref count of 0.
+ */
+static struct socket *
+soalloc(struct vnet *vnet)
+{
+ struct socket *so;
+
+ so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
+ if (so == NULL)
+ return (NULL);
+#ifdef MAC
+ if (mac_socket_init(so, M_NOWAIT) != 0) {
+ uma_zfree(socket_zone, so);
+ return (NULL);
+ }
+#endif
+ SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
+ SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
+ sx_init(&so->so_snd.sb_sx, "so_snd_sx");
+ sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
+ TAILQ_INIT(&so->so_aiojobq);
+ mtx_lock(&so_global_mtx);
+ so->so_gencnt = ++so_gencnt;
+ ++numopensockets;
+#ifdef VIMAGE
+ VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
+ __func__, __LINE__, so));
+ vnet->vnet_sockcnt++;
+ so->so_vnet = vnet;
+#endif
+ mtx_unlock(&so_global_mtx);
+ return (so);
+}
+
+/*
+ * Free the storage associated with a socket at the socket layer, tear down
+ * locks, labels, etc. All protocol state is assumed already to have been
+ * torn down (and possibly never set up) by the caller.
+ */
+static void
+sodealloc(struct socket *so)
+{
+
+ KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
+ KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
+
+ mtx_lock(&so_global_mtx);
+ so->so_gencnt = ++so_gencnt;
+ --numopensockets; /* Could be below, but faster here. */
+#ifdef VIMAGE
+ VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
+ __func__, __LINE__, so));
+ so->so_vnet->vnet_sockcnt--;
+#endif
+ mtx_unlock(&so_global_mtx);
+ if (so->so_rcv.sb_hiwat)
+ (void)chgsbsize(so->so_cred->cr_uidinfo,
+ &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
+ if (so->so_snd.sb_hiwat)
+ (void)chgsbsize(so->so_cred->cr_uidinfo,
+ &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
+#ifdef INET
+ /* remove acccept filter if one is present. */
+ if (so->so_accf != NULL)
+ do_setopt_accept_filter(so, NULL);
+#endif
+#ifdef MAC
+ mac_socket_destroy(so);
+#endif
+ crfree(so->so_cred);
+ sx_destroy(&so->so_snd.sb_sx);
+ sx_destroy(&so->so_rcv.sb_sx);
+ SOCKBUF_LOCK_DESTROY(&so->so_snd);
+ SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+ uma_zfree(socket_zone, so);
+}
+
+/*
+ * socreate returns a socket with a ref count of 1. The socket should be
+ * closed with soclose().
+ */
+int
+socreate(int dom, struct socket **aso, int type, int proto,
+ struct ucred *cred, struct thread *td)
+{
+ struct protosw *prp;
+ struct socket *so;
+ int error;
+
+ if (proto)
+ prp = pffindproto(dom, proto, type);
+ else
+ prp = pffindtype(dom, type);
+
+ if (prp == NULL) {
+ /* No support for domain. */
+ if (pffinddomain(dom) == NULL)
+ return (EAFNOSUPPORT);
+ /* No support for socket type. */
+ if (proto == 0 && type != 0)
+ return (EPROTOTYPE);
+ return (EPROTONOSUPPORT);
+ }
+ if (prp->pr_usrreqs->pru_attach == NULL ||
+ prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
+ return (EPROTONOSUPPORT);
+
+ if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
+ return (EPROTONOSUPPORT);
+
+ if (prp->pr_type != type)
+ return (EPROTOTYPE);
+ so = soalloc(CRED_TO_VNET(cred));
+ if (so == NULL)
+ return (ENOBUFS);
+
+ TAILQ_INIT(&so->so_incomp);
+ TAILQ_INIT(&so->so_comp);
+ so->so_type = type;
+ so->so_cred = crhold(cred);
+ if ((prp->pr_domain->dom_family == PF_INET) ||
+ (prp->pr_domain->dom_family == PF_INET6) ||
+ (prp->pr_domain->dom_family == PF_ROUTE))
+ so->so_fibnum = td->td_proc->p_fibnum;
+ else
+ so->so_fibnum = 0;
+ so->so_proto = prp;
+#ifdef MAC
+ mac_socket_create(cred, so);
+#endif
+ knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
+ knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
+ so->so_count = 1;
+ /*
+ * Auto-sizing of socket buffers is managed by the protocols and
+ * the appropriate flags must be set in the pru_attach function.
+ */
+ CURVNET_SET(so->so_vnet);
+ error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
+ CURVNET_RESTORE();
+ if (error) {
+ KASSERT(so->so_count == 1, ("socreate: so_count %d",
+ so->so_count));
+ so->so_count = 0;
+ sodealloc(so);
+ return (error);
+ }
+ *aso = so;
+ return (0);
+}
+
+#ifdef REGRESSION
+static int regression_sonewconn_earlytest = 1;
+SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
+ &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
+#endif
+
+/*
+ * When an attempt at a new connection is noted on a socket which accepts
+ * connections, sonewconn is called. If the connection is possible (subject
+ * to space constraints, etc.) then we allocate a new structure, propoerly
+ * linked into the data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * Note: the ref count on the socket is 0 on return.
+ */
+struct socket *
+sonewconn(struct socket *head, int connstatus)
+{
+ struct socket *so;
+ int over;
+
+ ACCEPT_LOCK();
+ over = (head->so_qlen > 3 * head->so_qlimit / 2);
+ ACCEPT_UNLOCK();
+#ifdef REGRESSION
+ if (regression_sonewconn_earlytest && over) {
+#else
+ if (over) {
+#endif
+ log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
+ "%i already in queue awaiting acceptance\n",
+ __func__, head->so_pcb, head->so_qlen);
+ return (NULL);
+ }
+ VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
+ __func__, __LINE__, head));
+ so = soalloc(head->so_vnet);
+ if (so == NULL) {
+ log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
+ "limit reached or out of memory\n",
+ __func__, head->so_pcb);
+ return (NULL);
+ }
+ if ((head->so_options & SO_ACCEPTFILTER) != 0)
+ connstatus = 0;
+ so->so_head = head;
+ so->so_type = head->so_type;
+ so->so_options = head->so_options &~ SO_ACCEPTCONN;
+ so->so_linger = head->so_linger;
+ so->so_state = head->so_state | SS_NOFDREF;
+ so->so_fibnum = head->so_fibnum;
+ so->so_proto = head->so_proto;
+ so->so_cred = crhold(head->so_cred);
+#ifdef MAC
+ mac_socket_newconn(head, so);
+#endif
+ knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
+ knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
+ VNET_SO_ASSERT(head);
+ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+ sodealloc(so);
+ log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
+ __func__, head->so_pcb);
+ return (NULL);
+ }
+ if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+ sodealloc(so);
+ log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+ __func__, head->so_pcb);
+ return (NULL);
+ }
+ so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
+ so->so_snd.sb_lowat = head->so_snd.sb_lowat;
+ so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+ so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+ so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+ so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+ so->so_state |= connstatus;
+ ACCEPT_LOCK();
+ /*
+ * The accept socket may be tearing down but we just
+ * won a race on the ACCEPT_LOCK.
+ * However, if sctp_peeloff() is called on a 1-to-many
+ * style socket, the SO_ACCEPTCONN doesn't need to be set.
+ */
+ if (!(head->so_options & SO_ACCEPTCONN) &&
+ ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
+ (head->so_type != SOCK_SEQPACKET))) {
+ SOCK_LOCK(so);
+ so->so_head = NULL;
+ sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */
+ return (NULL);
+ }
+ if (connstatus) {
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_qstate |= SQ_COMP;
+ head->so_qlen++;
+ } else {
+ /*
+ * Keep removing sockets from the head until there's room for
+ * us to insert on the tail. In pre-locking revisions, this
+ * was a simple if(), but as we could be racing with other
+ * threads and soabort() requires dropping locks, we must
+ * loop waiting for the condition to be true.
+ */
+ while (head->so_incqlen > head->so_qlimit) {
+ struct socket *sp;
+ sp = TAILQ_FIRST(&head->so_incomp);
+ TAILQ_REMOVE(&head->so_incomp, sp, so_list);
+ head->so_incqlen--;
+ sp->so_qstate &= ~SQ_INCOMP;
+ sp->so_head = NULL;
+ ACCEPT_UNLOCK();
+ soabort(sp);
+ ACCEPT_LOCK();
+ }
+ TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+ so->so_qstate |= SQ_INCOMP;
+ head->so_incqlen++;
+ }
+ ACCEPT_UNLOCK();
+ if (connstatus) {
+ sorwakeup(head);
+ wakeup_one(&head->so_timeo);
+ }
+ return (so);
+}
+
+int
+sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error;
+
+ CURVNET_SET(so->so_vnet);
+ error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
+ CURVNET_RESTORE();
+ return (error);
+}
+
+int
+sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error;
+
+ CURVNET_SET(so->so_vnet);
+ error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
+ CURVNET_RESTORE();
+ return (error);
+}
+
+/*
+ * solisten() transitions a socket from a non-listening state to a listening
+ * state, but can also be used to update the listen queue depth on an
+ * existing listen socket. The protocol will call back into the sockets
+ * layer using solisten_proto_check() and solisten_proto() to check and set
+ * socket-layer listen state. Call backs are used so that the protocol can
+ * acquire both protocol and socket layer locks in whatever order is required
+ * by the protocol.
+ *
+ * Protocol implementors are advised to hold the socket lock across the
+ * socket-layer test and set to avoid races at the socket layer.
+ */
+int
+solisten(struct socket *so, int backlog, struct thread *td)
+{
+ int error;
+
+ CURVNET_SET(so->so_vnet);
+ error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
+ CURVNET_RESTORE();
+ return (error);
+}
+
+int
+solisten_proto_check(struct socket *so)
+{
+
+ SOCK_LOCK_ASSERT(so);
+
+ if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
+ SS_ISDISCONNECTING))
+ return (EINVAL);
+ return (0);
+}
+
+void
+solisten_proto(struct socket *so, int backlog)
+{
+
+ SOCK_LOCK_ASSERT(so);
+
+ if (backlog < 0 || backlog > somaxconn)
+ backlog = somaxconn;
+ so->so_qlimit = backlog;
+ so->so_options |= SO_ACCEPTCONN;
+}
+
+/*
+ * Evaluate the reference count and named references on a socket; if no
+ * references remain, free it. This should be called whenever a reference is
+ * released, such as in sorele(), but also when named reference flags are
+ * cleared in socket or protocol code.
+ *
+ * sofree() will free the socket if:
+ *
+ * - There are no outstanding file descriptor references or related consumers
+ * (so_count == 0).
+ *
+ * - The socket has been closed by user space, if ever open (SS_NOFDREF).
+ *
+ * - The protocol does not have an outstanding strong reference on the socket
+ * (SS_PROTOREF).
+ *
+ * - The socket is not in a completed connection queue, so a process has been
+ * notified that it is present. If it is removed, the user process may
+ * block in accept() despite select() saying the socket was ready.
+ */
+void
+sofree(struct socket *so)
+{
+ struct protosw *pr = so->so_proto;
+ struct socket *head;
+
+ ACCEPT_LOCK_ASSERT();
+ SOCK_LOCK_ASSERT(so);
+
+ if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
+ (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
+ SOCK_UNLOCK(so);
+ ACCEPT_UNLOCK();
+ return;
+ }
+
+ head = so->so_head;
+ if (head != NULL) {
+ KASSERT((so->so_qstate & SQ_COMP) != 0 ||
+ (so->so_qstate & SQ_INCOMP) != 0,
+ ("sofree: so_head != NULL, but neither SQ_COMP nor "
+ "SQ_INCOMP"));
+ KASSERT((so->so_qstate & SQ_COMP) == 0 ||
+ (so->so_qstate & SQ_INCOMP) == 0,
+ ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_qstate &= ~SQ_INCOMP;
+ so->so_head = NULL;
+ }
+ KASSERT((so->so_qstate & SQ_COMP) == 0 &&
+ (so->so_qstate & SQ_INCOMP) == 0,
+ ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
+ so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
+ if (so->so_options & SO_ACCEPTCONN) {
+ KASSERT((TAILQ_EMPTY(&so->so_comp)),
+ ("sofree: so_comp populated"));
+ KASSERT((TAILQ_EMPTY(&so->so_incomp)),
+ ("sofree: so_incomp populated"));
+ }
+ SOCK_UNLOCK(so);
+ ACCEPT_UNLOCK();
+
+ VNET_SO_ASSERT(so);
+ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
+ (*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+ if (pr->pr_usrreqs->pru_detach != NULL)
+ (*pr->pr_usrreqs->pru_detach)(so);
+
+ /*
+ * From this point on, we assume that no other references to this
+ * socket exist anywhere else in the stack. Therefore, no locks need
+ * to be acquired or held.
+ *
+ * We used to do a lot of socket buffer and socket locking here, as
+ * well as invoke sorflush() and perform wakeups. The direct call to
+ * dom_dispose() and sbrelease_internal() are an inlining of what was
+ * necessary from sorflush().
+ *
+ * Notice that the socket buffer and kqueue state are torn down
+ * before calling pru_detach. This means that protocols shold not
+ * assume they can perform socket wakeups, etc, in their detach code.
+ */
+ sbdestroy(&so->so_snd, so);
+ sbdestroy(&so->so_rcv, so);
+ seldrain(&so->so_snd.sb_sel);
+ seldrain(&so->so_rcv.sb_sel);
+ knlist_destroy(&so->so_rcv.sb_sel.si_note);
+ knlist_destroy(&so->so_snd.sb_sel.si_note);
+ sodealloc(so);
+}
+
+/*
+ * Close a socket on last file table reference removal. Initiate disconnect
+ * if connected. Free socket when disconnect complete.
+ *
+ * This function will sorele() the socket. Note that soclose() may be called
+ * prior to the ref count reaching zero. The actual socket structure will
+ * not be freed until the ref count reaches zero.
+ */
+int
+soclose(struct socket *so)
+{
+ int error = 0;
+
+ KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
+
+ CURVNET_SET(so->so_vnet);
+ funsetown(&so->so_sigio);
+ if (so->so_state & SS_ISCONNECTED) {
+ if ((so->so_state & SS_ISDISCONNECTING) == 0) {
+ error = sodisconnect(so);
+ if (error) {
+ if (error == ENOTCONN)
+ error = 0;
+ goto drop;
+ }
+ }
+ if (so->so_options & SO_LINGER) {
+ if ((so->so_state & SS_ISDISCONNECTING) &&
+ (so->so_state & SS_NBIO))
+ goto drop;
+ while (so->so_state & SS_ISCONNECTED) {
+ error = tsleep(&so->so_timeo,
+ PSOCK | PCATCH, "soclos",
+ so->so_linger * hz);
+ if (error)
+ break;
+ }
+ }
+ }
+
+drop:
+ if (so->so_proto->pr_usrreqs->pru_close != NULL)
+ (*so->so_proto->pr_usrreqs->pru_close)(so);
+ ACCEPT_LOCK();
+ if (so->so_options & SO_ACCEPTCONN) {
+ struct socket *sp;
+ /*
+ * Prevent new additions to the accept queues due
+ * to ACCEPT_LOCK races while we are draining them.
+ */
+ so->so_options &= ~SO_ACCEPTCONN;
+ while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
+ TAILQ_REMOVE(&so->so_incomp, sp, so_list);
+ so->so_incqlen--;
+ sp->so_qstate &= ~SQ_INCOMP;
+ sp->so_head = NULL;
+ ACCEPT_UNLOCK();
+ soabort(sp);
+ ACCEPT_LOCK();
+ }
+ while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
+ TAILQ_REMOVE(&so->so_comp, sp, so_list);
+ so->so_qlen--;
+ sp->so_qstate &= ~SQ_COMP;
+ sp->so_head = NULL;
+ ACCEPT_UNLOCK();
+ soabort(sp);
+ ACCEPT_LOCK();
+ }
+ KASSERT((TAILQ_EMPTY(&so->so_comp)),
+ ("%s: so_comp populated", __func__));
+ KASSERT((TAILQ_EMPTY(&so->so_incomp)),
+ ("%s: so_incomp populated", __func__));
+ }
+ SOCK_LOCK(so);
+ KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
+ so->so_state |= SS_NOFDREF;
+ sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */
+ CURVNET_RESTORE();
+ return (error);
+}
+
+/*
+ * soabort() is used to abruptly tear down a connection, such as when a
+ * resource limit is reached (listen queue depth exceeded), or if a listen
+ * socket is closed while there are sockets waiting to be accepted.
+ *
+ * This interface is tricky, because it is called on an unreferenced socket,
+ * and must be called only by a thread that has actually removed the socket
+ * from the listen queue it was on, or races with other threads are risked.
+ *
+ * This interface will call into the protocol code, so must not be called
+ * with any socket locks held. Protocols do call it while holding their own
+ * recursible protocol mutexes, but this is something that should be subject
+ * to review in the future.
+ */
+void
+soabort(struct socket *so)
+{
+
+ /*
+ * In as much as is possible, assert that no references to this
+ * socket are held. This is not quite the same as asserting that the
+ * current thread is responsible for arranging for no references, but
+ * is as close as we can get for now.
+ */
+ KASSERT(so->so_count == 0, ("soabort: so_count"));
+ KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
+ KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
+ KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
+ KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
+ VNET_SO_ASSERT(so);
+
+ if (so->so_proto->pr_usrreqs->pru_abort != NULL)
+ (*so->so_proto->pr_usrreqs->pru_abort)(so);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ sofree(so);
+}
+
+int
+soaccept(struct socket *so, struct sockaddr **nam)
+{
+ int error;
+
+ SOCK_LOCK(so);
+ KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
+ so->so_state &= ~SS_NOFDREF;
+ SOCK_UNLOCK(so);
+
+ CURVNET_SET(so->so_vnet);
+ error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
+ CURVNET_RESTORE();
+ return (error);
+}
+
+int
+soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+ return (soconnectat(AT_FDCWD, so, nam, td));
+}
+
+int
+soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error;
+
+ if (so->so_options & SO_ACCEPTCONN)
+ return (EOPNOTSUPP);
+
+ CURVNET_SET(so->so_vnet);
+ /*
+ * If protocol is connection-based, can only connect once.
+ * Otherwise, if connected, try to disconnect first. This allows
+ * user to disconnect by connecting to, e.g., a null address.
+ */
+ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
+ ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
+ (error = sodisconnect(so)))) {
+ error = EISCONN;
+ } else {
+ /*
+ * Prevent accumulated error from previous connection from
+ * biting us.
+ */
+ so->so_error = 0;
+ if (fd == AT_FDCWD) {
+ error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
+ nam, td);
+ } else {
+ error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
+ so, nam, td);
+ }
+ }
+ CURVNET_RESTORE();
+
+ return (error);
+}
+
+int
+soconnect2(struct socket *so1, struct socket *so2)
+{
+ int error;
+
+ CURVNET_SET(so1->so_vnet);
+ error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
+ CURVNET_RESTORE();
+ return (error);
+}
+
+int
+sodisconnect(struct socket *so)
+{
+ int error;
+
+ if ((so->so_state & SS_ISCONNECTED) == 0)
+ return (ENOTCONN);
+ if (so->so_state & SS_ISDISCONNECTING)
+ return (EALREADY);
+ VNET_SO_ASSERT(so);
+ error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
+ return (error);
+}
+
+#ifdef SOCKET_SEND_COW
+struct so_zerocopy_stats{
+ int size_ok;
+ int align_ok;
+ int found_ifp;
+};
+struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
+
+/*
+ * sosend_copyin() is only used if zero copy sockets are enabled. Otherwise
+ * sosend_dgram() and sosend_generic() use m_uiotombuf().
+ *
+ * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
+ * all of the data referenced by the uio. If desired, it uses zero-copy.
+ * *space will be updated to reflect data copied in.
+ *
+ * NB: If atomic I/O is requested, the caller must already have checked that
+ * space can hold resid bytes.
+ *
+ * NB: In the event of an error, the caller may need to free the partial
+ * chain pointed to by *mpp. The contents of both *uio and *space may be
+ * modified even in the case of an error.
+ */
+static int
+sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
+ int flags)
+{
+ struct mbuf *m, **mp, *top;
+ long len;
+ ssize_t resid;
+ int error;
+ int cow_send;
+
+ *retmp = top = NULL;
+ mp = &top;
+ len = 0;
+ resid = uio->uio_resid;
+ error = 0;
+ do {
+ cow_send = 0;
+ if (resid >= MINCLSIZE) {
+ if (top == NULL) {
+ m = m_gethdr(M_WAITOK, MT_DATA);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+ } else
+ m = m_get(M_WAITOK, MT_DATA);
+ if (so_zero_copy_send &&
+ resid >= PAGE_SIZE &&
+ *space >= PAGE_SIZE &&
+ uio->uio_iov->iov_len >= PAGE_SIZE) {
+ so_zerocp_stats.size_ok++;
+ so_zerocp_stats.align_ok++;
+ cow_send = socow_setup(m, uio);
+ len = cow_send;
+ }
+ if (!cow_send) {
+ m_clget(m, M_WAITOK);
+ len = min(min(MCLBYTES, resid), *space);
+ }
+ } else {
+ if (top == NULL) {
+ m = m_gethdr(M_WAITOK, MT_DATA);
+ m->m_pkthdr.len = 0;
+ m->m_pkthdr.rcvif = NULL;
+
+ len = min(min(MHLEN, resid), *space);
+ /*
+ * For datagram protocols, leave room
+ * for protocol headers in first mbuf.
+ */
+ if (atomic && m && len < MHLEN)
+ MH_ALIGN(m, len);
+ } else {
+ m = m_get(M_WAITOK, MT_DATA);
+ len = min(min(MLEN, resid), *space);
+ }
+ }
+ if (m == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+
+ *space -= len;
+ if (cow_send)
+ error = 0;
+ else
+ error = uiomove(mtod(m, void *), (int)len, uio);
+ resid = uio->uio_resid;
+ m->m_len = len;
+ *mp = m;
+ top->m_pkthdr.len += len;
+ if (error)
+ goto out;
+ mp = &m->m_next;
+ if (resid <= 0) {
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ break;
+ }
+ } while (*space > 0 && atomic);
+out:
+ *retmp = top;
+ return (error);
+}
+#endif /* SOCKET_SEND_COW */
+
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
+
+int
+sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+ long space;
+ ssize_t resid;
+ int clen = 0, error, dontroute;
+#ifdef SOCKET_SEND_COW
+ int atomic = sosendallatonce(so) || top;
+#endif
+
+ KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
+ KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
+ ("sosend_dgram: !PR_ATOMIC"));
+
+ if (uio != NULL)
+ resid = uio->uio_resid;
+ else
+ resid = top->m_pkthdr.len;
+ /*
+ * In theory resid should be unsigned. However, space must be
+ * signed, as it might be less than 0 if we over-committed, and we
+ * must use a signed comparison of space and resid. On the other
+ * hand, a negative resid causes us to loop sending 0-length
+ * segments to the protocol.
+ */
+ if (resid < 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ dontroute =
+ (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
+ if (td != NULL)
+ td->td_ru.ru_msgsnd++;
+ if (control != NULL)
+ clen = control->m_len;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EPIPE;
+ goto out;
+ }
+ if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto out;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ /*
+ * `sendto' and `sendmsg' is allowed on a connection-based
+ * socket if it supports implied connect. Return ENOTCONN if
+ * not connected and no address is supplied.
+ */
+ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+ if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+ !(resid == 0 && clen != 0)) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = ENOTCONN;
+ goto out;
+ }
+ } else if (addr == NULL) {
+ if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+ error = ENOTCONN;
+ else
+ error = EDESTADDRREQ;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto out;
+ }
+ }
+
+ /*
+ * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
+ * problem and need fixing.
+ */
+ space = sbspace(&so->so_snd);
+ if (flags & MSG_OOB)
+ space += 1024;
+ space -= clen;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (resid > space) {
+ error = EMSGSIZE;
+ goto out;
+ }
+ if (uio == NULL) {
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ } else {
+#ifdef SOCKET_SEND_COW
+ error = sosend_copyin(uio, &top, atomic, &space, flags);
+ if (error)
+ goto out;
+#else
+ /*
+ * Copy the data from userland into a mbuf chain.
+ * If no data is to be copied in, a single empty mbuf
+ * is returned.
+ */
+ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
+ (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
+ if (top == NULL) {
+ error = EFAULT; /* only possible error */
+ goto out;
+ }
+ space -= resid - uio->uio_resid;
+#endif /* SOCKET_SEND_COW */
+ resid = uio->uio_resid;
+ }
+ KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
+ /*
+ * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
+ * than with.
+ */
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options |= SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ /*
+ * XXX all the SBS_CANTSENDMORE checks previously done could be out
+ * of date. We could have recieved a reset packet in an interrupt or
+ * maybe we slept while doing page faults in uiomove() etc. We could
+ * probably recheck again inside the locking protection here, but
+ * there are probably other places that this also happens. We must
+ * rethink this.
+ */
+ VNET_SO_ASSERT(so);
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * If the user set MSG_EOF, the protocol understands this flag and
+ * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
+ */
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
+ PRUS_EOF :
+ /* If there is more to send set PRUS_MORETOCOME */
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, td);
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options &= ~SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ clen = 0;
+ control = NULL;
+ top = NULL;
+out:
+ if (top != NULL)
+ m_freem(top);
+ if (control != NULL)
+ m_freem(control);
+ return (error);
+}
+
+/*
+ * Send on a socket. If send must go all at once and message is larger than
+ * send buffering, then hard error. Lock against other senders. If must go
+ * all at once and not enough room now, then inform user that this would
+ * block and do nothing. Otherwise, if nonblocking, send as much as
+ * possible. The data to be sent is described by "uio" if nonzero, otherwise
+ * by the mbuf chain "top" (which must be null if uio is not). Data provided
+ * in mbuf chain must be small enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers must check for short
+ * counts if EINTR/ERESTART are returned. Data and control buffers are freed
+ * on return.
+ */
+int
+sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+ long space;
+ ssize_t resid;
+ int clen = 0, error, dontroute;
+ int atomic = sosendallatonce(so) || top;
+
+ if (uio != NULL)
+ resid = uio->uio_resid;
+ else
+ resid = top->m_pkthdr.len;
+ /*
+ * In theory resid should be unsigned. However, space must be
+ * signed, as it might be less than 0 if we over-committed, and we
+ * must use a signed comparison of space and resid. On the other
+ * hand, a negative resid causes us to loop sending 0-length
+ * segments to the protocol.
+ *
+ * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+ * type sockets since that's an error.
+ */
+ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
+ error = EINVAL;
+ goto out;
+ }
+
+ dontroute =
+ (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
+ (so->so_proto->pr_flags & PR_ATOMIC);
+ if (td != NULL)
+ td->td_ru.ru_msgsnd++;
+ if (control != NULL)
+ clen = control->m_len;
+
+ error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+ if (error)
+ goto out;
+
+restart:
+ do {
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EPIPE;
+ goto release;
+ }
+ if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto release;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ /*
+ * `sendto' and `sendmsg' is allowed on a connection-
+ * based socket if it supports implied connect.
+ * Return ENOTCONN if not connected and no address is
+ * supplied.
+ */
+ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+ if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+ !(resid == 0 && clen != 0)) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = ENOTCONN;
+ goto release;
+ }
+ } else if (addr == NULL) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+ error = ENOTCONN;
+ else
+ error = EDESTADDRREQ;
+ goto release;
+ }
+ }
+ space = sbspace(&so->so_snd);
+ if (flags & MSG_OOB)
+ space += 1024;
+ if ((atomic && resid > so->so_snd.sb_hiwat) ||
+ clen > so->so_snd.sb_hiwat) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EMSGSIZE;
+ goto release;
+ }
+ if (space < resid + clen &&
+ (atomic || space < so->so_snd.sb_lowat || space < clen)) {
+ if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EWOULDBLOCK;
+ goto release;
+ }
+ error = sbwait(&so->so_snd);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (error)
+ goto release;
+ goto restart;
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ space -= clen;
+ do {
+ if (uio == NULL) {
+ resid = 0;
+ if (flags & MSG_EOR)
+ top->m_flags |= M_EOR;
+ } else {
+#ifdef SOCKET_SEND_COW
+ error = sosend_copyin(uio, &top, atomic,
+ &space, flags);
+ if (error != 0)
+ goto release;
+#else
+ /*
+ * Copy the data from userland into a mbuf
+ * chain. If no data is to be copied in,
+ * a single empty mbuf is returned.
+ */
+ top = m_uiotombuf(uio, M_WAITOK, space,
+ (atomic ? max_hdr : 0),
+ (atomic ? M_PKTHDR : 0) |
+ ((flags & MSG_EOR) ? M_EOR : 0));
+ if (top == NULL) {
+ error = EFAULT; /* only possible error */
+ goto release;
+ }
+ space -= resid - uio->uio_resid;
+#endif /* SOCKET_SEND_COW */
+ resid = uio->uio_resid;
+ }
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options |= SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ /*
+ * XXX all the SBS_CANTSENDMORE checks previously
+ * done could be out of date. We could have recieved
+ * a reset packet in an interrupt or maybe we slept
+ * while doing page faults in uiomove() etc. We
+ * could probably recheck again inside the locking
+ * protection here, but there are probably other
+ * places that this also happens. We must rethink
+ * this.
+ */
+ VNET_SO_ASSERT(so);
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * If the user set MSG_EOF, the protocol understands
+ * this flag and nothing left to send then use
+ * PRU_SEND_EOF instead of PRU_SEND.
+ */
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
+ PRUS_EOF :
+ /* If there is more to send set PRUS_MORETOCOME. */
+ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+ top, addr, control, td);
+ if (dontroute) {
+ SOCK_LOCK(so);
+ so->so_options &= ~SO_DONTROUTE;
+ SOCK_UNLOCK(so);
+ }
+ clen = 0;
+ control = NULL;
+ top = NULL;
+ if (error)
+ goto release;
+ } while (resid && space > 0);
+ } while (resid);
+
+release:
+ sbunlock(&so->so_snd);
+out:
+ if (top != NULL)
+ m_freem(top);
+ if (control != NULL)
+ m_freem(control);
+ return (error);
+}
+
+int
+sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+ int error;
+
+ CURVNET_SET(so->so_vnet);
+ error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
+ control, flags, td);
+ CURVNET_RESTORE();
+ return (error);
+}
+
+/*
+ * The part of soreceive() that implements reading non-inline out-of-band
+ * data from a socket. For more complete comments, see soreceive(), from
+ * which this code originated.
+ *
+ * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
+ * unable to return an mbuf chain to the caller.
+ */
+static int
+soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
+{
+ struct protosw *pr = so->so_proto;
+ struct mbuf *m;
+ int error;
+
+ KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
+ VNET_SO_ASSERT(so);
+
+ m = m_get(M_WAITOK, MT_DATA);
+ error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+ if (error)
+ goto bad;
+ do {
+#ifdef SOCKET_RECV_PFLIP
+ if (so_zero_copy_receive) {
+ int disposable;
+
+ if ((m->m_flags & M_EXT)
+ && (m->m_ext.ext_type == EXT_DISPOSABLE))
+ disposable = 1;
+ else
+ disposable = 0;
+
+ error = uiomoveco(mtod(m, void *),
+ min(uio->uio_resid, m->m_len), uio, disposable);
+ } else
+#endif /* SOCKET_RECV_PFLIP */
+ error = uiomove(mtod(m, void *),
+ (int) min(uio->uio_resid, m->m_len), uio);
+ m = m_free(m);
+ } while (uio->uio_resid && error == 0 && m);
+bad:
+ if (m != NULL)
+ m_freem(m);
+ return (error);
+}
+
+/*
+ * Following replacement or removal of the first mbuf on the first mbuf chain
+ * of a socket buffer, push necessary state changes back into the socket
+ * buffer so that other consumers see the values consistently. 'nextrecord'
+ * is the callers locally stored value of the original value of
+ * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
+ * NOTE: 'nextrecord' may be NULL.
+ */
+static __inline void
+sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
+{
+
+ SOCKBUF_LOCK_ASSERT(sb);
+ /*
+ * First, update for the new value of nextrecord. If necessary, make
+ * it the first record.
+ */
+ if (sb->sb_mb != NULL)
+ sb->sb_mb->m_nextpkt = nextrecord;
+ else
+ sb->sb_mb = nextrecord;
+
+ /*
+ * Now update any dependent socket buffer fields to reflect the new
+ * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
+ * addition of a second clause that takes care of the case where
+ * sb_mb has been updated, but remains the last record.
+ */
+ if (sb->sb_mb == NULL) {
+ sb->sb_mbtail = NULL;
+ sb->sb_lastrecord = NULL;
+ } else if (sb->sb_mb->m_nextpkt == NULL)
+ sb->sb_lastrecord = sb->sb_mb;
+}
+
+/*
+ * Implement receive operations on a socket. We depend on the way that
+ * records are added to the sockbuf by sbappend. In particular, each record
+ * (mbufs linked through m_next) must begin with an address if the protocol
+ * so specifies, followed by an optional mbuf or mbufs containing ancillary
+ * data, and then zero or more mbufs of data. In order to allow parallelism
+ * between network receive and copying to user space, as well as avoid
+ * sleeping with a mutex held, we release the socket buffer mutex during the
+ * user space copy. Although the sockbuf is locked, new data may still be
+ * appended, and thus we must maintain consistency of the sockbuf during that
+ * time.
+ *
+ * The caller may receive the data as a single mbuf chain by supplying an
+ * mbuf **mp0 for use in returning the chain. The uio is then used only for
+ * the count in uio_resid.
+ */
+int
+soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ struct mbuf *m, **mp;
+ int flags, error, offset;
+ ssize_t len;
+ struct protosw *pr = so->so_proto;
+ struct mbuf *nextrecord;
+ int moff, type = 0;
+ ssize_t orig_resid = uio->uio_resid;
+
+ mp = mp0;
+ if (psa != NULL)
+ *psa = NULL;
+ if (controlp != NULL)
+ *controlp = NULL;
+ if (flagsp != NULL)
+ flags = *flagsp &~ MSG_EOR;
+ else
+ flags = 0;
+ if (flags & MSG_OOB)
+ return (soreceive_rcvoob(so, uio, flags));
+ if (mp != NULL)
+ *mp = NULL;
+ if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
+ && uio->uio_resid) {
+ VNET_SO_ASSERT(so);
+ (*pr->pr_usrreqs->pru_rcvd)(so, 0);
+ }
+
+ error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+ if (error)
+ return (error);
+
+restart:
+ SOCKBUF_LOCK(&so->so_rcv);
+ m = so->so_rcv.sb_mb;
+ /*
+ * If we have less data than requested, block awaiting more (subject
+ * to any timeout) if:
+ * 1. the current count is less than the low water mark, or
+ * 2. MSG_DONTWAIT is not set
+ */
+ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
+ so->so_rcv.sb_cc < uio->uio_resid) &&
+ so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
+ m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
+ KASSERT(m != NULL || !so->so_rcv.sb_cc,
+ ("receive: m == %p so->so_rcv.sb_cc == %u",
+ m, so->so_rcv.sb_cc));
+ if (so->so_error) {
+ if (m != NULL)
+ goto dontblock;
+ error = so->so_error;
+ if ((flags & MSG_PEEK) == 0)
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ goto release;
+ }
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ if (m == NULL) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ goto release;
+ } else
+ goto dontblock;
+ }
+ for (; m != NULL; m = m->m_next)
+ if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
+ m = so->so_rcv.sb_mb;
+ goto dontblock;
+ }
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
+ (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ error = ENOTCONN;
+ goto release;
+ }
+ if (uio->uio_resid == 0) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ goto release;
+ }
+ if ((so->so_state & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ error = EWOULDBLOCK;
+ goto release;
+ }
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ error = sbwait(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ if (error)
+ goto release;
+ goto restart;
+ }
+dontblock:
+ /*
+ * From this point onward, we maintain 'nextrecord' as a cache of the
+ * pointer to the next record in the socket buffer. We must keep the
+ * various socket buffer pointers and local stack versions of the
+ * pointers in sync, pushing out modifications before dropping the
+ * socket buffer mutex, and re-reading them when picking it up.
+ *
+ * Otherwise, we will race with the network stack appending new data
+ * or records onto the socket buffer by using inconsistent/stale
+ * versions of the field, possibly resulting in socket buffer
+ * corruption.
+ *
+ * By holding the high-level sblock(), we prevent simultaneous
+ * readers from pulling off the front of the socket buffer.
+ */
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (uio->uio_td)
+ uio->uio_td->td_ru.ru_msgrcv++;
+ KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ nextrecord = m->m_nextpkt;
+ if (pr->pr_flags & PR_ADDR) {
+ KASSERT(m->m_type == MT_SONAME,
+ ("m->m_type == %d", m->m_type));
+ orig_resid = 0;
+ if (psa != NULL)
+ *psa = sodupsockaddr(mtod(m, struct sockaddr *),
+ M_NOWAIT);
+ if (flags & MSG_PEEK) {
+ m = m->m_next;
+ } else {
+ sbfree(&so->so_rcv, m);
+ so->so_rcv.sb_mb = m_free(m);
+ m = so->so_rcv.sb_mb;
+ sockbuf_pushsync(&so->so_rcv, nextrecord);
+ }
+ }
+
+ /*
+ * Process one or more MT_CONTROL mbufs present before any data mbufs
+ * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
+ * just copy the data; if !MSG_PEEK, we call into the protocol to
+ * perform externalization (or freeing if controlp == NULL).
+ */
+ if (m != NULL && m->m_type == MT_CONTROL) {
+ struct mbuf *cm = NULL, *cmn;
+ struct mbuf **cme = &cm;
+
+ do {
+ if (flags & MSG_PEEK) {
+ if (controlp != NULL) {
+ *controlp = m_copy(m, 0, m->m_len);
+ controlp = &(*controlp)->m_next;
+ }
+ m = m->m_next;
+ } else {
+ sbfree(&so->so_rcv, m);
+ so->so_rcv.sb_mb = m->m_next;
+ m->m_next = NULL;
+ *cme = m;
+ cme = &(*cme)->m_next;
+ m = so->so_rcv.sb_mb;
+ }
+ } while (m != NULL && m->m_type == MT_CONTROL);
+ if ((flags & MSG_PEEK) == 0)
+ sockbuf_pushsync(&so->so_rcv, nextrecord);
+ while (cm != NULL) {
+ cmn = cm->m_next;
+ cm->m_next = NULL;
+ if (pr->pr_domain->dom_externalize != NULL) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ VNET_SO_ASSERT(so);
+ error = (*pr->pr_domain->dom_externalize)
+ (cm, controlp, flags);
+ SOCKBUF_LOCK(&so->so_rcv);
+ } else if (controlp != NULL)
+ *controlp = cm;
+ else
+ m_freem(cm);
+ if (controlp != NULL) {
+ orig_resid = 0;
+ while (*controlp != NULL)
+ controlp = &(*controlp)->m_next;
+ }
+ cm = cmn;
+ }
+ if (m != NULL)
+ nextrecord = so->so_rcv.sb_mb->m_nextpkt;
+ else
+ nextrecord = so->so_rcv.sb_mb;
+ orig_resid = 0;
+ }
+ if (m != NULL) {
+ if ((flags & MSG_PEEK) == 0) {
+ KASSERT(m->m_nextpkt == nextrecord,
+ ("soreceive: post-control, nextrecord !sync"));
+ if (nextrecord == NULL) {
+ KASSERT(so->so_rcv.sb_mb == m,
+ ("soreceive: post-control, sb_mb!=m"));
+ KASSERT(so->so_rcv.sb_lastrecord == m,
+ ("soreceive: post-control, lastrecord!=m"));
+ }
+ }
+ type = m->m_type;
+ if (type == MT_OOBDATA)
+ flags |= MSG_OOB;
+ } else {
+ if ((flags & MSG_PEEK) == 0) {
+ KASSERT(so->so_rcv.sb_mb == nextrecord,
+ ("soreceive: sb_mb != nextrecord"));
+ if (so->so_rcv.sb_mb == NULL) {
+ KASSERT(so->so_rcv.sb_lastrecord == NULL,
+ ("soreceive: sb_lastercord != NULL"));
+ }
+ }
+ }
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+
+ /*
+ * Now continue to read any data mbufs off of the head of the socket
+ * buffer until the read request is satisfied. Note that 'type' is
+ * used to store the type of any mbuf reads that have happened so far
+ * such that soreceive() can stop reading if the type changes, which
+ * causes soreceive() to return only one of regular data and inline
+ * out-of-band data in a single socket receive operation.
+ */
+ moff = 0;
+ offset = 0;
+ while (m != NULL && uio->uio_resid > 0 && error == 0) {
+ /*
+ * If the type of mbuf has changed since the last mbuf
+ * examined ('type'), end the receive operation.
+ */
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
+ if (type != m->m_type)
+ break;
+ } else if (type == MT_OOBDATA)
+ break;
+ else
+ KASSERT(m->m_type == MT_DATA,
+ ("m->m_type == %d", m->m_type));
+ so->so_rcv.sb_state &= ~SBS_RCVATMARK;
+ len = uio->uio_resid;
+ if (so->so_oobmark && len > so->so_oobmark - offset)
+ len = so->so_oobmark - offset;
+ if (len > m->m_len - moff)
+ len = m->m_len - moff;
+ /*
+ * If mp is set, just pass back the mbufs. Otherwise copy
+ * them out via the uio, then free. Sockbuf must be
+ * consistent here (points to current mbuf, it points to next
+ * record) when we drop priority; we must note any additions
+ * to the sockbuf when we block interrupts again.
+ */
+ if (mp == NULL) {
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+#ifdef SOCKET_RECV_PFLIP
+ if (so_zero_copy_receive) {
+ int disposable;
+
+ if ((m->m_flags & M_EXT)
+ && (m->m_ext.ext_type == EXT_DISPOSABLE))
+ disposable = 1;
+ else
+ disposable = 0;
+
+ error = uiomoveco(mtod(m, char *) + moff,
+ (int)len, uio, disposable);
+ } else
+#endif /* SOCKET_RECV_PFLIP */
+ error = uiomove(mtod(m, char *) + moff, (int)len, uio);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (error) {
+ /*
+ * The MT_SONAME mbuf has already been removed
+ * from the record, so it is necessary to
+ * remove the data mbufs, if any, to preserve
+ * the invariant in the case of PR_ADDR that
+ * requires MT_SONAME mbufs at the head of
+ * each record.
+ */
+ if (m && pr->pr_flags & PR_ATOMIC &&
+ ((flags & MSG_PEEK) == 0))
+ (void)sbdroprecord_locked(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ goto release;
+ }
+ } else
+ uio->uio_resid -= len;
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (len == m->m_len - moff) {
+ if (m->m_flags & M_EOR)
+ flags |= MSG_EOR;
+ if (flags & MSG_PEEK) {
+ m = m->m_next;
+ moff = 0;
+ } else {
+ nextrecord = m->m_nextpkt;
+ sbfree(&so->so_rcv, m);
+ if (mp != NULL) {
+ m->m_nextpkt = NULL;
+ *mp = m;
+ mp = &m->m_next;
+ so->so_rcv.sb_mb = m = m->m_next;
+ *mp = NULL;
+ } else {
+ so->so_rcv.sb_mb = m_free(m);
+ m = so->so_rcv.sb_mb;
+ }
+ sockbuf_pushsync(&so->so_rcv, nextrecord);
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ }
+ } else {
+ if (flags & MSG_PEEK)
+ moff += len;
+ else {
+ if (mp != NULL) {
+ int copy_flag;
+
+ if (flags & MSG_DONTWAIT)
+ copy_flag = M_NOWAIT;
+ else
+ copy_flag = M_WAIT;
+ if (copy_flag == M_WAITOK)
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ *mp = m_copym(m, 0, len, copy_flag);
+ if (copy_flag == M_WAITOK)
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (*mp == NULL) {
+ /*
+ * m_copym() couldn't
+ * allocate an mbuf. Adjust
+ * uio_resid back (it was
+ * adjusted down by len
+ * bytes, which we didn't end
+ * up "copying" over).
+ */
+ uio->uio_resid += len;
+ break;
+ }
+ }
+ m->m_data += len;
+ m->m_len -= len;
+ so->so_rcv.sb_cc -= len;
+ }
+ }
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (so->so_oobmark) {
+ if ((flags & MSG_PEEK) == 0) {
+ so->so_oobmark -= len;
+ if (so->so_oobmark == 0) {
+ so->so_rcv.sb_state |= SBS_RCVATMARK;
+ break;
+ }
+ } else {
+ offset += len;
+ if (offset == so->so_oobmark)
+ break;
+ }
+ }
+ if (flags & MSG_EOR)
+ break;
+ /*
+ * If the MSG_WAITALL flag is set (for non-atomic socket), we
+ * must not quit until "uio->uio_resid == 0" or an error
+ * termination. If a signal/timeout occurs, return with a
+ * short count but without error. Keep sockbuf locked
+ * against other readers.
+ */
+ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
+ !sosendallatonce(so) && nextrecord == NULL) {
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (so->so_error ||
+ so->so_rcv.sb_state & SBS_CANTRCVMORE)
+ break;
+ /*
+ * Notify the protocol that some data has been
+ * drained before blocking.
+ */
+ if (pr->pr_flags & PR_WANTRCVD) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ VNET_SO_ASSERT(so);
+ (*pr->pr_usrreqs->pru_rcvd)(so, flags);
+ SOCKBUF_LOCK(&so->so_rcv);
+ }
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ /*
+ * We could receive some data while was notifying
+ * the protocol. Skip blocking in this case.
+ */
+ if (so->so_rcv.sb_mb == NULL) {
+ error = sbwait(&so->so_rcv);
+ if (error) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ goto release;
+ }
+ }
+ m = so->so_rcv.sb_mb;
+ if (m != NULL)
+ nextrecord = m->m_nextpkt;
+ }
+ }
+
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (m != NULL && pr->pr_flags & PR_ATOMIC) {
+ flags |= MSG_TRUNC;
+ if ((flags & MSG_PEEK) == 0)
+ (void) sbdroprecord_locked(&so->so_rcv);
+ }
+ if ((flags & MSG_PEEK) == 0) {
+ if (m == NULL) {
+ /*
+ * First part is an inline SB_EMPTY_FIXUP(). Second
+ * part makes sure sb_lastrecord is up-to-date if
+ * there is still data in the socket buffer.
+ */
+ so->so_rcv.sb_mb = nextrecord;
+ if (so->so_rcv.sb_mb == NULL) {
+ so->so_rcv.sb_mbtail = NULL;
+ so->so_rcv.sb_lastrecord = NULL;
+ } else if (nextrecord->m_nextpkt == NULL)
+ so->so_rcv.sb_lastrecord = nextrecord;
+ }
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ /*
+ * If soreceive() is being done from the socket callback,
+ * then don't need to generate ACK to peer to update window,
+ * since ACK will be generated on return to TCP.
+ */
+ if (!(flags & MSG_SOCALLBCK) &&
+ (pr->pr_flags & PR_WANTRCVD)) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ VNET_SO_ASSERT(so);
+ (*pr->pr_usrreqs->pru_rcvd)(so, flags);
+ SOCKBUF_LOCK(&so->so_rcv);
+ }
+ }
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ if (orig_resid == uio->uio_resid && orig_resid &&
+ (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ goto restart;
+ }
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ if (flagsp != NULL)
+ *flagsp |= flags;
+release:
+ sbunlock(&so->so_rcv);
+ return (error);
+}
+
+/*
+ * Optimized version of soreceive() for stream (TCP) sockets.
+ * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
+ */
+int
+soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ int len = 0, error = 0, flags, oresid;
+ struct sockbuf *sb;
+ struct mbuf *m, *n = NULL;
+
+ /* We only do stream sockets. */
+ if (so->so_type != SOCK_STREAM)
+ return (EINVAL);
+ if (psa != NULL)
+ *psa = NULL;
+ if (controlp != NULL)
+ return (EINVAL);
+ if (flagsp != NULL)
+ flags = *flagsp &~ MSG_EOR;
+ else
+ flags = 0;
+ if (flags & MSG_OOB)
+ return (soreceive_rcvoob(so, uio, flags));
+ if (mp0 != NULL)
+ *mp0 = NULL;
+
+ sb = &so->so_rcv;
+
+ /* Prevent other readers from entering the socket. */
+ error = sblock(sb, SBLOCKWAIT(flags));
+ if (error)
+ goto out;
+ SOCKBUF_LOCK(sb);
+
+ /* Easy one, no space to copyout anything. */
+ if (uio->uio_resid == 0) {
+ error = EINVAL;
+ goto out;
+ }
+ oresid = uio->uio_resid;
+
+ /* We will never ever get anything unless we are or were connected. */
+ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
+ error = ENOTCONN;
+ goto out;
+ }
+
+restart:
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+ /* Abort if socket has reported problems. */
+ if (so->so_error) {
+ if (sb->sb_cc > 0)
+ goto deliver;
+ if (oresid > uio->uio_resid)
+ goto out;
+ error = so->so_error;
+ if (!(flags & MSG_PEEK))
+ so->so_error = 0;
+ goto out;
+ }
+
+ /* Door is closed. Deliver what is left, if any. */
+ if (sb->sb_state & SBS_CANTRCVMORE) {
+ if (sb->sb_cc > 0)
+ goto deliver;
+ else
+ goto out;
+ }
+
+ /* Socket buffer is empty and we shall not block. */
+ if (sb->sb_cc == 0 &&
+ ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
+ error = EAGAIN;
+ goto out;
+ }
+
+ /* Socket buffer got some data that we shall deliver now. */
+ if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
+ ((sb->sb_flags & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
+ sb->sb_cc >= sb->sb_lowat ||
+ sb->sb_cc >= uio->uio_resid ||
+ sb->sb_cc >= sb->sb_hiwat) ) {
+ goto deliver;
+ }
+
+ /* On MSG_WAITALL we must wait until all data or error arrives. */
+ if ((flags & MSG_WAITALL) &&
+ (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
+ goto deliver;
+
+ /*
+ * Wait and block until (more) data comes in.
+ * NB: Drops the sockbuf lock during wait.
+ */
+ error = sbwait(sb);
+ if (error)
+ goto out;
+ goto restart;
+
+deliver:
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+ KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
+ KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
+
+ /* Statistics. */
+ if (uio->uio_td)
+ uio->uio_td->td_ru.ru_msgrcv++;
+
+ /* Fill uio until full or current end of socket buffer is reached. */
+ len = min(uio->uio_resid, sb->sb_cc);
+ if (mp0 != NULL) {
+ /* Dequeue as many mbufs as possible. */
+ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
+ if (*mp0 == NULL)
+ *mp0 = sb->sb_mb;
+ else
+ m_cat(*mp0, sb->sb_mb);
+ for (m = sb->sb_mb;
+ m != NULL && m->m_len <= len;
+ m = m->m_next) {
+ len -= m->m_len;
+ uio->uio_resid -= m->m_len;
+ sbfree(sb, m);
+ n = m;
+ }
+ n->m_next = NULL;
+ sb->sb_mb = m;
+ sb->sb_lastrecord = sb->sb_mb;
+ if (sb->sb_mb == NULL)
+ SB_EMPTY_FIXUP(sb);
+ }
+ /* Copy the remainder. */
+ if (len > 0) {
+ KASSERT(sb->sb_mb != NULL,
+ ("%s: len > 0 && sb->sb_mb empty", __func__));
+
+ m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
+ if (m == NULL)
+ len = 0; /* Don't flush data from sockbuf. */
+ else
+ uio->uio_resid -= len;
+ if (*mp0 != NULL)
+ m_cat(*mp0, m);
+ else
+ *mp0 = m;
+ if (*mp0 == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+ } else {
+ /* NB: Must unlock socket buffer as uiomove may sleep. */
+ SOCKBUF_UNLOCK(sb);
+ error = m_mbuftouio(uio, sb->sb_mb, len);
+ SOCKBUF_LOCK(sb);
+ if (error)
+ goto out;
+ }
+ SBLASTRECORDCHK(sb);
+ SBLASTMBUFCHK(sb);
+
+ /*
+ * Remove the delivered data from the socket buffer unless we
+ * were only peeking.
+ */
+ if (!(flags & MSG_PEEK)) {
+ if (len > 0)
+ sbdrop_locked(sb, len);
+
+ /* Notify protocol that we drained some data. */
+ if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
+ (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
+ !(flags & MSG_SOCALLBCK))) {
+ SOCKBUF_UNLOCK(sb);
+ VNET_SO_ASSERT(so);
+ (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
+ SOCKBUF_LOCK(sb);
+ }
+ }
+
+ /*
+ * For MSG_WAITALL we may have to loop again and wait for
+ * more data to come in.
+ */
+ if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
+ goto restart;
+out:
+ SOCKBUF_LOCK_ASSERT(sb);
+ SBLASTRECORDCHK(sb);
+ SBLASTMBUFCHK(sb);
+ SOCKBUF_UNLOCK(sb);
+ sbunlock(sb);
+ return (error);
+}
+
+/*
+ * Optimized version of soreceive() for simple datagram cases from userspace.
+ * Unlike in the stream case, we're able to drop a datagram if copyout()
+ * fails, and because we handle datagrams atomically, we don't need to use a
+ * sleep lock to prevent I/O interlacing.
+ */
+int
+soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ struct mbuf *m, *m2;
+ int flags, error;
+ ssize_t len;
+ struct protosw *pr = so->so_proto;
+ struct mbuf *nextrecord;
+
+ if (psa != NULL)
+ *psa = NULL;
+ if (controlp != NULL)
+ *controlp = NULL;
+ if (flagsp != NULL)
+ flags = *flagsp &~ MSG_EOR;
+ else
+ flags = 0;
+
+ /*
+ * For any complicated cases, fall back to the full
+ * soreceive_generic().
+ */
+ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB))
+ return (soreceive_generic(so, psa, uio, mp0, controlp,
+ flagsp));
+
+ /*
+ * Enforce restrictions on use.
+ */
+ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
+ ("soreceive_dgram: wantrcvd"));
+ KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
+ KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
+ ("soreceive_dgram: SBS_RCVATMARK"));
+ KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
+ ("soreceive_dgram: P_CONNREQUIRED"));
+
+ /*
+ * Loop blocking while waiting for a datagram.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ while ((m = so->so_rcv.sb_mb) == NULL) {
+ KASSERT(so->so_rcv.sb_cc == 0,
+ ("soreceive_dgram: sb_mb NULL but sb_cc %u",
+ so->so_rcv.sb_cc));
+ if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (error);
+ }
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
+ uio->uio_resid == 0) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (0);
+ }
+ if ((so->so_state & SS_NBIO) ||
+ (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (EWOULDBLOCK);
+ }
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ error = sbwait(&so->so_rcv);
+ if (error) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ return (error);
+ }
+ }
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+ if (uio->uio_td)
+ uio->uio_td->td_ru.ru_msgrcv++;
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ nextrecord = m->m_nextpkt;
+ if (nextrecord == NULL) {
+ KASSERT(so->so_rcv.sb_lastrecord == m,
+ ("soreceive_dgram: lastrecord != m"));
+ }
+
+ KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
+ ("soreceive_dgram: m_nextpkt != nextrecord"));
+
+ /*
+ * Pull 'm' and its chain off the front of the packet queue.
+ */
+ so->so_rcv.sb_mb = NULL;
+ sockbuf_pushsync(&so->so_rcv, nextrecord);
+
+ /*
+ * Walk 'm's chain and free that many bytes from the socket buffer.
+ */
+ for (m2 = m; m2 != NULL; m2 = m2->m_next)
+ sbfree(&so->so_rcv, m2);
+
+ /*
+ * Do a few last checks before we let go of the lock.
+ */
+ SBLASTRECORDCHK(&so->so_rcv);
+ SBLASTMBUFCHK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ if (pr->pr_flags & PR_ADDR) {
+ KASSERT(m->m_type == MT_SONAME,
+ ("m->m_type == %d", m->m_type));
+ if (psa != NULL)
+ *psa = sodupsockaddr(mtod(m, struct sockaddr *),
+ M_NOWAIT);
+ m = m_free(m);
+ }
+ if (m == NULL) {
+ /* XXXRW: Can this happen? */
+ return (0);
+ }
+
+ /*
+ * Packet to copyout() is now in 'm' and it is disconnected from the
+ * queue.
+ *
+ * Process one or more MT_CONTROL mbufs present before any data mbufs
+ * in the first mbuf chain on the socket buffer. We call into the
+ * protocol to perform externalization (or freeing if controlp ==
+ * NULL).
+ */
+ if (m->m_type == MT_CONTROL) {
+ struct mbuf *cm = NULL, *cmn;
+ struct mbuf **cme = &cm;
+
+ do {
+ m2 = m->m_next;
+ m->m_next = NULL;
+ *cme = m;
+ cme = &(*cme)->m_next;
+ m = m2;
+ } while (m != NULL && m->m_type == MT_CONTROL);
+ while (cm != NULL) {
+ cmn = cm->m_next;
+ cm->m_next = NULL;
+ if (pr->pr_domain->dom_externalize != NULL) {
+ error = (*pr->pr_domain->dom_externalize)
+ (cm, controlp, flags);
+ } else if (controlp != NULL)
+ *controlp = cm;
+ else
+ m_freem(cm);
+ if (controlp != NULL) {
+ while (*controlp != NULL)
+ controlp = &(*controlp)->m_next;
+ }
+ cm = cmn;
+ }
+ }
+ KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
+
+ while (m != NULL && uio->uio_resid > 0) {
+ len = uio->uio_resid;
+ if (len > m->m_len)
+ len = m->m_len;
+ error = uiomove(mtod(m, char *), (int)len, uio);
+ if (error) {
+ m_freem(m);
+ return (error);
+ }
+ if (len == m->m_len)
+ m = m_free(m);
+ else {
+ m->m_data += len;
+ m->m_len -= len;
+ }
+ }
+ if (m != NULL)
+ flags |= MSG_TRUNC;
+ m_freem(m);
+ if (flagsp != NULL)
+ *flagsp |= flags;
+ return (0);
+}
+
+int
+soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+ struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+ int error;
+
+ CURVNET_SET(so->so_vnet);
+ error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
+ controlp, flagsp));
+ CURVNET_RESTORE();
+ return (error);
+}
+
+int
+soshutdown(struct socket *so, int how)
+{
+ struct protosw *pr = so->so_proto;
+ int error;
+
+ if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
+ return (EINVAL);
+
+ CURVNET_SET(so->so_vnet);
+ if (pr->pr_usrreqs->pru_flush != NULL)
+ (*pr->pr_usrreqs->pru_flush)(so, how);
+ if (how != SHUT_WR)
+ sorflush(so);
+ if (how != SHUT_RD) {
+ error = (*pr->pr_usrreqs->pru_shutdown)(so);
+ wakeup(&so->so_timeo);
+ CURVNET_RESTORE();
+ return (error);
+ }
+ wakeup(&so->so_timeo);
+ CURVNET_RESTORE();
+ return (0);
+}
+
+void
+sorflush(struct socket *so)
+{
+ struct sockbuf *sb = &so->so_rcv;
+ struct protosw *pr = so->so_proto;
+ struct sockbuf asb;
+
+ VNET_SO_ASSERT(so);
+
+ /*
+ * In order to avoid calling dom_dispose with the socket buffer mutex
+ * held, and in order to generally avoid holding the lock for a long
+ * time, we make a copy of the socket buffer and clear the original
+ * (except locks, state). The new socket buffer copy won't have
+ * initialized locks so we can only call routines that won't use or
+ * assert those locks.
+ *
+ * Dislodge threads currently blocked in receive and wait to acquire
+ * a lock against other simultaneous readers before clearing the
+ * socket buffer. Don't let our acquire be interrupted by a signal
+ * despite any existing socket disposition on interruptable waiting.
+ */
+ socantrcvmore(so);
+ (void) sblock(sb, SBL_WAIT | SBL_NOINTR);
+
+ /*
+ * Invalidate/clear most of the sockbuf structure, but leave selinfo
+ * and mutex data unchanged.
+ */
+ SOCKBUF_LOCK(sb);
+ bzero(&asb, offsetof(struct sockbuf, sb_startzero));
+ bcopy(&sb->sb_startzero, &asb.sb_startzero,
+ sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
+ bzero(&sb->sb_startzero,
+ sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
+ SOCKBUF_UNLOCK(sb);
+ sbunlock(sb);
+
+ /*
+ * Dispose of special rights and flush the socket buffer. Don't call
+ * any unsafe routines (that rely on locks being initialized) on asb.
+ */
+ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
+ (*pr->pr_domain->dom_dispose)(asb.sb_mb);
+ sbrelease_internal(&asb, so);
+}
+
+/*
+ * Perhaps this routine, and sooptcopyout(), below, ought to come in an
+ * additional variant to handle the case where the option value needs to be
+ * some kind of integer, but not a specific size. In addition to their use
+ * here, these functions are also called by the protocol-level pr_ctloutput()
+ * routines.
+ */
+int
+sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
+{
+ size_t valsize;
+
+ /*
+ * If the user gives us more than we wanted, we ignore it, but if we
+ * don't get the minimum length the caller wants, we return EINVAL.
+ * On success, sopt->sopt_valsize is set to however much we actually
+ * retrieved.
+ */
+ if ((valsize = sopt->sopt_valsize) < minlen)
+ return EINVAL;
+ if (valsize > len)
+ sopt->sopt_valsize = valsize = len;
+
+ if (sopt->sopt_td != NULL)
+ return (copyin(sopt->sopt_val, buf, valsize));
+
+ bcopy(sopt->sopt_val, buf, valsize);
+ return (0);
+}
+
+/*
+ * Kernel version of setsockopt(2).
+ *
+ * XXX: optlen is size_t, not socklen_t
+ */
+int
+so_setsockopt(struct socket *so, int level, int optname, void *optval,
+ size_t optlen)
+{
+ struct sockopt sopt;
+
+ sopt.sopt_level = level;
+ sopt.sopt_name = optname;
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_val = optval;
+ sopt.sopt_valsize = optlen;
+ sopt.sopt_td = NULL;
+ return (sosetopt(so, &sopt));
+}
+
+int
+sosetopt(struct socket *so, struct sockopt *sopt)
+{
+ int error, optval;
+ struct linger l;
+ struct timeval tv;
+ sbintime_t val;
+ uint32_t val32;
+#ifdef MAC
+ struct mac extmac;
+#endif
+
+ CURVNET_SET(so->so_vnet);
+ error = 0;
+ if (sopt->sopt_level != SOL_SOCKET) {
+ if (so->so_proto->pr_ctloutput != NULL) {
+ error = (*so->so_proto->pr_ctloutput)(so, sopt);
+ CURVNET_RESTORE();
+ return (error);
+ }
+ error = ENOPROTOOPT;
+ } else {
+ switch (sopt->sopt_name) {
+#ifdef INET
+ case SO_ACCEPTFILTER:
+ error = do_setopt_accept_filter(so, sopt);
+ if (error)
+ goto bad;
+ break;
+#endif
+ case SO_LINGER:
+ error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
+ if (error)
+ goto bad;
+
+ SOCK_LOCK(so);
+ so->so_linger = l.l_linger;
+ if (l.l_onoff)
+ so->so_options |= SO_LINGER;
+ else
+ so->so_options &= ~SO_LINGER;
+ SOCK_UNLOCK(so);
+ break;
+
+ case SO_DEBUG:
+ case SO_KEEPALIVE:
+ case SO_DONTROUTE:
+ case SO_USELOOPBACK:
+ case SO_BROADCAST:
+ case SO_REUSEADDR:
+ case SO_REUSEPORT:
+ case SO_OOBINLINE:
+ case SO_TIMESTAMP:
+ case SO_BINTIME:
+ case SO_NOSIGPIPE:
+ case SO_NO_DDP:
+ case SO_NO_OFFLOAD:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ goto bad;
+ SOCK_LOCK(so);
+ if (optval)
+ so->so_options |= sopt->sopt_name;
+ else
+ so->so_options &= ~sopt->sopt_name;
+ SOCK_UNLOCK(so);
+ break;
+
+ case SO_SETFIB:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ goto bad;
+
+ if (optval < 0 || optval >= rt_numfibs) {
+ error = EINVAL;
+ goto bad;
+ }
+ if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
+ (so->so_proto->pr_domain->dom_family == PF_INET6) ||
+ (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
+ so->so_fibnum = optval;
+ else
+ so->so_fibnum = 0;
+ break;
+
+ case SO_USER_COOKIE:
+ error = sooptcopyin(sopt, &val32, sizeof val32,
+ sizeof val32);
+ if (error)
+ goto bad;
+ so->so_user_cookie = val32;
+ break;
+
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ case SO_SNDLOWAT:
+ case SO_RCVLOWAT:
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ goto bad;
+
+ /*
+ * Values < 1 make no sense for any of these options,
+ * so disallow them.
+ */
+ if (optval < 1) {
+ error = EINVAL;
+ goto bad;
+ }
+
+ switch (sopt->sopt_name) {
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
+ &so->so_snd : &so->so_rcv, (u_long)optval,
+ so, curthread) == 0) {
+ error = ENOBUFS;
+ goto bad;
+ }
+ (sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
+ &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
+ break;
+
+ /*
+ * Make sure the low-water is never greater than the
+ * high-water.
+ */
+ case SO_SNDLOWAT:
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_lowat =
+ (optval > so->so_snd.sb_hiwat) ?
+ so->so_snd.sb_hiwat : optval;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ break;
+ case SO_RCVLOWAT:
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_lowat =
+ (optval > so->so_rcv.sb_hiwat) ?
+ so->so_rcv.sb_hiwat : optval;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ break;
+ }
+ break;
+
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO:
+#ifdef COMPAT_FREEBSD32
+ if (SV_CURPROC_FLAG(SV_ILP32)) {
+ struct timeval32 tv32;
+
+ error = sooptcopyin(sopt, &tv32, sizeof tv32,
+ sizeof tv32);
+ CP(tv32, tv, tv_sec);
+ CP(tv32, tv, tv_usec);
+ } else
+#endif
+ error = sooptcopyin(sopt, &tv, sizeof tv,
+ sizeof tv);
+ if (error)
+ goto bad;
+ if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
+ tv.tv_usec >= 1000000) {
+ error = EDOM;
+ goto bad;
+ }
+ val = tvtosbt(tv);
+
+ switch (sopt->sopt_name) {
+ case SO_SNDTIMEO:
+ so->so_snd.sb_timeo = val;
+ break;
+ case SO_RCVTIMEO:
+ so->so_rcv.sb_timeo = val;
+ break;
+ }
+ break;
+
+ case SO_LABEL:
+#ifdef MAC
+ error = sooptcopyin(sopt, &extmac, sizeof extmac,
+ sizeof extmac);
+ if (error)
+ goto bad;
+ error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
+ so, &extmac);
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ if (error == 0 && so->so_proto->pr_ctloutput != NULL)
+ (void)(*so->so_proto->pr_ctloutput)(so, sopt);
+ }
+bad:
+ CURVNET_RESTORE();
+ return (error);
+}
+
+/*
+ * Helper routine for getsockopt.
+ */
+int
+sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
+{
+ int error;
+ size_t valsize;
+
+ error = 0;
+
+ /*
+ * Documented get behavior is that we always return a value, possibly
+ * truncated to fit in the user's buffer. Traditional behavior is
+ * that we always tell the user precisely how much we copied, rather
+ * than something useful like the total amount we had available for
+ * her. Note that this interface is not idempotent; the entire
+ * answer must generated ahead of time.
+ */
+ valsize = min(len, sopt->sopt_valsize);
+ sopt->sopt_valsize = valsize;
+ if (sopt->sopt_val != NULL) {
+ if (sopt->sopt_td != NULL)
+ error = copyout(buf, sopt->sopt_val, valsize);
+ else
+ bcopy(buf, sopt->sopt_val, valsize);
+ }
+ return (error);
+}
+
+int
+sogetopt(struct socket *so, struct sockopt *sopt)
+{
+ int error, optval;
+ struct linger l;
+ struct timeval tv;
+#ifdef MAC
+ struct mac extmac;
+#endif
+
+ CURVNET_SET(so->so_vnet);
+ error = 0;
+ if (sopt->sopt_level != SOL_SOCKET) {
+ if (so->so_proto->pr_ctloutput != NULL)
+ error = (*so->so_proto->pr_ctloutput)(so, sopt);
+ else
+ error = ENOPROTOOPT;
+ CURVNET_RESTORE();
+ return (error);
+ } else {
+ switch (sopt->sopt_name) {
+#ifdef INET
+ case SO_ACCEPTFILTER:
+ error = do_getopt_accept_filter(so, sopt);
+ break;
+#endif
+ case SO_LINGER:
+ SOCK_LOCK(so);
+ l.l_onoff = so->so_options & SO_LINGER;
+ l.l_linger = so->so_linger;
+ SOCK_UNLOCK(so);
+ error = sooptcopyout(sopt, &l, sizeof l);
+ break;
+
+ case SO_USELOOPBACK:
+ case SO_DONTROUTE:
+ case SO_DEBUG:
+ case SO_KEEPALIVE:
+ case SO_REUSEADDR:
+ case SO_REUSEPORT:
+ case SO_BROADCAST:
+ case SO_OOBINLINE:
+ case SO_ACCEPTCONN:
+ case SO_TIMESTAMP:
+ case SO_BINTIME:
+ case SO_NOSIGPIPE:
+ optval = so->so_options & sopt->sopt_name;
+integer:
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+
+ case SO_TYPE:
+ optval = so->so_type;
+ goto integer;
+
+ case SO_PROTOCOL:
+ optval = so->so_proto->pr_protocol;
+ goto integer;
+
+ case SO_ERROR:
+ SOCK_LOCK(so);
+ optval = so->so_error;
+ so->so_error = 0;
+ SOCK_UNLOCK(so);
+ goto integer;
+
+ case SO_SNDBUF:
+ optval = so->so_snd.sb_hiwat;
+ goto integer;
+
+ case SO_RCVBUF:
+ optval = so->so_rcv.sb_hiwat;
+ goto integer;
+
+ case SO_SNDLOWAT:
+ optval = so->so_snd.sb_lowat;
+ goto integer;
+
+ case SO_RCVLOWAT:
+ optval = so->so_rcv.sb_lowat;
+ goto integer;
+
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO:
+ optval = (sopt->sopt_name == SO_SNDTIMEO ?
+ so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
+
+ tv = sbttotv(optval);
+#ifdef COMPAT_FREEBSD32
+ if (SV_CURPROC_FLAG(SV_ILP32)) {
+ struct timeval32 tv32;
+
+ CP(tv, tv32, tv_sec);
+ CP(tv, tv32, tv_usec);
+ error = sooptcopyout(sopt, &tv32, sizeof tv32);
+ } else
+#endif
+ error = sooptcopyout(sopt, &tv, sizeof tv);
+ break;
+
+ case SO_LABEL:
+#ifdef MAC
+ error = sooptcopyin(sopt, &extmac, sizeof(extmac),
+ sizeof(extmac));
+ if (error)
+ goto bad;
+ error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
+ so, &extmac);
+ if (error)
+ goto bad;
+ error = sooptcopyout(sopt, &extmac, sizeof extmac);
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
+
+ case SO_PEERLABEL:
+#ifdef MAC
+ error = sooptcopyin(sopt, &extmac, sizeof(extmac),
+ sizeof(extmac));
+ if (error)
+ goto bad;
+ error = mac_getsockopt_peerlabel(
+ sopt->sopt_td->td_ucred, so, &extmac);
+ if (error)
+ goto bad;
+ error = sooptcopyout(sopt, &extmac, sizeof extmac);
+#else
+ error = EOPNOTSUPP;
+#endif
+ break;
+
+ case SO_LISTENQLIMIT:
+ optval = so->so_qlimit;
+ goto integer;
+
+ case SO_LISTENQLEN:
+ optval = so->so_qlen;
+ goto integer;
+
+ case SO_LISTENINCQLEN:
+ optval = so->so_incqlen;
+ goto integer;
+
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ }
+#ifdef MAC
+bad:
+#endif
+ CURVNET_RESTORE();
+ return (error);
+}
+
+int
+soopt_getm(struct sockopt *sopt, struct mbuf **mp)
+{
+ struct mbuf *m, *m_prev;
+ int sopt_size = sopt->sopt_valsize;
+
+ MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
+ if (m == NULL)
+ return ENOBUFS;
+ if (sopt_size > MLEN) {
+ MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_free(m);
+ return ENOBUFS;
+ }
+ m->m_len = min(MCLBYTES, sopt_size);
+ } else {
+ m->m_len = min(MLEN, sopt_size);
+ }
+ sopt_size -= m->m_len;
+ *mp = m;
+ m_prev = m;
+
+ while (sopt_size) {
+ MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ m_freem(*mp);
+ return ENOBUFS;
+ }
+ if (sopt_size > MLEN) {
+ MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
+ M_NOWAIT);
+ if ((m->m_flags & M_EXT) == 0) {
+ m_freem(m);
+ m_freem(*mp);
+ return ENOBUFS;
+ }
+ m->m_len = min(MCLBYTES, sopt_size);
+ } else {
+ m->m_len = min(MLEN, sopt_size);
+ }
+ sopt_size -= m->m_len;
+ m_prev->m_next = m;
+ m_prev = m;
+ }
+ return (0);
+}
+
+int
+soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
+{
+ struct mbuf *m0 = m;
+
+ if (sopt->sopt_val == NULL)
+ return (0);
+ while (m != NULL && sopt->sopt_valsize >= m->m_len) {
+ if (sopt->sopt_td != NULL) {
+ int error;
+
+ error = copyin(sopt->sopt_val, mtod(m, char *),
+ m->m_len);
+ if (error != 0) {
+ m_freem(m0);
+ return(error);
+ }
+ } else
+ bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
+ sopt->sopt_valsize -= m->m_len;
+ sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
+ m = m->m_next;
+ }
+ if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
+ panic("ip6_sooptmcopyin");
+ return (0);
+}
+
+int
+soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
+{
+ struct mbuf *m0 = m;
+ size_t valsize = 0;
+
+ if (sopt->sopt_val == NULL)
+ return (0);
+ while (m != NULL && sopt->sopt_valsize >= m->m_len) {
+ if (sopt->sopt_td != NULL) {
+ int error;
+
+ error = copyout(mtod(m, char *), sopt->sopt_val,
+ m->m_len);
+ if (error != 0) {
+ m_freem(m0);
+ return(error);
+ }
+ } else
+ bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
+ sopt->sopt_valsize -= m->m_len;
+ sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
+ valsize += m->m_len;
+ m = m->m_next;
+ }
+ if (m != NULL) {
+ /* enough soopt buffer should be given from user-land */
+ m_freem(m0);
+ return(EINVAL);
+ }
+ sopt->sopt_valsize = valsize;
+ return (0);
+}
+
+/*
+ * sohasoutofband(): protocol notifies socket layer of the arrival of new
+ * out-of-band data, which will then notify socket consumers.
+ */
+void
+sohasoutofband(struct socket *so)
+{
+
+ if (so->so_sigio != NULL)
+ pgsigio(&so->so_sigio, SIGURG, 0);
+ selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
+}
+
+int
+sopoll(struct socket *so, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+
+ /*
+ * We do not need to set or assert curvnet as long as everyone uses
+ * sopoll_generic().
+ */
+ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
+ td));
+}
+
+int
+sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
+ struct thread *td)
+{
+ int revents = 0;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ SOCKBUF_LOCK(&so->so_rcv);
+ if (events & (POLLIN | POLLRDNORM))
+ if (soreadabledata(so))
+ revents |= events & (POLLIN | POLLRDNORM);
+
+ if (events & (POLLOUT | POLLWRNORM))
+ if (sowriteable(so))
+ revents |= events & (POLLOUT | POLLWRNORM);
+
+ if (events & (POLLPRI | POLLRDBAND))
+ if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK))
+ revents |= events & (POLLPRI | POLLRDBAND);
+
+ if ((events & POLLINIGNEOF) == 0) {
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ revents |= events & (POLLIN | POLLRDNORM);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE)
+ revents |= POLLHUP;
+ }
+ }
+
+ if (revents == 0) {
+ if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
+ selrecord(td, &so->so_rcv.sb_sel);
+ so->so_rcv.sb_flags |= SB_SEL;
+ }
+
+ if (events & (POLLOUT | POLLWRNORM)) {
+ selrecord(td, &so->so_snd.sb_sel);
+ so->so_snd.sb_flags |= SB_SEL;
+ }
+ }
+
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (revents);
+}
+
+int
+soo_kqfilter(struct file *fp, struct knote *kn)
+{
+ struct socket *so = kn->kn_fp->f_data;
+ struct sockbuf *sb;
+
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ if (so->so_options & SO_ACCEPTCONN)
+ kn->kn_fop = &solisten_filtops;
+ else
+ kn->kn_fop = &soread_filtops;
+ sb = &so->so_rcv;
+ break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &sowrite_filtops;
+ sb = &so->so_snd;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ SOCKBUF_LOCK(sb);
+ knlist_add(&sb->sb_sel.si_note, kn, 1);
+ sb->sb_flags |= SB_KNOTE;
+ SOCKBUF_UNLOCK(sb);
+ return (0);
+}
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol. Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
+ struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
+ struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+ struct ifnet *ifp, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_disconnect_notsupp(struct socket *so)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
+ struct sockaddr *addr, struct mbuf *control, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one and
+ * doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ return 0;
+}
+
+int
+pru_shutdown_notsupp(struct socket *so)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
+ struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
+ struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+
+ return EOPNOTSUPP;
+}
+
+int
+pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
+ struct thread *td)
+{
+
+ return EOPNOTSUPP;
+}
+
+static void
+filt_sordetach(struct knote *kn)
+{
+ struct socket *so = kn->kn_fp->f_data;
+
+ SOCKBUF_LOCK(&so->so_rcv);
+ knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
+ if (knlist_empty(&so->so_rcv.sb_sel.si_note))
+ so->so_rcv.sb_flags &= ~SB_KNOTE;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+}
+
+/*ARGSUSED*/
+static int
+filt_soread(struct knote *kn, long hint)
+{
+ struct socket *so;
+
+ so = kn->kn_fp->f_data;
+ SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+ kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
+ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ kn->kn_flags |= EV_EOF;
+ kn->kn_fflags = so->so_error;
+ return (1);
+ } else if (so->so_error) /* temporary udp error */
+ return (1);
+ else if (kn->kn_sfflags & NOTE_LOWAT)
+ return (kn->kn_data >= kn->kn_sdata);
+ else
+ return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat);
+}
+
+static void
+filt_sowdetach(struct knote *kn)
+{
+ struct socket *so = kn->kn_fp->f_data;
+
+ SOCKBUF_LOCK(&so->so_snd);
+ knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
+ if (knlist_empty(&so->so_snd.sb_sel.si_note))
+ so->so_snd.sb_flags &= ~SB_KNOTE;
+ SOCKBUF_UNLOCK(&so->so_snd);
+}
+
+/*ARGSUSED*/
+static int
+filt_sowrite(struct knote *kn, long hint)
+{
+ struct socket *so;
+
+ so = kn->kn_fp->f_data;
+ SOCKBUF_LOCK_ASSERT(&so->so_snd);
+ kn->kn_data = sbspace(&so->so_snd);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ kn->kn_flags |= EV_EOF;
+ kn->kn_fflags = so->so_error;
+ return (1);
+ } else if (so->so_error) /* temporary udp error */
+ return (1);
+ else if (((so->so_state & SS_ISCONNECTED) == 0) &&
+ (so->so_proto->pr_flags & PR_CONNREQUIRED))
+ return (0);
+ else if (kn->kn_sfflags & NOTE_LOWAT)
+ return (kn->kn_data >= kn->kn_sdata);
+ else
+ return (kn->kn_data >= so->so_snd.sb_lowat);
+}
+
+/*ARGSUSED*/
+static int
+filt_solisten(struct knote *kn, long hint)
+{
+ struct socket *so = kn->kn_fp->f_data;
+
+ kn->kn_data = so->so_qlen;
+ return (!TAILQ_EMPTY(&so->so_comp));
+}
+
+int
+socheckuid(struct socket *so, uid_t uid)
+{
+
+ if (so == NULL)
+ return (EPERM);
+ if (so->so_cred->cr_uid != uid)
+ return (EPERM);
+ return (0);
+}
+
+/*
+ * These functions are used by protocols to notify the socket layer (and its
+ * consumers) of state changes in the sockets driven by protocol-side events.
+ */
+
+/*
+ * Procedures to manipulate state flags of socket and do appropriate wakeups.
+ *
+ * Normal sequence from the active (originating) side is that
+ * soisconnecting() is called during processing of connect() call, resulting
+ * in an eventual call to soisconnected() if/when the connection is
+ * established. When the connection is torn down soisdisconnecting() is
+ * called during processing of disconnect() call, and soisdisconnected() is
+ * called when the connection to the peer is totally severed. The semantics
+ * of these routines are such that connectionless protocols can call
+ * soisconnected() and soisdisconnected() only, bypassing the in-progress
+ * calls when setting up a ``connection'' takes no time.
+ *
+ * From the passive side, a socket is created with two queues of sockets:
+ * so_incomp for connections in progress and so_comp for connections already
+ * made and awaiting user acceptance. As a protocol is preparing incoming
+ * connections, it creates a socket structure queued on so_incomp by calling
+ * sonewconn(). When the connection is established, soisconnected() is
+ * called, and transfers the socket structure to so_comp, making it available
+ * to accept().
+ *
+ * If a socket is closed with sockets on either so_incomp or so_comp, these
+ * sockets are dropped.
+ *
+ * If higher-level protocols are implemented in the kernel, the wakeups done
+ * here will sometimes cause software-interrupt process scheduling.
+ */
+void
+soisconnecting(struct socket *so)
+{
+
+ SOCK_LOCK(so);
+ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISCONNECTING;
+ SOCK_UNLOCK(so);
+}
+
+void
+soisconnected(struct socket *so)
+{
+ struct socket *head;
+ int ret;
+
+restart:
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+ so->so_state |= SS_ISCONNECTED;
+ head = so->so_head;
+ if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
+ if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+ SOCK_UNLOCK(so);
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_qstate &= ~SQ_INCOMP;
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ head->so_qlen++;
+ so->so_qstate |= SQ_COMP;
+ ACCEPT_UNLOCK();
+ sorwakeup(head);
+ wakeup_one(&head->so_timeo);
+ } else {
+ ACCEPT_UNLOCK();
+ soupcall_set(so, SO_RCV,
+ head->so_accf->so_accept_filter->accf_callback,
+ head->so_accf->so_accept_filter_arg);
+ so->so_options &= ~SO_ACCEPTFILTER;
+ ret = head->so_accf->so_accept_filter->accf_callback(so,
+ head->so_accf->so_accept_filter_arg, M_NOWAIT);
+ if (ret == SU_ISCONNECTED)
+ soupcall_clear(so, SO_RCV);
+ SOCK_UNLOCK(so);
+ if (ret == SU_ISCONNECTED)
+ goto restart;
+ }
+ return;
+ }
+ SOCK_UNLOCK(so);
+ ACCEPT_UNLOCK();
+ wakeup(&so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+}
+
+void
+soisdisconnecting(struct socket *so)
+{
+
+ /*
+ * Note: This code assumes that SOCK_LOCK(so) and
+ * SOCKBUF_LOCK(&so->so_rcv) are the same.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_state &= ~SS_ISCONNECTING;
+ so->so_state |= SS_ISDISCONNECTING;
+ so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+ sorwakeup_locked(so);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_state |= SBS_CANTSENDMORE;
+ sowwakeup_locked(so);
+ wakeup(&so->so_timeo);
+}
+
+void
+soisdisconnected(struct socket *so)
+{
+
+ /*
+ * Note: This code assumes that SOCK_LOCK(so) and
+ * SOCKBUF_LOCK(&so->so_rcv) are the same.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISDISCONNECTED;
+ so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+ sorwakeup_locked(so);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_state |= SBS_CANTSENDMORE;
+ sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
+ sowwakeup_locked(so);
+ wakeup(&so->so_timeo);
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+sodupsockaddr(const struct sockaddr *sa, int mflags)
+{
+ struct sockaddr *sa2;
+
+ sa2 = malloc(sa->sa_len, M_SONAME, mflags);
+ if (sa2)
+ bcopy(sa, sa2, sa->sa_len);
+ return sa2;
+}
+
+/*
+ * Register per-socket buffer upcalls.
+ */
+void
+soupcall_set(struct socket *so, int which,
+ int (*func)(struct socket *, void *, int), void *arg)
+{
+ struct sockbuf *sb;
+
+ switch (which) {
+ case SO_RCV:
+ sb = &so->so_rcv;
+ break;
+ case SO_SND:
+ sb = &so->so_snd;
+ break;
+ default:
+ panic("soupcall_set: bad which");
+ }
+ SOCKBUF_LOCK_ASSERT(sb);
+#if 0
+ /* XXX: accf_http actually wants to do this on purpose. */
+ KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall"));
+#endif
+ sb->sb_upcall = func;
+ sb->sb_upcallarg = arg;
+ sb->sb_flags |= SB_UPCALL;
+}
+
+void
+soupcall_clear(struct socket *so, int which)
+{
+ struct sockbuf *sb;
+
+ switch (which) {
+ case SO_RCV:
+ sb = &so->so_rcv;
+ break;
+ case SO_SND:
+ sb = &so->so_snd;
+ break;
+ default:
+ panic("soupcall_clear: bad which");
+ }
+ SOCKBUF_LOCK_ASSERT(sb);
+ KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear"));
+ sb->sb_upcall = NULL;
+ sb->sb_upcallarg = NULL;
+ sb->sb_flags &= ~SB_UPCALL;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information in
+ * the kernel-format socket structure pointed to by so. This is done to
+ * reduce the spew of irrelevant information over this interface, to isolate
+ * user code from changes in the kernel structure, and potentially to provide
+ * information-hiding if we decide that some of this information should be
+ * hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+
+ xso->xso_len = sizeof *xso;
+ xso->xso_so = so;
+ xso->so_type = so->so_type;
+ xso->so_options = so->so_options;
+ xso->so_linger = so->so_linger;
+ xso->so_state = so->so_state;
+ xso->so_pcb = so->so_pcb;
+ xso->xso_protocol = so->so_proto->pr_protocol;
+ xso->xso_family = so->so_proto->pr_domain->dom_family;
+ xso->so_qlen = so->so_qlen;
+ xso->so_incqlen = so->so_incqlen;
+ xso->so_qlimit = so->so_qlimit;
+ xso->so_timeo = so->so_timeo;
+ xso->so_error = so->so_error;
+ xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+ xso->so_oobmark = so->so_oobmark;
+ sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+ sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+ xso->so_uid = so->so_cred->cr_uid;
+}
+
+
+/*
+ * Socket accessor functions to provide external consumers with
+ * a safe interface to socket state
+ *
+ */
+
+void
+so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
+ void *arg)
+{
+
+ TAILQ_FOREACH(so, &so->so_comp, so_list)
+ func(so, arg);
+}
+
+struct sockbuf *
+so_sockbuf_rcv(struct socket *so)
+{
+
+ return (&so->so_rcv);
+}
+
+struct sockbuf *
+so_sockbuf_snd(struct socket *so)
+{
+
+ return (&so->so_snd);
+}
+
+int
+so_state_get(const struct socket *so)
+{
+
+ return (so->so_state);
+}
+
+void
+so_state_set(struct socket *so, int val)
+{
+
+ so->so_state = val;
+}
+
+int
+so_options_get(const struct socket *so)
+{
+
+ return (so->so_options);
+}
+
+void
+so_options_set(struct socket *so, int val)
+{
+
+ so->so_options = val;
+}
+
+int
+so_error_get(const struct socket *so)
+{
+
+ return (so->so_error);
+}
+
+void
+so_error_set(struct socket *so, int val)
+{
+
+ so->so_error = val;
+}
+
+int
+so_linger_get(const struct socket *so)
+{
+
+ return (so->so_linger);
+}
+
+void
+so_linger_set(struct socket *so, int val)
+{
+
+ so->so_linger = val;
+}
+
+struct protosw *
+so_protosw_get(const struct socket *so)
+{
+
+ return (so->so_proto);
+}
+
+void
+so_protosw_set(struct socket *so, struct protosw *val)
+{
+
+ so->so_proto = val;
+}
+
+void
+so_sorwakeup(struct socket *so)
+{
+
+ sorwakeup(so);
+}
+
+void
+so_sowwakeup(struct socket *so)
+{
+
+ sowwakeup(so);
+}
+
+void
+so_sorwakeup_locked(struct socket *so)
+{
+
+ sorwakeup_locked(so);
+}
+
+void
+so_sowwakeup_locked(struct socket *so)
+{
+
+ sowwakeup_locked(so);
+}
+
+void
+so_lock(struct socket *so)
+{
+
+ SOCK_LOCK(so);
+}
+
+void
+so_unlock(struct socket *so)
+{
+
+ SOCK_UNLOCK(so);
+}
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
new file mode 100644
index 0000000..8229390
--- /dev/null
+++ b/sys/kern/uipc_syscalls.c
@@ -0,0 +1,2935 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * sendfile(2) and related extensions:
+ * Copyright (c) 1998, David Greenman. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_sctp.h"
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/malloc.h>
+#include <sys/filedesc.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/jail.h>
+#include <sys/mount.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/rwlock.h>
+#include <sys/sf_buf.h>
+#include <sys/sysent.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+#ifdef COMPAT_FREEBSD32
+#include <compat/freebsd32/freebsd32_util.h>
+#endif
+
+#include <net/vnet.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+#if defined(INET) || defined(INET6)
+#ifdef SCTP
+#include <netinet/sctp.h>
+#include <netinet/sctp_peeloff.h>
+#endif /* SCTP */
+#endif /* INET || INET6 */
+
+/*
+ * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
+ * and SOCK_NONBLOCK.
+ */
+#define ACCEPT4_INHERIT 0x1
+#define ACCEPT4_COMPAT 0x2
+
+static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
+static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
+
+static int accept1(struct thread *td, int s, struct sockaddr *uname,
+ socklen_t *anamelen, int flags);
+static int do_sendfile(struct thread *td, struct sendfile_args *uap,
+ int compat);
+static int getsockname1(struct thread *td, struct getsockname_args *uap,
+ int compat);
+static int getpeername1(struct thread *td, struct getpeername_args *uap,
+ int compat);
+
+counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
+
+/*
+ * sendfile(2)-related variables and associated sysctls
+ */
+int nsfbufs;
+int nsfbufspeak;
+int nsfbufsused;
+static int sfreadahead = 1;
+
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
+ "Maximum number of sendfile(2) sf_bufs available");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
+ "Number of sendfile(2) sf_bufs at peak usage");
+SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
+ "Number of sendfile(2) sf_bufs in use");
+SYSCTL_INT(_kern_ipc, OID_AUTO, sfreadahead, CTLFLAG_RW, &sfreadahead, 0,
+ "Number of sendfile(2) read-ahead MAXBSIZE blocks");
+
+
+static void
+sfstat_init(const void *unused)
+{
+
+ COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
+ M_WAITOK);
+}
+SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
+
+static int
+sfstat_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct sfstat s;
+
+ COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
+ if (req->newptr)
+ COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
+ return (SYSCTL_OUT(req, &s, sizeof(s)));
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
+ NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
+
+/*
+ * Convert a user file descriptor to a kernel file entry and check if required
+ * capability rights are present.
+ * A reference on the file entry is held upon returning.
+ */
+static int
+getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp,
+ struct file **fpp, u_int *fflagp)
+{
+ struct file *fp;
+ int error;
+
+ error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
+ if (error != 0)
+ return (error);
+ if (fp->f_type != DTYPE_SOCKET) {
+ fdrop(fp, curthread);
+ return (ENOTSOCK);
+ }
+ if (fflagp != NULL)
+ *fflagp = fp->f_flag;
+ *fpp = fp;
+ return (0);
+}
+
+/*
+ * System call interface to the socket abstraction.
+ */
+#if defined(COMPAT_43)
+#define COMPAT_OLDSOCK
+#endif
+
+int
+sys_socket(td, uap)
+ struct thread *td;
+ struct socket_args /* {
+ int domain;
+ int type;
+ int protocol;
+ } */ *uap;
+{
+ struct socket *so;
+ struct file *fp;
+ int fd, error, type, oflag, fflag;
+
+ AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
+
+ type = uap->type;
+ oflag = 0;
+ fflag = 0;
+ if ((type & SOCK_CLOEXEC) != 0) {
+ type &= ~SOCK_CLOEXEC;
+ oflag |= O_CLOEXEC;
+ }
+ if ((type & SOCK_NONBLOCK) != 0) {
+ type &= ~SOCK_NONBLOCK;
+ fflag |= FNONBLOCK;
+ }
+
+#ifdef MAC
+ error = mac_socket_check_create(td->td_ucred, uap->domain, type,
+ uap->protocol);
+ if (error != 0)
+ return (error);
+#endif
+ error = falloc(td, &fp, &fd, oflag);
+ if (error != 0)
+ return (error);
+ /* An extra reference on `fp' has been held for us by falloc(). */
+ error = socreate(uap->domain, &so, type, uap->protocol,
+ td->td_ucred, td);
+ if (error != 0) {
+ fdclose(td->td_proc->p_fd, fp, fd, td);
+ } else {
+ finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
+ if ((fflag & FNONBLOCK) != 0)
+ (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
+ td->td_retval[0] = fd;
+ }
+ fdrop(fp, td);
+ return (error);
+}
+
+/* ARGSUSED */
+int
+sys_bind(td, uap)
+ struct thread *td;
+ struct bind_args /* {
+ int s;
+ caddr_t name;
+ int namelen;
+ } */ *uap;
+{
+ struct sockaddr *sa;
+ int error;
+
+ error = getsockaddr(&sa, uap->name, uap->namelen);
+ if (error == 0) {
+ error = kern_bind(td, uap->s, sa);
+ free(sa, M_SONAME);
+ }
+ return (error);
+}
+
+static int
+kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
+{
+ struct socket *so;
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ AUDIT_ARG_SOCKADDR(td, dirfd, sa);
+ error = getsock_cap(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_BIND), &fp, NULL);
+ if (error != 0)
+ return (error);
+ so = fp->f_data;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrsockaddr(sa);
+#endif
+#ifdef MAC
+ error = mac_socket_check_bind(td->td_ucred, so, sa);
+ if (error == 0) {
+#endif
+ if (dirfd == AT_FDCWD)
+ error = sobind(so, sa, td);
+ else
+ error = sobindat(dirfd, so, sa, td);
+#ifdef MAC
+ }
+#endif
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+kern_bind(struct thread *td, int fd, struct sockaddr *sa)
+{
+
+ return (kern_bindat(td, AT_FDCWD, fd, sa));
+}
+
+/* ARGSUSED */
+int
+sys_bindat(td, uap)
+ struct thread *td;
+ struct bindat_args /* {
+ int fd;
+ int s;
+ caddr_t name;
+ int namelen;
+ } */ *uap;
+{
+ struct sockaddr *sa;
+ int error;
+
+ error = getsockaddr(&sa, uap->name, uap->namelen);
+ if (error == 0) {
+ error = kern_bindat(td, uap->fd, uap->s, sa);
+ free(sa, M_SONAME);
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+int
+sys_listen(td, uap)
+ struct thread *td;
+ struct listen_args /* {
+ int s;
+ int backlog;
+ } */ *uap;
+{
+ struct socket *so;
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->s);
+ error = getsock_cap(td->td_proc->p_fd, uap->s,
+ cap_rights_init(&rights, CAP_LISTEN), &fp, NULL);
+ if (error == 0) {
+ so = fp->f_data;
+#ifdef MAC
+ error = mac_socket_check_listen(td->td_ucred, so);
+ if (error == 0)
+#endif
+ error = solisten(so, uap->backlog, td);
+ fdrop(fp, td);
+ }
+ return(error);
+}
+
+/*
+ * accept1()
+ */
+static int
+accept1(td, s, uname, anamelen, flags)
+ struct thread *td;
+ int s;
+ struct sockaddr *uname;
+ socklen_t *anamelen;
+ int flags;
+{
+ struct sockaddr *name;
+ socklen_t namelen;
+ struct file *fp;
+ int error;
+
+ if (uname == NULL)
+ return (kern_accept4(td, s, NULL, NULL, flags, NULL));
+
+ error = copyin(anamelen, &namelen, sizeof (namelen));
+ if (error != 0)
+ return (error);
+
+ error = kern_accept4(td, s, &name, &namelen, flags, &fp);
+
+ /*
+ * return a namelen of zero for older code which might
+ * ignore the return value from accept.
+ */
+ if (error != 0) {
+ (void) copyout(&namelen, anamelen, sizeof(*anamelen));
+ return (error);
+ }
+
+ if (error == 0 && uname != NULL) {
+#ifdef COMPAT_OLDSOCK
+ if (flags & ACCEPT4_COMPAT)
+ ((struct osockaddr *)name)->sa_family =
+ name->sa_family;
+#endif
+ error = copyout(name, uname, namelen);
+ }
+ if (error == 0)
+ error = copyout(&namelen, anamelen,
+ sizeof(namelen));
+ if (error != 0)
+ fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
+ fdrop(fp, td);
+ free(name, M_SONAME);
+ return (error);
+}
+
+int
+kern_accept(struct thread *td, int s, struct sockaddr **name,
+ socklen_t *namelen, struct file **fp)
+{
+ return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
+}
+
+int
+kern_accept4(struct thread *td, int s, struct sockaddr **name,
+ socklen_t *namelen, int flags, struct file **fp)
+{
+ struct filedesc *fdp;
+ struct file *headfp, *nfp = NULL;
+ struct sockaddr *sa = NULL;
+ struct socket *head, *so;
+ cap_rights_t rights;
+ u_int fflag;
+ pid_t pgid;
+ int error, fd, tmp;
+
+ if (name != NULL)
+ *name = NULL;
+
+ AUDIT_ARG_FD(s);
+ fdp = td->td_proc->p_fd;
+ error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT),
+ &headfp, &fflag);
+ if (error != 0)
+ return (error);
+ head = headfp->f_data;
+ if ((head->so_options & SO_ACCEPTCONN) == 0) {
+ error = EINVAL;
+ goto done;
+ }
+#ifdef MAC
+ error = mac_socket_check_accept(td->td_ucred, head);
+ if (error != 0)
+ goto done;
+#endif
+ error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
+ if (error != 0)
+ goto done;
+ ACCEPT_LOCK();
+ if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
+ ACCEPT_UNLOCK();
+ error = EWOULDBLOCK;
+ goto noconnection;
+ }
+ while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
+ if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
+ head->so_error = ECONNABORTED;
+ break;
+ }
+ error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
+ "accept", 0);
+ if (error != 0) {
+ ACCEPT_UNLOCK();
+ goto noconnection;
+ }
+ }
+ if (head->so_error) {
+ error = head->so_error;
+ head->so_error = 0;
+ ACCEPT_UNLOCK();
+ goto noconnection;
+ }
+ so = TAILQ_FIRST(&head->so_comp);
+ KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
+ KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
+
+ /*
+ * Before changing the flags on the socket, we have to bump the
+ * reference count. Otherwise, if the protocol calls sofree(),
+ * the socket will be released due to a zero refcount.
+ */
+ SOCK_LOCK(so); /* soref() and so_state update */
+ soref(so); /* file descriptor reference */
+
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ head->so_qlen--;
+ if (flags & ACCEPT4_INHERIT)
+ so->so_state |= (head->so_state & SS_NBIO);
+ else
+ so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
+ so->so_qstate &= ~SQ_COMP;
+ so->so_head = NULL;
+
+ SOCK_UNLOCK(so);
+ ACCEPT_UNLOCK();
+
+ /* An extra reference on `nfp' has been held for us by falloc(). */
+ td->td_retval[0] = fd;
+
+ /* connection has been removed from the listen queue */
+ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
+
+ if (flags & ACCEPT4_INHERIT) {
+ pgid = fgetown(&head->so_sigio);
+ if (pgid != 0)
+ fsetown(pgid, &so->so_sigio);
+ } else {
+ fflag &= ~(FNONBLOCK | FASYNC);
+ if (flags & SOCK_NONBLOCK)
+ fflag |= FNONBLOCK;
+ }
+
+ finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
+ /* Sync socket nonblocking/async state with file flags */
+ tmp = fflag & FNONBLOCK;
+ (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
+ tmp = fflag & FASYNC;
+ (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
+ sa = 0;
+ error = soaccept(so, &sa);
+ if (error != 0) {
+ /*
+ * return a namelen of zero for older code which might
+ * ignore the return value from accept.
+ */
+ if (name)
+ *namelen = 0;
+ goto noconnection;
+ }
+ if (sa == NULL) {
+ if (name)
+ *namelen = 0;
+ goto done;
+ }
+ AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
+ if (name) {
+ /* check sa_len before it is destroyed */
+ if (*namelen > sa->sa_len)
+ *namelen = sa->sa_len;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrsockaddr(sa);
+#endif
+ *name = sa;
+ sa = NULL;
+ }
+noconnection:
+ free(sa, M_SONAME);
+
+ /*
+ * close the new descriptor, assuming someone hasn't ripped it
+ * out from under us.
+ */
+ if (error != 0)
+ fdclose(fdp, nfp, fd, td);
+
+ /*
+ * Release explicitly held references before returning. We return
+ * a reference on nfp to the caller on success if they request it.
+ */
+done:
+ if (fp != NULL) {
+ if (error == 0) {
+ *fp = nfp;
+ nfp = NULL;
+ } else
+ *fp = NULL;
+ }
+ if (nfp != NULL)
+ fdrop(nfp, td);
+ fdrop(headfp, td);
+ return (error);
+}
+
+int
+sys_accept(td, uap)
+ struct thread *td;
+ struct accept_args *uap;
+{
+
+ return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
+}
+
+int
+sys_accept4(td, uap)
+ struct thread *td;
+ struct accept4_args *uap;
+{
+
+ if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return (EINVAL);
+
+ return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+oaccept(td, uap)
+ struct thread *td;
+ struct accept_args *uap;
+{
+
+ return (accept1(td, uap->s, uap->name, uap->anamelen,
+ ACCEPT4_INHERIT | ACCEPT4_COMPAT));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/* ARGSUSED */
+int
+sys_connect(td, uap)
+ struct thread *td;
+ struct connect_args /* {
+ int s;
+ caddr_t name;
+ int namelen;
+ } */ *uap;
+{
+ struct sockaddr *sa;
+ int error;
+
+ error = getsockaddr(&sa, uap->name, uap->namelen);
+ if (error == 0) {
+ error = kern_connect(td, uap->s, sa);
+ free(sa, M_SONAME);
+ }
+ return (error);
+}
+
+static int
+kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
+{
+ struct socket *so;
+ struct file *fp;
+ cap_rights_t rights;
+ int error, interrupted = 0;
+
+ AUDIT_ARG_FD(fd);
+ AUDIT_ARG_SOCKADDR(td, dirfd, sa);
+ error = getsock_cap(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_CONNECT), &fp, NULL);
+ if (error != 0)
+ return (error);
+ so = fp->f_data;
+ if (so->so_state & SS_ISCONNECTING) {
+ error = EALREADY;
+ goto done1;
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrsockaddr(sa);
+#endif
+#ifdef MAC
+ error = mac_socket_check_connect(td->td_ucred, so, sa);
+ if (error != 0)
+ goto bad;
+#endif
+ if (dirfd == AT_FDCWD)
+ error = soconnect(so, sa, td);
+ else
+ error = soconnectat(dirfd, so, sa, td);
+ if (error != 0)
+ goto bad;
+ if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
+ error = EINPROGRESS;
+ goto done1;
+ }
+ SOCK_LOCK(so);
+ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+ error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
+ "connec", 0);
+ if (error != 0) {
+ if (error == EINTR || error == ERESTART)
+ interrupted = 1;
+ break;
+ }
+ }
+ if (error == 0) {
+ error = so->so_error;
+ so->so_error = 0;
+ }
+ SOCK_UNLOCK(so);
+bad:
+ if (!interrupted)
+ so->so_state &= ~SS_ISCONNECTING;
+ if (error == ERESTART)
+ error = EINTR;
+done1:
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+kern_connect(struct thread *td, int fd, struct sockaddr *sa)
+{
+
+ return (kern_connectat(td, AT_FDCWD, fd, sa));
+}
+
+/* ARGSUSED */
+int
+sys_connectat(td, uap)
+ struct thread *td;
+ struct connectat_args /* {
+ int fd;
+ int s;
+ caddr_t name;
+ int namelen;
+ } */ *uap;
+{
+ struct sockaddr *sa;
+ int error;
+
+ error = getsockaddr(&sa, uap->name, uap->namelen);
+ if (error == 0) {
+ error = kern_connectat(td, uap->fd, uap->s, sa);
+ free(sa, M_SONAME);
+ }
+ return (error);
+}
+
+int
+kern_socketpair(struct thread *td, int domain, int type, int protocol,
+ int *rsv)
+{
+ struct filedesc *fdp = td->td_proc->p_fd;
+ struct file *fp1, *fp2;
+ struct socket *so1, *so2;
+ int fd, error, oflag, fflag;
+
+ AUDIT_ARG_SOCKET(domain, type, protocol);
+
+ oflag = 0;
+ fflag = 0;
+ if ((type & SOCK_CLOEXEC) != 0) {
+ type &= ~SOCK_CLOEXEC;
+ oflag |= O_CLOEXEC;
+ }
+ if ((type & SOCK_NONBLOCK) != 0) {
+ type &= ~SOCK_NONBLOCK;
+ fflag |= FNONBLOCK;
+ }
+#ifdef MAC
+ /* We might want to have a separate check for socket pairs. */
+ error = mac_socket_check_create(td->td_ucred, domain, type,
+ protocol);
+ if (error != 0)
+ return (error);
+#endif
+ error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
+ if (error != 0)
+ return (error);
+ error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
+ if (error != 0)
+ goto free1;
+ /* On success extra reference to `fp1' and 'fp2' is set by falloc. */
+ error = falloc(td, &fp1, &fd, oflag);
+ if (error != 0)
+ goto free2;
+ rsv[0] = fd;
+ fp1->f_data = so1; /* so1 already has ref count */
+ error = falloc(td, &fp2, &fd, oflag);
+ if (error != 0)
+ goto free3;
+ fp2->f_data = so2; /* so2 already has ref count */
+ rsv[1] = fd;
+ error = soconnect2(so1, so2);
+ if (error != 0)
+ goto free4;
+ if (type == SOCK_DGRAM) {
+ /*
+ * Datagram socket connection is asymmetric.
+ */
+ error = soconnect2(so2, so1);
+ if (error != 0)
+ goto free4;
+ }
+ finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
+ &socketops);
+ finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
+ &socketops);
+ if ((fflag & FNONBLOCK) != 0) {
+ (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
+ (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
+ }
+ fdrop(fp1, td);
+ fdrop(fp2, td);
+ return (0);
+free4:
+ fdclose(fdp, fp2, rsv[1], td);
+ fdrop(fp2, td);
+free3:
+ fdclose(fdp, fp1, rsv[0], td);
+ fdrop(fp1, td);
+free2:
+ if (so2 != NULL)
+ (void)soclose(so2);
+free1:
+ if (so1 != NULL)
+ (void)soclose(so1);
+ return (error);
+}
+
+int
+sys_socketpair(struct thread *td, struct socketpair_args *uap)
+{
+ int error, sv[2];
+
+ error = kern_socketpair(td, uap->domain, uap->type,
+ uap->protocol, sv);
+ if (error != 0)
+ return (error);
+ error = copyout(sv, uap->rsv, 2 * sizeof(int));
+ if (error != 0) {
+ (void)kern_close(td, sv[0]);
+ (void)kern_close(td, sv[1]);
+ }
+ return (error);
+}
+
+static int
+sendit(td, s, mp, flags)
+ struct thread *td;
+ int s;
+ struct msghdr *mp;
+ int flags;
+{
+ struct mbuf *control;
+ struct sockaddr *to;
+ int error;
+
+#ifdef CAPABILITY_MODE
+ if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
+ return (ECAPMODE);
+#endif
+
+ if (mp->msg_name != NULL) {
+ error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
+ if (error != 0) {
+ to = NULL;
+ goto bad;
+ }
+ mp->msg_name = to;
+ } else {
+ to = NULL;
+ }
+
+ if (mp->msg_control) {
+ if (mp->msg_controllen < sizeof(struct cmsghdr)
+#ifdef COMPAT_OLDSOCK
+ && mp->msg_flags != MSG_COMPAT
+#endif
+ ) {
+ error = EINVAL;
+ goto bad;
+ }
+ error = sockargs(&control, mp->msg_control,
+ mp->msg_controllen, MT_CONTROL);
+ if (error != 0)
+ goto bad;
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags == MSG_COMPAT) {
+ struct cmsghdr *cm;
+
+ M_PREPEND(control, sizeof(*cm), M_WAITOK);
+ cm = mtod(control, struct cmsghdr *);
+ cm->cmsg_len = control->m_len;
+ cm->cmsg_level = SOL_SOCKET;
+ cm->cmsg_type = SCM_RIGHTS;
+ }
+#endif
+ } else {
+ control = NULL;
+ }
+
+ error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
+
+bad:
+ free(to, M_SONAME);
+ return (error);
+}
+
+int
+kern_sendit(td, s, mp, flags, control, segflg)
+ struct thread *td;
+ int s;
+ struct msghdr *mp;
+ int flags;
+ struct mbuf *control;
+ enum uio_seg segflg;
+{
+ struct file *fp;
+ struct uio auio;
+ struct iovec *iov;
+ struct socket *so;
+ cap_rights_t rights;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+ ssize_t len;
+ int i, error;
+
+ AUDIT_ARG_FD(s);
+ cap_rights_init(&rights, CAP_SEND);
+ if (mp->msg_name != NULL) {
+ AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
+ cap_rights_set(&rights, CAP_CONNECT);
+ }
+ error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL);
+ if (error != 0)
+ return (error);
+ so = (struct socket *)fp->f_data;
+
+#ifdef KTRACE
+ if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
+ ktrsockaddr(mp->msg_name);
+#endif
+#ifdef MAC
+ if (mp->msg_name != NULL) {
+ error = mac_socket_check_connect(td->td_ucred, so,
+ mp->msg_name);
+ if (error != 0)
+ goto bad;
+ }
+ error = mac_socket_check_send(td->td_ucred, so);
+ if (error != 0)
+ goto bad;
+#endif
+
+ auio.uio_iov = mp->msg_iov;
+ auio.uio_iovcnt = mp->msg_iovlen;
+ auio.uio_segflg = segflg;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ iov = mp->msg_iov;
+ for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+ if ((auio.uio_resid += iov->iov_len) < 0) {
+ error = EINVAL;
+ goto bad;
+ }
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO))
+ ktruio = cloneuio(&auio);
+#endif
+ len = auio.uio_resid;
+ error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
+ if (error != 0) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Generation of SIGPIPE can be controlled per socket */
+ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+ !(flags & MSG_NOSIGNAL)) {
+ PROC_LOCK(td->td_proc);
+ tdsignal(td, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ if (error == 0)
+ td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = td->td_retval[0];
+ ktrgenio(s, UIO_WRITE, ktruio, error);
+ }
+#endif
+bad:
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_sendto(td, uap)
+ struct thread *td;
+ struct sendto_args /* {
+ int s;
+ caddr_t buf;
+ size_t len;
+ int flags;
+ caddr_t to;
+ int tolen;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+
+ msg.msg_name = uap->to;
+ msg.msg_namelen = uap->tolen;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = 0;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = 0;
+#endif
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ return (sendit(td, uap->s, &msg, uap->flags));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+osend(td, uap)
+ struct thread *td;
+ struct osend_args /* {
+ int s;
+ caddr_t buf;
+ int len;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = 0;
+ return (sendit(td, uap->s, &msg, uap->flags));
+}
+
+int
+osendmsg(td, uap)
+ struct thread *td;
+ struct osendmsg_args /* {
+ int s;
+ caddr_t msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec *iov;
+ int error;
+
+ error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
+ if (error != 0)
+ return (error);
+ error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
+ if (error != 0)
+ return (error);
+ msg.msg_iov = iov;
+ msg.msg_flags = MSG_COMPAT;
+ error = sendit(td, uap->s, &msg, uap->flags);
+ free(iov, M_IOV);
+ return (error);
+}
+#endif
+
+int
+sys_sendmsg(td, uap)
+ struct thread *td;
+ struct sendmsg_args /* {
+ int s;
+ caddr_t msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec *iov;
+ int error;
+
+ error = copyin(uap->msg, &msg, sizeof (msg));
+ if (error != 0)
+ return (error);
+ error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
+ if (error != 0)
+ return (error);
+ msg.msg_iov = iov;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = 0;
+#endif
+ error = sendit(td, uap->s, &msg, uap->flags);
+ free(iov, M_IOV);
+ return (error);
+}
+
+int
+kern_recvit(td, s, mp, fromseg, controlp)
+ struct thread *td;
+ int s;
+ struct msghdr *mp;
+ enum uio_seg fromseg;
+ struct mbuf **controlp;
+{
+ struct uio auio;
+ struct iovec *iov;
+ struct mbuf *m, *control = NULL;
+ caddr_t ctlbuf;
+ struct file *fp;
+ struct socket *so;
+ struct sockaddr *fromsa = NULL;
+ cap_rights_t rights;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+ ssize_t len;
+ int error, i;
+
+ if (controlp != NULL)
+ *controlp = NULL;
+
+ AUDIT_ARG_FD(s);
+ error = getsock_cap(td->td_proc->p_fd, s,
+ cap_rights_init(&rights, CAP_RECV), &fp, NULL);
+ if (error != 0)
+ return (error);
+ so = fp->f_data;
+
+#ifdef MAC
+ error = mac_socket_check_receive(td->td_ucred, so);
+ if (error != 0) {
+ fdrop(fp, td);
+ return (error);
+ }
+#endif
+
+ auio.uio_iov = mp->msg_iov;
+ auio.uio_iovcnt = mp->msg_iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ iov = mp->msg_iov;
+ for (i = 0; i < mp->msg_iovlen; i++, iov++) {
+ if ((auio.uio_resid += iov->iov_len) < 0) {
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO))
+ ktruio = cloneuio(&auio);
+#endif
+ len = auio.uio_resid;
+ error = soreceive(so, &fromsa, &auio, NULL,
+ (mp->msg_control || controlp) ? &control : NULL,
+ &mp->msg_flags);
+ if (error != 0) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ }
+ if (fromsa != NULL)
+ AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = len - auio.uio_resid;
+ ktrgenio(s, UIO_READ, ktruio, error);
+ }
+#endif
+ if (error != 0)
+ goto out;
+ td->td_retval[0] = len - auio.uio_resid;
+ if (mp->msg_name) {
+ len = mp->msg_namelen;
+ if (len <= 0 || fromsa == NULL)
+ len = 0;
+ else {
+ /* save sa_len before it is destroyed by MSG_COMPAT */
+ len = MIN(len, fromsa->sa_len);
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags & MSG_COMPAT)
+ ((struct osockaddr *)fromsa)->sa_family =
+ fromsa->sa_family;
+#endif
+ if (fromseg == UIO_USERSPACE) {
+ error = copyout(fromsa, mp->msg_name,
+ (unsigned)len);
+ if (error != 0)
+ goto out;
+ } else
+ bcopy(fromsa, mp->msg_name, len);
+ }
+ mp->msg_namelen = len;
+ }
+ if (mp->msg_control && controlp == NULL) {
+#ifdef COMPAT_OLDSOCK
+ /*
+ * We assume that old recvmsg calls won't receive access
+ * rights and other control info, esp. as control info
+ * is always optional and those options didn't exist in 4.3.
+ * If we receive rights, trim the cmsghdr; anything else
+ * is tossed.
+ */
+ if (control && mp->msg_flags & MSG_COMPAT) {
+ if (mtod(control, struct cmsghdr *)->cmsg_level !=
+ SOL_SOCKET ||
+ mtod(control, struct cmsghdr *)->cmsg_type !=
+ SCM_RIGHTS) {
+ mp->msg_controllen = 0;
+ goto out;
+ }
+ control->m_len -= sizeof (struct cmsghdr);
+ control->m_data += sizeof (struct cmsghdr);
+ }
+#endif
+ len = mp->msg_controllen;
+ m = control;
+ mp->msg_controllen = 0;
+ ctlbuf = mp->msg_control;
+
+ while (m && len > 0) {
+ unsigned int tocopy;
+
+ if (len >= m->m_len)
+ tocopy = m->m_len;
+ else {
+ mp->msg_flags |= MSG_CTRUNC;
+ tocopy = len;
+ }
+
+ if ((error = copyout(mtod(m, caddr_t),
+ ctlbuf, tocopy)) != 0)
+ goto out;
+
+ ctlbuf += tocopy;
+ len -= tocopy;
+ m = m->m_next;
+ }
+ mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
+ }
+out:
+ fdrop(fp, td);
+#ifdef KTRACE
+ if (fromsa && KTRPOINT(td, KTR_STRUCT))
+ ktrsockaddr(fromsa);
+#endif
+ free(fromsa, M_SONAME);
+
+ if (error == 0 && controlp != NULL)
+ *controlp = control;
+ else if (control)
+ m_freem(control);
+
+ return (error);
+}
+
+static int
+recvit(td, s, mp, namelenp)
+ struct thread *td;
+ int s;
+ struct msghdr *mp;
+ void *namelenp;
+{
+ int error;
+
+ error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
+ if (error != 0)
+ return (error);
+ if (namelenp != NULL) {
+ error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
+#ifdef COMPAT_OLDSOCK
+ if (mp->msg_flags & MSG_COMPAT)
+ error = 0; /* old recvfrom didn't check */
+#endif
+ }
+ return (error);
+}
+
+int
+sys_recvfrom(td, uap)
+ struct thread *td;
+ struct recvfrom_args /* {
+ int s;
+ caddr_t buf;
+ size_t len;
+ int flags;
+ struct sockaddr * __restrict from;
+ socklen_t * __restrict fromlenaddr;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+ int error;
+
+ if (uap->fromlenaddr) {
+ error = copyin(uap->fromlenaddr,
+ &msg.msg_namelen, sizeof (msg.msg_namelen));
+ if (error != 0)
+ goto done2;
+ } else {
+ msg.msg_namelen = 0;
+ }
+ msg.msg_name = uap->from;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = uap->flags;
+ error = recvit(td, uap->s, &msg, uap->fromlenaddr);
+done2:
+ return (error);
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+orecvfrom(td, uap)
+ struct thread *td;
+ struct recvfrom_args *uap;
+{
+
+ uap->flags |= MSG_COMPAT;
+ return (sys_recvfrom(td, uap));
+}
+#endif
+
+#ifdef COMPAT_OLDSOCK
+int
+orecv(td, uap)
+ struct thread *td;
+ struct orecv_args /* {
+ int s;
+ caddr_t buf;
+ int len;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = uap->flags;
+ return (recvit(td, uap->s, &msg, NULL));
+}
+
+/*
+ * Old recvmsg. This code takes advantage of the fact that the old msghdr
+ * overlays the new one, missing only the flags, and with the (old) access
+ * rights where the control fields are now.
+ */
+int
+orecvmsg(td, uap)
+ struct thread *td;
+ struct orecvmsg_args /* {
+ int s;
+ struct omsghdr *msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec *iov;
+ int error;
+
+ error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
+ if (error != 0)
+ return (error);
+ error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
+ if (error != 0)
+ return (error);
+ msg.msg_flags = uap->flags | MSG_COMPAT;
+ msg.msg_iov = iov;
+ error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
+ if (msg.msg_controllen && error == 0)
+ error = copyout(&msg.msg_controllen,
+ &uap->msg->msg_accrightslen, sizeof (int));
+ free(iov, M_IOV);
+ return (error);
+}
+#endif
+
+int
+sys_recvmsg(td, uap)
+ struct thread *td;
+ struct recvmsg_args /* {
+ int s;
+ struct msghdr *msg;
+ int flags;
+ } */ *uap;
+{
+ struct msghdr msg;
+ struct iovec *uiov, *iov;
+ int error;
+
+ error = copyin(uap->msg, &msg, sizeof (msg));
+ if (error != 0)
+ return (error);
+ error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
+ if (error != 0)
+ return (error);
+ msg.msg_flags = uap->flags;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags &= ~MSG_COMPAT;
+#endif
+ uiov = msg.msg_iov;
+ msg.msg_iov = iov;
+ error = recvit(td, uap->s, &msg, NULL);
+ if (error == 0) {
+ msg.msg_iov = uiov;
+ error = copyout(&msg, uap->msg, sizeof(msg));
+ }
+ free(iov, M_IOV);
+ return (error);
+}
+
+/* ARGSUSED */
+int
+sys_shutdown(td, uap)
+ struct thread *td;
+ struct shutdown_args /* {
+ int s;
+ int how;
+ } */ *uap;
+{
+ struct socket *so;
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->s);
+ error = getsock_cap(td->td_proc->p_fd, uap->s,
+ cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL);
+ if (error == 0) {
+ so = fp->f_data;
+ error = soshutdown(so, uap->how);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+int
+sys_setsockopt(td, uap)
+ struct thread *td;
+ struct setsockopt_args /* {
+ int s;
+ int level;
+ int name;
+ caddr_t val;
+ int valsize;
+ } */ *uap;
+{
+
+ return (kern_setsockopt(td, uap->s, uap->level, uap->name,
+ uap->val, UIO_USERSPACE, uap->valsize));
+}
+
+int
+kern_setsockopt(td, s, level, name, val, valseg, valsize)
+ struct thread *td;
+ int s;
+ int level;
+ int name;
+ void *val;
+ enum uio_seg valseg;
+ socklen_t valsize;
+{
+ struct socket *so;
+ struct file *fp;
+ struct sockopt sopt;
+ cap_rights_t rights;
+ int error;
+
+ if (val == NULL && valsize != 0)
+ return (EFAULT);
+ if ((int)valsize < 0)
+ return (EINVAL);
+
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = level;
+ sopt.sopt_name = name;
+ sopt.sopt_val = val;
+ sopt.sopt_valsize = valsize;
+ switch (valseg) {
+ case UIO_USERSPACE:
+ sopt.sopt_td = td;
+ break;
+ case UIO_SYSSPACE:
+ sopt.sopt_td = NULL;
+ break;
+ default:
+ panic("kern_setsockopt called with bad valseg");
+ }
+
+ AUDIT_ARG_FD(s);
+ error = getsock_cap(td->td_proc->p_fd, s,
+ cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL);
+ if (error == 0) {
+ so = fp->f_data;
+ error = sosetopt(so, &sopt);
+ fdrop(fp, td);
+ }
+ return(error);
+}
+
+/* ARGSUSED */
+int
+sys_getsockopt(td, uap)
+ struct thread *td;
+ struct getsockopt_args /* {
+ int s;
+ int level;
+ int name;
+ void * __restrict val;
+ socklen_t * __restrict avalsize;
+ } */ *uap;
+{
+ socklen_t valsize;
+ int error;
+
+ if (uap->val) {
+ error = copyin(uap->avalsize, &valsize, sizeof (valsize));
+ if (error != 0)
+ return (error);
+ }
+
+ error = kern_getsockopt(td, uap->s, uap->level, uap->name,
+ uap->val, UIO_USERSPACE, &valsize);
+
+ if (error == 0)
+ error = copyout(&valsize, uap->avalsize, sizeof (valsize));
+ return (error);
+}
+
+/*
+ * Kernel version of getsockopt.
+ * optval can be a userland or userspace. optlen is always a kernel pointer.
+ */
+int
+kern_getsockopt(td, s, level, name, val, valseg, valsize)
+ struct thread *td;
+ int s;
+ int level;
+ int name;
+ void *val;
+ enum uio_seg valseg;
+ socklen_t *valsize;
+{
+ struct socket *so;
+ struct file *fp;
+ struct sockopt sopt;
+ cap_rights_t rights;
+ int error;
+
+ if (val == NULL)
+ *valsize = 0;
+ if ((int)*valsize < 0)
+ return (EINVAL);
+
+ sopt.sopt_dir = SOPT_GET;
+ sopt.sopt_level = level;
+ sopt.sopt_name = name;
+ sopt.sopt_val = val;
+ sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
+ switch (valseg) {
+ case UIO_USERSPACE:
+ sopt.sopt_td = td;
+ break;
+ case UIO_SYSSPACE:
+ sopt.sopt_td = NULL;
+ break;
+ default:
+ panic("kern_getsockopt called with bad valseg");
+ }
+
+ AUDIT_ARG_FD(s);
+ error = getsock_cap(td->td_proc->p_fd, s,
+ cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL);
+ if (error == 0) {
+ so = fp->f_data;
+ error = sogetopt(so, &sopt);
+ *valsize = sopt.sopt_valsize;
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+/*
+ * getsockname1() - Get socket name.
+ */
+/* ARGSUSED */
+static int
+getsockname1(td, uap, compat)
+ struct thread *td;
+ struct getsockname_args /* {
+ int fdes;
+ struct sockaddr * __restrict asa;
+ socklen_t * __restrict alen;
+ } */ *uap;
+ int compat;
+{
+ struct sockaddr *sa;
+ socklen_t len;
+ int error;
+
+ error = copyin(uap->alen, &len, sizeof(len));
+ if (error != 0)
+ return (error);
+
+ error = kern_getsockname(td, uap->fdes, &sa, &len);
+ if (error != 0)
+ return (error);
+
+ if (len != 0) {
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+ error = copyout(sa, uap->asa, (u_int)len);
+ }
+ free(sa, M_SONAME);
+ if (error == 0)
+ error = copyout(&len, uap->alen, sizeof(len));
+ return (error);
+}
+
+int
+kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
+ socklen_t *alen)
+{
+ struct socket *so;
+ struct file *fp;
+ cap_rights_t rights;
+ socklen_t len;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ error = getsock_cap(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL);
+ if (error != 0)
+ return (error);
+ so = fp->f_data;
+ *sa = NULL;
+ CURVNET_SET(so->so_vnet);
+ error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
+ CURVNET_RESTORE();
+ if (error != 0)
+ goto bad;
+ if (*sa == NULL)
+ len = 0;
+ else
+ len = MIN(*alen, (*sa)->sa_len);
+ *alen = len;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrsockaddr(*sa);
+#endif
+bad:
+ fdrop(fp, td);
+ if (error != 0 && *sa != NULL) {
+ free(*sa, M_SONAME);
+ *sa = NULL;
+ }
+ return (error);
+}
+
+int
+sys_getsockname(td, uap)
+ struct thread *td;
+ struct getsockname_args *uap;
+{
+
+ return (getsockname1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetsockname(td, uap)
+ struct thread *td;
+ struct getsockname_args *uap;
+{
+
+ return (getsockname1(td, uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+/*
+ * getpeername1() - Get name of peer for connected socket.
+ */
+/* ARGSUSED */
+static int
+getpeername1(td, uap, compat)
+ struct thread *td;
+ struct getpeername_args /* {
+ int fdes;
+ struct sockaddr * __restrict asa;
+ socklen_t * __restrict alen;
+ } */ *uap;
+ int compat;
+{
+ struct sockaddr *sa;
+ socklen_t len;
+ int error;
+
+ error = copyin(uap->alen, &len, sizeof (len));
+ if (error != 0)
+ return (error);
+
+ error = kern_getpeername(td, uap->fdes, &sa, &len);
+ if (error != 0)
+ return (error);
+
+ if (len != 0) {
+#ifdef COMPAT_OLDSOCK
+ if (compat)
+ ((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+ error = copyout(sa, uap->asa, (u_int)len);
+ }
+ free(sa, M_SONAME);
+ if (error == 0)
+ error = copyout(&len, uap->alen, sizeof(len));
+ return (error);
+}
+
+int
+kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
+ socklen_t *alen)
+{
+ struct socket *so;
+ struct file *fp;
+ cap_rights_t rights;
+ socklen_t len;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ error = getsock_cap(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL);
+ if (error != 0)
+ return (error);
+ so = fp->f_data;
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
+ error = ENOTCONN;
+ goto done;
+ }
+ *sa = NULL;
+ CURVNET_SET(so->so_vnet);
+ error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
+ CURVNET_RESTORE();
+ if (error != 0)
+ goto bad;
+ if (*sa == NULL)
+ len = 0;
+ else
+ len = MIN(*alen, (*sa)->sa_len);
+ *alen = len;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrsockaddr(*sa);
+#endif
+bad:
+ if (error != 0 && *sa != NULL) {
+ free(*sa, M_SONAME);
+ *sa = NULL;
+ }
+done:
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_getpeername(td, uap)
+ struct thread *td;
+ struct getpeername_args *uap;
+{
+
+ return (getpeername1(td, uap, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetpeername(td, uap)
+ struct thread *td;
+ struct ogetpeername_args *uap;
+{
+
+ /* XXX uap should have type `getpeername_args *' to begin with. */
+ return (getpeername1(td, (struct getpeername_args *)uap, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+int
+sockargs(mp, buf, buflen, type)
+ struct mbuf **mp;
+ caddr_t buf;
+ int buflen, type;
+{
+ struct sockaddr *sa;
+ struct mbuf *m;
+ int error;
+
+ if (buflen > MLEN) {
+#ifdef COMPAT_OLDSOCK
+ if (type == MT_SONAME && buflen <= 112)
+ buflen = MLEN; /* unix domain compat. hack */
+ else
+#endif
+ if (buflen > MCLBYTES)
+ return (EINVAL);
+ }
+ m = m_get2(buflen, M_WAITOK, type, 0);
+ m->m_len = buflen;
+ error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
+ if (error != 0)
+ (void) m_free(m);
+ else {
+ *mp = m;
+ if (type == MT_SONAME) {
+ sa = mtod(m, struct sockaddr *);
+
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+ if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+ sa->sa_family = sa->sa_len;
+#endif
+ sa->sa_len = buflen;
+ }
+ }
+ return (error);
+}
+
+int
+getsockaddr(namp, uaddr, len)
+ struct sockaddr **namp;
+ caddr_t uaddr;
+ size_t len;
+{
+ struct sockaddr *sa;
+ int error;
+
+ if (len > SOCK_MAXADDRLEN)
+ return (ENAMETOOLONG);
+ if (len < offsetof(struct sockaddr, sa_data[0]))
+ return (EINVAL);
+ sa = malloc(len, M_SONAME, M_WAITOK);
+ error = copyin(uaddr, sa, len);
+ if (error != 0) {
+ free(sa, M_SONAME);
+ } else {
+#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
+ if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+ sa->sa_family = sa->sa_len;
+#endif
+ sa->sa_len = len;
+ *namp = sa;
+ }
+ return (error);
+}
+
+#include <sys/condvar.h>
+
+struct sendfile_sync {
+ struct mtx mtx;
+ struct cv cv;
+ unsigned count;
+};
+
+/*
+ * Detach mapped page and release resources back to the system.
+ */
+int
+sf_buf_mext(struct mbuf *mb, void *addr, void *args)
+{
+ vm_page_t m;
+ struct sendfile_sync *sfs;
+
+ m = sf_buf_page(args);
+ sf_buf_free(args);
+ vm_page_lock(m);
+ vm_page_unwire(m, 0);
+ /*
+ * Check for the object going away on us. This can
+ * happen since we don't hold a reference to it.
+ * If so, we're responsible for freeing the page.
+ */
+ if (m->wire_count == 0 && m->object == NULL)
+ vm_page_free(m);
+ vm_page_unlock(m);
+ if (addr == NULL)
+ return (EXT_FREE_OK);
+ sfs = addr;
+ mtx_lock(&sfs->mtx);
+ KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
+ if (--sfs->count == 0)
+ cv_signal(&sfs->cv);
+ mtx_unlock(&sfs->mtx);
+ return (EXT_FREE_OK);
+}
+
+/*
+ * sendfile(2)
+ *
+ * int sendfile(int fd, int s, off_t offset, size_t nbytes,
+ * struct sf_hdtr *hdtr, off_t *sbytes, int flags)
+ *
+ * Send a file specified by 'fd' and starting at 'offset' to a socket
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
+ * 0. Optionally add a header and/or trailer to the socket output. If
+ * specified, write the total number of bytes sent into *sbytes.
+ */
+int
+sys_sendfile(struct thread *td, struct sendfile_args *uap)
+{
+
+ return (do_sendfile(td, uap, 0));
+}
+
+static int
+do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
+{
+ struct sf_hdtr hdtr;
+ struct uio *hdr_uio, *trl_uio;
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ if (uap->offset < 0)
+ return (EINVAL);
+
+ hdr_uio = trl_uio = NULL;
+
+ if (uap->hdtr != NULL) {
+ error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+ if (error != 0)
+ goto out;
+ if (hdtr.headers != NULL) {
+ error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
+ if (error != 0)
+ goto out;
+ }
+ if (hdtr.trailers != NULL) {
+ error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
+ if (error != 0)
+ goto out;
+
+ }
+ }
+
+ AUDIT_ARG_FD(uap->fd);
+
+ /*
+ * sendfile(2) can start at any offset within a file so we require
+ * CAP_READ+CAP_SEEK = CAP_PREAD.
+ */
+ if ((error = fget_read(td, uap->fd,
+ cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
+ goto out;
+ }
+
+ error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
+ uap->nbytes, uap->sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
+ fdrop(fp, td);
+
+out:
+ free(hdr_uio, M_IOV);
+ free(trl_uio, M_IOV);
+ return (error);
+}
+
+#ifdef COMPAT_FREEBSD4
+int
+freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
+{
+ struct sendfile_args args;
+
+ args.fd = uap->fd;
+ args.s = uap->s;
+ args.offset = uap->offset;
+ args.nbytes = uap->nbytes;
+ args.hdtr = uap->hdtr;
+ args.sbytes = uap->sbytes;
+ args.flags = uap->flags;
+
+ return (do_sendfile(td, &args, 1));
+}
+#endif /* COMPAT_FREEBSD4 */
+
+int
+vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+ struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+ int kflags, struct thread *td)
+{
+ struct vnode *vp = fp->f_vnode;
+ struct file *sock_fp;
+ struct vm_object *obj = NULL;
+ struct socket *so = NULL;
+ struct mbuf *m = NULL;
+ struct sf_buf *sf;
+ struct vm_page *pg;
+ struct vattr va;
+ struct sendfile_sync *sfs = NULL;
+ cap_rights_t rights;
+ off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
+ int bsize, error, hdrlen = 0, mnw = 0;
+
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ if (vp->v_type == VREG) {
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ if (nbytes == 0) {
+ error = VOP_GETATTR(vp, &va, td->td_ucred);
+ if (error != 0) {
+ VOP_UNLOCK(vp, 0);
+ obj = NULL;
+ goto out;
+ }
+ rem = va.va_size;
+ } else
+ rem = nbytes;
+ obj = vp->v_object;
+ if (obj != NULL) {
+ /*
+ * Temporarily increase the backing VM
+ * object's reference count so that a forced
+ * reclamation of its vnode does not
+ * immediately destroy it.
+ */
+ VM_OBJECT_WLOCK(obj);
+ if ((obj->flags & OBJ_DEAD) == 0) {
+ vm_object_reference_locked(obj);
+ VM_OBJECT_WUNLOCK(obj);
+ } else {
+ VM_OBJECT_WUNLOCK(obj);
+ obj = NULL;
+ }
+ }
+ } else
+ bsize = 0; /* silence gcc */
+ VOP_UNLOCK(vp, 0);
+ if (obj == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * The socket must be a stream socket and connected.
+ * Remember if it a blocking or non-blocking socket.
+ */
+ error = getsock_cap(td->td_proc->p_fd, sockfd,
+ cap_rights_init(&rights, CAP_SEND), &sock_fp, NULL);
+ if (error != 0)
+ goto out;
+ so = sock_fp->f_data;
+ if (so->so_type != SOCK_STREAM) {
+ error = EINVAL;
+ goto out;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ goto out;
+ }
+ /*
+ * Do not wait on memory allocations but return ENOMEM for
+ * caller to retry later.
+ * XXX: Experimental.
+ */
+ if (flags & SF_MNOWAIT)
+ mnw = 1;
+
+ if (flags & SF_SYNC) {
+ sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
+ mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
+ cv_init(&sfs->cv, "sendfile");
+ }
+
+#ifdef MAC
+ error = mac_socket_check_send(td->td_ucred, so);
+ if (error != 0)
+ goto out;
+#endif
+
+ /* If headers are specified copy them into mbufs. */
+ if (hdr_uio != NULL) {
+ hdr_uio->uio_td = td;
+ hdr_uio->uio_rw = UIO_WRITE;
+ if (hdr_uio->uio_resid > 0) {
+ /*
+ * In FBSD < 5.0 the nbytes to send also included
+ * the header. If compat is specified subtract the
+ * header size from nbytes.
+ */
+ if (kflags & SFK_COMPAT) {
+ if (nbytes > hdr_uio->uio_resid)
+ nbytes -= hdr_uio->uio_resid;
+ else
+ nbytes = 0;
+ }
+ m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
+ 0, 0, 0);
+ if (m == NULL) {
+ error = mnw ? EAGAIN : ENOBUFS;
+ goto out;
+ }
+ hdrlen = m_length(m, NULL);
+ }
+ }
+
+ /*
+ * Protect against multiple writers to the socket.
+ *
+ * XXXRW: Historically this has assumed non-interruptibility, so now
+ * we implement that, but possibly shouldn't.
+ */
+ (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
+
+ /*
+ * Loop through the pages of the file, starting with the requested
+ * offset. Get a file page (do I/O if necessary), map the file page
+ * into an sf_buf, attach an mbuf header to the sf_buf, and queue
+ * it on the socket.
+ * This is done in two loops. The inner loop turns as many pages
+ * as it can, up to available socket buffer space, without blocking
+ * into mbufs to have it bulk delivered into the socket send buffer.
+ * The outer loop checks the state and available space of the socket
+ * and takes care of the overall progress.
+ */
+ for (off = offset; ; ) {
+ struct mbuf *mtail;
+ int loopbytes;
+ int space;
+ int done;
+
+ if ((nbytes != 0 && nbytes == fsbytes) ||
+ (nbytes == 0 && va.va_size == fsbytes))
+ break;
+
+ mtail = NULL;
+ loopbytes = 0;
+ space = 0;
+ done = 0;
+
+ /*
+ * Check the socket state for ongoing connection,
+ * no errors and space in socket buffer.
+ * If space is low allow for the remainder of the
+ * file to be processed if it fits the socket buffer.
+ * Otherwise block in waiting for sufficient space
+ * to proceed, or if the socket is nonblocking, return
+ * to userland with EAGAIN while reporting how far
+ * we've come.
+ * We wait until the socket buffer has significant free
+ * space to do bulk sends. This makes good use of file
+ * system read ahead and allows packet segmentation
+ * offloading hardware to take over lots of work. If
+ * we were not careful here we would send off only one
+ * sfbuf at a time.
+ */
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+retry_space:
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ error = EPIPE;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto done;
+ } else if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto done;
+ }
+ space = sbspace(&so->so_snd);
+ if (space < rem &&
+ (space <= 0 ||
+ space < so->so_snd.sb_lowat)) {
+ if (so->so_state & SS_NBIO) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ error = EAGAIN;
+ goto done;
+ }
+ /*
+ * sbwait drops the lock while sleeping.
+ * When we loop back to retry_space the
+ * state may have changed and we retest
+ * for it.
+ */
+ error = sbwait(&so->so_snd);
+ /*
+ * An error from sbwait usually indicates that we've
+ * been interrupted by a signal. If we've sent anything
+ * then return bytes sent, otherwise return the error.
+ */
+ if (error != 0) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto done;
+ }
+ goto retry_space;
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+
+ /*
+ * Reduce space in the socket buffer by the size of
+ * the header mbuf chain.
+ * hdrlen is set to 0 after the first loop.
+ */
+ space -= hdrlen;
+
+ error = vn_lock(vp, LK_SHARED);
+ if (error != 0)
+ goto done;
+ error = VOP_GETATTR(vp, &va, td->td_ucred);
+ if (error != 0 || off >= va.va_size) {
+ VOP_UNLOCK(vp, 0);
+ goto done;
+ }
+
+ /*
+ * Loop and construct maximum sized mbuf chain to be bulk
+ * dumped into socket buffer.
+ */
+ while (space > loopbytes) {
+ vm_pindex_t pindex;
+ vm_offset_t pgoff;
+ struct mbuf *m0;
+
+ /*
+ * Calculate the amount to transfer.
+ * Not to exceed a page, the EOF,
+ * or the passed in nbytes.
+ */
+ pgoff = (vm_offset_t)(off & PAGE_MASK);
+ if (nbytes)
+ rem = (nbytes - fsbytes - loopbytes);
+ else
+ rem = va.va_size -
+ offset - fsbytes - loopbytes;
+ xfsize = omin(PAGE_SIZE - pgoff, rem);
+ xfsize = omin(space - loopbytes, xfsize);
+ if (xfsize <= 0) {
+ done = 1; /* all data sent */
+ break;
+ }
+
+ /*
+ * Attempt to look up the page. Allocate
+ * if not found or wait and loop if busy.
+ */
+ pindex = OFF_TO_IDX(off);
+ VM_OBJECT_WLOCK(obj);
+ pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
+ VM_ALLOC_IGN_SBUSY | VM_ALLOC_NORMAL |
+ VM_ALLOC_WIRED);
+
+ /*
+ * Check if page is valid for what we need,
+ * otherwise initiate I/O.
+ * If we already turned some pages into mbufs,
+ * send them off before we come here again and
+ * block.
+ */
+ if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
+ VM_OBJECT_WUNLOCK(obj);
+ else if (m != NULL)
+ error = EAGAIN; /* send what we already got */
+ else if (flags & SF_NODISKIO)
+ error = EBUSY;
+ else {
+ ssize_t resid;
+ int readahead = sfreadahead * MAXBSIZE;
+
+ VM_OBJECT_WUNLOCK(obj);
+
+ /*
+ * Get the page from backing store.
+ * XXXMAC: Because we don't have fp->f_cred
+ * here, we pass in NOCRED. This is probably
+ * wrong, but is consistent with our original
+ * implementation.
+ */
+ error = vn_rdwr(UIO_READ, vp, NULL, readahead,
+ trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
+ IO_VMIO | ((readahead / bsize) << IO_SEQSHIFT),
+ td->td_ucred, NOCRED, &resid, td);
+ SFSTAT_INC(sf_iocnt);
+ if (error != 0)
+ VM_OBJECT_WLOCK(obj);
+ }
+ if (error != 0) {
+ vm_page_lock(pg);
+ vm_page_unwire(pg, 0);
+ /*
+ * See if anyone else might know about
+ * this page. If not and it is not valid,
+ * then free it.
+ */
+ if (pg->wire_count == 0 && pg->valid == 0 &&
+ !vm_page_busied(pg))
+ vm_page_free(pg);
+ vm_page_unlock(pg);
+ VM_OBJECT_WUNLOCK(obj);
+ if (error == EAGAIN)
+ error = 0; /* not a real error */
+ break;
+ }
+
+ /*
+ * Get a sendfile buf. When allocating the
+ * first buffer for mbuf chain, we usually
+ * wait as long as necessary, but this wait
+ * can be interrupted. For consequent
+ * buffers, do not sleep, since several
+ * threads might exhaust the buffers and then
+ * deadlock.
+ */
+ sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
+ SFB_CATCH);
+ if (sf == NULL) {
+ SFSTAT_INC(sf_allocfail);
+ vm_page_lock(pg);
+ vm_page_unwire(pg, 0);
+ KASSERT(pg->object != NULL,
+ ("%s: object disappeared", __func__));
+ vm_page_unlock(pg);
+ if (m == NULL)
+ error = (mnw ? EAGAIN : EINTR);
+ break;
+ }
+
+ /*
+ * Get an mbuf and set it up as having
+ * external storage.
+ */
+ m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
+ if (m0 == NULL) {
+ error = (mnw ? EAGAIN : ENOBUFS);
+ (void)sf_buf_mext(NULL, NULL, sf);
+ break;
+ }
+ if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
+ sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
+ (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
+ error = (mnw ? EAGAIN : ENOBUFS);
+ (void)sf_buf_mext(NULL, NULL, sf);
+ m_freem(m0);
+ break;
+ }
+ m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
+ m0->m_len = xfsize;
+
+ /* Append to mbuf chain. */
+ if (mtail != NULL)
+ mtail->m_next = m0;
+ else if (m != NULL)
+ m_last(m)->m_next = m0;
+ else
+ m = m0;
+ mtail = m0;
+
+ /* Keep track of bits processed. */
+ loopbytes += xfsize;
+ off += xfsize;
+
+ if (sfs != NULL) {
+ mtx_lock(&sfs->mtx);
+ sfs->count++;
+ mtx_unlock(&sfs->mtx);
+ }
+ }
+
+ VOP_UNLOCK(vp, 0);
+
+ /* Add the buffer chain to the socket buffer. */
+ if (m != NULL) {
+ int mlen, err;
+
+ mlen = m_length(m, NULL);
+ SOCKBUF_LOCK(&so->so_snd);
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ error = EPIPE;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ goto done;
+ }
+ SOCKBUF_UNLOCK(&so->so_snd);
+ CURVNET_SET(so->so_vnet);
+ /* Avoid error aliasing. */
+ err = (*so->so_proto->pr_usrreqs->pru_send)
+ (so, 0, m, NULL, NULL, td);
+ CURVNET_RESTORE();
+ if (err == 0) {
+ /*
+ * We need two counters to get the
+ * file offset and nbytes to send
+ * right:
+ * - sbytes contains the total amount
+ * of bytes sent, including headers.
+ * - fsbytes contains the total amount
+ * of bytes sent from the file.
+ */
+ sbytes += mlen;
+ fsbytes += mlen;
+ if (hdrlen) {
+ fsbytes -= hdrlen;
+ hdrlen = 0;
+ }
+ } else if (error == 0)
+ error = err;
+ m = NULL; /* pru_send always consumes */
+ }
+
+ /* Quit outer loop on error or when we're done. */
+ if (done)
+ break;
+ if (error != 0)
+ goto done;
+ }
+
+ /*
+ * Send trailers. Wimp out and use writev(2).
+ */
+ if (trl_uio != NULL) {
+ sbunlock(&so->so_snd);
+ error = kern_writev(td, sockfd, trl_uio);
+ if (error == 0)
+ sbytes += td->td_retval[0];
+ goto out;
+ }
+
+done:
+ sbunlock(&so->so_snd);
+out:
+ /*
+ * If there was no error we have to clear td->td_retval[0]
+ * because it may have been set by writev.
+ */
+ if (error == 0) {
+ td->td_retval[0] = 0;
+ }
+ if (sent != NULL) {
+ copyout(&sbytes, sent, sizeof(off_t));
+ }
+ if (obj != NULL)
+ vm_object_deallocate(obj);
+ if (so)
+ fdrop(sock_fp, td);
+ if (m)
+ m_freem(m);
+
+ if (sfs != NULL) {
+ mtx_lock(&sfs->mtx);
+ if (sfs->count != 0)
+ cv_wait(&sfs->cv, &sfs->mtx);
+ KASSERT(sfs->count == 0, ("sendfile sync still busy"));
+ cv_destroy(&sfs->cv);
+ mtx_destroy(&sfs->mtx);
+ free(sfs, M_TEMP);
+ }
+
+ if (error == ERESTART)
+ error = EINTR;
+
+ return (error);
+}
+
+/*
+ * SCTP syscalls.
+ * Functionality only compiled in if SCTP is defined in the kernel Makefile,
+ * otherwise all return EOPNOTSUPP.
+ * XXX: We should make this loadable one day.
+ */
+int
+sys_sctp_peeloff(td, uap)
+ struct thread *td;
+ struct sctp_peeloff_args /* {
+ int sd;
+ caddr_t name;
+ } */ *uap;
+{
+#if (defined(INET) || defined(INET6)) && defined(SCTP)
+ struct file *nfp = NULL;
+ struct socket *head, *so;
+ cap_rights_t rights;
+ u_int fflag;
+ int error, fd;
+
+ AUDIT_ARG_FD(uap->sd);
+ error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF),
+ &head, &fflag);
+ if (error != 0)
+ goto done2;
+ if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
+ error = EOPNOTSUPP;
+ goto done;
+ }
+ error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
+ if (error != 0)
+ goto done;
+ /*
+ * At this point we know we do have a assoc to pull
+ * we proceed to get the fd setup. This may block
+ * but that is ok.
+ */
+
+ error = falloc(td, &nfp, &fd, 0);
+ if (error != 0)
+ goto done;
+ td->td_retval[0] = fd;
+
+ CURVNET_SET(head->so_vnet);
+ so = sonewconn(head, SS_ISCONNECTED);
+ if (so == NULL) {
+ error = ENOMEM;
+ goto noconnection;
+ }
+ /*
+ * Before changing the flags on the socket, we have to bump the
+ * reference count. Otherwise, if the protocol calls sofree(),
+ * the socket will be released due to a zero refcount.
+ */
+ SOCK_LOCK(so);
+ soref(so); /* file descriptor reference */
+ SOCK_UNLOCK(so);
+
+ ACCEPT_LOCK();
+
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ head->so_qlen--;
+ so->so_state |= (head->so_state & SS_NBIO);
+ so->so_state &= ~SS_NOFDREF;
+ so->so_qstate &= ~SQ_COMP;
+ so->so_head = NULL;
+ ACCEPT_UNLOCK();
+ finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
+ error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
+ if (error != 0)
+ goto noconnection;
+ if (head->so_sigio != NULL)
+ fsetown(fgetown(&head->so_sigio), &so->so_sigio);
+
+noconnection:
+ /*
+ * close the new descriptor, assuming someone hasn't ripped it
+ * out from under us.
+ */
+ if (error != 0)
+ fdclose(td->td_proc->p_fd, nfp, fd, td);
+
+ /*
+ * Release explicitly held references before returning.
+ */
+ CURVNET_RESTORE();
+done:
+ if (nfp != NULL)
+ fdrop(nfp, td);
+ fputsock(head);
+done2:
+ return (error);
+#else /* SCTP */
+ return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sys_sctp_generic_sendmsg (td, uap)
+ struct thread *td;
+ struct sctp_generic_sendmsg_args /* {
+ int sd,
+ caddr_t msg,
+ int mlen,
+ caddr_t to,
+ __socklen_t tolen,
+ struct sctp_sndrcvinfo *sinfo,
+ int flags
+ } */ *uap;
+{
+#if (defined(INET) || defined(INET6)) && defined(SCTP)
+ struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
+ struct socket *so;
+ struct file *fp = NULL;
+ struct sockaddr *to = NULL;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+ struct uio auio;
+ struct iovec iov[1];
+ cap_rights_t rights;
+ int error = 0, len;
+
+ if (uap->sinfo != NULL) {
+ error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
+ if (error != 0)
+ return (error);
+ u_sinfo = &sinfo;
+ }
+
+ cap_rights_init(&rights, CAP_SEND);
+ if (uap->tolen != 0) {
+ error = getsockaddr(&to, uap->to, uap->tolen);
+ if (error != 0) {
+ to = NULL;
+ goto sctp_bad2;
+ }
+ cap_rights_set(&rights, CAP_CONNECT);
+ }
+
+ AUDIT_ARG_FD(uap->sd);
+ error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
+ if (error != 0)
+ goto sctp_bad;
+#ifdef KTRACE
+ if (to && (KTRPOINT(td, KTR_STRUCT)))
+ ktrsockaddr(to);
+#endif
+
+ iov[0].iov_base = uap->msg;
+ iov[0].iov_len = uap->mlen;
+
+ so = (struct socket *)fp->f_data;
+ if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
+ error = EOPNOTSUPP;
+ goto sctp_bad;
+ }
+#ifdef MAC
+ error = mac_socket_check_send(td->td_ucred, so);
+ if (error != 0)
+ goto sctp_bad;
+#endif /* MAC */
+
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ len = auio.uio_resid = uap->mlen;
+ CURVNET_SET(so->so_vnet);
+ error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL,
+ (struct mbuf *)NULL, uap->flags, u_sinfo, td);
+ CURVNET_RESTORE();
+ if (error != 0) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Generation of SIGPIPE can be controlled per socket. */
+ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+ !(uap->flags & MSG_NOSIGNAL)) {
+ PROC_LOCK(td->td_proc);
+ tdsignal(td, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ if (error == 0)
+ td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = td->td_retval[0];
+ ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
+ }
+#endif /* KTRACE */
+sctp_bad:
+ if (fp != NULL)
+ fdrop(fp, td);
+sctp_bad2:
+ free(to, M_SONAME);
+ return (error);
+#else /* SCTP */
+ return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sys_sctp_generic_sendmsg_iov(td, uap)
+ struct thread *td;
+ struct sctp_generic_sendmsg_iov_args /* {
+ int sd,
+ struct iovec *iov,
+ int iovlen,
+ caddr_t to,
+ __socklen_t tolen,
+ struct sctp_sndrcvinfo *sinfo,
+ int flags
+ } */ *uap;
+{
+#if (defined(INET) || defined(INET6)) && defined(SCTP)
+ struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
+ struct socket *so;
+ struct file *fp = NULL;
+ struct sockaddr *to = NULL;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+ struct uio auio;
+ struct iovec *iov, *tiov;
+ cap_rights_t rights;
+ ssize_t len;
+ int error, i;
+
+ if (uap->sinfo != NULL) {
+ error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
+ if (error != 0)
+ return (error);
+ u_sinfo = &sinfo;
+ }
+ cap_rights_init(&rights, CAP_SEND);
+ if (uap->tolen != 0) {
+ error = getsockaddr(&to, uap->to, uap->tolen);
+ if (error != 0) {
+ to = NULL;
+ goto sctp_bad2;
+ }
+ cap_rights_set(&rights, CAP_CONNECT);
+ }
+
+ AUDIT_ARG_FD(uap->sd);
+ error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
+ if (error != 0)
+ goto sctp_bad1;
+
+#ifdef COMPAT_FREEBSD32
+ if (SV_CURPROC_FLAG(SV_ILP32))
+ error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
+ uap->iovlen, &iov, EMSGSIZE);
+ else
+#endif
+ error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
+ if (error != 0)
+ goto sctp_bad1;
+#ifdef KTRACE
+ if (to && (KTRPOINT(td, KTR_STRUCT)))
+ ktrsockaddr(to);
+#endif
+
+ so = (struct socket *)fp->f_data;
+ if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
+ error = EOPNOTSUPP;
+ goto sctp_bad;
+ }
+#ifdef MAC
+ error = mac_socket_check_send(td->td_ucred, so);
+ if (error != 0)
+ goto sctp_bad;
+#endif /* MAC */
+
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ tiov = iov;
+ for (i = 0; i <uap->iovlen; i++, tiov++) {
+ if ((auio.uio_resid += tiov->iov_len) < 0) {
+ error = EINVAL;
+ goto sctp_bad;
+ }
+ }
+ len = auio.uio_resid;
+ CURVNET_SET(so->so_vnet);
+ error = sctp_lower_sosend(so, to, &auio,
+ (struct mbuf *)NULL, (struct mbuf *)NULL,
+ uap->flags, u_sinfo, td);
+ CURVNET_RESTORE();
+ if (error != 0) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Generation of SIGPIPE can be controlled per socket */
+ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+ !(uap->flags & MSG_NOSIGNAL)) {
+ PROC_LOCK(td->td_proc);
+ tdsignal(td, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ if (error == 0)
+ td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = td->td_retval[0];
+ ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
+ }
+#endif /* KTRACE */
+sctp_bad:
+ free(iov, M_IOV);
+sctp_bad1:
+ if (fp != NULL)
+ fdrop(fp, td);
+sctp_bad2:
+ free(to, M_SONAME);
+ return (error);
+#else /* SCTP */
+ return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sys_sctp_generic_recvmsg(td, uap)
+ struct thread *td;
+ struct sctp_generic_recvmsg_args /* {
+ int sd,
+ struct iovec *iov,
+ int iovlen,
+ struct sockaddr *from,
+ __socklen_t *fromlenaddr,
+ struct sctp_sndrcvinfo *sinfo,
+ int *msg_flags
+ } */ *uap;
+{
+#if (defined(INET) || defined(INET6)) && defined(SCTP)
+ uint8_t sockbufstore[256];
+ struct uio auio;
+ struct iovec *iov, *tiov;
+ struct sctp_sndrcvinfo sinfo;
+ struct socket *so;
+ struct file *fp = NULL;
+ struct sockaddr *fromsa;
+ cap_rights_t rights;
+#ifdef KTRACE
+ struct uio *ktruio = NULL;
+#endif
+ ssize_t len;
+ int error, fromlen, i, msg_flags;
+
+ AUDIT_ARG_FD(uap->sd);
+ error = getsock_cap(td->td_proc->p_fd, uap->sd,
+ cap_rights_init(&rights, CAP_RECV), &fp, NULL);
+ if (error != 0)
+ return (error);
+#ifdef COMPAT_FREEBSD32
+ if (SV_CURPROC_FLAG(SV_ILP32))
+ error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
+ uap->iovlen, &iov, EMSGSIZE);
+ else
+#endif
+ error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
+ if (error != 0)
+ goto out1;
+
+ so = fp->f_data;
+ if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
+ error = EOPNOTSUPP;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_socket_check_receive(td->td_ucred, so);
+ if (error != 0)
+ goto out;
+#endif /* MAC */
+
+ if (uap->fromlenaddr != NULL) {
+ error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen));
+ if (error != 0)
+ goto out;
+ } else {
+ fromlen = 0;
+ }
+ if (uap->msg_flags) {
+ error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
+ if (error != 0)
+ goto out;
+ } else {
+ msg_flags = 0;
+ }
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovlen;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = td;
+ auio.uio_offset = 0; /* XXX */
+ auio.uio_resid = 0;
+ tiov = iov;
+ for (i = 0; i <uap->iovlen; i++, tiov++) {
+ if ((auio.uio_resid += tiov->iov_len) < 0) {
+ error = EINVAL;
+ goto out;
+ }
+ }
+ len = auio.uio_resid;
+ fromsa = (struct sockaddr *)sockbufstore;
+
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_GENIO))
+ ktruio = cloneuio(&auio);
+#endif /* KTRACE */
+ memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
+ CURVNET_SET(so->so_vnet);
+ error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
+ fromsa, fromlen, &msg_flags,
+ (struct sctp_sndrcvinfo *)&sinfo, 1);
+ CURVNET_RESTORE();
+ if (error != 0) {
+ if (auio.uio_resid != len && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ } else {
+ if (uap->sinfo)
+ error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
+ }
+#ifdef KTRACE
+ if (ktruio != NULL) {
+ ktruio->uio_resid = len - auio.uio_resid;
+ ktrgenio(uap->sd, UIO_READ, ktruio, error);
+ }
+#endif /* KTRACE */
+ if (error != 0)
+ goto out;
+ td->td_retval[0] = len - auio.uio_resid;
+
+ if (fromlen && uap->from) {
+ len = fromlen;
+ if (len <= 0 || fromsa == 0)
+ len = 0;
+ else {
+ len = MIN(len, fromsa->sa_len);
+ error = copyout(fromsa, uap->from, (size_t)len);
+ if (error != 0)
+ goto out;
+ }
+ error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
+ if (error != 0)
+ goto out;
+ }
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrsockaddr(fromsa);
+#endif
+ if (uap->msg_flags) {
+ error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
+ if (error != 0)
+ goto out;
+ }
+out:
+ free(iov, M_IOV);
+out1:
+ if (fp != NULL)
+ fdrop(fp, td);
+
+ return (error);
+#else /* SCTP */
+ return (EOPNOTSUPP);
+#endif /* SCTP */
+}
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
new file mode 100644
index 0000000..c0a5d2e
--- /dev/null
+++ b/sys/kern/uipc_usrreq.c
@@ -0,0 +1,2505 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1991, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2004-2009 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94
+ */
+
+/*
+ * UNIX Domain (Local) Sockets
+ *
+ * This is an implementation of UNIX (local) domain sockets. Each socket has
+ * an associated struct unpcb (UNIX protocol control block). Stream sockets
+ * may be connected to 0 or 1 other socket. Datagram sockets may be
+ * connected to 0, 1, or many other sockets. Sockets may be created and
+ * connected in pairs (socketpair(2)), or bound/connected to using the file
+ * system name space. For most purposes, only the receive socket buffer is
+ * used, as sending on one socket delivers directly to the receive socket
+ * buffer of a second socket.
+ *
+ * The implementation is substantially complicated by the fact that
+ * "ancillary data", such as file descriptors or credentials, may be passed
+ * across UNIX domain sockets. The potential for passing UNIX domain sockets
+ * over other UNIX domain sockets requires the implementation of a simple
+ * garbage collector to find and tear down cycles of disconnected sockets.
+ *
+ * TODO:
+ * RDM
+ * distinguish datagram size limits from flow control limits in SEQPACKET
+ * rethink name space problems
+ * need a proper out-of-band
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/capability.h>
+#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h> /* XXX must be before <sys/file.h> */
+#include <sys/eventhandler.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/queue.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <sys/un.h>
+#include <sys/unpcb.h>
+#include <sys/vnode.h>
+
+#include <net/vnet.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+MALLOC_DECLARE(M_FILECAPS);
+
+/*
+ * Locking key:
+ * (l) Locked using list lock
+ * (g) Locked using linkage lock
+ */
+
+static uma_zone_t unp_zone;
+static unp_gen_t unp_gencnt; /* (l) */
+static u_int unp_count; /* (l) Count of local sockets. */
+static ino_t unp_ino; /* Prototype for fake inode numbers. */
+static int unp_rights; /* (g) File descriptors in flight. */
+static struct unp_head unp_shead; /* (l) List of stream sockets. */
+static struct unp_head unp_dhead; /* (l) List of datagram sockets. */
+static struct unp_head unp_sphead; /* (l) List of seqpacket sockets. */
+
+struct unp_defer {
+ SLIST_ENTRY(unp_defer) ud_link;
+ struct file *ud_fp;
+};
+static SLIST_HEAD(, unp_defer) unp_defers;
+static int unp_defers_count;
+
+static const struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
+
+/*
+ * Garbage collection of cyclic file descriptor/socket references occurs
+ * asynchronously in a taskqueue context in order to avoid recursion and
+ * reentrance in the UNIX domain socket, file descriptor, and socket layer
+ * code. See unp_gc() for a full description.
+ */
+static struct timeout_task unp_gc_task;
+
+/*
+ * The close of unix domain sockets attached as SCM_RIGHTS is
+ * postponed to the taskqueue, to avoid arbitrary recursion depth.
+ * The attached sockets might have another sockets attached.
+ */
+static struct task unp_defer_task;
+
+/*
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
+ * stream sockets, although the total for sender and receiver is actually
+ * only PIPSIZ.
+ *
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace. Their recvspace should be
+ * large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define PIPSIZ 8192
+#endif
+static u_long unpst_sendspace = PIPSIZ;
+static u_long unpst_recvspace = PIPSIZ;
+static u_long unpdg_sendspace = 2*1024; /* really max datagram size */
+static u_long unpdg_recvspace = 4*1024;
+static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */
+static u_long unpsp_recvspace = PIPSIZ;
+
+static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+static SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0,
+ "SOCK_STREAM");
+static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
+static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket, CTLFLAG_RW, 0,
+ "SOCK_SEQPACKET");
+
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+ &unpst_sendspace, 0, "Default stream send space.");
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpst_recvspace, 0, "Default stream receive space.");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+ &unpdg_sendspace, 0, "Default datagram send space.");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpdg_recvspace, 0, "Default datagram receive space.");
+SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
+ &unpsp_sendspace, 0, "Default seqpacket send space.");
+SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpsp_recvspace, 0, "Default seqpacket receive space.");
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
+ "File descriptors in flight.");
+SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
+ &unp_defers_count, 0,
+ "File descriptors deferred to taskqueue for close.");
+
+/*
+ * Locking and synchronization:
+ *
+ * Three types of locks exit in the local domain socket implementation: a
+ * global list mutex, a global linkage rwlock, and per-unpcb mutexes. Of the
+ * global locks, the list lock protects the socket count, global generation
+ * number, and stream/datagram global lists. The linkage lock protects the
+ * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
+ * held exclusively over the acquisition of multiple unpcb locks to prevent
+ * deadlock.
+ *
+ * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
+ * allocated in pru_attach() and freed in pru_detach(). The validity of that
+ * pointer is an invariant, so no lock is required to dereference the so_pcb
+ * pointer if a valid socket reference is held by the caller. In practice,
+ * this is always true during operations performed on a socket. Each unpcb
+ * has a back-pointer to its socket, unp_socket, which will be stable under
+ * the same circumstances.
+ *
+ * This pointer may only be safely dereferenced as long as a valid reference
+ * to the unpcb is held. Typically, this reference will be from the socket,
+ * or from another unpcb when the referring unpcb's lock is held (in order
+ * that the reference not be invalidated during use). For example, to follow
+ * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
+ * as unp_socket remains valid as long as the reference to unp_conn is valid.
+ *
+ * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx. Individual
+ * atomic reads without the lock may be performed "lockless", but more
+ * complex reads and read-modify-writes require the mutex to be held. No
+ * lock order is defined between unpcb locks -- multiple unpcb locks may be
+ * acquired at the same time only when holding the linkage rwlock
+ * exclusively, which prevents deadlocks.
+ *
+ * Blocking with UNIX domain sockets is a tricky issue: unlike most network
+ * protocols, bind() is a non-atomic operation, and connect() requires
+ * potential sleeping in the protocol, due to potentially waiting on local or
+ * distributed file systems. We try to separate "lookup" operations, which
+ * may sleep, and the IPC operations themselves, which typically can occur
+ * with relative atomicity as locks can be held over the entire operation.
+ *
+ * Another tricky issue is simultaneous multi-threaded or multi-process
+ * access to a single UNIX domain socket. These are handled by the flags
+ * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
+ * binding, both of which involve dropping UNIX domain socket locks in order
+ * to perform namei() and other file system operations.
+ */
+static struct rwlock unp_link_rwlock;
+static struct mtx unp_list_lock;
+static struct mtx unp_defers_lock;
+
+#define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \
+ "unp_link_rwlock")
+
+#define UNP_LINK_LOCK_ASSERT() rw_assert(&unp_link_rwlock, \
+ RA_LOCKED)
+#define UNP_LINK_UNLOCK_ASSERT() rw_assert(&unp_link_rwlock, \
+ RA_UNLOCKED)
+
+#define UNP_LINK_RLOCK() rw_rlock(&unp_link_rwlock)
+#define UNP_LINK_RUNLOCK() rw_runlock(&unp_link_rwlock)
+#define UNP_LINK_WLOCK() rw_wlock(&unp_link_rwlock)
+#define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock)
+#define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \
+ RA_WLOCKED)
+
+#define UNP_LIST_LOCK_INIT() mtx_init(&unp_list_lock, \
+ "unp_list_lock", NULL, MTX_DEF)
+#define UNP_LIST_LOCK() mtx_lock(&unp_list_lock)
+#define UNP_LIST_UNLOCK() mtx_unlock(&unp_list_lock)
+
+#define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \
+ "unp_defer", NULL, MTX_DEF)
+#define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock)
+#define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock)
+
+#define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \
+ "unp_mtx", "unp_mtx", \
+ MTX_DUPOK|MTX_DEF|MTX_RECURSE)
+#define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx)
+#define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx)
+#define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx)
+#define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED)
+
+static int uipc_connect2(struct socket *, struct socket *);
+static int uipc_ctloutput(struct socket *, struct sockopt *);
+static int unp_connect(struct socket *, struct sockaddr *,
+ struct thread *);
+static int unp_connectat(int, struct socket *, struct sockaddr *,
+ struct thread *);
+static int unp_connect2(struct socket *so, struct socket *so2, int);
+static void unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
+static void unp_dispose(struct mbuf *);
+static void unp_shutdown(struct unpcb *);
+static void unp_drop(struct unpcb *, int);
+static void unp_gc(__unused void *, int);
+static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
+static void unp_discard(struct file *);
+static void unp_freerights(struct filedescent **, int);
+static void unp_init(void);
+static int unp_internalize(struct mbuf **, struct thread *);
+static void unp_internalize_fp(struct file *);
+static int unp_externalize(struct mbuf *, struct mbuf **, int);
+static int unp_externalize_fp(struct file *);
+static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
+static void unp_process_defers(void * __unused, int);
+
+/*
+ * Definitions of protocols supported in the LOCAL domain.
+ */
+static struct domain localdomain;
+static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream;
+static struct pr_usrreqs uipc_usrreqs_seqpacket;
+static struct protosw localsw[] = {
+{
+ .pr_type = SOCK_STREAM,
+ .pr_domain = &localdomain,
+ .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+ .pr_ctloutput = &uipc_ctloutput,
+ .pr_usrreqs = &uipc_usrreqs_stream
+},
+{
+ .pr_type = SOCK_DGRAM,
+ .pr_domain = &localdomain,
+ .pr_flags = PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+ .pr_ctloutput = &uipc_ctloutput,
+ .pr_usrreqs = &uipc_usrreqs_dgram
+},
+{
+ .pr_type = SOCK_SEQPACKET,
+ .pr_domain = &localdomain,
+
+ /*
+ * XXXRW: For now, PR_ADDR because soreceive will bump into them
+ * due to our use of sbappendaddr. A new sbappend variants is needed
+ * that supports both atomic record writes and control data.
+ */
+ .pr_flags = PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD|
+ PR_RIGHTS,
+ .pr_usrreqs = &uipc_usrreqs_seqpacket,
+},
+};
+
+static struct domain localdomain = {
+ .dom_family = AF_LOCAL,
+ .dom_name = "local",
+ .dom_init = unp_init,
+ .dom_externalize = unp_externalize,
+ .dom_dispose = unp_dispose,
+ .dom_protosw = localsw,
+ .dom_protoswNPROTOSW = &localsw[sizeof(localsw)/sizeof(localsw[0])]
+};
+DOMAIN_SET(local);
+
+static void
+uipc_abort(struct socket *so)
+{
+ struct unpcb *unp, *unp2;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
+
+ UNP_LINK_WLOCK();
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ unp_drop(unp2, ECONNABORTED);
+ UNP_PCB_UNLOCK(unp2);
+ }
+ UNP_PCB_UNLOCK(unp);
+ UNP_LINK_WUNLOCK();
+}
+
+static int
+uipc_accept(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp, *unp2;
+ const struct sockaddr *sa;
+
+ /*
+ * Pass back name of connected socket, if it was bound and we are
+ * still connected (our peer may have closed already!).
+ */
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
+
+ *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+ UNP_LINK_RLOCK();
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL && unp2->unp_addr != NULL) {
+ UNP_PCB_LOCK(unp2);
+ sa = (struct sockaddr *) unp2->unp_addr;
+ bcopy(sa, *nam, sa->sa_len);
+ UNP_PCB_UNLOCK(unp2);
+ } else {
+ sa = &sun_noname;
+ bcopy(sa, *nam, sa->sa_len);
+ }
+ UNP_LINK_RUNLOCK();
+ return (0);
+}
+
+static int
+uipc_attach(struct socket *so, int proto, struct thread *td)
+{
+ u_long sendspace, recvspace;
+ struct unpcb *unp;
+ int error;
+
+ KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ switch (so->so_type) {
+ case SOCK_STREAM:
+ sendspace = unpst_sendspace;
+ recvspace = unpst_recvspace;
+ break;
+
+ case SOCK_DGRAM:
+ sendspace = unpdg_sendspace;
+ recvspace = unpdg_recvspace;
+ break;
+
+ case SOCK_SEQPACKET:
+ sendspace = unpsp_sendspace;
+ recvspace = unpsp_recvspace;
+ break;
+
+ default:
+ panic("uipc_attach");
+ }
+ error = soreserve(so, sendspace, recvspace);
+ if (error)
+ return (error);
+ }
+ unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
+ if (unp == NULL)
+ return (ENOBUFS);
+ LIST_INIT(&unp->unp_refs);
+ UNP_PCB_LOCK_INIT(unp);
+ unp->unp_socket = so;
+ so->so_pcb = unp;
+ unp->unp_refcount = 1;
+
+ UNP_LIST_LOCK();
+ unp->unp_gencnt = ++unp_gencnt;
+ unp_count++;
+ switch (so->so_type) {
+ case SOCK_STREAM:
+ LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
+ break;
+
+ case SOCK_DGRAM:
+ LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
+ break;
+
+ case SOCK_SEQPACKET:
+ LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
+ break;
+
+ default:
+ panic("uipc_attach");
+ }
+ UNP_LIST_UNLOCK();
+
+ return (0);
+}
+
+static int
+uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+ struct vattr vattr;
+ int error, namelen;
+ struct nameidata nd;
+ struct unpcb *unp;
+ struct vnode *vp;
+ struct mount *mp;
+ cap_rights_t rights;
+ char *buf;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
+
+ if (soun->sun_len > sizeof(struct sockaddr_un))
+ return (EINVAL);
+ namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+ if (namelen <= 0)
+ return (EINVAL);
+
+ /*
+ * We don't allow simultaneous bind() calls on a single UNIX domain
+ * socket, so flag in-progress operations, and return an error if an
+ * operation is already in progress.
+ *
+ * Historically, we have not allowed a socket to be rebound, so this
+ * also returns an error. Not allowing re-binding simplifies the
+ * implementation and avoids a great many possible failure modes.
+ */
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_vnode != NULL) {
+ UNP_PCB_UNLOCK(unp);
+ return (EINVAL);
+ }
+ if (unp->unp_flags & UNP_BINDING) {
+ UNP_PCB_UNLOCK(unp);
+ return (EALREADY);
+ }
+ unp->unp_flags |= UNP_BINDING;
+ UNP_PCB_UNLOCK(unp);
+
+ buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
+ bcopy(soun->sun_path, buf, namelen);
+ buf[namelen] = 0;
+
+restart:
+ NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME,
+ UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_BINDAT), td);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+ error = namei(&nd);
+ if (error)
+ goto error;
+ vp = nd.ni_vp;
+ if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp != NULL) {
+ vrele(vp);
+ error = EADDRINUSE;
+ goto error;
+ }
+ error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+ if (error)
+ goto error;
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VSOCK;
+ vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
+#ifdef MAC
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+ &vattr);
+#endif
+ if (error == 0)
+ error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (error) {
+ vn_finished_write(mp);
+ goto error;
+ }
+ vp = nd.ni_vp;
+ ASSERT_VOP_ELOCKED(vp, "uipc_bind");
+ soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
+
+ UNP_LINK_WLOCK();
+ UNP_PCB_LOCK(unp);
+ VOP_UNP_BIND(vp, unp->unp_socket);
+ unp->unp_vnode = vp;
+ unp->unp_addr = soun;
+ unp->unp_flags &= ~UNP_BINDING;
+ UNP_PCB_UNLOCK(unp);
+ UNP_LINK_WUNLOCK();
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ free(buf, M_TEMP);
+ return (0);
+
+error:
+ UNP_PCB_LOCK(unp);
+ unp->unp_flags &= ~UNP_BINDING;
+ UNP_PCB_UNLOCK(unp);
+ free(buf, M_TEMP);
+ return (error);
+}
+
+static int
+uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+ return (uipc_bindat(AT_FDCWD, so, nam, td));
+}
+
+static int
+uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error;
+
+ KASSERT(td == curthread, ("uipc_connect: td != curthread"));
+ UNP_LINK_WLOCK();
+ error = unp_connect(so, nam, td);
+ UNP_LINK_WUNLOCK();
+ return (error);
+}
+
+static int
+uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
+ struct thread *td)
+{
+ int error;
+
+ KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
+ UNP_LINK_WLOCK();
+ error = unp_connectat(fd, so, nam, td);
+ UNP_LINK_WUNLOCK();
+ return (error);
+}
+
+static void
+uipc_close(struct socket *so)
+{
+ struct unpcb *unp, *unp2;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
+
+ UNP_LINK_WLOCK();
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
+ }
+ UNP_PCB_UNLOCK(unp);
+ UNP_LINK_WUNLOCK();
+}
+
+static int
+uipc_connect2(struct socket *so1, struct socket *so2)
+{
+ struct unpcb *unp, *unp2;
+ int error;
+
+ UNP_LINK_WLOCK();
+ unp = so1->so_pcb;
+ KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
+ UNP_PCB_LOCK(unp);
+ unp2 = so2->so_pcb;
+ KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
+ UNP_PCB_LOCK(unp2);
+ error = unp_connect2(so1, so2, PRU_CONNECT2);
+ UNP_PCB_UNLOCK(unp2);
+ UNP_PCB_UNLOCK(unp);
+ UNP_LINK_WUNLOCK();
+ return (error);
+}
+
+static void
+uipc_detach(struct socket *so)
+{
+ struct unpcb *unp, *unp2;
+ struct sockaddr_un *saved_unp_addr;
+ struct vnode *vp;
+ int freeunp, local_unp_rights;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
+
+ UNP_LINK_WLOCK();
+ UNP_LIST_LOCK();
+ UNP_PCB_LOCK(unp);
+ LIST_REMOVE(unp, unp_link);
+ unp->unp_gencnt = ++unp_gencnt;
+ --unp_count;
+ UNP_LIST_UNLOCK();
+
+ /*
+ * XXXRW: Should assert vp->v_socket == so.
+ */
+ if ((vp = unp->unp_vnode) != NULL) {
+ VOP_UNP_DETACH(vp);
+ unp->unp_vnode = NULL;
+ }
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
+ }
+
+ /*
+ * We hold the linkage lock exclusively, so it's OK to acquire
+ * multiple pcb locks at a time.
+ */
+ while (!LIST_EMPTY(&unp->unp_refs)) {
+ struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
+
+ UNP_PCB_LOCK(ref);
+ unp_drop(ref, ECONNRESET);
+ UNP_PCB_UNLOCK(ref);
+ }
+ local_unp_rights = unp_rights;
+ UNP_LINK_WUNLOCK();
+ unp->unp_socket->so_pcb = NULL;
+ saved_unp_addr = unp->unp_addr;
+ unp->unp_addr = NULL;
+ unp->unp_refcount--;
+ freeunp = (unp->unp_refcount == 0);
+ if (saved_unp_addr != NULL)
+ free(saved_unp_addr, M_SONAME);
+ if (freeunp) {
+ UNP_PCB_LOCK_DESTROY(unp);
+ uma_zfree(unp_zone, unp);
+ } else
+ UNP_PCB_UNLOCK(unp);
+ if (vp)
+ vrele(vp);
+ if (local_unp_rights)
+ taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
+}
+
+static int
+uipc_disconnect(struct socket *so)
+{
+ struct unpcb *unp, *unp2;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
+
+ UNP_LINK_WLOCK();
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
+ }
+ UNP_PCB_UNLOCK(unp);
+ UNP_LINK_WUNLOCK();
+ return (0);
+}
+
+static int
+uipc_listen(struct socket *so, int backlog, struct thread *td)
+{
+ struct unpcb *unp;
+ int error;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
+
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_vnode == NULL) {
+ UNP_PCB_UNLOCK(unp);
+ return (EINVAL);
+ }
+
+ SOCK_LOCK(so);
+ error = solisten_proto_check(so);
+ if (error == 0) {
+ cru2x(td->td_ucred, &unp->unp_peercred);
+ unp->unp_flags |= UNP_HAVEPCCACHED;
+ solisten_proto(so, backlog);
+ }
+ SOCK_UNLOCK(so);
+ UNP_PCB_UNLOCK(unp);
+ return (error);
+}
+
+static int
+uipc_peeraddr(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp, *unp2;
+ const struct sockaddr *sa;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
+
+ *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+ UNP_LINK_RLOCK();
+ /*
+ * XXX: It seems that this test always fails even when connection is
+ * established. So, this else clause is added as workaround to
+ * return PF_LOCAL sockaddr.
+ */
+ unp2 = unp->unp_conn;
+ if (unp2 != NULL) {
+ UNP_PCB_LOCK(unp2);
+ if (unp2->unp_addr != NULL)
+ sa = (struct sockaddr *) unp2->unp_addr;
+ else
+ sa = &sun_noname;
+ bcopy(sa, *nam, sa->sa_len);
+ UNP_PCB_UNLOCK(unp2);
+ } else {
+ sa = &sun_noname;
+ bcopy(sa, *nam, sa->sa_len);
+ }
+ UNP_LINK_RUNLOCK();
+ return (0);
+}
+
+static int
+uipc_rcvd(struct socket *so, int flags)
+{
+ struct unpcb *unp, *unp2;
+ struct socket *so2;
+ u_int mbcnt, sbcc;
+ u_long newhiwat;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
+
+ if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET)
+ panic("uipc_rcvd socktype %d", so->so_type);
+
+ /*
+ * Adjust backpressure on sender and wakeup any waiting to write.
+ *
+ * The unp lock is acquired to maintain the validity of the unp_conn
+ * pointer; no lock on unp2 is required as unp2->unp_socket will be
+ * static as long as we don't permit unp2 to disconnect from unp,
+ * which is prevented by the lock on unp. We cache values from
+ * so_rcv to avoid holding the so_rcv lock over the entire
+ * transaction on the remote so_snd.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ mbcnt = so->so_rcv.sb_mbcnt;
+ sbcc = so->so_rcv.sb_cc;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 == NULL) {
+ UNP_PCB_UNLOCK(unp);
+ return (0);
+ }
+ so2 = unp2->unp_socket;
+ SOCKBUF_LOCK(&so2->so_snd);
+ so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
+ newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
+ (void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
+ newhiwat, RLIM_INFINITY);
+ sowwakeup_locked(so2);
+ unp->unp_mbcnt = mbcnt;
+ unp->unp_cc = sbcc;
+ UNP_PCB_UNLOCK(unp);
+ return (0);
+}
+
+static int
+uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
+{
+ struct unpcb *unp, *unp2;
+ struct socket *so2;
+ u_int mbcnt_delta, sbcc;
+ u_int newhiwat;
+ int error = 0;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
+
+ if (flags & PRUS_OOB) {
+ error = EOPNOTSUPP;
+ goto release;
+ }
+ if (control != NULL && (error = unp_internalize(&control, td)))
+ goto release;
+ if ((nam != NULL) || (flags & PRUS_EOF))
+ UNP_LINK_WLOCK();
+ else
+ UNP_LINK_RLOCK();
+ switch (so->so_type) {
+ case SOCK_DGRAM:
+ {
+ const struct sockaddr *from;
+
+ unp2 = unp->unp_conn;
+ if (nam != NULL) {
+ UNP_LINK_WLOCK_ASSERT();
+ if (unp2 != NULL) {
+ error = EISCONN;
+ break;
+ }
+ error = unp_connect(so, nam, td);
+ if (error)
+ break;
+ unp2 = unp->unp_conn;
+ }
+
+ /*
+ * Because connect() and send() are non-atomic in a sendto()
+ * with a target address, it's possible that the socket will
+ * have disconnected before the send() can run. In that case
+ * return the slightly counter-intuitive but otherwise
+ * correct error that the socket is not connected.
+ */
+ if (unp2 == NULL) {
+ error = ENOTCONN;
+ break;
+ }
+ /* Lockless read. */
+ if (unp2->unp_flags & UNP_WANTCRED)
+ control = unp_addsockcred(td, control);
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_addr != NULL)
+ from = (struct sockaddr *)unp->unp_addr;
+ else
+ from = &sun_noname;
+ so2 = unp2->unp_socket;
+ SOCKBUF_LOCK(&so2->so_rcv);
+ if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
+ sorwakeup_locked(so2);
+ m = NULL;
+ control = NULL;
+ } else {
+ SOCKBUF_UNLOCK(&so2->so_rcv);
+ error = ENOBUFS;
+ }
+ if (nam != NULL) {
+ UNP_LINK_WLOCK_ASSERT();
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
+ }
+ UNP_PCB_UNLOCK(unp);
+ break;
+ }
+
+ case SOCK_SEQPACKET:
+ case SOCK_STREAM:
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ if (nam != NULL) {
+ UNP_LINK_WLOCK_ASSERT();
+ error = unp_connect(so, nam, td);
+ if (error)
+ break; /* XXX */
+ } else {
+ error = ENOTCONN;
+ break;
+ }
+ }
+
+ /* Lockless read. */
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ error = EPIPE;
+ break;
+ }
+
+ /*
+ * Because connect() and send() are non-atomic in a sendto()
+ * with a target address, it's possible that the socket will
+ * have disconnected before the send() can run. In that case
+ * return the slightly counter-intuitive but otherwise
+ * correct error that the socket is not connected.
+ *
+ * Locking here must be done carefully: the linkage lock
+ * prevents interconnections between unpcbs from changing, so
+ * we can traverse from unp to unp2 without acquiring unp's
+ * lock. Socket buffer locks follow unpcb locks, so we can
+ * acquire both remote and lock socket buffer locks.
+ */
+ unp2 = unp->unp_conn;
+ if (unp2 == NULL) {
+ error = ENOTCONN;
+ break;
+ }
+ so2 = unp2->unp_socket;
+ UNP_PCB_LOCK(unp2);
+ SOCKBUF_LOCK(&so2->so_rcv);
+ if (unp2->unp_flags & UNP_WANTCRED) {
+ /*
+ * Credentials are passed only once on SOCK_STREAM
+ * and SOCK_SEQPACKET.
+ */
+ unp2->unp_flags &= ~UNP_WANTCRED;
+ control = unp_addsockcred(td, control);
+ }
+ /*
+ * Send to paired receive port, and then reduce send buffer
+ * hiwater marks to maintain backpressure. Wake up readers.
+ */
+ switch (so->so_type) {
+ case SOCK_STREAM:
+ if (control != NULL) {
+ if (sbappendcontrol_locked(&so2->so_rcv, m,
+ control))
+ control = NULL;
+ } else
+ sbappend_locked(&so2->so_rcv, m);
+ break;
+
+ case SOCK_SEQPACKET: {
+ const struct sockaddr *from;
+
+ from = &sun_noname;
+ if (sbappendaddr_locked(&so2->so_rcv, from, m,
+ control))
+ control = NULL;
+ break;
+ }
+ }
+
+ /*
+ * XXXRW: While fine for SOCK_STREAM, this conflates maximum
+ * datagram size and back-pressure for SOCK_SEQPACKET, which
+ * can lead to undesired return of EMSGSIZE on send instead
+ * of more desirable blocking.
+ */
+ mbcnt_delta = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt;
+ unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+ sbcc = so2->so_rcv.sb_cc;
+ sorwakeup_locked(so2);
+
+ SOCKBUF_LOCK(&so->so_snd);
+ if ((int)so->so_snd.sb_hiwat >= (int)(sbcc - unp2->unp_cc))
+ newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
+ else
+ newhiwat = 0;
+ (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
+ newhiwat, RLIM_INFINITY);
+ so->so_snd.sb_mbmax -= mbcnt_delta;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ unp2->unp_cc = sbcc;
+ UNP_PCB_UNLOCK(unp2);
+ m = NULL;
+ break;
+
+ default:
+ panic("uipc_send unknown socktype");
+ }
+
+ /*
+ * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
+ */
+ if (flags & PRUS_EOF) {
+ UNP_PCB_LOCK(unp);
+ socantsendmore(so);
+ unp_shutdown(unp);
+ UNP_PCB_UNLOCK(unp);
+ }
+
+ if ((nam != NULL) || (flags & PRUS_EOF))
+ UNP_LINK_WUNLOCK();
+ else
+ UNP_LINK_RUNLOCK();
+
+ if (control != NULL && error != 0)
+ unp_dispose(control);
+
+release:
+ if (control != NULL)
+ m_freem(control);
+ if (m != NULL)
+ m_freem(m);
+ return (error);
+}
+
+static int
+uipc_sense(struct socket *so, struct stat *sb)
+{
+ struct unpcb *unp, *unp2;
+ struct socket *so2;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
+
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ UNP_LINK_RLOCK();
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) &&
+ unp2 != NULL) {
+ so2 = unp2->unp_socket;
+ sb->st_blksize += so2->so_rcv.sb_cc;
+ }
+ sb->st_dev = NODEV;
+ if (unp->unp_ino == 0)
+ unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
+ sb->st_ino = unp->unp_ino;
+ UNP_PCB_UNLOCK(unp);
+ UNP_LINK_RUNLOCK();
+ return (0);
+}
+
+static int
+uipc_shutdown(struct socket *so)
+{
+ struct unpcb *unp;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
+
+ UNP_LINK_WLOCK();
+ UNP_PCB_LOCK(unp);
+ socantsendmore(so);
+ unp_shutdown(unp);
+ UNP_PCB_UNLOCK(unp);
+ UNP_LINK_WUNLOCK();
+ return (0);
+}
+
+static int
+uipc_sockaddr(struct socket *so, struct sockaddr **nam)
+{
+ struct unpcb *unp;
+ const struct sockaddr *sa;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
+
+ *nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_addr != NULL)
+ sa = (struct sockaddr *) unp->unp_addr;
+ else
+ sa = &sun_noname;
+ bcopy(sa, *nam, sa->sa_len);
+ UNP_PCB_UNLOCK(unp);
+ return (0);
+}
+
+static struct pr_usrreqs uipc_usrreqs_dgram = {
+ .pru_abort = uipc_abort,
+ .pru_accept = uipc_accept,
+ .pru_attach = uipc_attach,
+ .pru_bind = uipc_bind,
+ .pru_bindat = uipc_bindat,
+ .pru_connect = uipc_connect,
+ .pru_connectat = uipc_connectat,
+ .pru_connect2 = uipc_connect2,
+ .pru_detach = uipc_detach,
+ .pru_disconnect = uipc_disconnect,
+ .pru_listen = uipc_listen,
+ .pru_peeraddr = uipc_peeraddr,
+ .pru_rcvd = uipc_rcvd,
+ .pru_send = uipc_send,
+ .pru_sense = uipc_sense,
+ .pru_shutdown = uipc_shutdown,
+ .pru_sockaddr = uipc_sockaddr,
+ .pru_soreceive = soreceive_dgram,
+ .pru_close = uipc_close,
+};
+
+static struct pr_usrreqs uipc_usrreqs_seqpacket = {
+ .pru_abort = uipc_abort,
+ .pru_accept = uipc_accept,
+ .pru_attach = uipc_attach,
+ .pru_bind = uipc_bind,
+ .pru_bindat = uipc_bindat,
+ .pru_connect = uipc_connect,
+ .pru_connectat = uipc_connectat,
+ .pru_connect2 = uipc_connect2,
+ .pru_detach = uipc_detach,
+ .pru_disconnect = uipc_disconnect,
+ .pru_listen = uipc_listen,
+ .pru_peeraddr = uipc_peeraddr,
+ .pru_rcvd = uipc_rcvd,
+ .pru_send = uipc_send,
+ .pru_sense = uipc_sense,
+ .pru_shutdown = uipc_shutdown,
+ .pru_sockaddr = uipc_sockaddr,
+ .pru_soreceive = soreceive_generic, /* XXX: or...? */
+ .pru_close = uipc_close,
+};
+
+static struct pr_usrreqs uipc_usrreqs_stream = {
+ .pru_abort = uipc_abort,
+ .pru_accept = uipc_accept,
+ .pru_attach = uipc_attach,
+ .pru_bind = uipc_bind,
+ .pru_bindat = uipc_bindat,
+ .pru_connect = uipc_connect,
+ .pru_connectat = uipc_connectat,
+ .pru_connect2 = uipc_connect2,
+ .pru_detach = uipc_detach,
+ .pru_disconnect = uipc_disconnect,
+ .pru_listen = uipc_listen,
+ .pru_peeraddr = uipc_peeraddr,
+ .pru_rcvd = uipc_rcvd,
+ .pru_send = uipc_send,
+ .pru_sense = uipc_sense,
+ .pru_shutdown = uipc_shutdown,
+ .pru_sockaddr = uipc_sockaddr,
+ .pru_soreceive = soreceive_generic,
+ .pru_close = uipc_close,
+};
+
+static int
+uipc_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct unpcb *unp;
+ struct xucred xu;
+ int error, optval;
+
+ if (sopt->sopt_level != 0)
+ return (EINVAL);
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
+ error = 0;
+ switch (sopt->sopt_dir) {
+ case SOPT_GET:
+ switch (sopt->sopt_name) {
+ case LOCAL_PEERCRED:
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_flags & UNP_HAVEPC)
+ xu = unp->unp_peercred;
+ else {
+ if (so->so_type == SOCK_STREAM)
+ error = ENOTCONN;
+ else
+ error = EINVAL;
+ }
+ UNP_PCB_UNLOCK(unp);
+ if (error == 0)
+ error = sooptcopyout(sopt, &xu, sizeof(xu));
+ break;
+
+ case LOCAL_CREDS:
+ /* Unlocked read. */
+ optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
+
+ case LOCAL_CONNWAIT:
+ /* Unlocked read. */
+ optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
+ error = sooptcopyout(sopt, &optval, sizeof(optval));
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ break;
+
+ case SOPT_SET:
+ switch (sopt->sopt_name) {
+ case LOCAL_CREDS:
+ case LOCAL_CONNWAIT:
+ error = sooptcopyin(sopt, &optval, sizeof(optval),
+ sizeof(optval));
+ if (error)
+ break;
+
+#define OPTSET(bit) do { \
+ UNP_PCB_LOCK(unp); \
+ if (optval) \
+ unp->unp_flags |= bit; \
+ else \
+ unp->unp_flags &= ~bit; \
+ UNP_PCB_UNLOCK(unp); \
+} while (0)
+
+ switch (sopt->sopt_name) {
+ case LOCAL_CREDS:
+ OPTSET(UNP_WANTCRED);
+ break;
+
+ case LOCAL_CONNWAIT:
+ OPTSET(UNP_CONNWAIT);
+ break;
+
+ default:
+ break;
+ }
+ break;
+#undef OPTSET
+ default:
+ error = ENOPROTOOPT;
+ break;
+ }
+ break;
+
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
+
+static int
+unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+ return (unp_connectat(AT_FDCWD, so, nam, td));
+}
+
+static int
+unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
+ struct thread *td)
+{
+ struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+ struct vnode *vp;
+ struct socket *so2, *so3;
+ struct unpcb *unp, *unp2, *unp3;
+ struct nameidata nd;
+ char buf[SOCK_MAXADDRLEN];
+ struct sockaddr *sa;
+ cap_rights_t rights;
+ int error, len;
+
+ UNP_LINK_WLOCK_ASSERT();
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+
+ if (nam->sa_len > sizeof(struct sockaddr_un))
+ return (EINVAL);
+ len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
+ if (len <= 0)
+ return (EINVAL);
+ bcopy(soun->sun_path, buf, len);
+ buf[len] = 0;
+
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_flags & UNP_CONNECTING) {
+ UNP_PCB_UNLOCK(unp);
+ return (EALREADY);
+ }
+ UNP_LINK_WUNLOCK();
+ unp->unp_flags |= UNP_CONNECTING;
+ UNP_PCB_UNLOCK(unp);
+
+ sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
+ UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_CONNECTAT), td);
+ error = namei(&nd);
+ if (error)
+ vp = NULL;
+ else
+ vp = nd.ni_vp;
+ ASSERT_VOP_LOCKED(vp, "unp_connect");
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (error)
+ goto bad;
+
+ if (vp->v_type != VSOCK) {
+ error = ENOTSOCK;
+ goto bad;
+ }
+#ifdef MAC
+ error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
+ if (error)
+ goto bad;
+#endif
+ error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
+ if (error)
+ goto bad;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+
+ /*
+ * Lock linkage lock for two reasons: make sure v_socket is stable,
+ * and to protect simultaneous locking of multiple pcbs.
+ */
+ UNP_LINK_WLOCK();
+ VOP_UNP_CONNECT(vp, &so2);
+ if (so2 == NULL) {
+ error = ECONNREFUSED;
+ goto bad2;
+ }
+ if (so->so_type != so2->so_type) {
+ error = EPROTOTYPE;
+ goto bad2;
+ }
+ if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+ if (so2->so_options & SO_ACCEPTCONN) {
+ CURVNET_SET(so2->so_vnet);
+ so3 = sonewconn(so2, 0);
+ CURVNET_RESTORE();
+ } else
+ so3 = NULL;
+ if (so3 == NULL) {
+ error = ECONNREFUSED;
+ goto bad2;
+ }
+ unp = sotounpcb(so);
+ unp2 = sotounpcb(so2);
+ unp3 = sotounpcb(so3);
+ UNP_PCB_LOCK(unp);
+ UNP_PCB_LOCK(unp2);
+ UNP_PCB_LOCK(unp3);
+ if (unp2->unp_addr != NULL) {
+ bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
+ unp3->unp_addr = (struct sockaddr_un *) sa;
+ sa = NULL;
+ }
+
+ /*
+ * The connector's (client's) credentials are copied from its
+ * process structure at the time of connect() (which is now).
+ */
+ cru2x(td->td_ucred, &unp3->unp_peercred);
+ unp3->unp_flags |= UNP_HAVEPC;
+
+ /*
+ * The receiver's (server's) credentials are copied from the
+ * unp_peercred member of socket on which the former called
+ * listen(); uipc_listen() cached that process's credentials
+ * at that time so we can use them now.
+ */
+ KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
+ ("unp_connect: listener without cached peercred"));
+ memcpy(&unp->unp_peercred, &unp2->unp_peercred,
+ sizeof(unp->unp_peercred));
+ unp->unp_flags |= UNP_HAVEPC;
+ if (unp2->unp_flags & UNP_WANTCRED)
+ unp3->unp_flags |= UNP_WANTCRED;
+ UNP_PCB_UNLOCK(unp3);
+ UNP_PCB_UNLOCK(unp2);
+ UNP_PCB_UNLOCK(unp);
+#ifdef MAC
+ mac_socketpeer_set_from_socket(so, so3);
+ mac_socketpeer_set_from_socket(so3, so);
+#endif
+
+ so2 = so3;
+ }
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+ unp2 = sotounpcb(so2);
+ KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
+ UNP_PCB_LOCK(unp);
+ UNP_PCB_LOCK(unp2);
+ error = unp_connect2(so, so2, PRU_CONNECT);
+ UNP_PCB_UNLOCK(unp2);
+ UNP_PCB_UNLOCK(unp);
+bad2:
+ UNP_LINK_WUNLOCK();
+bad:
+ if (vp != NULL)
+ vput(vp);
+ free(sa, M_SONAME);
+ UNP_LINK_WLOCK();
+ UNP_PCB_LOCK(unp);
+ unp->unp_flags &= ~UNP_CONNECTING;
+ UNP_PCB_UNLOCK(unp);
+ return (error);
+}
+
+static int
+unp_connect2(struct socket *so, struct socket *so2, int req)
+{
+ struct unpcb *unp;
+ struct unpcb *unp2;
+
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
+ unp2 = sotounpcb(so2);
+ KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
+
+ UNP_LINK_WLOCK_ASSERT();
+ UNP_PCB_LOCK_ASSERT(unp);
+ UNP_PCB_LOCK_ASSERT(unp2);
+
+ if (so2->so_type != so->so_type)
+ return (EPROTOTYPE);
+ unp->unp_conn = unp2;
+
+ switch (so->so_type) {
+ case SOCK_DGRAM:
+ LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
+ soisconnected(so);
+ break;
+
+ case SOCK_STREAM:
+ case SOCK_SEQPACKET:
+ unp2->unp_conn = unp;
+ if (req == PRU_CONNECT &&
+ ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
+ soisconnecting(so);
+ else
+ soisconnected(so);
+ soisconnected(so2);
+ break;
+
+ default:
+ panic("unp_connect2");
+ }
+ return (0);
+}
+
+static void
+unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
+{
+ struct socket *so;
+
+ KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
+
+ UNP_LINK_WLOCK_ASSERT();
+ UNP_PCB_LOCK_ASSERT(unp);
+ UNP_PCB_LOCK_ASSERT(unp2);
+
+ unp->unp_conn = NULL;
+ switch (unp->unp_socket->so_type) {
+ case SOCK_DGRAM:
+ LIST_REMOVE(unp, unp_reflink);
+ so = unp->unp_socket;
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_ISCONNECTED;
+ SOCK_UNLOCK(so);
+ break;
+
+ case SOCK_STREAM:
+ case SOCK_SEQPACKET:
+ soisdisconnected(unp->unp_socket);
+ unp2->unp_conn = NULL;
+ soisdisconnected(unp2->unp_socket);
+ break;
+ }
+}
+
+/*
+ * unp_pcblist() walks the global list of struct unpcb's to generate a
+ * pointer list, bumping the refcount on each unpcb. It then copies them out
+ * sequentially, validating the generation number on each to see if it has
+ * been detached. All of this is necessary because copyout() may sleep on
+ * disk I/O.
+ */
+static int
+unp_pcblist(SYSCTL_HANDLER_ARGS)
+{
+ int error, i, n;
+ int freeunp;
+ struct unpcb *unp, **unp_list;
+ unp_gen_t gencnt;
+ struct xunpgen *xug;
+ struct unp_head *head;
+ struct xunpcb *xu;
+
+ switch ((intptr_t)arg1) {
+ case SOCK_STREAM:
+ head = &unp_shead;
+ break;
+
+ case SOCK_DGRAM:
+ head = &unp_dhead;
+ break;
+
+ case SOCK_SEQPACKET:
+ head = &unp_sphead;
+ break;
+
+ default:
+ panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
+ }
+
+ /*
+ * The process of preparing the PCB list is too time-consuming and
+ * resource-intensive to repeat twice on every request.
+ */
+ if (req->oldptr == NULL) {
+ n = unp_count;
+ req->oldidx = 2 * (sizeof *xug)
+ + (n + n/8) * sizeof(struct xunpcb);
+ return (0);
+ }
+
+ if (req->newptr != NULL)
+ return (EPERM);
+
+ /*
+ * OK, now we're committed to doing something.
+ */
+ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
+ UNP_LIST_LOCK();
+ gencnt = unp_gencnt;
+ n = unp_count;
+ UNP_LIST_UNLOCK();
+
+ xug->xug_len = sizeof *xug;
+ xug->xug_count = n;
+ xug->xug_gen = gencnt;
+ xug->xug_sogen = so_gencnt;
+ error = SYSCTL_OUT(req, xug, sizeof *xug);
+ if (error) {
+ free(xug, M_TEMP);
+ return (error);
+ }
+
+ unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
+
+ UNP_LIST_LOCK();
+ for (unp = LIST_FIRST(head), i = 0; unp && i < n;
+ unp = LIST_NEXT(unp, unp_link)) {
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_gencnt <= gencnt) {
+ if (cr_cansee(req->td->td_ucred,
+ unp->unp_socket->so_cred)) {
+ UNP_PCB_UNLOCK(unp);
+ continue;
+ }
+ unp_list[i++] = unp;
+ unp->unp_refcount++;
+ }
+ UNP_PCB_UNLOCK(unp);
+ }
+ UNP_LIST_UNLOCK();
+ n = i; /* In case we lost some during malloc. */
+
+ error = 0;
+ xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
+ for (i = 0; i < n; i++) {
+ unp = unp_list[i];
+ UNP_PCB_LOCK(unp);
+ unp->unp_refcount--;
+ if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
+ xu->xu_len = sizeof *xu;
+ xu->xu_unpp = unp;
+ /*
+ * XXX - need more locking here to protect against
+ * connect/disconnect races for SMP.
+ */
+ if (unp->unp_addr != NULL)
+ bcopy(unp->unp_addr, &xu->xu_addr,
+ unp->unp_addr->sun_len);
+ if (unp->unp_conn != NULL &&
+ unp->unp_conn->unp_addr != NULL)
+ bcopy(unp->unp_conn->unp_addr,
+ &xu->xu_caddr,
+ unp->unp_conn->unp_addr->sun_len);
+ bcopy(unp, &xu->xu_unp, sizeof *unp);
+ sotoxsocket(unp->unp_socket, &xu->xu_socket);
+ UNP_PCB_UNLOCK(unp);
+ error = SYSCTL_OUT(req, xu, sizeof *xu);
+ } else {
+ freeunp = (unp->unp_refcount == 0);
+ UNP_PCB_UNLOCK(unp);
+ if (freeunp) {
+ UNP_PCB_LOCK_DESTROY(unp);
+ uma_zfree(unp_zone, unp);
+ }
+ }
+ }
+ free(xu, M_TEMP);
+ if (!error) {
+ /*
+ * Give the user an updated idea of our state. If the
+ * generation differs from what we told her before, she knows
+ * that something happened while we were processing this
+ * request, and it might be necessary to retry.
+ */
+ xug->xug_gen = unp_gencnt;
+ xug->xug_sogen = so_gencnt;
+ xug->xug_count = unp_count;
+ error = SYSCTL_OUT(req, xug, sizeof *xug);
+ }
+ free(unp_list, M_TEMP);
+ free(xug, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
+ "List of active local datagram sockets");
+SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
+ "List of active local stream sockets");
+SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
+ CTLTYPE_OPAQUE | CTLFLAG_RD,
+ (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
+ "List of active local seqpacket sockets");
+
+static void
+unp_shutdown(struct unpcb *unp)
+{
+ struct unpcb *unp2;
+ struct socket *so;
+
+ UNP_LINK_WLOCK_ASSERT();
+ UNP_PCB_LOCK_ASSERT(unp);
+
+ unp2 = unp->unp_conn;
+ if ((unp->unp_socket->so_type == SOCK_STREAM ||
+ (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
+ so = unp2->unp_socket;
+ if (so != NULL)
+ socantrcvmore(so);
+ }
+}
+
+static void
+unp_drop(struct unpcb *unp, int errno)
+{
+ struct socket *so = unp->unp_socket;
+ struct unpcb *unp2;
+
+ UNP_LINK_WLOCK_ASSERT();
+ UNP_PCB_LOCK_ASSERT(unp);
+
+ so->so_error = errno;
+ unp2 = unp->unp_conn;
+ if (unp2 == NULL)
+ return;
+ UNP_PCB_LOCK(unp2);
+ unp_disconnect(unp, unp2);
+ UNP_PCB_UNLOCK(unp2);
+}
+
+static void
+unp_freerights(struct filedescent **fdep, int fdcount)
+{
+ struct file *fp;
+ int i;
+
+ KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
+
+ for (i = 0; i < fdcount; i++) {
+ fp = fdep[i]->fde_file;
+ filecaps_free(&fdep[i]->fde_caps);
+ unp_discard(fp);
+ }
+ free(fdep[0], M_FILECAPS);
+}
+
+static int
+unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
+{
+ struct thread *td = curthread; /* XXX */
+ struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+ int i;
+ int *fdp;
+ struct filedesc *fdesc = td->td_proc->p_fd;
+ struct filedescent *fde, **fdep;
+ void *data;
+ socklen_t clen = control->m_len, datalen;
+ int error, newfds;
+ u_int newlen;
+
+ UNP_LINK_UNLOCK_ASSERT();
+
+ error = 0;
+ if (controlp != NULL) /* controlp == NULL => free control messages */
+ *controlp = NULL;
+ while (cm != NULL) {
+ if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
+ error = EINVAL;
+ break;
+ }
+ data = CMSG_DATA(cm);
+ datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
+ if (cm->cmsg_level == SOL_SOCKET
+ && cm->cmsg_type == SCM_RIGHTS) {
+ newfds = datalen / sizeof(*fdep);
+ if (newfds == 0)
+ goto next;
+ fdep = data;
+
+ /* If we're not outputting the descriptors free them. */
+ if (error || controlp == NULL) {
+ unp_freerights(fdep, newfds);
+ goto next;
+ }
+ FILEDESC_XLOCK(fdesc);
+
+ /*
+ * Now change each pointer to an fd in the global
+ * table to an integer that is the index to the local
+ * fd table entry that we set up to point to the
+ * global one we are transferring.
+ */
+ newlen = newfds * sizeof(int);
+ *controlp = sbcreatecontrol(NULL, newlen,
+ SCM_RIGHTS, SOL_SOCKET);
+ if (*controlp == NULL) {
+ FILEDESC_XUNLOCK(fdesc);
+ error = E2BIG;
+ unp_freerights(fdep, newfds);
+ goto next;
+ }
+
+ fdp = (int *)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ if (fdallocn(td, 0, fdp, newfds) != 0) {
+ FILEDESC_XUNLOCK(td->td_proc->p_fd);
+ error = EMSGSIZE;
+ unp_freerights(fdep, newfds);
+ m_freem(*controlp);
+ *controlp = NULL;
+ goto next;
+ }
+ for (i = 0; i < newfds; i++, fdp++) {
+ fde = &fdesc->fd_ofiles[*fdp];
+ fde->fde_file = fdep[i]->fde_file;
+ filecaps_move(&fdep[i]->fde_caps,
+ &fde->fde_caps);
+ if ((flags & MSG_CMSG_CLOEXEC) != 0)
+ fde->fde_flags |= UF_EXCLOSE;
+ unp_externalize_fp(fde->fde_file);
+ }
+ FILEDESC_XUNLOCK(fdesc);
+ free(fdep[0], M_FILECAPS);
+ } else {
+ /* We can just copy anything else across. */
+ if (error || controlp == NULL)
+ goto next;
+ *controlp = sbcreatecontrol(NULL, datalen,
+ cm->cmsg_type, cm->cmsg_level);
+ if (*controlp == NULL) {
+ error = ENOBUFS;
+ goto next;
+ }
+ bcopy(data,
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
+ datalen);
+ }
+ controlp = &(*controlp)->m_next;
+
+next:
+ if (CMSG_SPACE(datalen) < clen) {
+ clen -= CMSG_SPACE(datalen);
+ cm = (struct cmsghdr *)
+ ((caddr_t)cm + CMSG_SPACE(datalen));
+ } else {
+ clen = 0;
+ cm = NULL;
+ }
+ }
+
+ m_freem(control);
+ return (error);
+}
+
+static void
+unp_zone_change(void *tag)
+{
+
+ uma_zone_set_max(unp_zone, maxsockets);
+}
+
+static void
+unp_init(void)
+{
+
+#ifdef VIMAGE
+ if (!IS_DEFAULT_VNET(curvnet))
+ return;
+#endif
+ unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, 0);
+ if (unp_zone == NULL)
+ panic("unp_init");
+ uma_zone_set_max(unp_zone, maxsockets);
+ uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
+ EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
+ NULL, EVENTHANDLER_PRI_ANY);
+ LIST_INIT(&unp_dhead);
+ LIST_INIT(&unp_shead);
+ LIST_INIT(&unp_sphead);
+ SLIST_INIT(&unp_defers);
+ TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
+ TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
+ UNP_LINK_LOCK_INIT();
+ UNP_LIST_LOCK_INIT();
+ UNP_DEFERRED_LOCK_INIT();
+}
+
+static int
+unp_internalize(struct mbuf **controlp, struct thread *td)
+{
+ struct mbuf *control = *controlp;
+ struct proc *p = td->td_proc;
+ struct filedesc *fdesc = p->p_fd;
+ struct bintime *bt;
+ struct cmsghdr *cm = mtod(control, struct cmsghdr *);
+ struct cmsgcred *cmcred;
+ struct filedescent *fde, **fdep, *fdev;
+ struct file *fp;
+ struct timeval *tv;
+ int i, fd, *fdp;
+ void *data;
+ socklen_t clen = control->m_len, datalen;
+ int error, oldfds;
+ u_int newlen;
+
+ UNP_LINK_UNLOCK_ASSERT();
+
+ error = 0;
+ *controlp = NULL;
+ while (cm != NULL) {
+ if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
+ || cm->cmsg_len > clen) {
+ error = EINVAL;
+ goto out;
+ }
+ data = CMSG_DATA(cm);
+ datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
+
+ switch (cm->cmsg_type) {
+ /*
+ * Fill in credential information.
+ */
+ case SCM_CREDS:
+ *controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
+ SCM_CREDS, SOL_SOCKET);
+ if (*controlp == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ cmcred = (struct cmsgcred *)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ cmcred->cmcred_pid = p->p_pid;
+ cmcred->cmcred_uid = td->td_ucred->cr_ruid;
+ cmcred->cmcred_gid = td->td_ucred->cr_rgid;
+ cmcred->cmcred_euid = td->td_ucred->cr_uid;
+ cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
+ CMGROUP_MAX);
+ for (i = 0; i < cmcred->cmcred_ngroups; i++)
+ cmcred->cmcred_groups[i] =
+ td->td_ucred->cr_groups[i];
+ break;
+
+ case SCM_RIGHTS:
+ oldfds = datalen / sizeof (int);
+ if (oldfds == 0)
+ break;
+ /*
+ * Check that all the FDs passed in refer to legal
+ * files. If not, reject the entire operation.
+ */
+ fdp = data;
+ FILEDESC_SLOCK(fdesc);
+ for (i = 0; i < oldfds; i++) {
+ fd = *fdp++;
+ if (fget_locked(fdesc, fd) == NULL) {
+ FILEDESC_SUNLOCK(fdesc);
+ error = EBADF;
+ goto out;
+ }
+ fp = fdesc->fd_ofiles[fd].fde_file;
+ if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
+ FILEDESC_SUNLOCK(fdesc);
+ error = EOPNOTSUPP;
+ goto out;
+ }
+
+ }
+
+ /*
+ * Now replace the integer FDs with pointers to the
+ * file structure and capability rights.
+ */
+ newlen = oldfds * sizeof(fdep[0]);
+ *controlp = sbcreatecontrol(NULL, newlen,
+ SCM_RIGHTS, SOL_SOCKET);
+ if (*controlp == NULL) {
+ FILEDESC_SUNLOCK(fdesc);
+ error = E2BIG;
+ goto out;
+ }
+ fdp = data;
+ fdep = (struct filedescent **)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
+ M_WAITOK);
+ for (i = 0; i < oldfds; i++, fdev++, fdp++) {
+ fde = &fdesc->fd_ofiles[*fdp];
+ fdep[i] = fdev;
+ fdep[i]->fde_file = fde->fde_file;
+ filecaps_copy(&fde->fde_caps,
+ &fdep[i]->fde_caps);
+ unp_internalize_fp(fdep[i]->fde_file);
+ }
+ FILEDESC_SUNLOCK(fdesc);
+ break;
+
+ case SCM_TIMESTAMP:
+ *controlp = sbcreatecontrol(NULL, sizeof(*tv),
+ SCM_TIMESTAMP, SOL_SOCKET);
+ if (*controlp == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ tv = (struct timeval *)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ microtime(tv);
+ break;
+
+ case SCM_BINTIME:
+ *controlp = sbcreatecontrol(NULL, sizeof(*bt),
+ SCM_BINTIME, SOL_SOCKET);
+ if (*controlp == NULL) {
+ error = ENOBUFS;
+ goto out;
+ }
+ bt = (struct bintime *)
+ CMSG_DATA(mtod(*controlp, struct cmsghdr *));
+ bintime(bt);
+ break;
+
+ default:
+ error = EINVAL;
+ goto out;
+ }
+
+ controlp = &(*controlp)->m_next;
+ if (CMSG_SPACE(datalen) < clen) {
+ clen -= CMSG_SPACE(datalen);
+ cm = (struct cmsghdr *)
+ ((caddr_t)cm + CMSG_SPACE(datalen));
+ } else {
+ clen = 0;
+ cm = NULL;
+ }
+ }
+
+out:
+ m_freem(control);
+ return (error);
+}
+
+static struct mbuf *
+unp_addsockcred(struct thread *td, struct mbuf *control)
+{
+ struct mbuf *m, *n, *n_prev;
+ struct sockcred *sc;
+ const struct cmsghdr *cm;
+ int ngroups;
+ int i;
+
+ ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
+ m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
+ if (m == NULL)
+ return (control);
+
+ sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
+ sc->sc_uid = td->td_ucred->cr_ruid;
+ sc->sc_euid = td->td_ucred->cr_uid;
+ sc->sc_gid = td->td_ucred->cr_rgid;
+ sc->sc_egid = td->td_ucred->cr_gid;
+ sc->sc_ngroups = ngroups;
+ for (i = 0; i < sc->sc_ngroups; i++)
+ sc->sc_groups[i] = td->td_ucred->cr_groups[i];
+
+ /*
+ * Unlink SCM_CREDS control messages (struct cmsgcred), since just
+ * created SCM_CREDS control message (struct sockcred) has another
+ * format.
+ */
+ if (control != NULL)
+ for (n = control, n_prev = NULL; n != NULL;) {
+ cm = mtod(n, struct cmsghdr *);
+ if (cm->cmsg_level == SOL_SOCKET &&
+ cm->cmsg_type == SCM_CREDS) {
+ if (n_prev == NULL)
+ control = n->m_next;
+ else
+ n_prev->m_next = n->m_next;
+ n = m_free(n);
+ } else {
+ n_prev = n;
+ n = n->m_next;
+ }
+ }
+
+ /* Prepend it to the head. */
+ m->m_next = control;
+ return (m);
+}
+
+static struct unpcb *
+fptounp(struct file *fp)
+{
+ struct socket *so;
+
+ if (fp->f_type != DTYPE_SOCKET)
+ return (NULL);
+ if ((so = fp->f_data) == NULL)
+ return (NULL);
+ if (so->so_proto->pr_domain != &localdomain)
+ return (NULL);
+ return sotounpcb(so);
+}
+
+static void
+unp_discard(struct file *fp)
+{
+ struct unp_defer *dr;
+
+ if (unp_externalize_fp(fp)) {
+ dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
+ dr->ud_fp = fp;
+ UNP_DEFERRED_LOCK();
+ SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
+ UNP_DEFERRED_UNLOCK();
+ atomic_add_int(&unp_defers_count, 1);
+ taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
+ } else
+ (void) closef(fp, (struct thread *)NULL);
+}
+
+static void
+unp_process_defers(void *arg __unused, int pending)
+{
+ struct unp_defer *dr;
+ SLIST_HEAD(, unp_defer) drl;
+ int count;
+
+ SLIST_INIT(&drl);
+ for (;;) {
+ UNP_DEFERRED_LOCK();
+ if (SLIST_FIRST(&unp_defers) == NULL) {
+ UNP_DEFERRED_UNLOCK();
+ break;
+ }
+ SLIST_SWAP(&unp_defers, &drl, unp_defer);
+ UNP_DEFERRED_UNLOCK();
+ count = 0;
+ while ((dr = SLIST_FIRST(&drl)) != NULL) {
+ SLIST_REMOVE_HEAD(&drl, ud_link);
+ closef(dr->ud_fp, NULL);
+ free(dr, M_TEMP);
+ count++;
+ }
+ atomic_add_int(&unp_defers_count, -count);
+ }
+}
+
+static void
+unp_internalize_fp(struct file *fp)
+{
+ struct unpcb *unp;
+
+ UNP_LINK_WLOCK();
+ if ((unp = fptounp(fp)) != NULL) {
+ unp->unp_file = fp;
+ unp->unp_msgcount++;
+ }
+ fhold(fp);
+ unp_rights++;
+ UNP_LINK_WUNLOCK();
+}
+
+static int
+unp_externalize_fp(struct file *fp)
+{
+ struct unpcb *unp;
+ int ret;
+
+ UNP_LINK_WLOCK();
+ if ((unp = fptounp(fp)) != NULL) {
+ unp->unp_msgcount--;
+ ret = 1;
+ } else
+ ret = 0;
+ unp_rights--;
+ UNP_LINK_WUNLOCK();
+ return (ret);
+}
+
+/*
+ * unp_defer indicates whether additional work has been defered for a future
+ * pass through unp_gc(). It is thread local and does not require explicit
+ * synchronization.
+ */
+static int unp_marked;
+static int unp_unreachable;
+
+static void
+unp_accessable(struct filedescent **fdep, int fdcount)
+{
+ struct unpcb *unp;
+ struct file *fp;
+ int i;
+
+ for (i = 0; i < fdcount; i++) {
+ fp = fdep[i]->fde_file;
+ if ((unp = fptounp(fp)) == NULL)
+ continue;
+ if (unp->unp_gcflag & UNPGC_REF)
+ continue;
+ unp->unp_gcflag &= ~UNPGC_DEAD;
+ unp->unp_gcflag |= UNPGC_REF;
+ unp_marked++;
+ }
+}
+
+static void
+unp_gc_process(struct unpcb *unp)
+{
+ struct socket *soa;
+ struct socket *so;
+ struct file *fp;
+
+ /* Already processed. */
+ if (unp->unp_gcflag & UNPGC_SCANNED)
+ return;
+ fp = unp->unp_file;
+
+ /*
+ * Check for a socket potentially in a cycle. It must be in a
+ * queue as indicated by msgcount, and this must equal the file
+ * reference count. Note that when msgcount is 0 the file is NULL.
+ */
+ if ((unp->unp_gcflag & UNPGC_REF) == 0 && fp &&
+ unp->unp_msgcount != 0 && fp->f_count == unp->unp_msgcount) {
+ unp->unp_gcflag |= UNPGC_DEAD;
+ unp_unreachable++;
+ return;
+ }
+
+ /*
+ * Mark all sockets we reference with RIGHTS.
+ */
+ so = unp->unp_socket;
+ SOCKBUF_LOCK(&so->so_rcv);
+ unp_scan(so->so_rcv.sb_mb, unp_accessable);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+
+ /*
+ * Mark all sockets in our accept queue.
+ */
+ ACCEPT_LOCK();
+ TAILQ_FOREACH(soa, &so->so_comp, so_list) {
+ SOCKBUF_LOCK(&soa->so_rcv);
+ unp_scan(soa->so_rcv.sb_mb, unp_accessable);
+ SOCKBUF_UNLOCK(&soa->so_rcv);
+ }
+ ACCEPT_UNLOCK();
+ unp->unp_gcflag |= UNPGC_SCANNED;
+}
+
+static int unp_recycled;
+SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0,
+ "Number of unreachable sockets claimed by the garbage collector.");
+
+static int unp_taskcount;
+SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0,
+ "Number of times the garbage collector has run.");
+
+static void
+unp_gc(__unused void *arg, int pending)
+{
+ struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
+ NULL };
+ struct unp_head **head;
+ struct file *f, **unref;
+ struct unpcb *unp;
+ int i, total;
+
+ unp_taskcount++;
+ UNP_LIST_LOCK();
+ /*
+ * First clear all gc flags from previous runs.
+ */
+ for (head = heads; *head != NULL; head++)
+ LIST_FOREACH(unp, *head, unp_link)
+ unp->unp_gcflag = 0;
+
+ /*
+ * Scan marking all reachable sockets with UNPGC_REF. Once a socket
+ * is reachable all of the sockets it references are reachable.
+ * Stop the scan once we do a complete loop without discovering
+ * a new reachable socket.
+ */
+ do {
+ unp_unreachable = 0;
+ unp_marked = 0;
+ for (head = heads; *head != NULL; head++)
+ LIST_FOREACH(unp, *head, unp_link)
+ unp_gc_process(unp);
+ } while (unp_marked);
+ UNP_LIST_UNLOCK();
+ if (unp_unreachable == 0)
+ return;
+
+ /*
+ * Allocate space for a local list of dead unpcbs.
+ */
+ unref = malloc(unp_unreachable * sizeof(struct file *),
+ M_TEMP, M_WAITOK);
+
+ /*
+ * Iterate looking for sockets which have been specifically marked
+ * as as unreachable and store them locally.
+ */
+ UNP_LINK_RLOCK();
+ UNP_LIST_LOCK();
+ for (total = 0, head = heads; *head != NULL; head++)
+ LIST_FOREACH(unp, *head, unp_link)
+ if ((unp->unp_gcflag & UNPGC_DEAD) != 0) {
+ f = unp->unp_file;
+ if (unp->unp_msgcount == 0 || f == NULL ||
+ f->f_count != unp->unp_msgcount)
+ continue;
+ unref[total++] = f;
+ fhold(f);
+ KASSERT(total <= unp_unreachable,
+ ("unp_gc: incorrect unreachable count."));
+ }
+ UNP_LIST_UNLOCK();
+ UNP_LINK_RUNLOCK();
+
+ /*
+ * Now flush all sockets, free'ing rights. This will free the
+ * struct files associated with these sockets but leave each socket
+ * with one remaining ref.
+ */
+ for (i = 0; i < total; i++) {
+ struct socket *so;
+
+ so = unref[i]->f_data;
+ CURVNET_SET(so->so_vnet);
+ sorflush(so);
+ CURVNET_RESTORE();
+ }
+
+ /*
+ * And finally release the sockets so they can be reclaimed.
+ */
+ for (i = 0; i < total; i++)
+ fdrop(unref[i], NULL);
+ unp_recycled += total;
+ free(unref, M_TEMP);
+}
+
+static void
+unp_dispose(struct mbuf *m)
+{
+
+ if (m)
+ unp_scan(m, unp_freerights);
+}
+
+static void
+unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
+{
+ struct mbuf *m;
+ struct cmsghdr *cm;
+ void *data;
+ socklen_t clen, datalen;
+
+ while (m0 != NULL) {
+ for (m = m0; m; m = m->m_next) {
+ if (m->m_type != MT_CONTROL)
+ continue;
+
+ cm = mtod(m, struct cmsghdr *);
+ clen = m->m_len;
+
+ while (cm != NULL) {
+ if (sizeof(*cm) > clen || cm->cmsg_len > clen)
+ break;
+
+ data = CMSG_DATA(cm);
+ datalen = (caddr_t)cm + cm->cmsg_len
+ - (caddr_t)data;
+
+ if (cm->cmsg_level == SOL_SOCKET &&
+ cm->cmsg_type == SCM_RIGHTS) {
+ (*op)(data, datalen /
+ sizeof(struct filedescent *));
+ }
+
+ if (CMSG_SPACE(datalen) < clen) {
+ clen -= CMSG_SPACE(datalen);
+ cm = (struct cmsghdr *)
+ ((caddr_t)cm + CMSG_SPACE(datalen));
+ } else {
+ clen = 0;
+ cm = NULL;
+ }
+ }
+ }
+ m0 = m0->m_act;
+ }
+}
+
+/*
+ * A helper function called by VFS before socket-type vnode reclamation.
+ * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
+ * use count.
+ */
+void
+vfs_unp_reclaim(struct vnode *vp)
+{
+ struct socket *so;
+ struct unpcb *unp;
+ int active;
+
+ ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
+ KASSERT(vp->v_type == VSOCK,
+ ("vfs_unp_reclaim: vp->v_type != VSOCK"));
+
+ active = 0;
+ UNP_LINK_WLOCK();
+ VOP_UNP_CONNECT(vp, &so);
+ if (so == NULL)
+ goto done;
+ unp = sotounpcb(so);
+ if (unp == NULL)
+ goto done;
+ UNP_PCB_LOCK(unp);
+ if (unp->unp_vnode == vp) {
+ VOP_UNP_DETACH(vp);
+ unp->unp_vnode = NULL;
+ active = 1;
+ }
+ UNP_PCB_UNLOCK(unp);
+done:
+ UNP_LINK_WUNLOCK();
+ if (active)
+ vunref(vp);
+}
+
+#ifdef DDB
+static void
+db_print_indent(int indent)
+{
+ int i;
+
+ for (i = 0; i < indent; i++)
+ db_printf(" ");
+}
+
+static void
+db_print_unpflags(int unp_flags)
+{
+ int comma;
+
+ comma = 0;
+ if (unp_flags & UNP_HAVEPC) {
+ db_printf("%sUNP_HAVEPC", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_HAVEPCCACHED) {
+ db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_WANTCRED) {
+ db_printf("%sUNP_WANTCRED", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_CONNWAIT) {
+ db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_CONNECTING) {
+ db_printf("%sUNP_CONNECTING", comma ? ", " : "");
+ comma = 1;
+ }
+ if (unp_flags & UNP_BINDING) {
+ db_printf("%sUNP_BINDING", comma ? ", " : "");
+ comma = 1;
+ }
+}
+
+static void
+db_print_xucred(int indent, struct xucred *xu)
+{
+ int comma, i;
+
+ db_print_indent(indent);
+ db_printf("cr_version: %u cr_uid: %u cr_ngroups: %d\n",
+ xu->cr_version, xu->cr_uid, xu->cr_ngroups);
+ db_print_indent(indent);
+ db_printf("cr_groups: ");
+ comma = 0;
+ for (i = 0; i < xu->cr_ngroups; i++) {
+ db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
+ comma = 1;
+ }
+ db_printf("\n");
+}
+
+static void
+db_print_unprefs(int indent, struct unp_head *uh)
+{
+ struct unpcb *unp;
+ int counter;
+
+ counter = 0;
+ LIST_FOREACH(unp, uh, unp_reflink) {
+ if (counter % 4 == 0)
+ db_print_indent(indent);
+ db_printf("%p ", unp);
+ if (counter % 4 == 3)
+ db_printf("\n");
+ counter++;
+ }
+ if (counter != 0 && counter % 4 != 0)
+ db_printf("\n");
+}
+
+DB_SHOW_COMMAND(unpcb, db_show_unpcb)
+{
+ struct unpcb *unp;
+
+ if (!have_addr) {
+ db_printf("usage: show unpcb <addr>\n");
+ return;
+ }
+ unp = (struct unpcb *)addr;
+
+ db_printf("unp_socket: %p unp_vnode: %p\n", unp->unp_socket,
+ unp->unp_vnode);
+
+ db_printf("unp_ino: %ju unp_conn: %p\n", (uintmax_t)unp->unp_ino,
+ unp->unp_conn);
+
+ db_printf("unp_refs:\n");
+ db_print_unprefs(2, &unp->unp_refs);
+
+ /* XXXRW: Would be nice to print the full address, if any. */
+ db_printf("unp_addr: %p\n", unp->unp_addr);
+
+ db_printf("unp_cc: %d unp_mbcnt: %d unp_gencnt: %llu\n",
+ unp->unp_cc, unp->unp_mbcnt,
+ (unsigned long long)unp->unp_gencnt);
+
+ db_printf("unp_flags: %x (", unp->unp_flags);
+ db_print_unpflags(unp->unp_flags);
+ db_printf(")\n");
+
+ db_printf("unp_peercred:\n");
+ db_print_xucred(2, &unp->unp_peercred);
+
+ db_printf("unp_refcount: %u\n", unp->unp_refcount);
+}
+#endif
diff --git a/sys/kern/vfs_acl.c b/sys/kern/vfs_acl.c
new file mode 100644
index 0000000..362792b
--- /dev/null
+++ b/sys/kern/vfs_acl.c
@@ -0,0 +1,562 @@
+/*-
+ * Copyright (c) 1999-2006 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL system calls and other functions common across different ACL types.
+ * Type-specific routines go into subr_acl_<type>.c.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/acl.h>
+
+#include <security/mac/mac_framework.h>
+
+CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES);
+
+MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists");
+
+static int vacl_set_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_get_acl(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+static int vacl_aclcheck(struct thread *td, struct vnode *vp,
+ acl_type_t type, struct acl *aclp);
+
+int
+acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest)
+{
+ int i;
+
+ if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES)
+ return (EINVAL);
+
+ bzero(dest, sizeof(*dest));
+
+ dest->acl_cnt = source->acl_cnt;
+ dest->acl_maxcnt = ACL_MAX_ENTRIES;
+
+ for (i = 0; i < dest->acl_cnt; i++) {
+ dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
+ dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
+ dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
+ }
+
+ return (0);
+}
+
+int
+acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest)
+{
+ int i;
+
+ if (source->acl_cnt > OLDACL_MAX_ENTRIES)
+ return (EINVAL);
+
+ bzero(dest, sizeof(*dest));
+
+ dest->acl_cnt = source->acl_cnt;
+
+ for (i = 0; i < dest->acl_cnt; i++) {
+ dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
+ dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
+ dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
+ }
+
+ return (0);
+}
+
+/*
+ * At one time, "struct ACL" was extended in order to add support for NFSv4
+ * ACLs. Instead of creating compatibility versions of all the ACL-related
+ * syscalls, they were left intact. It's possible to find out what the code
+ * calling these syscalls (libc) expects basing on "type" argument - if it's
+ * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were
+ * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct
+ * oldacl". If it's something else, then it's the new "struct acl". In the
+ * latter case, the routines below just copyin/copyout the contents. In the
+ * former case, they copyin the "struct oldacl" and convert it to the new
+ * format.
+ */
+static int
+acl_copyin(void *user_acl, struct acl *kernel_acl, acl_type_t type)
+{
+ int error;
+ struct oldacl old;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS_OLD:
+ case ACL_TYPE_DEFAULT_OLD:
+ error = copyin(user_acl, &old, sizeof(old));
+ if (error != 0)
+ break;
+ acl_copy_oldacl_into_acl(&old, kernel_acl);
+ break;
+
+ default:
+ error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl));
+ if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES)
+ return (EINVAL);
+ }
+
+ return (error);
+}
+
+static int
+acl_copyout(struct acl *kernel_acl, void *user_acl, acl_type_t type)
+{
+ int error;
+ struct oldacl old;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS_OLD:
+ case ACL_TYPE_DEFAULT_OLD:
+ error = acl_copy_acl_into_oldacl(kernel_acl, &old);
+ if (error != 0)
+ break;
+
+ error = copyout(&old, user_acl, sizeof(old));
+ break;
+
+ default:
+ if (fuword32((char *)user_acl +
+ offsetof(struct acl, acl_maxcnt)) != ACL_MAX_ENTRIES)
+ return (EINVAL);
+
+ error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl));
+ }
+
+ return (error);
+}
+
+/*
+ * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new"
+ * counterpart. It's required for old (pre-NFSv4 ACLs) libc to work
+ * with new kernel. Fixing 'type' for old binaries with new libc
+ * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold().
+ */
+static int
+acl_type_unold(int type)
+{
+ switch (type) {
+ case ACL_TYPE_ACCESS_OLD:
+ return (ACL_TYPE_ACCESS);
+
+ case ACL_TYPE_DEFAULT_OLD:
+ return (ACL_TYPE_DEFAULT);
+
+ default:
+ return (type);
+ }
+}
+
+/*
+ * These calls wrap the real vnode operations, and are called by the syscall
+ * code once the syscall has converted the path or file descriptor to a vnode
+ * (unlocked). The aclp pointer is assumed still to point to userland, so
+ * this should not be consumed within the kernel except by syscall code.
+ * Other code should directly invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl *inkernelacl;
+ struct mount *mp;
+ int error;
+
+ inkernelacl = acl_alloc(M_WAITOK);
+ error = acl_copyin(aclp, inkernelacl, type);
+ if (error != 0)
+ goto out;
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ goto out;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+ error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl);
+ if (error != 0)
+ goto out_unlock;
+#endif
+ error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl,
+ td->td_ucred, td);
+#ifdef MAC
+out_unlock:
+#endif
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+out:
+ acl_free(inkernelacl);
+ return (error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl *inkernelacl;
+ int error;
+
+ inkernelacl = acl_alloc(M_WAITOK | M_ZERO);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+ error = mac_vnode_check_getacl(td->td_ucred, vp, type);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl,
+ td->td_ucred, td);
+
+#ifdef MAC
+out:
+#endif
+ VOP_UNLOCK(vp, 0);
+ if (error == 0)
+ error = acl_copyout(inkernelacl, aclp, type);
+ acl_free(inkernelacl);
+ return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+ struct mount *mp;
+ int error;
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+ error = mac_vnode_check_deleteacl(td->td_ucred, vp, type);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+ struct acl *aclp)
+{
+ struct acl *inkernelacl;
+ int error;
+
+ inkernelacl = acl_alloc(M_WAITOK);
+ error = acl_copyin(aclp, inkernelacl, type);
+ if (error != 0)
+ goto out;
+ error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl,
+ td->td_ucred, td);
+out:
+ acl_free(inkernelacl);
+ return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't
+ * need to lock, as the vacl_ code will get/release any locks required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ */
+int
+sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, get an ACL for it; don't follow links.
+ */
+int
+sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it.
+ */
+int
+sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it; don't follow links.
+ */
+int
+sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it.
+ */
+int
+sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = getvnode(td->td_proc->p_fd, uap->filedes,
+ cap_rights_init(&rights, CAP_ACL_GET), &fp);
+ if (error == 0) {
+ error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it.
+ */
+int
+sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = getvnode(td->td_proc->p_fd, uap->filedes,
+ cap_rights_init(&rights, CAP_ACL_SET), &fp);
+ if (error == 0) {
+ error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_delete(td, nd.ni_vp, uap->type);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it; don't follow links.
+ */
+int
+sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_delete(td, nd.ni_vp, uap->type);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = getvnode(td->td_proc->p_fd, uap->filedes,
+ cap_rights_init(&rights, CAP_ACL_DELETE), &fp);
+ if (error == 0) {
+ error = vacl_delete(td, fp->f_vnode, uap->type);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it.
+ */
+int
+sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it; don't follow links.
+ */
+int
+sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error == 0) {
+ error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
+ NDFREE(&nd, 0);
+ }
+ return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it.
+ */
+int
+sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ error = getvnode(td->td_proc->p_fd, uap->filedes,
+ cap_rights_init(&rights, CAP_ACL_CHECK), &fp);
+ if (error == 0) {
+ error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
+ fdrop(fp, td);
+ }
+ return (error);
+}
+
+struct acl *
+acl_alloc(int flags)
+{
+ struct acl *aclp;
+
+ aclp = malloc(sizeof(*aclp), M_ACL, flags);
+ if (aclp == NULL)
+ return (NULL);
+
+ aclp->acl_maxcnt = ACL_MAX_ENTRIES;
+
+ return (aclp);
+}
+
+void
+acl_free(struct acl *aclp)
+{
+
+ free(aclp, M_ACL);
+}
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
new file mode 100644
index 0000000..7f9f881
--- /dev/null
+++ b/sys/kern/vfs_aio.c
@@ -0,0 +1,3069 @@
+/*-
+ * Copyright (c) 1997 John S. Dyson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. John S. Dyson's name may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * DISCLAIMER: This code isn't warranted to do anything useful. Anything
+ * bad that happens because of using this software isn't the responsibility
+ * of the author. This software is distributed AS-IS.
+ */
+
+/*
+ * This file contains support for the POSIX 1003.1B AIO/LIO facility.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capability.h>
+#include <sys/eventhandler.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/kthread.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/unistd.h>
+#include <sys/posix4.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/protosw.h>
+#include <sys/rwlock.h>
+#include <sys/sema.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/sx.h>
+#include <sys/taskqueue.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/mount.h>
+
+#include <machine/atomic.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/uma.h>
+#include <sys/aio.h>
+
+#include "opt_vfs_aio.h"
+
+/*
+ * Counter for allocating reference ids to new jobs. Wrapped to 1 on
+ * overflow. (XXX will be removed soon.)
+ */
+static u_long jobrefid;
+
+/*
+ * Counter for aio_fsync.
+ */
+static uint64_t jobseqno;
+
+#define JOBST_NULL 0
+#define JOBST_JOBQSOCK 1
+#define JOBST_JOBQGLOBAL 2
+#define JOBST_JOBRUNNING 3
+#define JOBST_JOBFINISHED 4
+#define JOBST_JOBQBUF 5
+#define JOBST_JOBQSYNC 6
+
+#ifndef MAX_AIO_PER_PROC
+#define MAX_AIO_PER_PROC 32
+#endif
+
+#ifndef MAX_AIO_QUEUE_PER_PROC
+#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef MAX_AIO_PROCS
+#define MAX_AIO_PROCS 32
+#endif
+
+#ifndef MAX_AIO_QUEUE
+#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
+#endif
+
+#ifndef TARGET_AIO_PROCS
+#define TARGET_AIO_PROCS 4
+#endif
+
+#ifndef MAX_BUF_AIO
+#define MAX_BUF_AIO 16
+#endif
+
+#ifndef AIOD_TIMEOUT_DEFAULT
+#define AIOD_TIMEOUT_DEFAULT (10 * hz)
+#endif
+
+#ifndef AIOD_LIFETIME_DEFAULT
+#define AIOD_LIFETIME_DEFAULT (30 * hz)
+#endif
+
+FEATURE(aio, "Asynchronous I/O");
+
+static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");
+
+static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");
+
+static int max_aio_procs = MAX_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
+ CTLFLAG_RW, &max_aio_procs, 0,
+ "Maximum number of kernel threads to use for handling async IO ");
+
+static int num_aio_procs = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
+ CTLFLAG_RD, &num_aio_procs, 0,
+ "Number of presently active kernel threads for async IO");
+
+/*
+ * The code will adjust the actual number of AIO processes towards this
+ * number when it gets a chance.
+ */
+static int target_aio_procs = TARGET_AIO_PROCS;
+SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
+ 0, "Preferred number of ready kernel threads for async IO");
+
+static int max_queue_count = MAX_AIO_QUEUE;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
+ "Maximum number of aio requests to queue, globally");
+
+static int num_queue_count = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
+ "Number of queued aio requests");
+
+static int num_buf_aio = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
+ "Number of aio requests presently handled by the buf subsystem");
+
+/* Number of async I/O thread in the process of being started */
+/* XXX This should be local to aio_aqueue() */
+static int num_aio_resv_start = 0;
+
+static int aiod_timeout;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
+ "Timeout value for synchronous aio operations");
+
+static int aiod_lifetime;
+SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
+ "Maximum lifetime for idle aiod");
+
+static int unloadable = 0;
+SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
+ "Allow unload of aio (not recommended)");
+
+
+static int max_aio_per_proc = MAX_AIO_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
+ 0, "Maximum active aio requests per process (stored in the process)");
+
+static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
+ &max_aio_queue_per_proc, 0,
+ "Maximum queued aio requests per process (stored in the process)");
+
+static int max_buf_aio = MAX_BUF_AIO;
+SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
+ "Maximum buf aio requests per process (stored in the process)");
+
+typedef struct oaiocb {
+ int aio_fildes; /* File descriptor */
+ off_t aio_offset; /* File offset for I/O */
+ volatile void *aio_buf; /* I/O buffer in process space */
+ size_t aio_nbytes; /* Number of bytes for I/O */
+ struct osigevent aio_sigevent; /* Signal to deliver */
+ int aio_lio_opcode; /* LIO opcode */
+ int aio_reqprio; /* Request priority -- ignored */
+ struct __aiocb_private _aiocb_private;
+} oaiocb_t;
+
+/*
+ * Below is a key of locks used to protect each member of struct aiocblist
+ * aioliojob and kaioinfo and any backends.
+ *
+ * * - need not protected
+ * a - locked by kaioinfo lock
+ * b - locked by backend lock, the backend lock can be null in some cases,
+ * for example, BIO belongs to this type, in this case, proc lock is
+ * reused.
+ * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
+ */
+
+/*
+ * Current, there is only two backends: BIO and generic file I/O.
+ * socket I/O is served by generic file I/O, this is not a good idea, since
+ * disk file I/O and any other types without O_NONBLOCK flag can block daemon
+ * threads, if there is no thread to serve socket I/O, the socket I/O will be
+ * delayed too long or starved, we should create some threads dedicated to
+ * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
+ * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
+ * structure is not safe because there is race between userland and aio
+ * daemons.
+ */
+
+struct aiocblist {
+ TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */
+ TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */
+ TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */
+ int jobflags; /* (a) job flags */
+ int jobstate; /* (b) job state */
+ int inputcharge; /* (*) input blockes */
+ int outputcharge; /* (*) output blockes */
+ struct buf *bp; /* (*) private to BIO backend,
+ * buffer pointer
+ */
+ struct proc *userproc; /* (*) user process */
+ struct ucred *cred; /* (*) active credential when created */
+ struct file *fd_file; /* (*) pointer to file structure */
+ struct aioliojob *lio; /* (*) optional lio job */
+ struct aiocb *uuaiocb; /* (*) pointer in userspace of aiocb */
+ struct knlist klist; /* (a) list of knotes */
+ struct aiocb uaiocb; /* (*) kernel I/O control block */
+ ksiginfo_t ksi; /* (a) realtime signal info */
+ struct task biotask; /* (*) private to BIO backend */
+ uint64_t seqno; /* (*) job number */
+ int pending; /* (a) number of pending I/O, aio_fsync only */
+};
+
+/* jobflags */
+#define AIOCBLIST_DONE 0x01
+#define AIOCBLIST_BUFDONE 0x02
+#define AIOCBLIST_RUNDOWN 0x04
+#define AIOCBLIST_CHECKSYNC 0x08
+
+/*
+ * AIO process info
+ */
+#define AIOP_FREE 0x1 /* proc on free queue */
+
+struct aiothreadlist {
+ int aiothreadflags; /* (c) AIO proc flags */
+ TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */
+ struct thread *aiothread; /* (*) the AIO thread */
+};
+
+/*
+ * data-structure for lio signal management
+ */
+struct aioliojob {
+ int lioj_flags; /* (a) listio flags */
+ int lioj_count; /* (a) listio flags */
+ int lioj_finished_count; /* (a) listio flags */
+ struct sigevent lioj_signal; /* (a) signal on all I/O done */
+ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
+ struct knlist klist; /* (a) list of knotes */
+ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
+};
+
+#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
+#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
+#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */
+
+/*
+ * per process aio data structure
+ */
+struct kaioinfo {
+ struct mtx kaio_mtx; /* the lock to protect this struct */
+ int kaio_flags; /* (a) per process kaio flags */
+ int kaio_maxactive_count; /* (*) maximum number of AIOs */
+ int kaio_active_count; /* (c) number of currently used AIOs */
+ int kaio_qallowed_count; /* (*) maxiumu size of AIO queue */
+ int kaio_count; /* (a) size of AIO queue */
+ int kaio_ballowed_count; /* (*) maximum number of buffers */
+ int kaio_buffer_count; /* (a) number of physio buffers */
+ TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */
+ TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */
+ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
+ TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */
+ TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */
+ TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets,
+ * NOT USED YET.
+ */
+ TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */
+ struct task kaio_task; /* (*) task to kick aio threads */
+};
+
+#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
+#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
+#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
+#define AIO_MTX(ki) (&(ki)->kaio_mtx)
+
+#define KAIO_RUNDOWN 0x1 /* process is being run down */
+#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */
+
+/*
+ * Operations used to interact with userland aio control blocks.
+ * Different ABIs provide their own operations.
+ */
+struct aiocb_ops {
+ int (*copyin)(struct aiocb *ujob, struct aiocb *kjob);
+ long (*fetch_status)(struct aiocb *ujob);
+ long (*fetch_error)(struct aiocb *ujob);
+ int (*store_status)(struct aiocb *ujob, long status);
+ int (*store_error)(struct aiocb *ujob, long error);
+ int (*store_kernelinfo)(struct aiocb *ujob, long jobref);
+ int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob);
+};
+
+static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */
+static struct sema aio_newproc_sem;
+static struct mtx aio_job_mtx;
+static struct mtx aio_sock_mtx;
+static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */
+static struct unrhdr *aiod_unr;
+
+void aio_init_aioinfo(struct proc *p);
+static int aio_onceonly(void);
+static int aio_free_entry(struct aiocblist *aiocbe);
+static void aio_process_rw(struct aiocblist *aiocbe);
+static void aio_process_sync(struct aiocblist *aiocbe);
+static void aio_process_mlock(struct aiocblist *aiocbe);
+static int aio_newproc(int *);
+int aio_aqueue(struct thread *td, struct aiocb *job,
+ struct aioliojob *lio, int type, struct aiocb_ops *ops);
+static void aio_physwakeup(struct buf *bp);
+static void aio_proc_rundown(void *arg, struct proc *p);
+static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
+static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
+static void biohelper(void *, int);
+static void aio_daemon(void *param);
+static void aio_swake_cb(struct socket *, struct sockbuf *);
+static int aio_unload(void);
+static void aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
+#define DONE_BUF 1
+#define DONE_QUEUE 2
+static int aio_kick(struct proc *userp);
+static void aio_kick_nowait(struct proc *userp);
+static void aio_kick_helper(void *context, int pending);
+static int filt_aioattach(struct knote *kn);
+static void filt_aiodetach(struct knote *kn);
+static int filt_aio(struct knote *kn, long hint);
+static int filt_lioattach(struct knote *kn);
+static void filt_liodetach(struct knote *kn);
+static int filt_lio(struct knote *kn, long hint);
+
+/*
+ * Zones for:
+ * kaio Per process async io info
+ * aiop async io thread data
+ * aiocb async io jobs
+ * aiol list io job pointer - internal to aio_suspend XXX
+ * aiolio list io jobs
+ */
+static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;
+
+/* kqueue filters for aio */
+static struct filterops aio_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_aioattach,
+ .f_detach = filt_aiodetach,
+ .f_event = filt_aio,
+};
+static struct filterops lio_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_lioattach,
+ .f_detach = filt_liodetach,
+ .f_event = filt_lio
+};
+
+static eventhandler_tag exit_tag, exec_tag;
+
+TASKQUEUE_DEFINE_THREAD(aiod_bio);
+
+/*
+ * Main operations function for use as a kernel module.
+ */
+static int
+aio_modload(struct module *module, int cmd, void *arg)
+{
+ int error = 0;
+
+ switch (cmd) {
+ case MOD_LOAD:
+ aio_onceonly();
+ break;
+ case MOD_UNLOAD:
+ error = aio_unload();
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t aio_mod = {
+ "aio",
+ &aio_modload,
+ NULL
+};
+
+static struct syscall_helper_data aio_syscalls[] = {
+ SYSCALL_INIT_HELPER(aio_cancel),
+ SYSCALL_INIT_HELPER(aio_error),
+ SYSCALL_INIT_HELPER(aio_fsync),
+ SYSCALL_INIT_HELPER(aio_mlock),
+ SYSCALL_INIT_HELPER(aio_read),
+ SYSCALL_INIT_HELPER(aio_return),
+ SYSCALL_INIT_HELPER(aio_suspend),
+ SYSCALL_INIT_HELPER(aio_waitcomplete),
+ SYSCALL_INIT_HELPER(aio_write),
+ SYSCALL_INIT_HELPER(lio_listio),
+ SYSCALL_INIT_HELPER(oaio_read),
+ SYSCALL_INIT_HELPER(oaio_write),
+ SYSCALL_INIT_HELPER(olio_listio),
+ SYSCALL_INIT_LAST
+};
+
+#ifdef COMPAT_FREEBSD32
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <compat/freebsd32/freebsd32.h>
+#include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
+#include <compat/freebsd32/freebsd32_syscall.h>
+#include <compat/freebsd32/freebsd32_util.h>
+
+static struct syscall_helper_data aio32_syscalls[] = {
+ SYSCALL32_INIT_HELPER(freebsd32_aio_return),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_error),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_mlock),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_read),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_write),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
+ SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
+ SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
+ SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
+ SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
+ SYSCALL_INIT_LAST
+};
+#endif
+
+DECLARE_MODULE(aio, aio_mod,
+ SI_SUB_VFS, SI_ORDER_ANY);
+MODULE_VERSION(aio, 1);
+
+/*
+ * Startup initialization
+ */
+static int
+aio_onceonly(void)
+{
+ int error;
+
+ /* XXX: should probably just use so->callback */
+ aio_swake = &aio_swake_cb;
+ exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
+ EVENTHANDLER_PRI_ANY);
+ exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
+ EVENTHANDLER_PRI_ANY);
+ kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
+ kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
+ TAILQ_INIT(&aio_freeproc);
+ sema_init(&aio_newproc_sem, 0, "aio_new_proc");
+ mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
+ mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
+ TAILQ_INIT(&aio_jobs);
+ aiod_unr = new_unrhdr(1, INT_MAX, NULL);
+ kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
+ NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ aiod_timeout = AIOD_TIMEOUT_DEFAULT;
+ aiod_lifetime = AIOD_LIFETIME_DEFAULT;
+ jobrefid = 1;
+ async_io_version = _POSIX_VERSION;
+ p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
+ p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
+ p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);
+
+ error = syscall_helper_register(aio_syscalls);
+ if (error)
+ return (error);
+#ifdef COMPAT_FREEBSD32
+ error = syscall32_helper_register(aio32_syscalls);
+ if (error)
+ return (error);
+#endif
+ return (0);
+}
+
+/*
+ * Callback for unload of AIO when used as a module.
+ */
+static int
+aio_unload(void)
+{
+ int error;
+
+ /*
+ * XXX: no unloads by default, it's too dangerous.
+ * perhaps we could do it if locked out callers and then
+ * did an aio_proc_rundown() on each process.
+ *
+ * jhb: aio_proc_rundown() needs to run on curproc though,
+ * so I don't think that would fly.
+ */
+ if (!unloadable)
+ return (EOPNOTSUPP);
+
+#ifdef COMPAT_FREEBSD32
+ syscall32_helper_unregister(aio32_syscalls);
+#endif
+ syscall_helper_unregister(aio_syscalls);
+
+ error = kqueue_del_filteropts(EVFILT_AIO);
+ if (error)
+ return error;
+ error = kqueue_del_filteropts(EVFILT_LIO);
+ if (error)
+ return error;
+ async_io_version = 0;
+ aio_swake = NULL;
+ taskqueue_free(taskqueue_aiod_bio);
+ delete_unrhdr(aiod_unr);
+ uma_zdestroy(kaio_zone);
+ uma_zdestroy(aiop_zone);
+ uma_zdestroy(aiocb_zone);
+ uma_zdestroy(aiol_zone);
+ uma_zdestroy(aiolio_zone);
+ EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
+ EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
+ mtx_destroy(&aio_job_mtx);
+ mtx_destroy(&aio_sock_mtx);
+ sema_destroy(&aio_newproc_sem);
+ p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
+ p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
+ p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
+ return (0);
+}
+
+/*
+ * Init the per-process aioinfo structure. The aioinfo limits are set
+ * per-process for user limit (resource) management.
+ */
+void
+aio_init_aioinfo(struct proc *p)
+{
+ struct kaioinfo *ki;
+
+ ki = uma_zalloc(kaio_zone, M_WAITOK);
+ mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
+ ki->kaio_flags = 0;
+ ki->kaio_maxactive_count = max_aio_per_proc;
+ ki->kaio_active_count = 0;
+ ki->kaio_qallowed_count = max_aio_queue_per_proc;
+ ki->kaio_count = 0;
+ ki->kaio_ballowed_count = max_buf_aio;
+ ki->kaio_buffer_count = 0;
+ TAILQ_INIT(&ki->kaio_all);
+ TAILQ_INIT(&ki->kaio_done);
+ TAILQ_INIT(&ki->kaio_jobqueue);
+ TAILQ_INIT(&ki->kaio_bufqueue);
+ TAILQ_INIT(&ki->kaio_liojoblist);
+ TAILQ_INIT(&ki->kaio_sockqueue);
+ TAILQ_INIT(&ki->kaio_syncqueue);
+ TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
+ PROC_LOCK(p);
+ if (p->p_aioinfo == NULL) {
+ p->p_aioinfo = ki;
+ PROC_UNLOCK(p);
+ } else {
+ PROC_UNLOCK(p);
+ mtx_destroy(&ki->kaio_mtx);
+ uma_zfree(kaio_zone, ki);
+ }
+
+ while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
+ aio_newproc(NULL);
+}
+
+static int
+aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
+{
+ struct thread *td;
+ int error;
+
+ error = sigev_findtd(p, sigev, &td);
+ if (error)
+ return (error);
+ if (!KSI_ONQ(ksi)) {
+ ksiginfo_set_sigev(ksi, sigev);
+ ksi->ksi_code = SI_ASYNCIO;
+ ksi->ksi_flags |= KSI_EXT | KSI_INS;
+ tdsendsignal(p, td, ksi->ksi_signo, ksi);
+ }
+ PROC_UNLOCK(p);
+ return (error);
+}
+
+/*
+ * Free a job entry. Wait for completion if it is currently active, but don't
+ * delay forever. If we delay, we return a flag that says that we have to
+ * restart the queue scan.
+ */
+static int
+aio_free_entry(struct aiocblist *aiocbe)
+{
+ struct kaioinfo *ki;
+ struct aioliojob *lj;
+ struct proc *p;
+
+ p = aiocbe->userproc;
+ MPASS(curproc == p);
+ ki = p->p_aioinfo;
+ MPASS(ki != NULL);
+
+ AIO_LOCK_ASSERT(ki, MA_OWNED);
+ MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
+
+ atomic_subtract_int(&num_queue_count, 1);
+
+ ki->kaio_count--;
+ MPASS(ki->kaio_count >= 0);
+
+ TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
+ TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
+
+ lj = aiocbe->lio;
+ if (lj) {
+ lj->lioj_count--;
+ lj->lioj_finished_count--;
+
+ if (lj->lioj_count == 0) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ /* lio is going away, we need to destroy any knotes */
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
+ uma_zfree(aiolio_zone, lj);
+ }
+ }
+
+ /* aiocbe is going away, we need to destroy any knotes */
+ knlist_delete(&aiocbe->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&aiocbe->ksi);
+ PROC_UNLOCK(p);
+
+ MPASS(aiocbe->bp == NULL);
+ aiocbe->jobstate = JOBST_NULL;
+ AIO_UNLOCK(ki);
+
+ /*
+ * The thread argument here is used to find the owning process
+ * and is also passed to fo_close() which may pass it to various
+ * places such as devsw close() routines. Because of that, we
+ * need a thread pointer from the process owning the job that is
+ * persistent and won't disappear out from under us or move to
+ * another process.
+ *
+ * Currently, all the callers of this function call it to remove
+ * an aiocblist from the current process' job list either via a
+ * syscall or due to the current process calling exit() or
+ * execve(). Thus, we know that p == curproc. We also know that
+ * curthread can't exit since we are curthread.
+ *
+ * Therefore, we use curthread as the thread to pass to
+ * knlist_delete(). This does mean that it is possible for the
+ * thread pointer at close time to differ from the thread pointer
+ * at open time, but this is already true of file descriptors in
+ * a multithreaded process.
+ */
+ if (aiocbe->fd_file)
+ fdrop(aiocbe->fd_file, curthread);
+ crfree(aiocbe->cred);
+ uma_zfree(aiocb_zone, aiocbe);
+ AIO_LOCK(ki);
+
+ return (0);
+}
+
+static void
+aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+ aio_proc_rundown(arg, p);
+}
+
+/*
+ * Rundown the jobs for a given process.
+ */
+static void
+aio_proc_rundown(void *arg, struct proc *p)
+{
+ struct kaioinfo *ki;
+ struct aioliojob *lj;
+ struct aiocblist *cbe, *cbn;
+ struct file *fp;
+ struct socket *so;
+ int remove;
+
+ KASSERT(curthread->td_proc == p,
+ ("%s: called on non-curproc", __func__));
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return;
+
+ AIO_LOCK(ki);
+ ki->kaio_flags |= KAIO_RUNDOWN;
+
+restart:
+
+ /*
+ * Try to cancel all pending requests. This code simulates
+ * aio_cancel on all pending I/O requests.
+ */
+ TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
+ remove = 0;
+ mtx_lock(&aio_job_mtx);
+ if (cbe->jobstate == JOBST_JOBQGLOBAL) {
+ TAILQ_REMOVE(&aio_jobs, cbe, list);
+ remove = 1;
+ } else if (cbe->jobstate == JOBST_JOBQSOCK) {
+ fp = cbe->fd_file;
+ MPASS(fp->f_type == DTYPE_SOCKET);
+ so = fp->f_data;
+ TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+ remove = 1;
+ } else if (cbe->jobstate == JOBST_JOBQSYNC) {
+ TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
+ remove = 1;
+ }
+ mtx_unlock(&aio_job_mtx);
+
+ if (remove) {
+ cbe->jobstate = JOBST_JOBFINISHED;
+ cbe->uaiocb._aiocb_private.status = -1;
+ cbe->uaiocb._aiocb_private.error = ECANCELED;
+ TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
+ aio_bio_done_notify(p, cbe, DONE_QUEUE);
+ }
+ }
+
+ /* Wait for all running I/O to be finished */
+ if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
+ TAILQ_FIRST(&ki->kaio_jobqueue)) {
+ ki->kaio_flags |= KAIO_WAKEUP;
+ msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
+ goto restart;
+ }
+
+ /* Free all completed I/O requests. */
+ while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
+ aio_free_entry(cbe);
+
+ while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
+ if (lj->lioj_count == 0) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
+ uma_zfree(aiolio_zone, lj);
+ } else {
+ panic("LIO job not cleaned up: C:%d, FC:%d\n",
+ lj->lioj_count, lj->lioj_finished_count);
+ }
+ }
+ AIO_UNLOCK(ki);
+ taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
+ mtx_destroy(&ki->kaio_mtx);
+ uma_zfree(kaio_zone, ki);
+ p->p_aioinfo = NULL;
+}
+
+/*
+ * Select a job to run (called by an AIO daemon).
+ */
+static struct aiocblist *
+aio_selectjob(struct aiothreadlist *aiop)
+{
+ struct aiocblist *aiocbe;
+ struct kaioinfo *ki;
+ struct proc *userp;
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+ TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
+ userp = aiocbe->userproc;
+ ki = userp->p_aioinfo;
+
+ if (ki->kaio_active_count < ki->kaio_maxactive_count) {
+ TAILQ_REMOVE(&aio_jobs, aiocbe, list);
+ /* Account for currently active jobs. */
+ ki->kaio_active_count++;
+ aiocbe->jobstate = JOBST_JOBRUNNING;
+ break;
+ }
+ }
+ return (aiocbe);
+}
+
+/*
+ * Move all data to a permanent storage device, this code
+ * simulates fsync syscall.
+ */
+static int
+aio_fsync_vnode(struct thread *td, struct vnode *vp)
+{
+ struct mount *mp;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto drop;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_object != NULL) {
+ VM_OBJECT_WLOCK(vp->v_object);
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ VM_OBJECT_WUNLOCK(vp->v_object);
+ }
+ error = VOP_FSYNC(vp, MNT_WAIT, td);
+
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+drop:
+ return (error);
+}
+
+/*
+ * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that
+ * does the I/O request for the non-physio version of the operations. The
+ * normal vn operations are used, and this code should work in all instances
+ * for every type of file, including pipes, sockets, fifos, and regular files.
+ *
+ * XXX I don't think it works well for socket, pipe, and fifo.
+ */
+static void
+aio_process_rw(struct aiocblist *aiocbe)
+{
+ struct ucred *td_savedcred;
+ struct thread *td;
+ struct aiocb *cb;
+ struct file *fp;
+ struct socket *so;
+ struct uio auio;
+ struct iovec aiov;
+ int cnt;
+ int error;
+ int oublock_st, oublock_end;
+ int inblock_st, inblock_end;
+
+ KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ ||
+ aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE,
+ ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+ td = curthread;
+ td_savedcred = td->td_ucred;
+ td->td_ucred = aiocbe->cred;
+ cb = &aiocbe->uaiocb;
+ fp = aiocbe->fd_file;
+
+ aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
+ aiov.iov_len = cb->aio_nbytes;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = cb->aio_offset;
+ auio.uio_resid = cb->aio_nbytes;
+ cnt = cb->aio_nbytes;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+
+ inblock_st = td->td_ru.ru_inblock;
+ oublock_st = td->td_ru.ru_oublock;
+ /*
+ * aio_aqueue() acquires a reference to the file that is
+ * released in aio_free_entry().
+ */
+ if (cb->aio_lio_opcode == LIO_READ) {
+ auio.uio_rw = UIO_READ;
+ if (auio.uio_resid == 0)
+ error = 0;
+ else
+ error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+ } else {
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ auio.uio_rw = UIO_WRITE;
+ error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+ }
+ inblock_end = td->td_ru.ru_inblock;
+ oublock_end = td->td_ru.ru_oublock;
+
+ aiocbe->inputcharge = inblock_end - inblock_st;
+ aiocbe->outputcharge = oublock_end - oublock_st;
+
+ if ((error) && (auio.uio_resid != cnt)) {
+ if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
+ error = 0;
+ if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
+ int sigpipe = 1;
+ if (fp->f_type == DTYPE_SOCKET) {
+ so = fp->f_data;
+ if (so->so_options & SO_NOSIGPIPE)
+ sigpipe = 0;
+ }
+ if (sigpipe) {
+ PROC_LOCK(aiocbe->userproc);
+ kern_psignal(aiocbe->userproc, SIGPIPE);
+ PROC_UNLOCK(aiocbe->userproc);
+ }
+ }
+ }
+
+ cnt -= auio.uio_resid;
+ cb->_aiocb_private.error = error;
+ cb->_aiocb_private.status = cnt;
+ td->td_ucred = td_savedcred;
+}
+
+static void
+aio_process_sync(struct aiocblist *aiocbe)
+{
+ struct thread *td = curthread;
+ struct ucred *td_savedcred = td->td_ucred;
+ struct aiocb *cb = &aiocbe->uaiocb;
+ struct file *fp = aiocbe->fd_file;
+ int error = 0;
+
+ KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC,
+ ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+ td->td_ucred = aiocbe->cred;
+ if (fp->f_vnode != NULL)
+ error = aio_fsync_vnode(td, fp->f_vnode);
+ cb->_aiocb_private.error = error;
+ cb->_aiocb_private.status = 0;
+ td->td_ucred = td_savedcred;
+}
+
+static void
+aio_process_mlock(struct aiocblist *aiocbe)
+{
+ struct aiocb *cb = &aiocbe->uaiocb;
+ int error;
+
+ KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK,
+ ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+ error = vm_mlock(aiocbe->userproc, aiocbe->cred,
+ __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
+ cb->_aiocb_private.error = error;
+ cb->_aiocb_private.status = 0;
+}
+
+static void
+aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
+{
+ struct aioliojob *lj;
+ struct kaioinfo *ki;
+ struct aiocblist *scb, *scbn;
+ int lj_done;
+
+ ki = userp->p_aioinfo;
+ AIO_LOCK_ASSERT(ki, MA_OWNED);
+ lj = aiocbe->lio;
+ lj_done = 0;
+ if (lj) {
+ lj->lioj_finished_count++;
+ if (lj->lioj_count == lj->lioj_finished_count)
+ lj_done = 1;
+ }
+ if (type == DONE_QUEUE) {
+ aiocbe->jobflags |= AIOCBLIST_DONE;
+ } else {
+ aiocbe->jobflags |= AIOCBLIST_BUFDONE;
+ }
+ TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
+ aiocbe->jobstate = JOBST_JOBFINISHED;
+
+ if (ki->kaio_flags & KAIO_RUNDOWN)
+ goto notification_done;
+
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+ aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
+ aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
+
+ KNOTE_LOCKED(&aiocbe->klist, 1);
+
+ if (lj_done) {
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+ KNOTE_LOCKED(&lj->klist, 1);
+ }
+ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+ == LIOJ_SIGNAL
+ && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+ aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+ }
+
+notification_done:
+ if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
+ TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
+ if (aiocbe->fd_file == scb->fd_file &&
+ aiocbe->seqno < scb->seqno) {
+ if (--scb->pending == 0) {
+ mtx_lock(&aio_job_mtx);
+ scb->jobstate = JOBST_JOBQGLOBAL;
+ TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
+ TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
+ aio_kick_nowait(userp);
+ mtx_unlock(&aio_job_mtx);
+ }
+ }
+ }
+ }
+ if (ki->kaio_flags & KAIO_WAKEUP) {
+ ki->kaio_flags &= ~KAIO_WAKEUP;
+ wakeup(&userp->p_aioinfo);
+ }
+}
+
+/*
+ * The AIO daemon, most of the actual work is done in aio_process_*,
+ * but the setup (and address space mgmt) is done in this routine.
+ */
+static void
+aio_daemon(void *_id)
+{
+ struct aiocblist *aiocbe;
+ struct aiothreadlist *aiop;
+ struct kaioinfo *ki;
+ struct proc *curcp, *mycp, *userp;
+ struct vmspace *myvm, *tmpvm;
+ struct thread *td = curthread;
+ int id = (intptr_t)_id;
+
+ /*
+ * Local copies of curproc (cp) and vmspace (myvm)
+ */
+ mycp = td->td_proc;
+ myvm = mycp->p_vmspace;
+
+ KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));
+
+ /*
+ * Allocate and ready the aio control info. There is one aiop structure
+ * per daemon.
+ */
+ aiop = uma_zalloc(aiop_zone, M_WAITOK);
+ aiop->aiothread = td;
+ aiop->aiothreadflags = 0;
+
+ /* The daemon resides in its own pgrp. */
+ sys_setsid(td, NULL);
+
+ /*
+ * Wakeup parent process. (Parent sleeps to keep from blasting away
+ * and creating too many daemons.)
+ */
+ sema_post(&aio_newproc_sem);
+
+ mtx_lock(&aio_job_mtx);
+ for (;;) {
+ /*
+ * curcp is the current daemon process context.
+ * userp is the current user process context.
+ */
+ curcp = mycp;
+
+ /*
+ * Take daemon off of free queue
+ */
+ if (aiop->aiothreadflags & AIOP_FREE) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ aiop->aiothreadflags &= ~AIOP_FREE;
+ }
+
+ /*
+ * Check for jobs.
+ */
+ while ((aiocbe = aio_selectjob(aiop)) != NULL) {
+ mtx_unlock(&aio_job_mtx);
+ userp = aiocbe->userproc;
+
+ /*
+ * Connect to process address space for user program.
+ */
+ if (userp != curcp) {
+ /*
+ * Save the current address space that we are
+ * connected to.
+ */
+ tmpvm = mycp->p_vmspace;
+
+ /*
+ * Point to the new user address space, and
+ * refer to it.
+ */
+ mycp->p_vmspace = userp->p_vmspace;
+ atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);
+
+ /* Activate the new mapping. */
+ pmap_activate(FIRST_THREAD_IN_PROC(mycp));
+
+ /*
+ * If the old address space wasn't the daemons
+ * own address space, then we need to remove the
+ * daemon's reference from the other process
+ * that it was acting on behalf of.
+ */
+ if (tmpvm != myvm) {
+ vmspace_free(tmpvm);
+ }
+ curcp = userp;
+ }
+
+ ki = userp->p_aioinfo;
+
+ /* Do the I/O function. */
+ switch(aiocbe->uaiocb.aio_lio_opcode) {
+ case LIO_READ:
+ case LIO_WRITE:
+ aio_process_rw(aiocbe);
+ break;
+ case LIO_SYNC:
+ aio_process_sync(aiocbe);
+ break;
+ case LIO_MLOCK:
+ aio_process_mlock(aiocbe);
+ break;
+ }
+
+ mtx_lock(&aio_job_mtx);
+ /* Decrement the active job count. */
+ ki->kaio_active_count--;
+ mtx_unlock(&aio_job_mtx);
+
+ AIO_LOCK(ki);
+ TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
+ aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
+ AIO_UNLOCK(ki);
+
+ mtx_lock(&aio_job_mtx);
+ }
+
+ /*
+ * Disconnect from user address space.
+ */
+ if (curcp != mycp) {
+
+ mtx_unlock(&aio_job_mtx);
+
+ /* Get the user address space to disconnect from. */
+ tmpvm = mycp->p_vmspace;
+
+ /* Get original address space for daemon. */
+ mycp->p_vmspace = myvm;
+
+ /* Activate the daemon's address space. */
+ pmap_activate(FIRST_THREAD_IN_PROC(mycp));
+#ifdef DIAGNOSTIC
+ if (tmpvm == myvm) {
+ printf("AIOD: vmspace problem -- %d\n",
+ mycp->p_pid);
+ }
+#endif
+ /* Remove our vmspace reference. */
+ vmspace_free(tmpvm);
+
+ curcp = mycp;
+
+ mtx_lock(&aio_job_mtx);
+ /*
+ * We have to restart to avoid race, we only sleep if
+ * no job can be selected, that should be
+ * curcp == mycp.
+ */
+ continue;
+ }
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+
+ TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
+ aiop->aiothreadflags |= AIOP_FREE;
+
+ /*
+ * If daemon is inactive for a long time, allow it to exit,
+ * thereby freeing resources.
+ */
+ if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
+ aiod_lifetime)) {
+ if (TAILQ_EMPTY(&aio_jobs)) {
+ if ((aiop->aiothreadflags & AIOP_FREE) &&
+ (num_aio_procs > target_aio_procs)) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ num_aio_procs--;
+ mtx_unlock(&aio_job_mtx);
+ uma_zfree(aiop_zone, aiop);
+ free_unr(aiod_unr, id);
+#ifdef DIAGNOSTIC
+ if (mycp->p_vmspace->vm_refcnt <= 1) {
+ printf("AIOD: bad vm refcnt for"
+ " exiting daemon: %d\n",
+ mycp->p_vmspace->vm_refcnt);
+ }
+#endif
+ kproc_exit(0);
+ }
+ }
+ }
+ }
+ mtx_unlock(&aio_job_mtx);
+ panic("shouldn't be here\n");
+}
+
+/*
+ * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
+ * AIO daemon modifies its environment itself.
+ */
+static int
+aio_newproc(int *start)
+{
+ int error;
+ struct proc *p;
+ int id;
+
+ id = alloc_unr(aiod_unr);
+ error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
+ RFNOWAIT, 0, "aiod%d", id);
+ if (error == 0) {
+ /*
+ * Wait until daemon is started.
+ */
+ sema_wait(&aio_newproc_sem);
+ mtx_lock(&aio_job_mtx);
+ num_aio_procs++;
+ if (start != NULL)
+ (*start)--;
+ mtx_unlock(&aio_job_mtx);
+ } else {
+ free_unr(aiod_unr, id);
+ }
+ return (error);
+}
+
+/*
+ * Try the high-performance, low-overhead physio method for eligible
+ * VCHR devices. This method doesn't use an aio helper thread, and
+ * thus has very low overhead.
+ *
+ * Assumes that the caller, aio_aqueue(), has incremented the file
+ * structure's reference count, preventing its deallocation for the
+ * duration of this call.
+ */
+static int
+aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
+{
+ struct aiocb *cb;
+ struct file *fp;
+ struct buf *bp;
+ struct vnode *vp;
+ struct cdevsw *csw;
+ struct cdev *dev;
+ struct kaioinfo *ki;
+ struct aioliojob *lj;
+ int error, ref;
+
+ cb = &aiocbe->uaiocb;
+ fp = aiocbe->fd_file;
+
+ if (fp == NULL || fp->f_type != DTYPE_VNODE)
+ return (-1);
+
+ vp = fp->f_vnode;
+
+ /*
+ * If its not a disk, we don't want to return a positive error.
+ * It causes the aio code to not fall through to try the thread
+ * way when you're talking to a regular file.
+ */
+ if (!vn_isdisk(vp, &error)) {
+ if (error == ENOTBLK)
+ return (-1);
+ else
+ return (error);
+ }
+
+ if (vp->v_bufobj.bo_bsize == 0)
+ return (-1);
+
+ if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+ return (-1);
+
+ if (cb->aio_nbytes >
+ MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
+ return (-1);
+
+ ki = p->p_aioinfo;
+ if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
+ return (-1);
+
+ ref = 0;
+ csw = devvn_refthread(vp, &dev, &ref);
+ if (csw == NULL)
+ return (ENXIO);
+ if (cb->aio_nbytes > dev->si_iosize_max) {
+ error = -1;
+ goto unref;
+ }
+
+ /* Create and build a buffer header for a transfer. */
+ bp = (struct buf *)getpbuf(NULL);
+ BUF_KERNPROC(bp);
+
+ AIO_LOCK(ki);
+ ki->kaio_count++;
+ ki->kaio_buffer_count++;
+ lj = aiocbe->lio;
+ if (lj)
+ lj->lioj_count++;
+ AIO_UNLOCK(ki);
+
+ /*
+ * Get a copy of the kva from the physical buffer.
+ */
+ error = 0;
+
+ bp->b_bcount = cb->aio_nbytes;
+ bp->b_bufsize = cb->aio_nbytes;
+ bp->b_iodone = aio_physwakeup;
+ bp->b_saveaddr = bp->b_data;
+ bp->b_data = (void *)(uintptr_t)cb->aio_buf;
+ bp->b_offset = cb->aio_offset;
+ bp->b_iooffset = cb->aio_offset;
+ bp->b_blkno = btodb(cb->aio_offset);
+ bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+
+ /*
+ * Bring buffer into kernel space.
+ */
+ if (vmapbuf(bp, (dev->si_flags & SI_UNMAPPED) == 0) < 0) {
+ error = EFAULT;
+ goto doerror;
+ }
+
+ AIO_LOCK(ki);
+ aiocbe->bp = bp;
+ bp->b_caller1 = (void *)aiocbe;
+ TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+ aiocbe->jobstate = JOBST_JOBQBUF;
+ cb->_aiocb_private.status = cb->aio_nbytes;
+ AIO_UNLOCK(ki);
+
+ atomic_add_int(&num_queue_count, 1);
+ atomic_add_int(&num_buf_aio, 1);
+
+ bp->b_error = 0;
+
+ TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
+
+ /* Perform transfer. */
+ dev_strategy_csw(dev, csw, bp);
+ dev_relthread(dev, ref);
+ return (0);
+
+doerror:
+ AIO_LOCK(ki);
+ ki->kaio_count--;
+ ki->kaio_buffer_count--;
+ if (lj)
+ lj->lioj_count--;
+ aiocbe->bp = NULL;
+ AIO_UNLOCK(ki);
+ relpbuf(bp, NULL);
+unref:
+ dev_relthread(dev, ref);
+ return (error);
+}
+
+/*
+ * Wake up aio requests that may be serviceable now.
+ */
+static void
+aio_swake_cb(struct socket *so, struct sockbuf *sb)
+{
+ struct aiocblist *cb, *cbn;
+ int opcode;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+ if (sb == &so->so_snd)
+ opcode = LIO_WRITE;
+ else
+ opcode = LIO_READ;
+
+ sb->sb_flags &= ~SB_AIO;
+ mtx_lock(&aio_job_mtx);
+ TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
+ if (opcode == cb->uaiocb.aio_lio_opcode) {
+ if (cb->jobstate != JOBST_JOBQSOCK)
+ panic("invalid queue value");
+ /* XXX
+ * We don't have actual sockets backend yet,
+ * so we simply move the requests to the generic
+ * file I/O backend.
+ */
+ TAILQ_REMOVE(&so->so_aiojobq, cb, list);
+ TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
+ aio_kick_nowait(cb->userproc);
+ }
+ }
+ mtx_unlock(&aio_job_mtx);
+}
+
+static int
+convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
+{
+
+ /*
+ * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
+ * supported by AIO with the old sigevent structure.
+ */
+ nsig->sigev_notify = osig->sigev_notify;
+ switch (nsig->sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
+ break;
+ case SIGEV_KEVENT:
+ nsig->sigev_notify_kqueue =
+ osig->__sigev_u.__sigev_notify_kqueue;
+ nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+{
+ struct oaiocb *ojob;
+ int error;
+
+ bzero(kjob, sizeof(struct aiocb));
+ error = copyin(ujob, kjob, sizeof(struct oaiocb));
+ if (error)
+ return (error);
+ ojob = (struct oaiocb *)kjob;
+ return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
+}
+
+static int
+aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob)
+{
+
+ return (copyin(ujob, kjob, sizeof(struct aiocb)));
+}
+
+static long
+aiocb_fetch_status(struct aiocb *ujob)
+{
+
+ return (fuword(&ujob->_aiocb_private.status));
+}
+
+static long
+aiocb_fetch_error(struct aiocb *ujob)
+{
+
+ return (fuword(&ujob->_aiocb_private.error));
+}
+
+static int
+aiocb_store_status(struct aiocb *ujob, long status)
+{
+
+ return (suword(&ujob->_aiocb_private.status, status));
+}
+
+static int
+aiocb_store_error(struct aiocb *ujob, long error)
+{
+
+ return (suword(&ujob->_aiocb_private.error, error));
+}
+
+static int
+aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
+{
+
+ return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
+}
+
+static int
+aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
+{
+
+ return (suword(ujobp, (long)ujob));
+}
+
+static struct aiocb_ops aiocb_ops = {
+ .copyin = aiocb_copyin,
+ .fetch_status = aiocb_fetch_status,
+ .fetch_error = aiocb_fetch_error,
+ .store_status = aiocb_store_status,
+ .store_error = aiocb_store_error,
+ .store_kernelinfo = aiocb_store_kernelinfo,
+ .store_aiocb = aiocb_store_aiocb,
+};
+
+static struct aiocb_ops aiocb_ops_osigevent = {
+ .copyin = aiocb_copyin_old_sigevent,
+ .fetch_status = aiocb_fetch_status,
+ .fetch_error = aiocb_fetch_error,
+ .store_status = aiocb_store_status,
+ .store_error = aiocb_store_error,
+ .store_kernelinfo = aiocb_store_kernelinfo,
+ .store_aiocb = aiocb_store_aiocb,
+};
+
+/*
+ * Queue a new AIO request. Choosing either the threaded or direct physio VCHR
+ * technique is done in this code.
+ */
+int
+aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
+ int type, struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ cap_rights_t rights;
+ struct file *fp;
+ struct socket *so;
+ struct aiocblist *aiocbe, *cb;
+ struct kaioinfo *ki;
+ struct kevent kev;
+ struct sockbuf *sb;
+ int opcode;
+ int error;
+ int fd, kqfd;
+ int jid;
+ u_short evflags;
+
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+
+ ki = p->p_aioinfo;
+
+ ops->store_status(job, -1);
+ ops->store_error(job, 0);
+ ops->store_kernelinfo(job, -1);
+
+ if (num_queue_count >= max_queue_count ||
+ ki->kaio_count >= ki->kaio_qallowed_count) {
+ ops->store_error(job, EAGAIN);
+ return (EAGAIN);
+ }
+
+ aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
+ knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
+
+ error = ops->copyin(job, &aiocbe->uaiocb);
+ if (error) {
+ ops->store_error(job, error);
+ uma_zfree(aiocb_zone, aiocbe);
+ return (error);
+ }
+
+ /* XXX: aio_nbytes is later casted to signed types. */
+ if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
+ uma_zfree(aiocb_zone, aiocbe);
+ return (EINVAL);
+ }
+
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
+ aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
+ aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
+ aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
+ ops->store_error(job, EINVAL);
+ uma_zfree(aiocb_zone, aiocbe);
+ return (EINVAL);
+ }
+
+ if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+ aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
+ !_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
+ uma_zfree(aiocb_zone, aiocbe);
+ return (EINVAL);
+ }
+
+ ksiginfo_init(&aiocbe->ksi);
+
+ /* Save userspace address of the job info. */
+ aiocbe->uuaiocb = job;
+
+ /* Get the opcode. */
+ if (type != LIO_NOP)
+ aiocbe->uaiocb.aio_lio_opcode = type;
+ opcode = aiocbe->uaiocb.aio_lio_opcode;
+
+ /*
+ * Validate the opcode and fetch the file object for the specified
+ * file descriptor.
+ *
+ * XXXRW: Moved the opcode validation up here so that we don't
+ * retrieve a file descriptor without knowing what the capabiltity
+ * should be.
+ */
+ fd = aiocbe->uaiocb.aio_fildes;
+ switch (opcode) {
+ case LIO_WRITE:
+ error = fget_write(td, fd,
+ cap_rights_init(&rights, CAP_PWRITE), &fp);
+ break;
+ case LIO_READ:
+ error = fget_read(td, fd,
+ cap_rights_init(&rights, CAP_PREAD), &fp);
+ break;
+ case LIO_SYNC:
+ error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
+ break;
+ case LIO_MLOCK:
+ fp = NULL;
+ break;
+ case LIO_NOP:
+ error = fget(td, fd, cap_rights_init(&rights), &fp);
+ break;
+ default:
+ error = EINVAL;
+ }
+ if (error) {
+ uma_zfree(aiocb_zone, aiocbe);
+ ops->store_error(job, error);
+ return (error);
+ }
+
+ if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+
+ if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+
+ aiocbe->fd_file = fp;
+
+ mtx_lock(&aio_job_mtx);
+ jid = jobrefid++;
+ aiocbe->seqno = jobseqno++;
+ mtx_unlock(&aio_job_mtx);
+ error = ops->store_kernelinfo(job, jid);
+ if (error) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+ aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
+
+ if (opcode == LIO_NOP) {
+ fdrop(fp, td);
+ uma_zfree(aiocb_zone, aiocbe);
+ return (0);
+ }
+
+ if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
+ goto no_kqueue;
+ evflags = aiocbe->uaiocb.aio_sigevent.sigev_notify_kevent_flags;
+ if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) {
+ error = EINVAL;
+ goto aqueue_fail;
+ }
+ kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
+ kev.ident = (uintptr_t)aiocbe->uuaiocb;
+ kev.filter = EVFILT_AIO;
+ kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags;
+ kev.data = (intptr_t)aiocbe;
+ kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
+ error = kqfd_register(kqfd, &kev, td, 1);
+aqueue_fail:
+ if (error) {
+ if (fp)
+ fdrop(fp, td);
+ uma_zfree(aiocb_zone, aiocbe);
+ ops->store_error(job, error);
+ goto done;
+ }
+no_kqueue:
+
+ ops->store_error(job, EINPROGRESS);
+ aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
+ aiocbe->userproc = p;
+ aiocbe->cred = crhold(td->td_ucred);
+ aiocbe->jobflags = 0;
+ aiocbe->lio = lj;
+
+ if (opcode == LIO_SYNC)
+ goto queueit;
+
+ if (fp && fp->f_type == DTYPE_SOCKET) {
+ /*
+ * Alternate queueing for socket ops: Reach down into the
+ * descriptor to get the socket data. Then check to see if the
+ * socket is ready to be read or written (based on the requested
+ * operation).
+ *
+ * If it is not ready for io, then queue the aiocbe on the
+ * socket, and set the flags so we get a call when sbnotify()
+ * happens.
+ *
+ * Note if opcode is neither LIO_WRITE nor LIO_READ we lock
+ * and unlock the snd sockbuf for no reason.
+ */
+ so = fp->f_data;
+ sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
+ SOCKBUF_LOCK(sb);
+ if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
+ LIO_WRITE) && (!sowriteable(so)))) {
+ sb->sb_flags |= SB_AIO;
+
+ mtx_lock(&aio_job_mtx);
+ TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
+ mtx_unlock(&aio_job_mtx);
+
+ AIO_LOCK(ki);
+ TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+ TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+ aiocbe->jobstate = JOBST_JOBQSOCK;
+ ki->kaio_count++;
+ if (lj)
+ lj->lioj_count++;
+ AIO_UNLOCK(ki);
+ SOCKBUF_UNLOCK(sb);
+ atomic_add_int(&num_queue_count, 1);
+ error = 0;
+ goto done;
+ }
+ SOCKBUF_UNLOCK(sb);
+ }
+
+ if ((error = aio_qphysio(p, aiocbe)) == 0)
+ goto done;
+#if 0
+ if (error > 0) {
+ aiocbe->uaiocb._aiocb_private.error = error;
+ ops->store_error(job, error);
+ goto done;
+ }
+#endif
+queueit:
+ /* No buffer for daemon I/O. */
+ aiocbe->bp = NULL;
+ atomic_add_int(&num_queue_count, 1);
+
+ AIO_LOCK(ki);
+ ki->kaio_count++;
+ if (lj)
+ lj->lioj_count++;
+ TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+ if (opcode == LIO_SYNC) {
+ TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
+ if (cb->fd_file == aiocbe->fd_file &&
+ cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
+ cb->seqno < aiocbe->seqno) {
+ cb->jobflags |= AIOCBLIST_CHECKSYNC;
+ aiocbe->pending++;
+ }
+ }
+ TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
+ if (cb->fd_file == aiocbe->fd_file &&
+ cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
+ cb->seqno < aiocbe->seqno) {
+ cb->jobflags |= AIOCBLIST_CHECKSYNC;
+ aiocbe->pending++;
+ }
+ }
+ if (aiocbe->pending != 0) {
+ TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
+ aiocbe->jobstate = JOBST_JOBQSYNC;
+ AIO_UNLOCK(ki);
+ goto done;
+ }
+ }
+ mtx_lock(&aio_job_mtx);
+ TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
+ aiocbe->jobstate = JOBST_JOBQGLOBAL;
+ aio_kick_nowait(p);
+ mtx_unlock(&aio_job_mtx);
+ AIO_UNLOCK(ki);
+ error = 0;
+done:
+ return (error);
+}
+
+static void
+aio_kick_nowait(struct proc *userp)
+{
+ struct kaioinfo *ki = userp->p_aioinfo;
+ struct aiothreadlist *aiop;
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+ if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ aiop->aiothreadflags &= ~AIOP_FREE;
+ wakeup(aiop->aiothread);
+ } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+ ((ki->kaio_active_count + num_aio_resv_start) <
+ ki->kaio_maxactive_count)) {
+ taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
+ }
+}
+
+static int
+aio_kick(struct proc *userp)
+{
+ struct kaioinfo *ki = userp->p_aioinfo;
+ struct aiothreadlist *aiop;
+ int error, ret = 0;
+
+ mtx_assert(&aio_job_mtx, MA_OWNED);
+retryproc:
+ if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+ TAILQ_REMOVE(&aio_freeproc, aiop, list);
+ aiop->aiothreadflags &= ~AIOP_FREE;
+ wakeup(aiop->aiothread);
+ } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+ ((ki->kaio_active_count + num_aio_resv_start) <
+ ki->kaio_maxactive_count)) {
+ num_aio_resv_start++;
+ mtx_unlock(&aio_job_mtx);
+ error = aio_newproc(&num_aio_resv_start);
+ mtx_lock(&aio_job_mtx);
+ if (error) {
+ num_aio_resv_start--;
+ goto retryproc;
+ }
+ } else {
+ ret = -1;
+ }
+ return (ret);
+}
+
+static void
+aio_kick_helper(void *context, int pending)
+{
+ struct proc *userp = context;
+
+ mtx_lock(&aio_job_mtx);
+ while (--pending >= 0) {
+ if (aio_kick(userp))
+ break;
+ }
+ mtx_unlock(&aio_job_mtx);
+}
+
+/*
+ * Support the aio_return system call, as a side-effect, kernel resources are
+ * released.
+ */
+static int
+kern_aio_return(struct thread *td, struct aiocb *uaiocb, struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct aiocblist *cb;
+ struct kaioinfo *ki;
+ int status, error;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return (EINVAL);
+ AIO_LOCK(ki);
+ TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
+ if (cb->uuaiocb == uaiocb)
+ break;
+ }
+ if (cb != NULL) {
+ MPASS(cb->jobstate == JOBST_JOBFINISHED);
+ status = cb->uaiocb._aiocb_private.status;
+ error = cb->uaiocb._aiocb_private.error;
+ td->td_retval[0] = status;
+ if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+ td->td_ru.ru_oublock += cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+ td->td_ru.ru_inblock += cb->inputcharge;
+ cb->inputcharge = 0;
+ }
+ aio_free_entry(cb);
+ AIO_UNLOCK(ki);
+ ops->store_error(uaiocb, error);
+ ops->store_status(uaiocb, status);
+ } else {
+ error = EINVAL;
+ AIO_UNLOCK(ki);
+ }
+ return (error);
+}
+
+int
+sys_aio_return(struct thread *td, struct aio_return_args *uap)
+{
+
+ return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
+}
+
+/*
+ * Allow a process to wakeup when any of the I/O requests are completed.
+ */
+static int
+kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist,
+ struct timespec *ts)
+{
+ struct proc *p = td->td_proc;
+ struct timeval atv;
+ struct kaioinfo *ki;
+ struct aiocblist *cb, *cbfirst;
+ int error, i, timo;
+
+ timo = 0;
+ if (ts) {
+ if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
+ return (EINVAL);
+
+ TIMESPEC_TO_TIMEVAL(&atv, ts);
+ if (itimerfix(&atv))
+ return (EINVAL);
+ timo = tvtohz(&atv);
+ }
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ return (EAGAIN);
+
+ if (njoblist == 0)
+ return (0);
+
+ AIO_LOCK(ki);
+ for (;;) {
+ cbfirst = NULL;
+ error = 0;
+ TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
+ for (i = 0; i < njoblist; i++) {
+ if (cb->uuaiocb == ujoblist[i]) {
+ if (cbfirst == NULL)
+ cbfirst = cb;
+ if (cb->jobstate == JOBST_JOBFINISHED)
+ goto RETURN;
+ }
+ }
+ }
+ /* All tasks were finished. */
+ if (cbfirst == NULL)
+ break;
+
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+ "aiospn", timo);
+ if (error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
+ }
+RETURN:
+ AIO_UNLOCK(ki);
+ return (error);
+}
+
+int
+sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap)
+{
+ struct timespec ts, *tsp;
+ struct aiocb **ujoblist;
+ int error;
+
+ if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
+ return (EINVAL);
+
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
+ return (error);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+ ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
+ error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
+ if (error == 0)
+ error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
+ uma_zfree(aiol_zone, ujoblist);
+ return (error);
+}
+
+/*
+ * aio_cancel cancels any non-physio aio operations not currently in
+ * progress.
+ */
+int
+sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap)
+{
+ struct proc *p = td->td_proc;
+ struct kaioinfo *ki;
+ struct aiocblist *cbe, *cbn;
+ struct file *fp;
+ struct socket *so;
+ int error;
+ int remove;
+ int cancelled = 0;
+ int notcancelled = 0;
+ struct vnode *vp;
+
+ /* Lookup file object. */
+ error = fget(td, uap->fd, NULL, &fp);
+ if (error)
+ return (error);
+
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ goto done;
+
+ if (fp->f_type == DTYPE_VNODE) {
+ vp = fp->f_vnode;
+ if (vn_isdisk(vp, &error)) {
+ fdrop(fp, td);
+ td->td_retval[0] = AIO_NOTCANCELED;
+ return (0);
+ }
+ }
+
+ AIO_LOCK(ki);
+ TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
+ if ((uap->fd == cbe->uaiocb.aio_fildes) &&
+ ((uap->aiocbp == NULL) ||
+ (uap->aiocbp == cbe->uuaiocb))) {
+ remove = 0;
+
+ mtx_lock(&aio_job_mtx);
+ if (cbe->jobstate == JOBST_JOBQGLOBAL) {
+ TAILQ_REMOVE(&aio_jobs, cbe, list);
+ remove = 1;
+ } else if (cbe->jobstate == JOBST_JOBQSOCK) {
+ MPASS(fp->f_type == DTYPE_SOCKET);
+ so = fp->f_data;
+ TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+ remove = 1;
+ } else if (cbe->jobstate == JOBST_JOBQSYNC) {
+ TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
+ remove = 1;
+ }
+ mtx_unlock(&aio_job_mtx);
+
+ if (remove) {
+ TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
+ cbe->uaiocb._aiocb_private.status = -1;
+ cbe->uaiocb._aiocb_private.error = ECANCELED;
+ aio_bio_done_notify(p, cbe, DONE_QUEUE);
+ cancelled++;
+ } else {
+ notcancelled++;
+ }
+ if (uap->aiocbp != NULL)
+ break;
+ }
+ }
+ AIO_UNLOCK(ki);
+
+done:
+ fdrop(fp, td);
+
+ if (uap->aiocbp != NULL) {
+ if (cancelled) {
+ td->td_retval[0] = AIO_CANCELED;
+ return (0);
+ }
+ }
+
+ if (notcancelled) {
+ td->td_retval[0] = AIO_NOTCANCELED;
+ return (0);
+ }
+
+ if (cancelled) {
+ td->td_retval[0] = AIO_CANCELED;
+ return (0);
+ }
+
+ td->td_retval[0] = AIO_ALLDONE;
+
+ return (0);
+}
+
+/*
+ * aio_error is implemented in the kernel level for compatibility purposes
+ * only. For a user mode async implementation, it would be best to do it in
+ * a userland subroutine.
+ */
+static int
+kern_aio_error(struct thread *td, struct aiocb *aiocbp, struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct aiocblist *cb;
+ struct kaioinfo *ki;
+ int status;
+
+ ki = p->p_aioinfo;
+ if (ki == NULL) {
+ td->td_retval[0] = EINVAL;
+ return (0);
+ }
+
+ AIO_LOCK(ki);
+ TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
+ if (cb->uuaiocb == aiocbp) {
+ if (cb->jobstate == JOBST_JOBFINISHED)
+ td->td_retval[0] =
+ cb->uaiocb._aiocb_private.error;
+ else
+ td->td_retval[0] = EINPROGRESS;
+ AIO_UNLOCK(ki);
+ return (0);
+ }
+ }
+ AIO_UNLOCK(ki);
+
+ /*
+ * Hack for failure of aio_aqueue.
+ */
+ status = ops->fetch_status(aiocbp);
+ if (status == -1) {
+ td->td_retval[0] = ops->fetch_error(aiocbp);
+ return (0);
+ }
+
+ td->td_retval[0] = EINVAL;
+ return (0);
+}
+
+int
+sys_aio_error(struct thread *td, struct aio_error_args *uap)
+{
+
+ return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
+}
+
+/* syscall - asynchronous read from a file (REALTIME) */
+int
+sys_oaio_read(struct thread *td, struct oaio_read_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+ &aiocb_ops_osigevent));
+}
+
+int
+sys_aio_read(struct thread *td, struct aio_read_args *uap)
+{
+
+ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
+}
+
+/* syscall - asynchronous write to a file (REALTIME) */
+int
+sys_oaio_write(struct thread *td, struct oaio_write_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+ &aiocb_ops_osigevent));
+}
+
+int
+sys_aio_write(struct thread *td, struct aio_write_args *uap)
+{
+
+ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
+}
+
+int
+sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
+{
+
+ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
+}
+
+static int
+kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
+ struct aiocb **acb_list, int nent, struct sigevent *sig,
+ struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct aiocb *iocb;
+ struct kaioinfo *ki;
+ struct aioliojob *lj;
+ struct kevent kev;
+ int error;
+ int nerror;
+ int i;
+
+ if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
+ return (EINVAL);
+
+ if (nent < 0 || nent > AIO_LISTIO_MAX)
+ return (EINVAL);
+
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+
+ ki = p->p_aioinfo;
+
+ lj = uma_zalloc(aiolio_zone, M_WAITOK);
+ lj->lioj_flags = 0;
+ lj->lioj_count = 0;
+ lj->lioj_finished_count = 0;
+ knlist_init_mtx(&lj->klist, AIO_MTX(ki));
+ ksiginfo_init(&lj->lioj_ksi);
+
+ /*
+ * Setup signal.
+ */
+ if (sig && (mode == LIO_NOWAIT)) {
+ bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ /* Assume only new style KEVENT */
+ kev.filter = EVFILT_LIO;
+ kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
+ kev.ident = (uintptr_t)uacb_list; /* something unique */
+ kev.data = (intptr_t)lj;
+ /* pass user defined sigval data */
+ kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
+ error = kqfd_register(
+ lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
+ if (error) {
+ uma_zfree(aiolio_zone, lj);
+ return (error);
+ }
+ } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
+ ;
+ } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
+ if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+ uma_zfree(aiolio_zone, lj);
+ return EINVAL;
+ }
+ lj->lioj_flags |= LIOJ_SIGNAL;
+ } else {
+ uma_zfree(aiolio_zone, lj);
+ return EINVAL;
+ }
+ }
+
+ AIO_LOCK(ki);
+ TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
+ /*
+ * Add extra aiocb count to avoid the lio to be freed
+ * by other threads doing aio_waitcomplete or aio_return,
+ * and prevent event from being sent until we have queued
+ * all tasks.
+ */
+ lj->lioj_count = 1;
+ AIO_UNLOCK(ki);
+
+ /*
+ * Get pointers to the list of I/O requests.
+ */
+ nerror = 0;
+ for (i = 0; i < nent; i++) {
+ iocb = acb_list[i];
+ if (iocb != NULL) {
+ error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
+ if (error != 0)
+ nerror++;
+ }
+ }
+
+ error = 0;
+ AIO_LOCK(ki);
+ if (mode == LIO_WAIT) {
+ while (lj->lioj_count - 1 != lj->lioj_finished_count) {
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki),
+ PRIBIO | PCATCH, "aiospn", 0);
+ if (error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
+ }
+ } else {
+ if (lj->lioj_count - 1 == lj->lioj_finished_count) {
+ if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+ lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+ KNOTE_LOCKED(&lj->klist, 1);
+ }
+ if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+ == LIOJ_SIGNAL
+ && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+ lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+ aio_sendsig(p, &lj->lioj_signal,
+ &lj->lioj_ksi);
+ lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+ }
+ }
+ }
+ lj->lioj_count--;
+ if (lj->lioj_count == 0) {
+ TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+ knlist_delete(&lj->klist, curthread, 1);
+ PROC_LOCK(p);
+ sigqueue_take(&lj->lioj_ksi);
+ PROC_UNLOCK(p);
+ AIO_UNLOCK(ki);
+ uma_zfree(aiolio_zone, lj);
+ } else
+ AIO_UNLOCK(ki);
+
+ if (nerror)
+ return (EIO);
+ return (error);
+}
+
+/* syscall - list directed I/O (REALTIME) */
+int
+sys_olio_listio(struct thread *td, struct olio_listio_args *uap)
+{
+ struct aiocb **acb_list;
+ struct sigevent *sigp, sig;
+ struct osigevent osig;
+ int error, nent;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return (EINVAL);
+
+ nent = uap->nent;
+ if (nent < 0 || nent > AIO_LISTIO_MAX)
+ return (EINVAL);
+
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &osig, sizeof(osig));
+ if (error)
+ return (error);
+ error = convert_old_sigevent(&osig, &sig);
+ if (error)
+ return (error);
+ sigp = &sig;
+ } else
+ sigp = NULL;
+
+ acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+ error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
+ if (error == 0)
+ error = kern_lio_listio(td, uap->mode,
+ (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+ &aiocb_ops_osigevent);
+ free(acb_list, M_LIO);
+ return (error);
+}
+
+/* syscall - list directed I/O (REALTIME) */
+int
+sys_lio_listio(struct thread *td, struct lio_listio_args *uap)
+{
+ struct aiocb **acb_list;
+ struct sigevent *sigp, sig;
+ int error, nent;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return (EINVAL);
+
+ nent = uap->nent;
+ if (nent < 0 || nent > AIO_LISTIO_MAX)
+ return (EINVAL);
+
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &sig, sizeof(sig));
+ if (error)
+ return (error);
+ sigp = &sig;
+ } else
+ sigp = NULL;
+
+ acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+ error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
+ if (error == 0)
+ error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
+ nent, sigp, &aiocb_ops);
+ free(acb_list, M_LIO);
+ return (error);
+}
+
+/*
+ * Called from interrupt thread for physio, we should return as fast
+ * as possible, so we schedule a biohelper task.
+ */
+static void
+aio_physwakeup(struct buf *bp)
+{
+ struct aiocblist *aiocbe;
+
+ aiocbe = (struct aiocblist *)bp->b_caller1;
+ taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
+}
+
+/*
+ * Task routine to perform heavy tasks, process wakeup, and signals.
+ */
+static void
+biohelper(void *context, int pending)
+{
+ struct aiocblist *aiocbe = context;
+ struct buf *bp;
+ struct proc *userp;
+ struct kaioinfo *ki;
+ int nblks;
+
+ bp = aiocbe->bp;
+ userp = aiocbe->userproc;
+ ki = userp->p_aioinfo;
+ AIO_LOCK(ki);
+ aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+ aiocbe->uaiocb._aiocb_private.error = 0;
+ if (bp->b_ioflags & BIO_ERROR)
+ aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+ nblks = btodb(aiocbe->uaiocb.aio_nbytes);
+ if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
+ aiocbe->outputcharge += nblks;
+ else
+ aiocbe->inputcharge += nblks;
+ aiocbe->bp = NULL;
+ TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
+ ki->kaio_buffer_count--;
+ aio_bio_done_notify(userp, aiocbe, DONE_BUF);
+ AIO_UNLOCK(ki);
+
+ /* Release mapping into kernel space. */
+ vunmapbuf(bp);
+ relpbuf(bp, NULL);
+ atomic_subtract_int(&num_buf_aio, 1);
+}
+
+/* syscall - wait for the next completion of an aio request */
+static int
+kern_aio_waitcomplete(struct thread *td, struct aiocb **aiocbp,
+ struct timespec *ts, struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct timeval atv;
+ struct kaioinfo *ki;
+ struct aiocblist *cb;
+ struct aiocb *uuaiocb;
+ int error, status, timo;
+
+ ops->store_aiocb(aiocbp, NULL);
+
+ timo = 0;
+ if (ts) {
+ if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000))
+ return (EINVAL);
+
+ TIMESPEC_TO_TIMEVAL(&atv, ts);
+ if (itimerfix(&atv))
+ return (EINVAL);
+ timo = tvtohz(&atv);
+ }
+
+ if (p->p_aioinfo == NULL)
+ aio_init_aioinfo(p);
+ ki = p->p_aioinfo;
+
+ error = 0;
+ cb = NULL;
+ AIO_LOCK(ki);
+ while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
+ ki->kaio_flags |= KAIO_WAKEUP;
+ error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+ "aiowc", timo);
+ if (timo && error == ERESTART)
+ error = EINTR;
+ if (error)
+ break;
+ }
+
+ if (cb != NULL) {
+ MPASS(cb->jobstate == JOBST_JOBFINISHED);
+ uuaiocb = cb->uuaiocb;
+ status = cb->uaiocb._aiocb_private.status;
+ error = cb->uaiocb._aiocb_private.error;
+ td->td_retval[0] = status;
+ if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+ td->td_ru.ru_oublock += cb->outputcharge;
+ cb->outputcharge = 0;
+ } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+ td->td_ru.ru_inblock += cb->inputcharge;
+ cb->inputcharge = 0;
+ }
+ aio_free_entry(cb);
+ AIO_UNLOCK(ki);
+ ops->store_aiocb(aiocbp, uuaiocb);
+ ops->store_error(uuaiocb, error);
+ ops->store_status(uuaiocb, status);
+ } else
+ AIO_UNLOCK(ki);
+
+ return (error);
+}
+
+int
+sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap)
+{
+ struct timespec ts, *tsp;
+ int error;
+
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ error = copyin(uap->timeout, &ts, sizeof(ts));
+ if (error)
+ return (error);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+ return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
+}
+
+static int
+kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
+ struct aiocb_ops *ops)
+{
+ struct proc *p = td->td_proc;
+ struct kaioinfo *ki;
+
+ if (op != O_SYNC) /* XXX lack of O_DSYNC */
+ return (EINVAL);
+ ki = p->p_aioinfo;
+ if (ki == NULL)
+ aio_init_aioinfo(p);
+ return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
+}
+
+int
+sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap)
+{
+
+ return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
+}
+
+/* kqueue attach function */
+static int
+filt_aioattach(struct knote *kn)
+{
+ struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
+
+ /*
+ * The aiocbe pointer must be validated before using it, so
+ * registration is restricted to the kernel; the user cannot
+ * set EV_FLAG1.
+ */
+ if ((kn->kn_flags & EV_FLAG1) == 0)
+ return (EPERM);
+ kn->kn_ptr.p_aio = aiocbe;
+ kn->kn_flags &= ~EV_FLAG1;
+
+ knlist_add(&aiocbe->klist, kn, 0);
+
+ return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_aiodetach(struct knote *kn)
+{
+ struct knlist *knl;
+
+ knl = &kn->kn_ptr.p_aio->klist;
+ knl->kl_lock(knl->kl_lockarg);
+ if (!knlist_empty(knl))
+ knlist_remove(knl, kn, 1);
+ knl->kl_unlock(knl->kl_lockarg);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_aio(struct knote *kn, long hint)
+{
+ struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
+
+ kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
+ if (aiocbe->jobstate != JOBST_JOBFINISHED)
+ return (0);
+ kn->kn_flags |= EV_EOF;
+ return (1);
+}
+
+/* kqueue attach function */
+static int
+filt_lioattach(struct knote *kn)
+{
+ struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
+
+ /*
+ * The aioliojob pointer must be validated before using it, so
+ * registration is restricted to the kernel; the user cannot
+ * set EV_FLAG1.
+ */
+ if ((kn->kn_flags & EV_FLAG1) == 0)
+ return (EPERM);
+ kn->kn_ptr.p_lio = lj;
+ kn->kn_flags &= ~EV_FLAG1;
+
+ knlist_add(&lj->klist, kn, 0);
+
+ return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_liodetach(struct knote *kn)
+{
+ struct knlist *knl;
+
+ knl = &kn->kn_ptr.p_lio->klist;
+ knl->kl_lock(knl->kl_lockarg);
+ if (!knlist_empty(knl))
+ knlist_remove(knl, kn, 1);
+ knl->kl_unlock(knl->kl_lockarg);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_lio(struct knote *kn, long hint)
+{
+ struct aioliojob * lj = kn->kn_ptr.p_lio;
+
+ return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
+}
+
+#ifdef COMPAT_FREEBSD32
+
+struct __aiocb_private32 {
+ int32_t status;
+ int32_t error;
+ uint32_t kernelinfo;
+};
+
+typedef struct oaiocb32 {
+ int aio_fildes; /* File descriptor */
+ uint64_t aio_offset __packed; /* File offset for I/O */
+ uint32_t aio_buf; /* I/O buffer in process space */
+ uint32_t aio_nbytes; /* Number of bytes for I/O */
+ struct osigevent32 aio_sigevent; /* Signal to deliver */
+ int aio_lio_opcode; /* LIO opcode */
+ int aio_reqprio; /* Request priority -- ignored */
+ struct __aiocb_private32 _aiocb_private;
+} oaiocb32_t;
+
+typedef struct aiocb32 {
+ int32_t aio_fildes; /* File descriptor */
+ uint64_t aio_offset __packed; /* File offset for I/O */
+ uint32_t aio_buf; /* I/O buffer in process space */
+ uint32_t aio_nbytes; /* Number of bytes for I/O */
+ int __spare__[2];
+ uint32_t __spare2__;
+ int aio_lio_opcode; /* LIO opcode */
+ int aio_reqprio; /* Request priority -- ignored */
+ struct __aiocb_private32 _aiocb_private;
+ struct sigevent32 aio_sigevent; /* Signal to deliver */
+} aiocb32_t;
+
+static int
+convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig)
+{
+
+ /*
+ * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
+ * supported by AIO with the old sigevent structure.
+ */
+ CP(*osig, *nsig, sigev_notify);
+ switch (nsig->sigev_notify) {
+ case SIGEV_NONE:
+ break;
+ case SIGEV_SIGNAL:
+ nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
+ break;
+ case SIGEV_KEVENT:
+ nsig->sigev_notify_kqueue =
+ osig->__sigev_u.__sigev_notify_kqueue;
+ PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr);
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+static int
+aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+{
+ struct oaiocb32 job32;
+ int error;
+
+ bzero(kjob, sizeof(struct aiocb));
+ error = copyin(ujob, &job32, sizeof(job32));
+ if (error)
+ return (error);
+
+ CP(job32, *kjob, aio_fildes);
+ CP(job32, *kjob, aio_offset);
+ PTRIN_CP(job32, *kjob, aio_buf);
+ CP(job32, *kjob, aio_nbytes);
+ CP(job32, *kjob, aio_lio_opcode);
+ CP(job32, *kjob, aio_reqprio);
+ CP(job32, *kjob, _aiocb_private.status);
+ CP(job32, *kjob, _aiocb_private.error);
+ PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
+ return (convert_old_sigevent32(&job32.aio_sigevent,
+ &kjob->aio_sigevent));
+}
+
+static int
+aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
+{
+ struct aiocb32 job32;
+ int error;
+
+ error = copyin(ujob, &job32, sizeof(job32));
+ if (error)
+ return (error);
+ CP(job32, *kjob, aio_fildes);
+ CP(job32, *kjob, aio_offset);
+ PTRIN_CP(job32, *kjob, aio_buf);
+ CP(job32, *kjob, aio_nbytes);
+ CP(job32, *kjob, aio_lio_opcode);
+ CP(job32, *kjob, aio_reqprio);
+ CP(job32, *kjob, _aiocb_private.status);
+ CP(job32, *kjob, _aiocb_private.error);
+ PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
+ return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
+}
+
+static long
+aiocb32_fetch_status(struct aiocb *ujob)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (fuword32(&ujob32->_aiocb_private.status));
+}
+
+static long
+aiocb32_fetch_error(struct aiocb *ujob)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (fuword32(&ujob32->_aiocb_private.error));
+}
+
+static int
+aiocb32_store_status(struct aiocb *ujob, long status)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (suword32(&ujob32->_aiocb_private.status, status));
+}
+
+static int
+aiocb32_store_error(struct aiocb *ujob, long error)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (suword32(&ujob32->_aiocb_private.error, error));
+}
+
+static int
+aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
+{
+ struct aiocb32 *ujob32;
+
+ ujob32 = (struct aiocb32 *)ujob;
+ return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
+}
+
+static int
+aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob)
+{
+
+ return (suword32(ujobp, (long)ujob));
+}
+
+static struct aiocb_ops aiocb32_ops = {
+ .copyin = aiocb32_copyin,
+ .fetch_status = aiocb32_fetch_status,
+ .fetch_error = aiocb32_fetch_error,
+ .store_status = aiocb32_store_status,
+ .store_error = aiocb32_store_error,
+ .store_kernelinfo = aiocb32_store_kernelinfo,
+ .store_aiocb = aiocb32_store_aiocb,
+};
+
+static struct aiocb_ops aiocb32_ops_osigevent = {
+ .copyin = aiocb32_copyin_old_sigevent,
+ .fetch_status = aiocb32_fetch_status,
+ .fetch_error = aiocb32_fetch_error,
+ .store_status = aiocb32_store_status,
+ .store_error = aiocb32_store_error,
+ .store_kernelinfo = aiocb32_store_kernelinfo,
+ .store_aiocb = aiocb32_store_aiocb,
+};
+
+int
+freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap)
+{
+
+ return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
+}
+
+int
+freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap)
+{
+ struct timespec32 ts32;
+ struct timespec ts, *tsp;
+ struct aiocb **ujoblist;
+ uint32_t *ujoblist32;
+ int error, i;
+
+ if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
+ return (EINVAL);
+
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
+ return (error);
+ CP(ts32, ts, tv_sec);
+ CP(ts32, ts, tv_nsec);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+ ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
+ ujoblist32 = (uint32_t *)ujoblist;
+ error = copyin(uap->aiocbp, ujoblist32, uap->nent *
+ sizeof(ujoblist32[0]));
+ if (error == 0) {
+ for (i = uap->nent; i > 0; i--)
+ ujoblist[i] = PTRIN(ujoblist32[i]);
+
+ error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
+ }
+ uma_zfree(aiol_zone, ujoblist);
+ return (error);
+}
+
+int
+freebsd32_aio_cancel(struct thread *td, struct freebsd32_aio_cancel_args *uap)
+{
+
+ return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
+}
+
+int
+freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap)
+{
+
+ return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
+}
+
+int
+freebsd32_oaio_read(struct thread *td, struct freebsd32_oaio_read_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+ &aiocb32_ops_osigevent));
+}
+
+int
+freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
+ &aiocb32_ops));
+}
+
+int
+freebsd32_oaio_write(struct thread *td, struct freebsd32_oaio_write_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+ &aiocb32_ops_osigevent));
+}
+
+int
+freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
+ &aiocb32_ops));
+}
+
+int
+freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
+ &aiocb32_ops));
+}
+
+int
+freebsd32_aio_waitcomplete(struct thread *td,
+ struct freebsd32_aio_waitcomplete_args *uap)
+{
+ struct timespec32 ts32;
+ struct timespec ts, *tsp;
+ int error;
+
+ if (uap->timeout) {
+ /* Get timespec struct. */
+ error = copyin(uap->timeout, &ts32, sizeof(ts32));
+ if (error)
+ return (error);
+ CP(ts32, ts, tv_sec);
+ CP(ts32, ts, tv_nsec);
+ tsp = &ts;
+ } else
+ tsp = NULL;
+
+ return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
+ &aiocb32_ops));
+}
+
+int
+freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap)
+{
+
+ return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
+ &aiocb32_ops));
+}
+
+int
+freebsd32_olio_listio(struct thread *td, struct freebsd32_olio_listio_args *uap)
+{
+ struct aiocb **acb_list;
+ struct sigevent *sigp, sig;
+ struct osigevent32 osig;
+ uint32_t *acb_list32;
+ int error, i, nent;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return (EINVAL);
+
+ nent = uap->nent;
+ if (nent < 0 || nent > AIO_LISTIO_MAX)
+ return (EINVAL);
+
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &osig, sizeof(osig));
+ if (error)
+ return (error);
+ error = convert_old_sigevent32(&osig, &sig);
+ if (error)
+ return (error);
+ sigp = &sig;
+ } else
+ sigp = NULL;
+
+ acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
+ error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
+ if (error) {
+ free(acb_list32, M_LIO);
+ return (error);
+ }
+ acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+ for (i = 0; i < nent; i++)
+ acb_list[i] = PTRIN(acb_list32[i]);
+ free(acb_list32, M_LIO);
+
+ error = kern_lio_listio(td, uap->mode,
+ (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+ &aiocb32_ops_osigevent);
+ free(acb_list, M_LIO);
+ return (error);
+}
+
+int
+freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap)
+{
+ struct aiocb **acb_list;
+ struct sigevent *sigp, sig;
+ struct sigevent32 sig32;
+ uint32_t *acb_list32;
+ int error, i, nent;
+
+ if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
+ return (EINVAL);
+
+ nent = uap->nent;
+ if (nent < 0 || nent > AIO_LISTIO_MAX)
+ return (EINVAL);
+
+ if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+ error = copyin(uap->sig, &sig32, sizeof(sig32));
+ if (error)
+ return (error);
+ error = convert_sigevent32(&sig32, &sig);
+ if (error)
+ return (error);
+ sigp = &sig;
+ } else
+ sigp = NULL;
+
+ acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
+ error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
+ if (error) {
+ free(acb_list32, M_LIO);
+ return (error);
+ }
+ acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK);
+ for (i = 0; i < nent; i++)
+ acb_list[i] = PTRIN(acb_list32[i]);
+ free(acb_list32, M_LIO);
+
+ error = kern_lio_listio(td, uap->mode,
+ (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
+ &aiocb32_ops);
+ free(acb_list, M_LIO);
+ return (error);
+}
+
+#endif
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
new file mode 100644
index 0000000..ea8a002
--- /dev/null
+++ b/sys/kern/vfs_bio.c
@@ -0,0 +1,4602 @@
+/*-
+ * Copyright (c) 2004 Poul-Henning Kamp
+ * Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme. Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
+ *
+ * Author: John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
+ *
+ * see man buf(9) for more info.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+#include <sys/devicestat.h>
+#include <sys/eventhandler.h>
+#include <sys/fail.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sysctl.h>
+#include <sys/vmem.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <geom/geom.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include "opt_compat.h"
+#include "opt_directio.h"
+#include "opt_swap.h"
+
+static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
+
+struct bio_ops bioops; /* I/O operation notification */
+
+struct buf_ops buf_ops_bio = {
+ .bop_name = "buf_ops_bio",
+ .bop_write = bufwrite,
+ .bop_strategy = bufstrategy,
+ .bop_sync = bufsync,
+ .bop_bdflush = bufbdflush,
+};
+
+/*
+ * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
+ * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c.
+ */
+struct buf *buf; /* buffer header pool */
+caddr_t unmapped_buf;
+
+static struct proc *bufdaemonproc;
+
+static int inmem(struct vnode *vp, daddr_t blkno);
+static void vm_hold_free_pages(struct buf *bp, int newbsize);
+static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
+ vm_offset_t to);
+static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
+static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
+ vm_page_t m);
+static void vfs_clean_pages_dirty_buf(struct buf *bp);
+static void vfs_setdirty_locked_object(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
+static int vfs_bio_clcheck(struct vnode *vp, int size,
+ daddr_t lblkno, daddr_t blkno);
+static int buf_flush(struct vnode *vp, int);
+static int flushbufqueues(struct vnode *, int, int);
+static void buf_daemon(void);
+static void bremfreel(struct buf *bp);
+static __inline void bd_wakeup(void);
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
+#endif
+
+int vmiodirenable = TRUE;
+SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
+ "Use the VM system for directory writes");
+long runningbufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
+ "Amount of presently outstanding async buffer io");
+static long bufspace;
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
+ &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
+#else
+SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
+ "Virtual memory used for buffers");
+#endif
+static long unmapped_bufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
+ &unmapped_bufspace, 0,
+ "Amount of unmapped buffers, inclusive in the bufspace");
+static long maxbufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
+ "Maximum allowed value of bufspace (including buf_daemon)");
+static long bufmallocspace;
+SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
+ "Amount of malloced memory for buffers");
+static long maxbufmallocspace;
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
+ "Maximum amount of malloced memory for buffers");
+static long lobufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
+ "Minimum amount of buffers we want to have");
+long hibufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
+ "Maximum allowed value of bufspace (excluding buf_daemon)");
+static int bufreusecnt;
+SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
+ "Number of times we have reused a buffer");
+static int buffreekvacnt;
+SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
+ "Number of times we have freed the KVA space from some buffer");
+static int bufdefragcnt;
+SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
+ "Number of times we have had to repeat buffer allocation to defragment");
+static long lorunningspace;
+SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
+ "Minimum preferred space used for in-progress I/O");
+static long hirunningspace;
+SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
+ "Maximum amount of space to use for in-progress I/O");
+int dirtybufferflushes;
+SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
+ 0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
+int bdwriteskip;
+SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
+ 0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
+int altbufferflushes;
+SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
+ 0, "Number of fsync flushes to limit dirty buffers");
+static int recursiveflushes;
+SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
+ 0, "Number of flushes skipped due to being recursive");
+static int numdirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
+ "Number of buffers that are dirty (has unwritten changes) at the moment");
+static int lodirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
+ "How many buffers we want to have free before bufdaemon can sleep");
+static int hidirtybuffers;
+SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
+ "When the number of dirty buffers is considered severe");
+int dirtybufthresh;
+SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
+ 0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
+static int numfreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
+ "Number of free buffers");
+static int lofreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
+ "XXX Unused");
+static int hifreebuffers;
+SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
+ "XXX Complicatedly unused");
+static int getnewbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
+ "Number of calls to getnewbuf");
+static int getnewbufrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
+ "Number of times getnewbuf has had to restart a buffer aquisition");
+static int mappingrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+ "Number of times getblk has had to restart a buffer mapping for "
+ "unmapped buffer");
+static int flushbufqtarget = 100;
+SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
+ "Amount of work to do in flushbufqueues when helping bufdaemon");
+static long notbufdflushes;
+SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
+ "Number of dirty buffer flushes done by the bufdaemon helpers");
+static long barrierwrites;
+SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
+ "Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+ &unmapped_buf_allowed, 0,
+ "Permit the use of the unmapped i/o");
+
+/*
+ * Lock for the non-dirty bufqueues
+ */
+static struct mtx_padalign bqclean;
+
+/*
+ * Lock for the dirty queue.
+ */
+static struct mtx_padalign bqdirty;
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign rbreqlock;
+
+/*
+ * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ */
+static struct mtx_padalign nblock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign bdirtylock;
+
+/*
+ * Wakeup point for bufdaemon, as well as indicator of whether it is already
+ * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it
+ * is idling.
+ */
+static int bd_request;
+
+/*
+ * Request for the buf daemon to write more buffers than is indicated by
+ * lodirtybuf. This may be necessary to push out excess dependencies or
+ * defragment the address space where a simple count of the number of dirty
+ * buffers is insufficient to characterize the demand for flushing them.
+ */
+static int bd_speedupreq;
+
+/*
+ * bogus page -- for I/O to/from partially complete buffers
+ * this is a temporary solution to the problem, but it is not
+ * really that bad. it would be better to split the buffer
+ * for input in the case of buffers partially already in memory,
+ * but the code is intricate enough already.
+ */
+vm_page_t bogus_page;
+
+/*
+ * Synchronization (sleep/wakeup) variable for active buffer space requests.
+ * Set when wait starts, cleared prior to wakeup().
+ * Used in runningbufwakeup() and waitrunningbufspace().
+ */
+static int runningbufreq;
+
+/*
+ * Synchronization (sleep/wakeup) variable for buffer requests.
+ * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
+ * by and/or.
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
+ * getnewbuf(), and getblk().
+ */
+static int needsbuffer;
+
+/*
+ * Synchronization for bwillwrite() waiters.
+ */
+static int bdirtywait;
+
+/*
+ * Definitions for the buffer free lists.
+ */
+#define BUFFER_QUEUES 5 /* number of free buffer queues */
+
+#define QUEUE_NONE 0 /* on no queue */
+#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */
+#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
+#define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY 4 /* empty buffer headers */
+#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
+
+/* Queues for free buffers with various properties */
+static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+#ifdef INVARIANTS
+static int bq_len[BUFFER_QUEUES];
+#endif
+
+/*
+ * Single global constant for BUF_WMESG, to avoid getting multiple references.
+ * buf_wmesg is referred from macros.
+ */
+const char *buf_wmesg = BUF_WMESG;
+
+#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
+#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
+#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
+
+#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
+ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
+static int
+sysctl_bufspace(SYSCTL_HANDLER_ARGS)
+{
+ long lvalue;
+ int ivalue;
+
+ if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
+ return (sysctl_handle_long(oidp, arg1, arg2, req));
+ lvalue = *(long *)arg1;
+ if (lvalue > INT_MAX)
+ /* On overflow, still write out a long to trigger ENOMEM. */
+ return (sysctl_handle_long(oidp, &lvalue, 0, req));
+ ivalue = lvalue;
+ return (sysctl_handle_int(oidp, &ivalue, 0, req));
+}
+#endif
+
+#ifdef DIRECTIO
+extern void ffs_rawread_setup(void);
+#endif /* DIRECTIO */
+
+/*
+ * bqlock:
+ *
+ * Return the appropriate queue lock based on the index.
+ */
+static inline struct mtx *
+bqlock(int qindex)
+{
+
+ if (qindex == QUEUE_DIRTY)
+ return (struct mtx *)(&bqdirty);
+ return (struct mtx *)(&bqclean);
+}
+
+/*
+ * bdirtywakeup:
+ *
+ * Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
+{
+ mtx_lock(&bdirtylock);
+ if (bdirtywait) {
+ bdirtywait = 0;
+ wakeup(&bdirtywait);
+ }
+ mtx_unlock(&bdirtylock);
+}
+
+/*
+ * bdirtysub:
+ *
+ * Decrement the numdirtybuffers count by one and wakeup any
+ * threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(void)
+{
+
+ if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
+ (lodirtybuffers + hidirtybuffers) / 2)
+ bdirtywakeup();
+}
+
+/*
+ * bdirtyadd:
+ *
+ * Increment the numdirtybuffers count by one and wakeup the buf
+ * daemon if needed.
+ */
+static void
+bdirtyadd(void)
+{
+
+ /*
+ * Only do the wakeup once as we cross the boundary. The
+ * buf daemon will keep running until the condition clears.
+ */
+ if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
+ (lodirtybuffers + hidirtybuffers) / 2)
+ bd_wakeup();
+}
+
+/*
+ * bufspacewakeup:
+ *
+ * Called when buffer space is potentially available for recovery.
+ * getnewbuf() will block on this flag when it is unable to free
+ * sufficient buffer space. Buffer space becomes recoverable when
+ * bp's get placed back in the queues.
+ */
+
+static __inline void
+bufspacewakeup(void)
+{
+
+ /*
+ * If someone is waiting for BUF space, wake them up. Even
+ * though we haven't freed the kva space yet, the waiting
+ * process will be able to now.
+ */
+ mtx_lock(&nblock);
+ if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
+ needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
+ wakeup(&needsbuffer);
+ }
+ mtx_unlock(&nblock);
+}
+
+/*
+ * runningwakeup:
+ *
+ * Wake up processes that are waiting on asynchronous writes to fall
+ * below lorunningspace.
+ */
+static void
+runningwakeup(void)
+{
+
+ mtx_lock(&rbreqlock);
+ if (runningbufreq) {
+ runningbufreq = 0;
+ wakeup(&runningbufreq);
+ }
+ mtx_unlock(&rbreqlock);
+}
+
+/*
+ * runningbufwakeup:
+ *
+ * Decrement the outstanding write count according.
+ */
+void
+runningbufwakeup(struct buf *bp)
+{
+ long space, bspace;
+
+ bspace = bp->b_runningbufspace;
+ if (bspace == 0)
+ return;
+ space = atomic_fetchadd_long(&runningbufspace, -bspace);
+ KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
+ space, bspace));
+ bp->b_runningbufspace = 0;
+ /*
+ * Only acquire the lock and wakeup on the transition from exceeding
+ * the threshold to falling below it.
+ */
+ if (space < lorunningspace)
+ return;
+ if (space - bspace > lorunningspace)
+ return;
+ runningwakeup();
+}
+
+/*
+ * bufcountadd:
+ *
+ * Called when a buffer has been added to one of the free queues to
+ * account for the buffer and to wakeup anyone waiting for free buffers.
+ * This typically occurs when large amounts of metadata are being handled
+ * by the buffer cache ( else buffer space runs out first, usually ).
+ */
+static __inline void
+bufcountadd(struct buf *bp)
+{
+ int old;
+
+ KASSERT((bp->b_flags & B_INFREECNT) == 0,
+ ("buf %p already counted as free", bp));
+ bp->b_flags |= B_INFREECNT;
+ old = atomic_fetchadd_int(&numfreebuffers, 1);
+ KASSERT(old >= 0 && old < nbuf,
+ ("numfreebuffers climbed to %d", old + 1));
+ mtx_lock(&nblock);
+ if (needsbuffer) {
+ needsbuffer &= ~VFS_BIO_NEED_ANY;
+ if (numfreebuffers >= hifreebuffers)
+ needsbuffer &= ~VFS_BIO_NEED_FREE;
+ wakeup(&needsbuffer);
+ }
+ mtx_unlock(&nblock);
+}
+
+/*
+ * bufcountsub:
+ *
+ * Decrement the numfreebuffers count as needed.
+ */
+static void
+bufcountsub(struct buf *bp)
+{
+ int old;
+
+ /*
+ * Fixup numfreebuffers count. If the buffer is invalid or not
+ * delayed-write, the buffer was free and we must decrement
+ * numfreebuffers.
+ */
+ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+ KASSERT((bp->b_flags & B_INFREECNT) != 0,
+ ("buf %p not counted in numfreebuffers", bp));
+ bp->b_flags &= ~B_INFREECNT;
+ old = atomic_fetchadd_int(&numfreebuffers, -1);
+ KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
+ }
+}
+
+/*
+ * waitrunningbufspace()
+ *
+ * runningbufspace is a measure of the amount of I/O currently
+ * running. This routine is used in async-write situations to
+ * prevent creating huge backups of pending writes to a device.
+ * Only asynchronous writes are governed by this function.
+ *
+ * This does NOT turn an async write into a sync write. It waits
+ * for earlier writes to complete and generally returns before the
+ * caller's write has reached the device.
+ */
+void
+waitrunningbufspace(void)
+{
+
+ mtx_lock(&rbreqlock);
+ while (runningbufspace > hirunningspace) {
+ runningbufreq = 1;
+ msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
+ }
+ mtx_unlock(&rbreqlock);
+}
+
+
+/*
+ * vfs_buf_test_cache:
+ *
+ * Called when a buffer is extended. This function clears the B_CACHE
+ * bit if the newly extended portion of the buffer does not contain
+ * valid data.
+ */
+static __inline
+void
+vfs_buf_test_cache(struct buf *bp,
+ vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
+ vm_page_t m)
+{
+
+ VM_OBJECT_ASSERT_LOCKED(m->object);
+ if (bp->b_flags & B_CACHE) {
+ int base = (foff + off) & PAGE_MASK;
+ if (vm_page_is_valid(m, base, size) == 0)
+ bp->b_flags &= ~B_CACHE;
+ }
+}
+
+/* Wake up the buffer daemon if necessary */
+static __inline void
+bd_wakeup(void)
+{
+
+ mtx_lock(&bdlock);
+ if (bd_request == 0) {
+ bd_request = 1;
+ wakeup(&bd_request);
+ }
+ mtx_unlock(&bdlock);
+}
+
+/*
+ * bd_speedup - speedup the buffer cache flushing code
+ */
+void
+bd_speedup(void)
+{
+ int needwake;
+
+ mtx_lock(&bdlock);
+ needwake = 0;
+ if (bd_speedupreq == 0 || bd_request == 0)
+ needwake = 1;
+ bd_speedupreq = 1;
+ bd_request = 1;
+ if (needwake)
+ wakeup(&bd_request);
+ mtx_unlock(&bdlock);
+}
+
+#ifdef __i386__
+#define TRANSIENT_DENOM 5
+#else
+#define TRANSIENT_DENOM 10
+#endif
+
+/*
+ * Calculating buffer cache scaling values and reserve space for buffer
+ * headers. This is called during low level kernel initialization and
+ * may be called more then once. We CANNOT write to the memory area
+ * being reserved at this time.
+ */
+caddr_t
+kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
+{
+ int tuned_nbuf;
+ long maxbuf, maxbuf_sz, buf_sz, biotmap_sz;
+
+ /*
+ * physmem_est is in pages. Convert it to kilobytes (assumes
+ * PAGE_SIZE is >= 1K)
+ */
+ physmem_est = physmem_est * (PAGE_SIZE / 1024);
+
+ /*
+ * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
+ * For the first 64MB of ram nominally allocate sufficient buffers to
+ * cover 1/4 of our ram. Beyond the first 64MB allocate additional
+ * buffers to cover 1/10 of our ram over 64MB. When auto-sizing
+ * the buffer cache we limit the eventual kva reservation to
+ * maxbcache bytes.
+ *
+ * factor represents the 1/4 x ram conversion.
+ */
+ if (nbuf == 0) {
+ int factor = 4 * BKVASIZE / 1024;
+
+ nbuf = 50;
+ if (physmem_est > 4096)
+ nbuf += min((physmem_est - 4096) / factor,
+ 65536 / factor);
+ if (physmem_est > 65536)
+ nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
+ 32 * 1024 * 1024 / (factor * 5));
+
+ if (maxbcache && nbuf > maxbcache / BKVASIZE)
+ nbuf = maxbcache / BKVASIZE;
+ tuned_nbuf = 1;
+ } else
+ tuned_nbuf = 0;
+
+ /* XXX Avoid unsigned long overflows later on with maxbufspace. */
+ maxbuf = (LONG_MAX / 3) / BKVASIZE;
+ if (nbuf > maxbuf) {
+ if (!tuned_nbuf)
+ printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
+ maxbuf);
+ nbuf = maxbuf;
+ }
+
+ /*
+ * Ideal allocation size for the transient bio submap if 10%
+ * of the maximal space buffer map. This roughly corresponds
+ * to the amount of the buffer mapped for typical UFS load.
+ *
+ * Clip the buffer map to reserve space for the transient
+ * BIOs, if its extent is bigger than 90% (80% on i386) of the
+ * maximum buffer map extent on the platform.
+ *
+ * The fall-back to the maxbuf in case of maxbcache unset,
+ * allows to not trim the buffer KVA for the architectures
+ * with ample KVA space.
+ */
+ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
+ maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+ buf_sz = (long)nbuf * BKVASIZE;
+ if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
+ (TRANSIENT_DENOM - 1)) {
+ /*
+ * There is more KVA than memory. Do not
+ * adjust buffer map size, and assign the rest
+ * of maxbuf to transient map.
+ */
+ biotmap_sz = maxbuf_sz - buf_sz;
+ } else {
+ /*
+ * Buffer map spans all KVA we could afford on
+ * this platform. Give 10% (20% on i386) of
+ * the buffer map to the transient bio map.
+ */
+ biotmap_sz = buf_sz / TRANSIENT_DENOM;
+ buf_sz -= biotmap_sz;
+ }
+ if (biotmap_sz / INT_MAX > MAXPHYS)
+ bio_transient_maxcnt = INT_MAX;
+ else
+ bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+ /*
+ * Artifically limit to 1024 simultaneous in-flight I/Os
+ * using the transient mapping.
+ */
+ if (bio_transient_maxcnt > 1024)
+ bio_transient_maxcnt = 1024;
+ if (tuned_nbuf)
+ nbuf = buf_sz / BKVASIZE;
+ }
+
+ /*
+ * swbufs are used as temporary holders for I/O, such as paging I/O.
+ * We have no less then 16 and no more then 256.
+ */
+ nswbuf = max(min(nbuf/4, 256), 16);
+#ifdef NSWBUF_MIN
+ if (nswbuf < NSWBUF_MIN)
+ nswbuf = NSWBUF_MIN;
+#endif
+#ifdef DIRECTIO
+ ffs_rawread_setup();
+#endif
+
+ /*
+ * Reserve space for the buffer cache buffers
+ */
+ swbuf = (void *)v;
+ v = (caddr_t)(swbuf + nswbuf);
+ buf = (void *)v;
+ v = (caddr_t)(buf + nbuf);
+
+ return(v);
+}
+
+/* Initialize the buffer subsystem. Called before use of any buffers. */
+void
+bufinit(void)
+{
+ struct buf *bp;
+ int i;
+
+ mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
+ mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
+ mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
+ mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
+ mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+ mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
+
+ /* next, make a null set of free lists */
+ for (i = 0; i < BUFFER_QUEUES; i++)
+ TAILQ_INIT(&bufqueues[i]);
+
+ /* finally, initialize each buffer header and stick on empty q */
+ for (i = 0; i < nbuf; i++) {
+ bp = &buf[i];
+ bzero(bp, sizeof *bp);
+ bp->b_flags = B_INVAL | B_INFREECNT;
+ bp->b_rcred = NOCRED;
+ bp->b_wcred = NOCRED;
+ bp->b_qindex = QUEUE_EMPTY;
+ bp->b_xflags = 0;
+ LIST_INIT(&bp->b_dep);
+ BUF_LOCKINIT(bp);
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[QUEUE_EMPTY]++;
+#endif
+ }
+
+ /*
+ * maxbufspace is the absolute maximum amount of buffer space we are
+ * allowed to reserve in KVM and in real terms. The absolute maximum
+ * is nominally used by buf_daemon. hibufspace is the nominal maximum
+ * used by most other processes. The differential is required to
+ * ensure that buf_daemon is able to run when other processes might
+ * be blocked waiting for buffer space.
+ *
+ * maxbufspace is based on BKVASIZE. Allocating buffers larger then
+ * this may result in KVM fragmentation which is not handled optimally
+ * by the system.
+ */
+ maxbufspace = (long)nbuf * BKVASIZE;
+ hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
+ lobufspace = hibufspace - MAXBSIZE;
+
+ /*
+ * Note: The 16 MiB upper limit for hirunningspace was chosen
+ * arbitrarily and may need further tuning. It corresponds to
+ * 128 outstanding write IO requests (if IO size is 128 KiB),
+ * which fits with many RAID controllers' tagged queuing limits.
+ * The lower 1 MiB limit is the historical upper limit for
+ * hirunningspace.
+ */
+ hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
+ 16 * 1024 * 1024), 1024 * 1024);
+ lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
+
+/*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space. Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on average
+ * (small) directories.
+ */
+ maxbufmallocspace = hibufspace / 20;
+
+/*
+ * Reduce the chance of a deadlock occuring by limiting the number
+ * of delayed-write dirty buffers we allow to stack up.
+ */
+ hidirtybuffers = nbuf / 4 + 20;
+ dirtybufthresh = hidirtybuffers * 9 / 10;
+ numdirtybuffers = 0;
+/*
+ * To support extreme low-memory systems, make sure hidirtybuffers cannot
+ * eat up all available buffer space. This occurs when our minimum cannot
+ * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming
+ * BKVASIZE'd buffers.
+ */
+ while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
+ hidirtybuffers >>= 1;
+ }
+ lodirtybuffers = hidirtybuffers / 2;
+
+/*
+ * Try to keep the number of free buffers in the specified range,
+ * and give special processes (e.g. like buf_daemon) access to an
+ * emergency reserve.
+ */
+ lofreebuffers = nbuf / 18 + 5;
+ hifreebuffers = 2 * lofreebuffers;
+ numfreebuffers = nbuf;
+
+ bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
+ VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+ unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
+}
+
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+ KASSERT((bp->b_flags & B_UNMAPPED) == 0,
+ ("mapped buf %p %x", bp, bp->b_flags));
+ KASSERT(bp->b_kvabase != unmapped_buf,
+ ("mapped buf: b_kvabase was not updated %p", bp));
+ KASSERT(bp->b_data != unmapped_buf,
+ ("mapped buf: b_data was not updated %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+ KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
+ ("unmapped buf %p %x", bp, bp->b_flags));
+ KASSERT(bp->b_kvabase == unmapped_buf,
+ ("unmapped buf: corrupted b_kvabase %p", bp));
+ KASSERT(bp->b_data == unmapped_buf,
+ ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define BUF_CHECK_MAPPED(bp) do {} while (0)
+#define BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+ BUF_CHECK_MAPPED(bp);
+
+ /*
+ * bp->b_data is relative to bp->b_offset, but
+ * bp->b_offset may be offset into the first page.
+ */
+ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+ pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+ bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+ (vm_offset_t)(bp->b_offset & PAGE_MASK));
+}
+
+/*
+ * bfreekva() - free the kva allocation for a buffer.
+ *
+ * Since this call frees up buffer space, we call bufspacewakeup().
+ */
+static void
+bfreekva(struct buf *bp)
+{
+
+ if (bp->b_kvasize == 0)
+ return;
+
+ atomic_add_int(&buffreekvacnt, 1);
+ atomic_subtract_long(&bufspace, bp->b_kvasize);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
+ bp->b_kvasize);
+ } else {
+ BUF_CHECK_UNMAPPED(bp);
+ if ((bp->b_flags & B_KVAALLOC) != 0) {
+ vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
+ bp->b_kvasize);
+ }
+ atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+ bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+ }
+ bp->b_kvasize = 0;
+ bufspacewakeup();
+}
+
+/*
+ * binsfree:
+ *
+ * Insert the buffer into the appropriate free list.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+ struct mtx *olock, *nlock;
+
+ BUF_ASSERT_XLOCKED(bp);
+
+ olock = bqlock(bp->b_qindex);
+ nlock = bqlock(qindex);
+ mtx_lock(olock);
+ /* Handle delayed bremfree() processing. */
+ if (bp->b_flags & B_REMFREE)
+ bremfreel(bp);
+
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("binsfree: free buffer onto another queue???");
+
+ bp->b_qindex = qindex;
+ if (olock != nlock) {
+ mtx_unlock(olock);
+ mtx_lock(nlock);
+ }
+ if (bp->b_flags & B_AGE)
+ TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+ else
+ TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[bp->b_qindex]++;
+#endif
+ mtx_unlock(nlock);
+
+ /*
+ * Something we can maybe free or reuse.
+ */
+ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+ bufspacewakeup();
+
+ if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
+ bufcountadd(bp);
+}
+
+/*
+ * bremfree:
+ *
+ * Mark the buffer for removal from the appropriate free list.
+ *
+ */
+void
+bremfree(struct buf *bp)
+{
+
+ CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT((bp->b_flags & B_REMFREE) == 0,
+ ("bremfree: buffer %p already marked for delayed removal.", bp));
+ KASSERT(bp->b_qindex != QUEUE_NONE,
+ ("bremfree: buffer %p not on a queue.", bp));
+ BUF_ASSERT_XLOCKED(bp);
+
+ bp->b_flags |= B_REMFREE;
+ bufcountsub(bp);
+}
+
+/*
+ * bremfreef:
+ *
+ * Force an immediate removal from a free list. Used only in nfs when
+ * it abuses the b_freelist pointer.
+ */
+void
+bremfreef(struct buf *bp)
+{
+ struct mtx *qlock;
+
+ qlock = bqlock(bp->b_qindex);
+ mtx_lock(qlock);
+ bremfreel(bp);
+ mtx_unlock(qlock);
+}
+
+/*
+ * bremfreel:
+ *
+ * Removes a buffer from the free list, must be called with the
+ * correct qlock held.
+ */
+static void
+bremfreel(struct buf *bp)
+{
+
+ CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_qindex != QUEUE_NONE,
+ ("bremfreel: buffer %p not on a queue.", bp));
+ BUF_ASSERT_XLOCKED(bp);
+ mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
+
+ TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
+ bp->b_qindex));
+ bq_len[bp->b_qindex]--;
+#endif
+ bp->b_qindex = QUEUE_NONE;
+ /*
+ * If this was a delayed bremfree() we only need to remove the buffer
+ * from the queue and return the stats are already done.
+ */
+ if (bp->b_flags & B_REMFREE) {
+ bp->b_flags &= ~B_REMFREE;
+ return;
+ }
+ bufcountsub(bp);
+}
+
+/*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks. We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
+ int cnt, struct ucred * cred)
+{
+ struct buf *rabp;
+ int i;
+
+ for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+ if (inmem(vp, *rablkno))
+ continue;
+ rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+
+ if ((rabp->b_flags & B_CACHE) == 0) {
+ if (!TD_IS_IDLETHREAD(curthread))
+ curthread->td_ru.ru_inblock++;
+ rabp->b_flags |= B_ASYNC;
+ rabp->b_flags &= ~B_INVAL;
+ rabp->b_ioflags &= ~BIO_ERROR;
+ rabp->b_iocmd = BIO_READ;
+ if (rabp->b_rcred == NOCRED && cred != NOCRED)
+ rabp->b_rcred = crhold(cred);
+ vfs_busy_pages(rabp, 0);
+ BUF_KERNPROC(rabp);
+ rabp->b_iooffset = dbtob(rabp->b_blkno);
+ bstrategy(rabp);
+ } else {
+ brelse(rabp);
+ }
+ }
+}
+
+/*
+ * Entry point for bread() and breadn() via #defines in sys/buf.h.
+ *
+ * Get a buffer with the specified data. Look in the cache first. We
+ * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything, see
+ * getblk(). Also starts asynchronous I/O on read-ahead blocks.
+ */
+int
+breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
+ int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
+{
+ struct buf *bp;
+ int rv = 0, readwait = 0;
+
+ CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
+ /*
+ * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
+ */
+ *bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
+ if (bp == NULL)
+ return (EBUSY);
+
+ /* if not found in cache, do some I/O */
+ if ((bp->b_flags & B_CACHE) == 0) {
+ if (!TD_IS_IDLETHREAD(curthread))
+ curthread->td_ru.ru_inblock++;
+ bp->b_iocmd = BIO_READ;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ if (bp->b_rcred == NOCRED && cred != NOCRED)
+ bp->b_rcred = crhold(cred);
+ vfs_busy_pages(bp, 0);
+ bp->b_iooffset = dbtob(bp->b_blkno);
+ bstrategy(bp);
+ ++readwait;
+ }
+
+ breada(vp, rablkno, rabsize, cnt, cred);
+
+ if (readwait) {
+ rv = bufwait(bp);
+ }
+ return (rv);
+}
+
+/*
+ * Write, release buffer on completion. (Done by iodone
+ * if async). Do not bother writing anything if the buffer
+ * is invalid.
+ *
+ * Note that we set B_CACHE here, indicating that buffer is
+ * fully valid and thus cacheable. This is true even of NFS
+ * now so we set it generally. This could be set either here
+ * or in biodone() since the I/O is synchronous. We put it
+ * here.
+ */
+int
+bufwrite(struct buf *bp)
+{
+ int oldflags;
+ struct vnode *vp;
+ long space;
+ int vp_md;
+
+ CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return (0);
+ }
+
+ if (bp->b_flags & B_BARRIER)
+ barrierwrites++;
+
+ oldflags = bp->b_flags;
+
+ BUF_ASSERT_HELD(bp);
+
+ if (bp->b_pin_count > 0)
+ bunpin_wait(bp);
+
+ KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
+ ("FFS background buffer should not get here %p", bp));
+
+ vp = bp->b_vp;
+ if (vp)
+ vp_md = vp->v_vflag & VV_MD;
+ else
+ vp_md = 0;
+
+ /*
+ * Mark the buffer clean. Increment the bufobj write count
+ * before bundirty() call, to prevent other thread from seeing
+ * empty dirty list and zero counter for writes in progress,
+ * falsely indicating that the bufobj is clean.
+ */
+ bufobj_wref(bp->b_bufobj);
+ bundirty(bp);
+
+ bp->b_flags &= ~B_DONE;
+ bp->b_ioflags &= ~BIO_ERROR;
+ bp->b_flags |= B_CACHE;
+ bp->b_iocmd = BIO_WRITE;
+
+ vfs_busy_pages(bp, 1);
+
+ /*
+ * Normal bwrites pipeline writes
+ */
+ bp->b_runningbufspace = bp->b_bufsize;
+ space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
+
+ if (!TD_IS_IDLETHREAD(curthread))
+ curthread->td_ru.ru_oublock++;
+ if (oldflags & B_ASYNC)
+ BUF_KERNPROC(bp);
+ bp->b_iooffset = dbtob(bp->b_blkno);
+ bstrategy(bp);
+
+ if ((oldflags & B_ASYNC) == 0) {
+ int rtval = bufwait(bp);
+ brelse(bp);
+ return (rtval);
+ } else if (space > hirunningspace) {
+ /*
+ * don't allow the async write to saturate the I/O
+ * system. We will not deadlock here because
+ * we are blocking waiting for I/O that is already in-progress
+ * to complete. We do not block here if it is the update
+ * or syncer daemon trying to clean up as that can lead
+ * to deadlock.
+ */
+ if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
+ waitrunningbufspace();
+ }
+
+ return (0);
+}
+
+void
+bufbdflush(struct bufobj *bo, struct buf *bp)
+{
+ struct buf *nbp;
+
+ if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
+ (void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
+ altbufferflushes++;
+ } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
+ BO_LOCK(bo);
+ /*
+ * Try to find a buffer to flush.
+ */
+ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+ if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+ BUF_LOCK(nbp,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL))
+ continue;
+ if (bp == nbp)
+ panic("bdwrite: found ourselves");
+ BO_UNLOCK(bo);
+ /* Don't countdeps with the bo lock held. */
+ if (buf_countdeps(nbp, 0)) {
+ BO_LOCK(bo);
+ BUF_UNLOCK(nbp);
+ continue;
+ }
+ if (nbp->b_flags & B_CLUSTEROK) {
+ vfs_bio_awrite(nbp);
+ } else {
+ bremfree(nbp);
+ bawrite(nbp);
+ }
+ dirtybufferflushes++;
+ break;
+ }
+ if (nbp == NULL)
+ BO_UNLOCK(bo);
+ }
+}
+
+/*
+ * Delayed write. (Buffer is marked dirty). Do not bother writing
+ * anything if the buffer is marked invalid.
+ *
+ * Note that since the buffer must be completely valid, we can safely
+ * set B_CACHE. In fact, we have to set B_CACHE here rather then in
+ * biodone() in order to prevent getblk from writing the buffer
+ * out synchronously.
+ */
+void
+bdwrite(struct buf *bp)
+{
+ struct thread *td = curthread;
+ struct vnode *vp;
+ struct bufobj *bo;
+
+ CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+ KASSERT((bp->b_flags & B_BARRIER) == 0,
+ ("Barrier request in delayed write %p", bp));
+ BUF_ASSERT_HELD(bp);
+
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return;
+ }
+
+ /*
+ * If we have too many dirty buffers, don't create any more.
+ * If we are wildly over our limit, then force a complete
+ * cleanup. Otherwise, just keep the situation from getting
+ * out of control. Note that we have to avoid a recursive
+ * disaster and not try to clean up after our own cleanup!
+ */
+ vp = bp->b_vp;
+ bo = bp->b_bufobj;
+ if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
+ td->td_pflags |= TDP_INBDFLUSH;
+ BO_BDFLUSH(bo, bp);
+ td->td_pflags &= ~TDP_INBDFLUSH;
+ } else
+ recursiveflushes++;
+
+ bdirty(bp);
+ /*
+ * Set B_CACHE, indicating that the buffer is fully valid. This is
+ * true even of NFS now.
+ */
+ bp->b_flags |= B_CACHE;
+
+ /*
+ * This bmap keeps the system from needing to do the bmap later,
+ * perhaps when the system is attempting to do a sync. Since it
+ * is likely that the indirect block -- or whatever other datastructure
+ * that the filesystem needs is still in memory now, it is a good
+ * thing to do this. Note also, that if the pageout daemon is
+ * requesting a sync -- there might not be enough memory to do
+ * the bmap then... So, this is important to do.
+ */
+ if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
+ VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+ }
+
+ /*
+ * Set the *dirty* buffer range based upon the VM system dirty
+ * pages.
+ *
+ * Mark the buffer pages as clean. We need to do this here to
+ * satisfy the vnode_pager and the pageout daemon, so that it
+ * thinks that the pages have been "cleaned". Note that since
+ * the pages are in a delayed write buffer -- the VFS layer
+ * "will" see that the pages get written out on the next sync,
+ * or perhaps the cluster will be completed.
+ */
+ vfs_clean_pages_dirty_buf(bp);
+ bqrelse(bp);
+
+ /*
+ * note: we cannot initiate I/O from a bdwrite even if we wanted to,
+ * due to the softdep code.
+ */
+}
+
+/*
+ * bdirty:
+ *
+ * Turn buffer into delayed write request. We must clear BIO_READ and
+ * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to
+ * itself to properly update it in the dirty/clean lists. We mark it
+ * B_DONE to ensure that any asynchronization of the buffer properly
+ * clears B_DONE ( else a panic will occur later ).
+ *
+ * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
+ * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty()
+ * should only be called if the buffer is known-good.
+ *
+ * Since the buffer is not on a queue, we do not update the numfreebuffers
+ * count.
+ *
+ * The buffer must be on QUEUE_NONE.
+ */
+void
+bdirty(struct buf *bp)
+{
+
+ CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+ KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
+ ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
+ BUF_ASSERT_HELD(bp);
+ bp->b_flags &= ~(B_RELBUF);
+ bp->b_iocmd = BIO_WRITE;
+
+ if ((bp->b_flags & B_DELWRI) == 0) {
+ bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
+ reassignbuf(bp);
+ bdirtyadd();
+ }
+}
+
+/*
+ * bundirty:
+ *
+ * Clear B_DELWRI for buffer.
+ *
+ * Since the buffer is not on a queue, we do not update the numfreebuffers
+ * count.
+ *
+ * The buffer must be on QUEUE_NONE.
+ */
+
+void
+bundirty(struct buf *bp)
+{
+
+ CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+ KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
+ ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
+ BUF_ASSERT_HELD(bp);
+
+ if (bp->b_flags & B_DELWRI) {
+ bp->b_flags &= ~B_DELWRI;
+ reassignbuf(bp);
+ bdirtysub();
+ }
+ /*
+ * Since it is now being written, we can clear its deferred write flag.
+ */
+ bp->b_flags &= ~B_DEFERRED;
+}
+
+/*
+ * bawrite:
+ *
+ * Asynchronous write. Start output on a buffer, but do not wait for
+ * it to complete. The buffer is released when the output completes.
+ *
+ * bwrite() ( or the VOP routine anyway ) is responsible for handling
+ * B_INVAL buffers. Not us.
+ */
+void
+bawrite(struct buf *bp)
+{
+
+ bp->b_flags |= B_ASYNC;
+ (void) bwrite(bp);
+}
+
+/*
+ * babarrierwrite:
+ *
+ * Asynchronous barrier write. Start output on a buffer, but do not
+ * wait for it to complete. Place a write barrier after this write so
+ * that this buffer and all buffers written before it are committed to
+ * the disk before any buffers written after this write are committed
+ * to the disk. The buffer is released when the output completes.
+ */
+void
+babarrierwrite(struct buf *bp)
+{
+
+ bp->b_flags |= B_ASYNC | B_BARRIER;
+ (void) bwrite(bp);
+}
+
+/*
+ * bbarrierwrite:
+ *
+ * Synchronous barrier write. Start output on a buffer and wait for
+ * it to complete. Place a write barrier after this write so that
+ * this buffer and all buffers written before it are committed to
+ * the disk before any buffers written after this write are committed
+ * to the disk. The buffer is released when the output completes.
+ */
+int
+bbarrierwrite(struct buf *bp)
+{
+
+ bp->b_flags |= B_BARRIER;
+ return (bwrite(bp));
+}
+
+/*
+ * bwillwrite:
+ *
+ * Called prior to the locking of any vnodes when we are expecting to
+ * write. We do not want to starve the buffer cache with too many
+ * dirty buffers so we block here. By blocking prior to the locking
+ * of any vnodes we attempt to avoid the situation where a locked vnode
+ * prevents the various system daemons from flushing related buffers.
+ */
+void
+bwillwrite(void)
+{
+
+ if (numdirtybuffers >= hidirtybuffers) {
+ mtx_lock(&bdirtylock);
+ while (numdirtybuffers >= hidirtybuffers) {
+ bdirtywait = 1;
+ msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+ "flswai", 0);
+ }
+ mtx_unlock(&bdirtylock);
+ }
+}
+
+/*
+ * Return true if we have too many dirty buffers.
+ */
+int
+buf_dirty_count_severe(void)
+{
+
+ return(numdirtybuffers >= hidirtybuffers);
+}
+
+static __noinline int
+buf_vm_page_count_severe(void)
+{
+
+ KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
+
+ return vm_page_count_severe();
+}
+
+/*
+ * brelse:
+ *
+ * Release a busy buffer and, if requested, free its resources. The
+ * buffer will be stashed in the appropriate bufqueue[] allowing it
+ * to be accessed later as a cache entity or reused for other purposes.
+ */
+void
+brelse(struct buf *bp)
+{
+ int qindex;
+
+ CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+ ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+ if (BUF_LOCKRECURSED(bp)) {
+ /*
+ * Do not process, in particular, do not handle the
+ * B_INVAL/B_RELBUF and do not release to free list.
+ */
+ BUF_UNLOCK(bp);
+ return;
+ }
+
+ if (bp->b_flags & B_MANAGED) {
+ bqrelse(bp);
+ return;
+ }
+
+ if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
+ bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
+ /*
+ * Failed write, redirty. Must clear BIO_ERROR to prevent
+ * pages from being scrapped. If the error is anything
+ * other than an I/O error (EIO), assume that retrying
+ * is futile.
+ */
+ bp->b_ioflags &= ~BIO_ERROR;
+ bdirty(bp);
+ } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
+ (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
+ /*
+ * Either a failed I/O or we were asked to free or not
+ * cache the buffer.
+ */
+ bp->b_flags |= B_INVAL;
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ if (bp->b_flags & B_DELWRI)
+ bdirtysub();
+ bp->b_flags &= ~(B_DELWRI | B_CACHE);
+ if ((bp->b_flags & B_VMIO) == 0) {
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+ if (bp->b_vp)
+ brelvp(bp);
+ }
+ }
+
+ /*
+ * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release()
+ * is called with B_DELWRI set, the underlying pages may wind up
+ * getting freed causing a previous write (bdwrite()) to get 'lost'
+ * because pages associated with a B_DELWRI bp are marked clean.
+ *
+ * We still allow the B_INVAL case to call vfs_vmio_release(), even
+ * if B_DELWRI is set.
+ *
+ * If B_DELWRI is not set we may have to set B_RELBUF if we are low
+ * on pages to return pages to the VM page queues.
+ */
+ if (bp->b_flags & B_DELWRI)
+ bp->b_flags &= ~B_RELBUF;
+ else if (buf_vm_page_count_severe()) {
+ /*
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
+ */
+ if (!(bp->b_vflags & BV_BKGRDINPROG))
+ bp->b_flags |= B_RELBUF;
+ }
+
+ /*
+ * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
+ * constituted, not even NFS buffers now. Two flags effect this. If
+ * B_INVAL, the struct buf is invalidated but the VM object is kept
+ * around ( i.e. so it is trivial to reconstitute the buffer later ).
+ *
+ * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
+ * invalidated. BIO_ERROR cannot be set for a failed write unless the
+ * buffer is also B_INVAL because it hits the re-dirtying code above.
+ *
+ * Normally we can do this whether a buffer is B_DELWRI or not. If
+ * the buffer is an NFS buffer, it is tracking piecemeal writes or
+ * the commit state and we cannot afford to lose the buffer. If the
+ * buffer has a background write in progress, we need to keep it
+ * around to prevent it from being reconstituted and starting a second
+ * background write.
+ */
+ if ((bp->b_flags & B_VMIO)
+ && !(bp->b_vp->v_mount != NULL &&
+ (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
+ !vn_isdisk(bp->b_vp, NULL) &&
+ (bp->b_flags & B_DELWRI))
+ ) {
+
+ int i, j, resid;
+ vm_page_t m;
+ off_t foff;
+ vm_pindex_t poff;
+ vm_object_t obj;
+
+ obj = bp->b_bufobj->bo_object;
+
+ /*
+ * Get the base offset and length of the buffer. Note that
+ * in the VMIO case if the buffer block size is not
+ * page-aligned then b_data pointer may not be page-aligned.
+ * But our b_pages[] array *IS* page aligned.
+ *
+ * block sizes less then DEV_BSIZE (usually 512) are not
+ * supported due to the page granularity bits (m->valid,
+ * m->dirty, etc...).
+ *
+ * See man buf(9) for more information
+ */
+ resid = bp->b_bufsize;
+ foff = bp->b_offset;
+ for (i = 0; i < bp->b_npages; i++) {
+ int had_bogus = 0;
+
+ m = bp->b_pages[i];
+
+ /*
+ * If we hit a bogus page, fixup *all* the bogus pages
+ * now.
+ */
+ if (m == bogus_page) {
+ poff = OFF_TO_IDX(bp->b_offset);
+ had_bogus = 1;
+
+ VM_OBJECT_RLOCK(obj);
+ for (j = i; j < bp->b_npages; j++) {
+ vm_page_t mtmp;
+ mtmp = bp->b_pages[j];
+ if (mtmp == bogus_page) {
+ mtmp = vm_page_lookup(obj, poff + j);
+ if (!mtmp) {
+ panic("brelse: page missing\n");
+ }
+ bp->b_pages[j] = mtmp;
+ }
+ }
+ VM_OBJECT_RUNLOCK(obj);
+
+ if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(
+ trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ }
+ m = bp->b_pages[i];
+ }
+ if ((bp->b_flags & B_NOCACHE) ||
+ (bp->b_ioflags & BIO_ERROR &&
+ bp->b_iocmd == BIO_READ)) {
+ int poffset = foff & PAGE_MASK;
+ int presid = resid > (PAGE_SIZE - poffset) ?
+ (PAGE_SIZE - poffset) : resid;
+
+ KASSERT(presid >= 0, ("brelse: extra page"));
+ VM_OBJECT_WLOCK(obj);
+ while (vm_page_xbusied(m)) {
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(obj);
+ vm_page_busy_sleep(m, "mbncsh");
+ VM_OBJECT_WLOCK(obj);
+ }
+ if (pmap_page_wired_mappings(m) == 0)
+ vm_page_set_invalid(m, poffset, presid);
+ VM_OBJECT_WUNLOCK(obj);
+ if (had_bogus)
+ printf("avoided corruption bug in bogus_page/brelse code\n");
+ }
+ resid -= PAGE_SIZE - (foff & PAGE_MASK);
+ foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ }
+ if (bp->b_flags & (B_INVAL | B_RELBUF))
+ vfs_vmio_release(bp);
+
+ } else if (bp->b_flags & B_VMIO) {
+
+ if (bp->b_flags & (B_INVAL | B_RELBUF)) {
+ vfs_vmio_release(bp);
+ }
+
+ } else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
+ if (bp->b_bufsize != 0)
+ allocbuf(bp, 0);
+ if (bp->b_vp != NULL)
+ brelvp(bp);
+ }
+
+ /*
+ * If the buffer has junk contents signal it and eventually
+ * clean up B_DELWRI and diassociate the vnode so that gbincore()
+ * doesn't find it.
+ */
+ if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
+ (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
+ bp->b_flags |= B_INVAL;
+ if (bp->b_flags & B_INVAL) {
+ if (bp->b_flags & B_DELWRI)
+ bundirty(bp);
+ if (bp->b_vp)
+ brelvp(bp);
+ }
+
+ /* buffers with no memory */
+ if (bp->b_bufsize == 0) {
+ bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 1");
+ if (bp->b_kvasize)
+ qindex = QUEUE_EMPTYKVA;
+ else
+ qindex = QUEUE_EMPTY;
+ bp->b_flags |= B_AGE;
+ /* buffers with junk contents */
+ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
+ (bp->b_ioflags & BIO_ERROR)) {
+ bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 2");
+ qindex = QUEUE_CLEAN;
+ bp->b_flags |= B_AGE;
+ /* remaining buffers */
+ } else if (bp->b_flags & B_DELWRI)
+ qindex = QUEUE_DIRTY;
+ else
+ qindex = QUEUE_CLEAN;
+
+ binsfree(bp, qindex);
+
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
+ if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
+ panic("brelse: not dirty");
+ /* unlock */
+ BUF_UNLOCK(bp);
+}
+
+/*
+ * Release a buffer back to the appropriate queue but do not try to free
+ * it. The buffer is expected to be used again soon.
+ *
+ * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
+ * biodone() to requeue an async I/O on completion. It is also used when
+ * known good buffers need to be requeued but we think we may need the data
+ * again soon.
+ *
+ * XXX we should be able to leave the B_RELBUF hint set on completion.
+ */
+void
+bqrelse(struct buf *bp)
+{
+ int qindex;
+
+ CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
+ ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+
+ if (BUF_LOCKRECURSED(bp)) {
+ /* do not release to free list */
+ BUF_UNLOCK(bp);
+ return;
+ }
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+
+ if (bp->b_flags & B_MANAGED) {
+ if (bp->b_flags & B_REMFREE)
+ bremfreef(bp);
+ goto out;
+ }
+
+ /* buffers with stale but valid contents */
+ if (bp->b_flags & B_DELWRI) {
+ qindex = QUEUE_DIRTY;
+ } else {
+ if ((bp->b_flags & B_DELWRI) == 0 &&
+ (bp->b_xflags & BX_VNDIRTY))
+ panic("bqrelse: not dirty");
+ /*
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
+ */
+ if (buf_vm_page_count_severe() &&
+ (bp->b_vflags & BV_BKGRDINPROG) == 0) {
+ /*
+ * We are too low on memory, we have to try to free
+ * the buffer (most importantly: the wired pages
+ * making up its backing store) *now*.
+ */
+ brelse(bp);
+ return;
+ }
+ qindex = QUEUE_CLEAN;
+ }
+ binsfree(bp, qindex);
+
+out:
+ /* unlock */
+ BUF_UNLOCK(bp);
+}
+
+/* Give pages used by the bp back to the VM system (where possible) */
+static void
+vfs_vmio_release(struct buf *bp)
+{
+ int i;
+ vm_page_t m;
+
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ bp->b_pages[i] = NULL;
+ /*
+ * In order to keep page LRU ordering consistent, put
+ * everything on the inactive queue.
+ */
+ vm_page_lock(m);
+ vm_page_unwire(m, 0);
+
+ /*
+ * Might as well free the page if we can and it has
+ * no valid data. We also free the page if the
+ * buffer was used for direct I/O
+ */
+ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
+ if (m->wire_count == 0 && !vm_page_busied(m))
+ vm_page_free(m);
+ } else if (bp->b_flags & B_DIRECT)
+ vm_page_try_to_free(m);
+ else if (buf_vm_page_count_severe())
+ vm_page_try_to_cache(m);
+ vm_page_unlock(m);
+ }
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+
+ if (bp->b_bufsize) {
+ bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
+ bp->b_npages = 0;
+ bp->b_flags &= ~B_VMIO;
+ if (bp->b_vp)
+ brelvp(bp);
+}
+
+/*
+ * Check to see if a block at a particular lbn is available for a clustered
+ * write.
+ */
+static int
+vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
+{
+ struct buf *bpa;
+ int match;
+
+ match = 0;
+
+ /* If the buf isn't in core skip it */
+ if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
+ return (0);
+
+ /* If the buf is busy we don't want to wait for it */
+ if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ return (0);
+
+ /* Only cluster with valid clusterable delayed write buffers */
+ if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
+ (B_DELWRI | B_CLUSTEROK))
+ goto done;
+
+ if (bpa->b_bufsize != size)
+ goto done;
+
+ /*
+ * Check to see if it is in the expected place on disk and that the
+ * block has been mapped.
+ */
+ if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
+ match = 1;
+done:
+ BUF_UNLOCK(bpa);
+ return (match);
+}
+
+/*
+ * vfs_bio_awrite:
+ *
+ * Implement clustered async writes for clearing out B_DELWRI buffers.
+ * This is much better then the old way of writing only one buffer at
+ * a time. Note that we may not be presented with the buffers in the
+ * correct order, so we search for the cluster in both directions.
+ */
+int
+vfs_bio_awrite(struct buf *bp)
+{
+ struct bufobj *bo;
+ int i;
+ int j;
+ daddr_t lblkno = bp->b_lblkno;
+ struct vnode *vp = bp->b_vp;
+ int ncl;
+ int nwritten;
+ int size;
+ int maxcl;
+ int gbflags;
+
+ bo = &vp->v_bufobj;
+ gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
+ /*
+ * right now we support clustered writing only to regular files. If
+ * we find a clusterable block we could be in the middle of a cluster
+ * rather then at the beginning.
+ */
+ if ((vp->v_type == VREG) &&
+ (vp->v_mount != 0) && /* Only on nodes that have the size info */
+ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+
+ size = vp->v_mount->mnt_stat.f_iosize;
+ maxcl = MAXPHYS / size;
+
+ BO_RLOCK(bo);
+ for (i = 1; i < maxcl; i++)
+ if (vfs_bio_clcheck(vp, size, lblkno + i,
+ bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
+ break;
+
+ for (j = 1; i + j <= maxcl && j <= lblkno; j++)
+ if (vfs_bio_clcheck(vp, size, lblkno - j,
+ bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
+ break;
+ BO_RUNLOCK(bo);
+ --j;
+ ncl = i + j;
+ /*
+ * this is a possible cluster write
+ */
+ if (ncl != 1) {
+ BUF_UNLOCK(bp);
+ nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
+ gbflags);
+ return (nwritten);
+ }
+ }
+ bremfree(bp);
+ bp->b_flags |= B_ASYNC;
+ /*
+ * default (old) behavior, writing out only one block
+ *
+ * XXX returns b_bufsize instead of b_bcount for nwritten?
+ */
+ nwritten = bp->b_bufsize;
+ (void) bwrite(bp);
+
+ return (nwritten);
+}
+
+static void
+setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
+{
+
+ KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+ bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
+ if ((gbflags & GB_UNMAPPED) == 0) {
+ bp->b_kvabase = (caddr_t)addr;
+ } else if ((gbflags & GB_KVAALLOC) != 0) {
+ KASSERT((gbflags & GB_UNMAPPED) != 0,
+ ("GB_KVAALLOC without GB_UNMAPPED"));
+ bp->b_kvaalloc = (caddr_t)addr;
+ bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+ atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+ }
+ bp->b_kvasize = maxsize;
+}
+
+/*
+ * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
+ * needed.
+ */
+static int
+allocbufkva(struct buf *bp, int maxsize, int gbflags)
+{
+ vm_offset_t addr;
+
+ bfreekva(bp);
+ addr = 0;
+
+ if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
+ /*
+ * Buffer map is too fragmented. Request the caller
+ * to defragment the map.
+ */
+ atomic_add_int(&bufdefragcnt, 1);
+ return (1);
+ }
+ setbufkva(bp, addr, maxsize, gbflags);
+ atomic_add_long(&bufspace, bp->b_kvasize);
+ return (0);
+}
+
+/*
+ * Ask the bufdaemon for help, or act as bufdaemon itself, when a
+ * locked vnode is supplied.
+ */
+static void
+getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
+ int defrag)
+{
+ struct thread *td;
+ char *waitmsg;
+ int fl, flags, norunbuf;
+
+ mtx_assert(&bqclean, MA_OWNED);
+
+ if (defrag) {
+ flags = VFS_BIO_NEED_BUFSPACE;
+ waitmsg = "nbufkv";
+ } else if (bufspace >= hibufspace) {
+ waitmsg = "nbufbs";
+ flags = VFS_BIO_NEED_BUFSPACE;
+ } else {
+ waitmsg = "newbuf";
+ flags = VFS_BIO_NEED_ANY;
+ }
+ mtx_lock(&nblock);
+ needsbuffer |= flags;
+ mtx_unlock(&nblock);
+ mtx_unlock(&bqclean);
+
+ bd_speedup(); /* heeeelp */
+ if ((gbflags & GB_NOWAIT_BD) != 0)
+ return;
+
+ td = curthread;
+ mtx_lock(&nblock);
+ while (needsbuffer & flags) {
+ if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
+ mtx_unlock(&nblock);
+ /*
+ * getblk() is called with a vnode locked, and
+ * some majority of the dirty buffers may as
+ * well belong to the vnode. Flushing the
+ * buffers there would make a progress that
+ * cannot be achieved by the buf_daemon, that
+ * cannot lock the vnode.
+ */
+ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+ (td->td_pflags & TDP_NORUNNINGBUF);
+ /* play bufdaemon */
+ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+ fl = buf_flush(vp, flushbufqtarget);
+ td->td_pflags &= norunbuf;
+ mtx_lock(&nblock);
+ if (fl != 0)
+ continue;
+ if ((needsbuffer & flags) == 0)
+ break;
+ }
+ if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
+ waitmsg, slptimeo))
+ break;
+ }
+ mtx_unlock(&nblock);
+}
+
+static void
+getnewbuf_reuse_bp(struct buf *bp, int qindex)
+{
+
+ CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
+ "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
+ bp->b_kvasize, bp->b_bufsize, qindex);
+ mtx_assert(&bqclean, MA_NOTOWNED);
+
+ /*
+ * Note: we no longer distinguish between VMIO and non-VMIO
+ * buffers.
+ */
+ KASSERT((bp->b_flags & B_DELWRI) == 0,
+ ("delwri buffer %p found in queue %d", bp, qindex));
+
+ if (qindex == QUEUE_CLEAN) {
+ if (bp->b_flags & B_VMIO) {
+ bp->b_flags &= ~B_ASYNC;
+ vfs_vmio_release(bp);
+ }
+ if (bp->b_vp != NULL)
+ brelvp(bp);
+ }
+
+ /*
+ * Get the rest of the buffer freed up. b_kva* is still valid
+ * after this operation.
+ */
+
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 3");
+ KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d",
+ bp, bp->b_vp, qindex));
+ KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+ ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+
+ bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
+ bp->b_ioflags = 0;
+ bp->b_xflags = 0;
+ KASSERT((bp->b_flags & B_INFREECNT) == 0,
+ ("buf %p still counted as free?", bp));
+ bp->b_vflags = 0;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
+
+ LIST_INIT(&bp->b_dep);
+}
+
+static int flushingbufs;
+
+static struct buf *
+getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
+{
+ struct buf *bp, *nbp;
+ int nqindex, qindex, pass;
+
+ KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
+
+ pass = 1;
+restart:
+ atomic_add_int(&getnewbufrestarts, 1);
+
+ /*
+ * Setup for scan. If we do not have enough free buffers,
+ * we setup a degenerate case that immediately fails. Note
+ * that if we are specially marked process, we are allowed to
+ * dip into our reserves.
+ *
+ * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+ * for the allocation of the mapped buffer. For unmapped, the
+ * easiest is to start with EMPTY outright.
+ *
+ * We start with EMPTYKVA. If the list is empty we backup to EMPTY.
+ * However, there are a number of cases (defragging, reusing, ...)
+ * where we cannot backup.
+ */
+ nbp = NULL;
+ mtx_lock(&bqclean);
+ if (!defrag && unmapped) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
+ if (nbp == NULL) {
+ nqindex = QUEUE_EMPTYKVA;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+ }
+
+ /*
+ * If no EMPTYKVA buffers and we are either defragging or
+ * reusing, locate a CLEAN buffer to free or reuse. If
+ * bufspace useage is low skip this step so we can allocate a
+ * new buffer.
+ */
+ if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ }
+
+ /*
+ * If we could not find or were not allowed to reuse a CLEAN
+ * buffer, check to see if it is ok to use an EMPTY buffer.
+ * We can only use an EMPTY buffer if allocating its KVA would
+ * not otherwise run us out of buffer space. No KVA is needed
+ * for the unmapped allocation.
+ */
+ if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
+ metadata)) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
+
+ /*
+ * All available buffers might be clean, retry ignoring the
+ * lobufspace as the last resort.
+ */
+ if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ }
+
+ /*
+ * Run scan, possibly freeing data and/or kva mappings on the fly
+ * depending.
+ */
+ while ((bp = nbp) != NULL) {
+ qindex = nqindex;
+
+ /*
+ * Calculate next bp (we can only use it if we do not
+ * block or do other fancy things).
+ */
+ if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
+ switch (qindex) {
+ case QUEUE_EMPTY:
+ nqindex = QUEUE_EMPTYKVA;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+ if (nbp != NULL)
+ break;
+ /* FALLTHROUGH */
+ case QUEUE_EMPTYKVA:
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ if (nbp != NULL)
+ break;
+ /* FALLTHROUGH */
+ case QUEUE_CLEAN:
+ if (metadata && pass == 1) {
+ pass = 2;
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(
+ &bufqueues[QUEUE_EMPTY]);
+ }
+ /*
+ * nbp is NULL.
+ */
+ break;
+ }
+ }
+ /*
+ * If we are defragging then we need a buffer with
+ * b_kvasize != 0. XXX this situation should no longer
+ * occur, if defrag is non-zero the buffer's b_kvasize
+ * should also be non-zero at this point. XXX
+ */
+ if (defrag && bp->b_kvasize == 0) {
+ printf("Warning: defrag empty buffer %p\n", bp);
+ continue;
+ }
+
+ /*
+ * Start freeing the bp. This is somewhat involved. nbp
+ * remains valid only for QUEUE_EMPTY[KVA] bp's.
+ */
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ continue;
+ /*
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
+ */
+ if (bp->b_vflags & BV_BKGRDINPROG) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+
+ KASSERT(bp->b_qindex == qindex,
+ ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+
+ bremfreel(bp);
+ mtx_unlock(&bqclean);
+ /*
+ * NOTE: nbp is now entirely invalid. We can only restart
+ * the scan from this point on.
+ */
+
+ getnewbuf_reuse_bp(bp, qindex);
+ mtx_assert(&bqclean, MA_NOTOWNED);
+
+ /*
+ * If we are defragging then free the buffer.
+ */
+ if (defrag) {
+ bp->b_flags |= B_INVAL;
+ bfreekva(bp);
+ brelse(bp);
+ defrag = 0;
+ goto restart;
+ }
+
+ /*
+ * Notify any waiters for the buffer lock about
+ * identity change by freeing the buffer.
+ */
+ if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
+ bp->b_flags |= B_INVAL;
+ bfreekva(bp);
+ brelse(bp);
+ goto restart;
+ }
+
+ if (metadata)
+ break;
+
+ /*
+ * If we are overcomitted then recover the buffer and its
+ * KVM space. This occurs in rare situations when multiple
+ * processes are blocked in getnewbuf() or allocbuf().
+ */
+ if (bufspace >= hibufspace)
+ flushingbufs = 1;
+ if (flushingbufs && bp->b_kvasize != 0) {
+ bp->b_flags |= B_INVAL;
+ bfreekva(bp);
+ brelse(bp);
+ goto restart;
+ }
+ if (bufspace < lobufspace)
+ flushingbufs = 0;
+ break;
+ }
+ return (bp);
+}
+
+/*
+ * getnewbuf:
+ *
+ * Find and initialize a new buffer header, freeing up existing buffers
+ * in the bufqueues as necessary. The new buffer is returned locked.
+ *
+ * Important: B_INVAL is not set. If the caller wishes to throw the
+ * buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ * We block if:
+ * We have insufficient buffer headers
+ * We have insufficient buffer space
+ * buffer_arena is too fragmented ( space reservation fails )
+ * If we have to flush dirty buffers ( but we try to avoid this )
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+ int gbflags)
+{
+ struct buf *bp;
+ int defrag, metadata;
+
+ KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+ if (!unmapped_buf_allowed)
+ gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+ defrag = 0;
+ if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+ vp->v_type == VCHR)
+ metadata = 1;
+ else
+ metadata = 0;
+ /*
+ * We can't afford to block since we might be holding a vnode lock,
+ * which may prevent system daemons from running. We deal with
+ * low-memory situations by proactively returning memory and running
+ * async I/O rather then sync I/O.
+ */
+ atomic_add_int(&getnewbufcalls, 1);
+ atomic_subtract_int(&getnewbufrestarts, 1);
+restart:
+ bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
+ GB_KVAALLOC)) == GB_UNMAPPED, metadata);
+ if (bp != NULL)
+ defrag = 0;
+
+ /*
+ * If we exhausted our list, sleep as appropriate. We may have to
+ * wakeup various daemons and write out some dirty buffers.
+ *
+ * Generally we are sleeping due to insufficient buffer space.
+ */
+ if (bp == NULL) {
+ mtx_assert(&bqclean, MA_OWNED);
+ getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
+ mtx_assert(&bqclean, MA_NOTOWNED);
+ } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
+ mtx_assert(&bqclean, MA_NOTOWNED);
+
+ bfreekva(bp);
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_kvabase = bp->b_data = unmapped_buf;
+ bp->b_kvasize = maxsize;
+ atomic_add_long(&bufspace, bp->b_kvasize);
+ atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ } else {
+ mtx_assert(&bqclean, MA_NOTOWNED);
+
+ /*
+ * We finally have a valid bp. We aren't quite out of the
+ * woods, we still have to reserve kva space. In order
+ * to keep fragmentation sane we only allocate kva in
+ * BKVASIZE chunks.
+ */
+ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
+
+ if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
+ B_KVAALLOC)) == B_UNMAPPED) {
+ if (allocbufkva(bp, maxsize, gbflags)) {
+ defrag = 1;
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto restart;
+ }
+ atomic_add_int(&bufreusecnt, 1);
+ } else if ((bp->b_flags & B_KVAALLOC) != 0 &&
+ (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
+ /*
+ * If the reused buffer has KVA allocated,
+ * reassign b_kvaalloc to b_kvabase.
+ */
+ bp->b_kvabase = bp->b_kvaalloc;
+ bp->b_flags &= ~B_KVAALLOC;
+ atomic_subtract_long(&unmapped_bufspace,
+ bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+ (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
+ GB_KVAALLOC)) {
+ /*
+ * The case of reused buffer already have KVA
+ * mapped, but the request is for unmapped
+ * buffer with KVA allocated.
+ */
+ bp->b_kvaalloc = bp->b_kvabase;
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+ atomic_add_long(&unmapped_bufspace,
+ bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ }
+ if ((gbflags & GB_UNMAPPED) == 0) {
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr;
+ bp->b_flags &= ~B_UNMAPPED;
+ BUF_CHECK_MAPPED(bp);
+ }
+ }
+ return (bp);
+}
+
+/*
+ * buf_daemon:
+ *
+ * buffer flushing daemon. Buffers are normally flushed by the
+ * update daemon but if it cannot keep up this process starts to
+ * take the load in an attempt to prevent getnewbuf() from blocking.
+ */
+
+static struct kproc_desc buf_kp = {
+ "bufdaemon",
+ buf_daemon,
+ &bufdaemonproc
+};
+SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
+
+static int
+buf_flush(struct vnode *vp, int target)
+{
+ int flushed;
+
+ flushed = flushbufqueues(vp, target, 0);
+ if (flushed == 0) {
+ /*
+ * Could not find any buffers without rollback
+ * dependencies, so just write the first one
+ * in the hopes of eventually making progress.
+ */
+ if (vp != NULL && target > 2)
+ target /= 2;
+ flushbufqueues(vp, target, 1);
+ }
+ return (flushed);
+}
+
+static void
+buf_daemon()
+{
+ int lodirty;
+
+ /*
+ * This process needs to be suspended prior to shutdown sync.
+ */
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
+ SHUTDOWN_PRI_LAST);
+
+ /*
+ * This process is allowed to take the buffer cache to the limit
+ */
+ curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
+ mtx_lock(&bdlock);
+ for (;;) {
+ bd_request = 0;
+ mtx_unlock(&bdlock);
+
+ kproc_suspend_check(bufdaemonproc);
+ lodirty = lodirtybuffers;
+ if (bd_speedupreq) {
+ lodirty = numdirtybuffers / 2;
+ bd_speedupreq = 0;
+ }
+ /*
+ * Do the flush. Limit the amount of in-transit I/O we
+ * allow to build up, otherwise we would completely saturate
+ * the I/O system.
+ */
+ while (numdirtybuffers > lodirty) {
+ if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
+ break;
+ kern_yield(PRI_USER);
+ }
+
+ /*
+ * Only clear bd_request if we have reached our low water
+ * mark. The buf_daemon normally waits 1 second and
+ * then incrementally flushes any dirty buffers that have
+ * built up, within reason.
+ *
+ * If we were unable to hit our low water mark and couldn't
+ * find any flushable buffers, we sleep for a short period
+ * to avoid endless loops on unlockable buffers.
+ */
+ mtx_lock(&bdlock);
+ if (numdirtybuffers <= lodirtybuffers) {
+ /*
+ * We reached our low water mark, reset the
+ * request and sleep until we are needed again.
+ * The sleep is just so the suspend code works.
+ */
+ bd_request = 0;
+ /*
+ * Do an extra wakeup in case dirty threshold
+ * changed via sysctl and the explicit transition
+ * out of shortfall was missed.
+ */
+ bdirtywakeup();
+ if (runningbufspace <= lorunningspace)
+ runningwakeup();
+ msleep(&bd_request, &bdlock, PVM, "psleep", hz);
+ } else {
+ /*
+ * We couldn't find any flushable dirty buffers but
+ * still have too many dirty buffers, we
+ * have to sleep and try again. (rare)
+ */
+ msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
+ }
+ }
+}
+
+/*
+ * flushbufqueues:
+ *
+ * Try to flush a buffer in the dirty queue. We must be careful to
+ * free up B_INVAL buffers instead of write them, which NFS is
+ * particularly sensitive to.
+ */
+static int flushwithdeps = 0;
+SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
+ 0, "Number of buffers flushed with dependecies that require rollbacks");
+
+static int
+flushbufqueues(struct vnode *lvp, int target, int flushdeps)
+{
+ struct buf *sentinel;
+ struct vnode *vp;
+ struct mount *mp;
+ struct buf *bp;
+ int hasdeps;
+ int flushed;
+ int queue;
+
+ flushed = 0;
+ queue = QUEUE_DIRTY;
+ bp = NULL;
+ sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
+ sentinel->b_qindex = QUEUE_SENTINEL;
+ mtx_lock(&bqdirty);
+ TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
+ while (flushed != target) {
+ bp = TAILQ_NEXT(sentinel, b_freelist);
+ if (bp != NULL) {
+ TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
+ TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
+ b_freelist);
+ } else
+ break;
+ /*
+ * Skip sentinels inserted by other invocations of the
+ * flushbufqueues(), taking care to not reorder them.
+ */
+ if (bp->b_qindex == QUEUE_SENTINEL)
+ continue;
+ /*
+ * Only flush the buffers that belong to the
+ * vnode locked by the curthread.
+ */
+ if (lvp != NULL && bp->b_vp != lvp)
+ continue;
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ continue;
+ if (bp->b_pin_count > 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+ /*
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
+ */
+ if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
+ (bp->b_flags & B_DELWRI) == 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+ if (bp->b_flags & B_INVAL) {
+ bremfreel(bp);
+ mtx_unlock(&bqdirty);
+ brelse(bp);
+ flushed++;
+ mtx_lock(&bqdirty);
+ continue;
+ }
+
+ if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
+ if (flushdeps == 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+ hasdeps = 1;
+ } else
+ hasdeps = 0;
+ /*
+ * We must hold the lock on a vnode before writing
+ * one of its buffers. Otherwise we may confuse, or
+ * in the case of a snapshot vnode, deadlock the
+ * system.
+ *
+ * The lock order here is the reverse of the normal
+ * of vnode followed by buf lock. This is ok because
+ * the NOWAIT will prevent deadlock.
+ */
+ vp = bp->b_vp;
+ if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+ if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
+ mtx_unlock(&bqdirty);
+ CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ if (curproc == bufdaemonproc)
+ vfs_bio_awrite(bp);
+ else {
+ bremfree(bp);
+ bwrite(bp);
+ notbufdflushes++;
+ }
+ vn_finished_write(mp);
+ VOP_UNLOCK(vp, 0);
+ flushwithdeps += hasdeps;
+ flushed++;
+
+ /*
+ * Sleeping on runningbufspace while holding
+ * vnode lock leads to deadlock.
+ */
+ if (curproc == bufdaemonproc &&
+ runningbufspace > hirunningspace)
+ waitrunningbufspace();
+ mtx_lock(&bqdirty);
+ continue;
+ }
+ vn_finished_write(mp);
+ BUF_UNLOCK(bp);
+ }
+ TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
+ mtx_unlock(&bqdirty);
+ free(sentinel, M_TEMP);
+ return (flushed);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
+struct buf *
+incore(struct bufobj *bo, daddr_t blkno)
+{
+ struct buf *bp;
+
+ BO_RLOCK(bo);
+ bp = gbincore(bo, blkno);
+ BO_RUNLOCK(bo);
+ return (bp);
+}
+
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object. This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+static int
+inmem(struct vnode * vp, daddr_t blkno)
+{
+ vm_object_t obj;
+ vm_offset_t toff, tinc, size;
+ vm_page_t m;
+ vm_ooffset_t off;
+
+ ASSERT_VOP_LOCKED(vp, "inmem");
+
+ if (incore(&vp->v_bufobj, blkno))
+ return 1;
+ if (vp->v_mount == NULL)
+ return 0;
+ obj = vp->v_object;
+ if (obj == NULL)
+ return (0);
+
+ size = PAGE_SIZE;
+ if (size > vp->v_mount->mnt_stat.f_iosize)
+ size = vp->v_mount->mnt_stat.f_iosize;
+ off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
+
+ VM_OBJECT_RLOCK(obj);
+ for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+ if (!m)
+ goto notinmem;
+ tinc = size;
+ if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
+ tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
+ if (vm_page_is_valid(m,
+ (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
+ goto notinmem;
+ }
+ VM_OBJECT_RUNLOCK(obj);
+ return 1;
+
+notinmem:
+ VM_OBJECT_RUNLOCK(obj);
+ return (0);
+}
+
+/*
+ * Set the dirty range for a buffer based on the status of the dirty
+ * bits in the pages comprising the buffer. The range is limited
+ * to the size of the buffer.
+ *
+ * Tell the VM system that the pages associated with this buffer
+ * are clean. This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ *
+ * Note that while we only really need to clean through to b_bcount, we
+ * just go ahead and clean through to b_bufsize.
+ */
+static void
+vfs_clean_pages_dirty_buf(struct buf *bp)
+{
+ vm_ooffset_t foff, noff, eoff;
+ vm_page_t m;
+ int i;
+
+ if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
+ return;
+
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_clean_pages_dirty_buf: no buffer offset"));
+
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ vfs_drain_busy_pages(bp);
+ vfs_setdirty_locked_object(bp);
+ for (i = 0; i < bp->b_npages; i++) {
+ noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ eoff = noff;
+ if (eoff > bp->b_offset + bp->b_bufsize)
+ eoff = bp->b_offset + bp->b_bufsize;
+ m = bp->b_pages[i];
+ vfs_page_set_validclean(bp, foff, m);
+ /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
+ foff = noff;
+ }
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+}
+
+static void
+vfs_setdirty_locked_object(struct buf *bp)
+{
+ vm_object_t object;
+ int i;
+
+ object = bp->b_bufobj->bo_object;
+ VM_OBJECT_ASSERT_WLOCKED(object);
+
+ /*
+ * We qualify the scan for modified pages on whether the
+ * object has been flushed yet.
+ */
+ if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
+ vm_offset_t boffset;
+ vm_offset_t eoffset;
+
+ /*
+ * test the pages to see if they have been modified directly
+ * by users through the VM system.
+ */
+ for (i = 0; i < bp->b_npages; i++)
+ vm_page_test_dirty(bp->b_pages[i]);
+
+ /*
+ * Calculate the encompassing dirty range, boffset and eoffset,
+ * (eoffset - boffset) bytes.
+ */
+
+ for (i = 0; i < bp->b_npages; i++) {
+ if (bp->b_pages[i]->dirty)
+ break;
+ }
+ boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+ for (i = bp->b_npages - 1; i >= 0; --i) {
+ if (bp->b_pages[i]->dirty) {
+ break;
+ }
+ }
+ eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
+
+ /*
+ * Fit it to the buffer.
+ */
+
+ if (eoffset > bp->b_bcount)
+ eoffset = bp->b_bcount;
+
+ /*
+ * If we have a good dirty range, merge with the existing
+ * dirty range.
+ */
+
+ if (boffset < eoffset) {
+ if (bp->b_dirtyoff > boffset)
+ bp->b_dirtyoff = boffset;
+ if (bp->b_dirtyend < eoffset)
+ bp->b_dirtyend = eoffset;
+ }
+ }
+}
+
+/*
+ * Allocate the KVA mapping for an existing buffer. It handles the
+ * cases of both B_UNMAPPED buffer, and buffer with the preallocated
+ * KVA which is not mapped (B_KVAALLOC).
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+ struct buf *scratch_bp;
+ int bsize, maxsize, need_mapping, need_kva;
+ off_t offset;
+
+ need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
+ (gbflags & GB_UNMAPPED) == 0;
+ need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
+ (gbflags & GB_KVAALLOC) != 0;
+ if (!need_mapping && !need_kva)
+ return;
+
+ BUF_CHECK_UNMAPPED(bp);
+
+ if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
+ /*
+ * Buffer is not mapped, but the KVA was already
+ * reserved at the time of the instantiation. Use the
+ * allocated space.
+ */
+ bp->b_flags &= ~B_KVAALLOC;
+ KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
+ bp->b_kvabase = bp->b_kvaalloc;
+ atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+ goto has_addr;
+ }
+
+ /*
+ * Calculate the amount of the address space we would reserve
+ * if the buffer was mapped.
+ */
+ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+ offset = blkno * bsize;
+ maxsize = size + (offset & PAGE_MASK);
+ maxsize = imax(maxsize, bsize);
+
+mapping_loop:
+ if (allocbufkva(bp, maxsize, gbflags)) {
+ /*
+ * Request defragmentation. getnewbuf() returns us the
+ * allocated space by the scratch buffer KVA.
+ */
+ scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
+ (GB_UNMAPPED | GB_KVAALLOC));
+ if (scratch_bp == NULL) {
+ if ((gbflags & GB_NOWAIT_BD) != 0) {
+ /*
+ * XXXKIB: defragmentation cannot
+ * succeed, not sure what else to do.
+ */
+ panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
+ }
+ atomic_add_int(&mappingrestarts, 1);
+ goto mapping_loop;
+ }
+ KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
+ ("scratch bp !B_KVAALLOC %p", scratch_bp));
+ setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
+ scratch_bp->b_kvasize, gbflags);
+
+ /* Get rid of the scratch buffer. */
+ scratch_bp->b_kvasize = 0;
+ scratch_bp->b_flags |= B_INVAL;
+ scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+ brelse(scratch_bp);
+ }
+ if (!need_mapping)
+ return;
+
+has_addr:
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
+ bp->b_flags &= ~B_UNMAPPED;
+ BUF_CHECK_MAPPED(bp);
+ bpmap_qenter(bp);
+}
+
+/*
+ * getblk:
+ *
+ * Get a block given a specified block and offset into a file/device.
+ * The buffers B_DONE bit will be cleared on return, making it almost
+ * ready for an I/O initiation. B_INVAL may or may not be set on
+ * return. The caller should clear B_INVAL prior to initiating a
+ * READ.
+ *
+ * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
+ * an existing buffer.
+ *
+ * For a VMIO buffer, B_CACHE is modified according to the backing VM.
+ * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
+ * and then cleared based on the backing VM. If the previous buffer is
+ * non-0-sized but invalid, B_CACHE will be cleared.
+ *
+ * If getblk() must create a new buffer, the new buffer is returned with
+ * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
+ * case it is returned with B_INVAL clear and B_CACHE set based on the
+ * backing VM.
+ *
+ * getblk() also forces a bwrite() for any B_DELWRI buffer whos
+ * B_CACHE bit is clear.
+ *
+ * What this means, basically, is that the caller should use B_CACHE to
+ * determine whether the buffer is fully valid or not and should clear
+ * B_INVAL prior to issuing a read. If the caller intends to validate
+ * the buffer by loading its data area with something, the caller needs
+ * to clear B_INVAL. If the caller does this without issuing an I/O,
+ * the caller should set B_CACHE ( as an optimization ), else the caller
+ * should issue the I/O and biodone() will set B_CACHE if the I/O was
+ * a write attempt or if it was a successfull read. If the caller
+ * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
+ * prior to issuing the READ. biodone() will *not* clear B_INVAL.
+ */
+struct buf *
+getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
+ int flags)
+{
+ struct buf *bp;
+ struct bufobj *bo;
+ int bsize, error, maxsize, vmio;
+ off_t offset;
+
+ CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+ KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+ ASSERT_VOP_LOCKED(vp, "getblk");
+ if (size > MAXBSIZE)
+ panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+ if (!unmapped_buf_allowed)
+ flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+ bo = &vp->v_bufobj;
+loop:
+ BO_RLOCK(bo);
+ bp = gbincore(bo, blkno);
+ if (bp != NULL) {
+ int lockflags;
+ /*
+ * Buffer is in-core. If the buffer is not busy nor managed,
+ * it must be on a queue.
+ */
+ lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
+
+ if (flags & GB_LOCK_NOWAIT)
+ lockflags |= LK_NOWAIT;
+
+ error = BUF_TIMELOCK(bp, lockflags,
+ BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
+
+ /*
+ * If we slept and got the lock we have to restart in case
+ * the buffer changed identities.
+ */
+ if (error == ENOLCK)
+ goto loop;
+ /* We timed out or were interrupted. */
+ else if (error)
+ return (NULL);
+ /* If recursed, assume caller knows the rules. */
+ else if (BUF_LOCKRECURSED(bp))
+ goto end;
+
+ /*
+ * The buffer is locked. B_CACHE is cleared if the buffer is
+ * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set
+ * and for a VMIO buffer B_CACHE is adjusted according to the
+ * backing VM cache.
+ */
+ if (bp->b_flags & B_INVAL)
+ bp->b_flags &= ~B_CACHE;
+ else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
+ bp->b_flags |= B_CACHE;
+ if (bp->b_flags & B_MANAGED)
+ MPASS(bp->b_qindex == QUEUE_NONE);
+ else
+ bremfree(bp);
+
+ /*
+ * check for size inconsistencies for non-VMIO case.
+ */
+ if (bp->b_bcount != size) {
+ if ((bp->b_flags & B_VMIO) == 0 ||
+ (size > bp->b_kvasize)) {
+ if (bp->b_flags & B_DELWRI) {
+ /*
+ * If buffer is pinned and caller does
+ * not want sleep waiting for it to be
+ * unpinned, bail out
+ * */
+ if (bp->b_pin_count > 0) {
+ if (flags & GB_LOCK_NOWAIT) {
+ bqrelse(bp);
+ return (NULL);
+ } else {
+ bunpin_wait(bp);
+ }
+ }
+ bp->b_flags |= B_NOCACHE;
+ bwrite(bp);
+ } else {
+ if (LIST_EMPTY(&bp->b_dep)) {
+ bp->b_flags |= B_RELBUF;
+ brelse(bp);
+ } else {
+ bp->b_flags |= B_NOCACHE;
+ bwrite(bp);
+ }
+ }
+ goto loop;
+ }
+ }
+
+ /*
+ * Handle the case of unmapped buffer which should
+ * become mapped, or the buffer for which KVA
+ * reservation is requested.
+ */
+ bp_unmapped_get_kva(bp, blkno, size, flags);
+
+ /*
+ * If the size is inconsistant in the VMIO case, we can resize
+ * the buffer. This might lead to B_CACHE getting set or
+ * cleared. If the size has not changed, B_CACHE remains
+ * unchanged from its previous state.
+ */
+ if (bp->b_bcount != size)
+ allocbuf(bp, size);
+
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("getblk: no buffer offset"));
+
+ /*
+ * A buffer with B_DELWRI set and B_CACHE clear must
+ * be committed before we can return the buffer in
+ * order to prevent the caller from issuing a read
+ * ( due to B_CACHE not being set ) and overwriting
+ * it.
+ *
+ * Most callers, including NFS and FFS, need this to
+ * operate properly either because they assume they
+ * can issue a read if B_CACHE is not set, or because
+ * ( for example ) an uncached B_DELWRI might loop due
+ * to softupdates re-dirtying the buffer. In the latter
+ * case, B_CACHE is set after the first write completes,
+ * preventing further loops.
+ * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE
+ * above while extending the buffer, we cannot allow the
+ * buffer to remain with B_CACHE set after the write
+ * completes or it will represent a corrupt state. To
+ * deal with this we set B_NOCACHE to scrap the buffer
+ * after the write.
+ *
+ * We might be able to do something fancy, like setting
+ * B_CACHE in bwrite() except if B_DELWRI is already set,
+ * so the below call doesn't set B_CACHE, but that gets real
+ * confusing. This is much easier.
+ */
+
+ if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
+ bp->b_flags |= B_NOCACHE;
+ bwrite(bp);
+ goto loop;
+ }
+ bp->b_flags &= ~B_DONE;
+ } else {
+ /*
+ * Buffer is not in-core, create new buffer. The buffer
+ * returned by getnewbuf() is locked. Note that the returned
+ * buffer is also considered valid (not marked B_INVAL).
+ */
+ BO_RUNLOCK(bo);
+ /*
+ * If the user does not want us to create the buffer, bail out
+ * here.
+ */
+ if (flags & GB_NOCREAT)
+ return NULL;
+ if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
+ return NULL;
+
+ bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
+ offset = blkno * bsize;
+ vmio = vp->v_object != NULL;
+ if (vmio) {
+ maxsize = size + (offset & PAGE_MASK);
+ } else {
+ maxsize = size;
+ /* Do not allow non-VMIO notmapped buffers. */
+ flags &= ~GB_UNMAPPED;
+ }
+ maxsize = imax(maxsize, bsize);
+
+ bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
+ if (bp == NULL) {
+ if (slpflag || slptimeo)
+ return NULL;
+ goto loop;
+ }
+
+ /*
+ * This code is used to make sure that a buffer is not
+ * created while the getnewbuf routine is blocked.
+ * This can be a problem whether the vnode is locked or not.
+ * If the buffer is created out from under us, we have to
+ * throw away the one we just created.
+ *
+ * Note: this must occur before we associate the buffer
+ * with the vp especially considering limitations in
+ * the splay tree implementation when dealing with duplicate
+ * lblkno's.
+ */
+ BO_LOCK(bo);
+ if (gbincore(bo, blkno)) {
+ BO_UNLOCK(bo);
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto loop;
+ }
+
+ /*
+ * Insert the buffer into the hash, so that it can
+ * be found by incore.
+ */
+ bp->b_blkno = bp->b_lblkno = blkno;
+ bp->b_offset = offset;
+ bgetvp(vp, bp);
+ BO_UNLOCK(bo);
+
+ /*
+ * set B_VMIO bit. allocbuf() the buffer bigger. Since the
+ * buffer size starts out as 0, B_CACHE will be set by
+ * allocbuf() for the VMIO case prior to it testing the
+ * backing store for validity.
+ */
+
+ if (vmio) {
+ bp->b_flags |= B_VMIO;
+ KASSERT(vp->v_object == bp->b_bufobj->bo_object,
+ ("ARGH! different b_bufobj->bo_object %p %p %p\n",
+ bp, vp->v_object, bp->b_bufobj->bo_object));
+ } else {
+ bp->b_flags &= ~B_VMIO;
+ KASSERT(bp->b_bufobj->bo_object == NULL,
+ ("ARGH! has b_bufobj->bo_object %p %p\n",
+ bp, bp->b_bufobj->bo_object));
+ BUF_CHECK_MAPPED(bp);
+ }
+
+ allocbuf(bp, size);
+ bp->b_flags &= ~B_DONE;
+ }
+ CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
+ BUF_ASSERT_HELD(bp);
+end:
+ KASSERT(bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+ return (bp);
+}
+
+/*
+ * Get an empty, disassociated buffer of given size. The buffer is initially
+ * set to B_INVAL.
+ */
+struct buf *
+geteblk(int size, int flags)
+{
+ struct buf *bp;
+ int maxsize;
+
+ maxsize = (size + BKVAMASK) & ~BKVAMASK;
+ while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
+ if ((flags & GB_NOWAIT_BD) &&
+ (curthread->td_pflags & TDP_BUFNEED) != 0)
+ return (NULL);
+ }
+ allocbuf(bp, size);
+ bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
+ BUF_ASSERT_HELD(bp);
+ return (bp);
+}
+
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations). This code is able to
+ * resize a buffer up or down.
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistant data situations. Tread lightly!!!
+ * There are B_CACHE and B_DELWRI interactions that must be dealt with by
+ * the caller. Calling this code willy nilly can result in the loss of data.
+ *
+ * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with
+ * B_CACHE for the non-VMIO case.
+ */
+
+int
+allocbuf(struct buf *bp, int size)
+{
+ int newbsize, mbsize;
+ int i;
+
+ BUF_ASSERT_HELD(bp);
+
+ if (bp->b_kvasize < size)
+ panic("allocbuf: buffer too small");
+
+ if ((bp->b_flags & B_VMIO) == 0) {
+ caddr_t origbuf;
+ int origbufsize;
+ /*
+ * Just get anonymous memory from the kernel. Don't
+ * mess with B_CACHE.
+ */
+ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+ if (bp->b_flags & B_MALLOC)
+ newbsize = mbsize;
+ else
+ newbsize = round_page(size);
+
+ if (newbsize < bp->b_bufsize) {
+ /*
+ * malloced buffers are not shrunk
+ */
+ if (bp->b_flags & B_MALLOC) {
+ if (newbsize) {
+ bp->b_bcount = size;
+ } else {
+ free(bp->b_data, M_BIOBUF);
+ if (bp->b_bufsize) {
+ atomic_subtract_long(
+ &bufmallocspace,
+ bp->b_bufsize);
+ bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr;
+ bp->b_bcount = 0;
+ bp->b_flags &= ~B_MALLOC;
+ }
+ return 1;
+ }
+ vm_hold_free_pages(bp, newbsize);
+ } else if (newbsize > bp->b_bufsize) {
+ /*
+ * We only use malloced memory on the first allocation.
+ * and revert to page-allocated memory when the buffer
+ * grows.
+ */
+ /*
+ * There is a potential smp race here that could lead
+ * to bufmallocspace slightly passing the max. It
+ * is probably extremely rare and not worth worrying
+ * over.
+ */
+ if ( (bufmallocspace < maxbufmallocspace) &&
+ (bp->b_bufsize == 0) &&
+ (mbsize <= PAGE_SIZE/2)) {
+
+ bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
+ bp->b_bufsize = mbsize;
+ bp->b_bcount = size;
+ bp->b_flags |= B_MALLOC;
+ atomic_add_long(&bufmallocspace, mbsize);
+ return 1;
+ }
+ origbuf = NULL;
+ origbufsize = 0;
+ /*
+ * If the buffer is growing on its other-than-first allocation,
+ * then we revert to the page-allocation scheme.
+ */
+ if (bp->b_flags & B_MALLOC) {
+ origbuf = bp->b_data;
+ origbufsize = bp->b_bufsize;
+ bp->b_data = bp->b_kvabase;
+ if (bp->b_bufsize) {
+ atomic_subtract_long(&bufmallocspace,
+ bp->b_bufsize);
+ bufspacewakeup();
+ bp->b_bufsize = 0;
+ }
+ bp->b_flags &= ~B_MALLOC;
+ newbsize = round_page(newbsize);
+ }
+ vm_hold_load_pages(
+ bp,
+ (vm_offset_t) bp->b_data + bp->b_bufsize,
+ (vm_offset_t) bp->b_data + newbsize);
+ if (origbuf) {
+ bcopy(origbuf, bp->b_data, origbufsize);
+ free(origbuf, M_BIOBUF);
+ }
+ }
+ } else {
+ int desiredpages;
+
+ newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+ desiredpages = (size == 0) ? 0 :
+ num_pages((bp->b_offset & PAGE_MASK) + newbsize);
+
+ if (bp->b_flags & B_MALLOC)
+ panic("allocbuf: VMIO buffer can't be malloced");
+ /*
+ * Set B_CACHE initially if buffer is 0 length or will become
+ * 0-length.
+ */
+ if (size == 0 || bp->b_bufsize == 0)
+ bp->b_flags |= B_CACHE;
+
+ if (newbsize < bp->b_bufsize) {
+ /*
+ * DEV_BSIZE aligned new buffer size is less then the
+ * DEV_BSIZE aligned existing buffer size. Figure out
+ * if we have to remove any pages.
+ */
+ if (desiredpages < bp->b_npages) {
+ vm_page_t m;
+
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove((vm_offset_t)trunc_page(
+ (vm_offset_t)bp->b_data) +
+ (desiredpages << PAGE_SHIFT),
+ (bp->b_npages - desiredpages));
+ } else
+ BUF_CHECK_UNMAPPED(bp);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ for (i = desiredpages; i < bp->b_npages; i++) {
+ /*
+ * the page is not freed here -- it
+ * is the responsibility of
+ * vnode_pager_setsize
+ */
+ m = bp->b_pages[i];
+ KASSERT(m != bogus_page,
+ ("allocbuf: bogus page found"));
+ while (vm_page_sleep_if_busy(m,
+ "biodep"))
+ continue;
+
+ bp->b_pages[i] = NULL;
+ vm_page_lock(m);
+ vm_page_unwire(m, 0);
+ vm_page_unlock(m);
+ }
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+ bp->b_npages = desiredpages;
+ }
+ } else if (size > bp->b_bcount) {
+ /*
+ * We are growing the buffer, possibly in a
+ * byte-granular fashion.
+ */
+ vm_object_t obj;
+ vm_offset_t toff;
+ vm_offset_t tinc;
+
+ /*
+ * Step 1, bring in the VM pages from the object,
+ * allocating them if necessary. We must clear
+ * B_CACHE if these pages are not valid for the
+ * range covered by the buffer.
+ */
+
+ obj = bp->b_bufobj->bo_object;
+
+ VM_OBJECT_WLOCK(obj);
+ while (bp->b_npages < desiredpages) {
+ vm_page_t m;
+
+ /*
+ * We must allocate system pages since blocking
+ * here could interfere with paging I/O, no
+ * matter which process we are.
+ *
+ * Only exclusive busy can be tested here.
+ * Blocking on shared busy might lead to
+ * deadlocks once allocbuf() is called after
+ * pages are vfs_busy_pages().
+ */
+ m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
+ bp->b_npages, VM_ALLOC_NOBUSY |
+ VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
+ VM_ALLOC_IGN_SBUSY |
+ VM_ALLOC_COUNT(desiredpages - bp->b_npages));
+ if (m->valid == 0)
+ bp->b_flags &= ~B_CACHE;
+ bp->b_pages[bp->b_npages] = m;
+ ++bp->b_npages;
+ }
+
+ /*
+ * Step 2. We've loaded the pages into the buffer,
+ * we have to figure out if we can still have B_CACHE
+ * set. Note that B_CACHE is set according to the
+ * byte-granular range ( bcount and size ), new the
+ * aligned range ( newbsize ).
+ *
+ * The VM test is against m->valid, which is DEV_BSIZE
+ * aligned. Needless to say, the validity of the data
+ * needs to also be DEV_BSIZE aligned. Note that this
+ * fails with NFS if the server or some other client
+ * extends the file's EOF. If our buffer is resized,
+ * B_CACHE may remain set! XXX
+ */
+
+ toff = bp->b_bcount;
+ tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
+
+ while ((bp->b_flags & B_CACHE) && toff < size) {
+ vm_pindex_t pi;
+
+ if (tinc > (size - toff))
+ tinc = size - toff;
+
+ pi = ((bp->b_offset & PAGE_MASK) + toff) >>
+ PAGE_SHIFT;
+
+ vfs_buf_test_cache(
+ bp,
+ bp->b_offset,
+ toff,
+ tinc,
+ bp->b_pages[pi]
+ );
+ toff += tinc;
+ tinc = PAGE_SIZE;
+ }
+ VM_OBJECT_WUNLOCK(obj);
+
+ /*
+ * Step 3, fixup the KVM pmap.
+ */
+ if ((bp->b_flags & B_UNMAPPED) == 0)
+ bpmap_qenter(bp);
+ else
+ BUF_CHECK_UNMAPPED(bp);
+ }
+ }
+ if (newbsize < bp->b_bufsize)
+ bufspacewakeup();
+ bp->b_bufsize = newbsize; /* actual buffer allocation */
+ bp->b_bcount = size; /* requested buffer size */
+ return 1;
+}
+
+extern int inflight_transient_maps;
+
+void
+biodone(struct bio *bp)
+{
+ struct mtx *mtxp;
+ void (*done)(struct bio *);
+ vm_offset_t start, end;
+ int transient;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ bp->bio_flags |= BIO_DONE;
+ if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+ start = trunc_page((vm_offset_t)bp->bio_data);
+ end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+ transient = 1;
+ } else {
+ transient = 0;
+ start = end = 0;
+ }
+ done = bp->bio_done;
+ if (done == NULL)
+ wakeup(bp);
+ mtx_unlock(mtxp);
+ if (done != NULL)
+ done(bp);
+ if (transient) {
+ pmap_qremove(start, OFF_TO_IDX(end - start));
+ vmem_free(transient_arena, start, end - start);
+ atomic_add_int(&inflight_transient_maps, -1);
+ }
+}
+
+/*
+ * Wait for a BIO to finish.
+ *
+ * XXX: resort to a timeout for now. The optimal locking (if any) for this
+ * case is not yet clear.
+ */
+int
+biowait(struct bio *bp, const char *wchan)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ while ((bp->bio_flags & BIO_DONE) == 0)
+ msleep(bp, mtxp, PRIBIO, wchan, hz / 10);
+ mtx_unlock(mtxp);
+ if (bp->bio_error != 0)
+ return (bp->bio_error);
+ if (!(bp->bio_flags & BIO_ERROR))
+ return (0);
+ return (EIO);
+}
+
+void
+biofinish(struct bio *bp, struct devstat *stat, int error)
+{
+
+ if (error) {
+ bp->bio_error = error;
+ bp->bio_flags |= BIO_ERROR;
+ }
+ if (stat != NULL)
+ devstat_end_transaction_bio(stat, bp);
+ biodone(bp);
+}
+
+/*
+ * bufwait:
+ *
+ * Wait for buffer I/O completion, returning error status. The buffer
+ * is left locked and B_DONE on return. B_EINTR is converted into an EINTR
+ * error and cleared.
+ */
+int
+bufwait(struct buf *bp)
+{
+ if (bp->b_iocmd == BIO_READ)
+ bwait(bp, PRIBIO, "biord");
+ else
+ bwait(bp, PRIBIO, "biowr");
+ if (bp->b_flags & B_EINTR) {
+ bp->b_flags &= ~B_EINTR;
+ return (EINTR);
+ }
+ if (bp->b_ioflags & BIO_ERROR) {
+ return (bp->b_error ? bp->b_error : EIO);
+ } else {
+ return (0);
+ }
+}
+
+ /*
+ * Call back function from struct bio back up to struct buf.
+ */
+static void
+bufdonebio(struct bio *bip)
+{
+ struct buf *bp;
+
+ bp = bip->bio_caller2;
+ bp->b_resid = bp->b_bcount - bip->bio_completed;
+ bp->b_resid = bip->bio_resid; /* XXX: remove */
+ bp->b_ioflags = bip->bio_flags;
+ bp->b_error = bip->bio_error;
+ if (bp->b_error)
+ bp->b_ioflags |= BIO_ERROR;
+ bufdone(bp);
+ g_destroy_bio(bip);
+}
+
+void
+dev_strategy(struct cdev *dev, struct buf *bp)
+{
+ struct cdevsw *csw;
+ int ref;
+
+ KASSERT(dev->si_refcount > 0,
+ ("dev_strategy on un-referenced struct cdev *(%s) %p",
+ devtoname(dev), dev));
+
+ csw = dev_refthread(dev, &ref);
+ dev_strategy_csw(dev, csw, bp);
+ dev_relthread(dev, ref);
+}
+
+void
+dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
+{
+ struct bio *bip;
+
+ KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
+ ("b_iocmd botch"));
+ KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
+ dev->si_threadcount > 0,
+ ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
+ dev));
+ if (csw == NULL) {
+ bp->b_error = ENXIO;
+ bp->b_ioflags = BIO_ERROR;
+ bufdone(bp);
+ return;
+ }
+ for (;;) {
+ bip = g_new_bio();
+ if (bip != NULL)
+ break;
+ /* Try again later */
+ tsleep(&bp, PRIBIO, "dev_strat", hz/10);
+ }
+ bip->bio_cmd = bp->b_iocmd;
+ bip->bio_offset = bp->b_iooffset;
+ bip->bio_length = bp->b_bcount;
+ bip->bio_bcount = bp->b_bcount; /* XXX: remove */
+ bdata2bio(bp, bip);
+ bip->bio_done = bufdonebio;
+ bip->bio_caller2 = bp;
+ bip->bio_dev = dev;
+ (*csw->d_strategy)(bip);
+}
+
+/*
+ * bufdone:
+ *
+ * Finish I/O on a buffer, optionally calling a completion function.
+ * This is usually called from an interrupt so process blocking is
+ * not allowed.
+ *
+ * biodone is also responsible for setting B_CACHE in a B_VMIO bp.
+ * In a non-VMIO bp, B_CACHE will be set on the next getblk()
+ * assuming B_INVAL is clear.
+ *
+ * For the VMIO case, we set B_CACHE if the op was a read and no
+ * read error occured, or if the op was a write. B_CACHE is never
+ * set if the buffer is invalid or otherwise uncacheable.
+ *
+ * biodone does not mess with B_INVAL, allowing the I/O routine or the
+ * initiator to leave B_INVAL set to brelse the buffer out of existance
+ * in the biodone routine.
+ */
+void
+bufdone(struct buf *bp)
+{
+ struct bufobj *dropobj;
+ void (*biodone)(struct buf *);
+
+ CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ dropobj = NULL;
+
+ KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
+ BUF_ASSERT_HELD(bp);
+
+ runningbufwakeup(bp);
+ if (bp->b_iocmd == BIO_WRITE)
+ dropobj = bp->b_bufobj;
+ /* call optional completion function if requested */
+ if (bp->b_iodone != NULL) {
+ biodone = bp->b_iodone;
+ bp->b_iodone = NULL;
+ (*biodone) (bp);
+ if (dropobj)
+ bufobj_wdrop(dropobj);
+ return;
+ }
+
+ bufdone_finish(bp);
+
+ if (dropobj)
+ bufobj_wdrop(dropobj);
+}
+
+void
+bufdone_finish(struct buf *bp)
+{
+ BUF_ASSERT_HELD(bp);
+
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_complete(bp);
+
+ if (bp->b_flags & B_VMIO) {
+ vm_ooffset_t foff;
+ vm_page_t m;
+ vm_object_t obj;
+ struct vnode *vp;
+ int bogus, i, iosize;
+
+ obj = bp->b_bufobj->bo_object;
+ KASSERT(obj->paging_in_progress >= bp->b_npages,
+ ("biodone_finish: paging in progress(%d) < b_npages(%d)",
+ obj->paging_in_progress, bp->b_npages));
+
+ vp = bp->b_vp;
+ KASSERT(vp->v_holdcnt > 0,
+ ("biodone_finish: vnode %p has zero hold count", vp));
+ KASSERT(vp->v_object != NULL,
+ ("biodone_finish: vnode %p has no vm_object", vp));
+
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("biodone_finish: bp %p has no buffer offset", bp));
+
+ /*
+ * Set B_CACHE if the op was a normal read and no error
+ * occured. B_CACHE is set for writes in the b*write()
+ * routines.
+ */
+ iosize = bp->b_bcount - bp->b_resid;
+ if (bp->b_iocmd == BIO_READ &&
+ !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
+ !(bp->b_ioflags & BIO_ERROR)) {
+ bp->b_flags |= B_CACHE;
+ }
+ bogus = 0;
+ VM_OBJECT_WLOCK(obj);
+ for (i = 0; i < bp->b_npages; i++) {
+ int bogusflag = 0;
+ int resid;
+
+ resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
+ if (resid > iosize)
+ resid = iosize;
+
+ /*
+ * cleanup bogus pages, restoring the originals
+ */
+ m = bp->b_pages[i];
+ if (m == bogus_page) {
+ bogus = bogusflag = 1;
+ m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+ if (m == NULL)
+ panic("biodone: page disappeared!");
+ bp->b_pages[i] = m;
+ }
+ KASSERT(OFF_TO_IDX(foff) == m->pindex,
+ ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
+ (intmax_t)foff, (uintmax_t)m->pindex));
+
+ /*
+ * In the write case, the valid and clean bits are
+ * already changed correctly ( see bdwrite() ), so we
+ * only need to do this here in the read case.
+ */
+ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
+ KASSERT((m->dirty & vm_page_bits(foff &
+ PAGE_MASK, resid)) == 0, ("bufdone_finish:"
+ " page %p has unexpected dirty bits", m));
+ vfs_page_set_valid(bp, foff, m);
+ }
+
+ vm_page_sunbusy(m);
+ vm_object_pip_subtract(obj, 1);
+ foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ iosize -= resid;
+ }
+ vm_object_pip_wakeupn(obj, 0);
+ VM_OBJECT_WUNLOCK(obj);
+ if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ }
+ }
+
+ /*
+ * For asynchronous completions, release the buffer now. The brelse
+ * will do a wakeup there if necessary - so no need to do a wakeup
+ * here in the async case. The sync case always needs to do a wakeup.
+ */
+
+ if (bp->b_flags & B_ASYNC) {
+ if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
+ brelse(bp);
+ else
+ bqrelse(bp);
+ } else
+ bdone(bp);
+}
+
+/*
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O. This keeps the busy status for pages
+ * consistant.
+ */
+void
+vfs_unbusy_pages(struct buf *bp)
+{
+ int i;
+ vm_object_t obj;
+ vm_page_t m;
+
+ runningbufwakeup(bp);
+ if (!(bp->b_flags & B_VMIO))
+ return;
+
+ obj = bp->b_bufobj->bo_object;
+ VM_OBJECT_WLOCK(obj);
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ if (m == bogus_page) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
+ if (!m)
+ panic("vfs_unbusy_pages: page missing\n");
+ bp->b_pages[i] = m;
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
+ }
+ vm_object_pip_subtract(obj, 1);
+ vm_page_sunbusy(m);
+ }
+ vm_object_pip_wakeupn(obj, 0);
+ VM_OBJECT_WUNLOCK(obj);
+}
+
+/*
+ * vfs_page_set_valid:
+ *
+ * Set the valid bits in a page based on the supplied offset. The
+ * range is restricted to the buffer's size.
+ *
+ * This routine is typically called after a read completes.
+ */
+static void
+vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
+{
+ vm_ooffset_t eoff;
+
+ /*
+ * Compute the end offset, eoff, such that [off, eoff) does not span a
+ * page boundary and eoff is not greater than the end of the buffer.
+ * The end of the buffer, in this case, is our file EOF, not the
+ * allocation size of the buffer.
+ */
+ eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
+ if (eoff > bp->b_offset + bp->b_bcount)
+ eoff = bp->b_offset + bp->b_bcount;
+
+ /*
+ * Set valid range. This is typically the entire buffer and thus the
+ * entire page.
+ */
+ if (eoff > off)
+ vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
+}
+
+/*
+ * vfs_page_set_validclean:
+ *
+ * Set the valid bits and clear the dirty bits in a page based on the
+ * supplied offset. The range is restricted to the buffer's size.
+ */
+static void
+vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
+{
+ vm_ooffset_t soff, eoff;
+
+ /*
+ * Start and end offsets in buffer. eoff - soff may not cross a
+ * page boundry or cross the end of the buffer. The end of the
+ * buffer, in this case, is our file EOF, not the allocation size
+ * of the buffer.
+ */
+ soff = off;
+ eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ if (eoff > bp->b_offset + bp->b_bcount)
+ eoff = bp->b_offset + bp->b_bcount;
+
+ /*
+ * Set valid range. This is typically the entire buffer and thus the
+ * entire page.
+ */
+ if (eoff > soff) {
+ vm_page_set_validclean(
+ m,
+ (vm_offset_t) (soff & PAGE_MASK),
+ (vm_offset_t) (eoff - soff)
+ );
+ }
+}
+
+/*
+ * Ensure that all buffer pages are not exclusive busied. If any page is
+ * exclusive busy, drain it.
+ */
+void
+vfs_drain_busy_pages(struct buf *bp)
+{
+ vm_page_t m;
+ int i, last_busied;
+
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
+ last_busied = 0;
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ if (vm_page_xbusied(m)) {
+ for (; last_busied < i; last_busied++)
+ vm_page_sbusy(bp->b_pages[last_busied]);
+ while (vm_page_xbusied(m)) {
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+ vm_page_busy_sleep(m, "vbpage");
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ }
+ }
+ }
+ for (i = 0; i < last_busied; i++)
+ vm_page_sunbusy(bp->b_pages[i]);
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being exclusive busy. Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistant.
+ *
+ * Since I/O has not been initiated yet, certain buffer flags
+ * such as BIO_ERROR or B_INVAL may be in an inconsistant state
+ * and should be ignored.
+ */
+void
+vfs_busy_pages(struct buf *bp, int clear_modify)
+{
+ int i, bogus;
+ vm_object_t obj;
+ vm_ooffset_t foff;
+ vm_page_t m;
+
+ if (!(bp->b_flags & B_VMIO))
+ return;
+
+ obj = bp->b_bufobj->bo_object;
+ foff = bp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("vfs_busy_pages: no buffer offset"));
+ VM_OBJECT_WLOCK(obj);
+ vfs_drain_busy_pages(bp);
+ if (bp->b_bufsize != 0)
+ vfs_setdirty_locked_object(bp);
+ bogus = 0;
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+
+ if ((bp->b_flags & B_CLUSTER) == 0) {
+ vm_object_pip_add(obj, 1);
+ vm_page_sbusy(m);
+ }
+ /*
+ * When readying a buffer for a read ( i.e
+ * clear_modify == 0 ), it is important to do
+ * bogus_page replacement for valid pages in
+ * partially instantiated buffers. Partially
+ * instantiated buffers can, in turn, occur when
+ * reconstituting a buffer from its VM backing store
+ * base. We only have to do this if B_CACHE is
+ * clear ( which causes the I/O to occur in the
+ * first place ). The replacement prevents the read
+ * I/O from overwriting potentially dirty VM-backed
+ * pages. XXX bogus page replacement is, uh, bogus.
+ * It may not work properly with small-block devices.
+ * We need to find a better way.
+ */
+ if (clear_modify) {
+ pmap_remove_write(m);
+ vfs_page_set_validclean(bp, foff, m);
+ } else if (m->valid == VM_PAGE_BITS_ALL &&
+ (bp->b_flags & B_CACHE) == 0) {
+ bp->b_pages[i] = bogus_page;
+ bogus++;
+ }
+ foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
+ }
+ VM_OBJECT_WUNLOCK(obj);
+ if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ }
+}
+
+/*
+ * vfs_bio_set_valid:
+ *
+ * Set the range within the buffer to valid. The range is
+ * relative to the beginning of the buffer, b_offset. Note that
+ * b_offset itself may be offset from the beginning of the first
+ * page.
+ */
+void
+vfs_bio_set_valid(struct buf *bp, int base, int size)
+{
+ int i, n;
+ vm_page_t m;
+
+ if (!(bp->b_flags & B_VMIO))
+ return;
+
+ /*
+ * Fixup base to be relative to beginning of first page.
+ * Set initial n to be the maximum number of bytes in the
+ * first page that can be validated.
+ */
+ base += (bp->b_offset & PAGE_MASK);
+ n = PAGE_SIZE - (base & PAGE_MASK);
+
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+ m = bp->b_pages[i];
+ if (n > size)
+ n = size;
+ vm_page_set_valid_range(m, base & PAGE_MASK, n);
+ base += n;
+ size -= n;
+ n = PAGE_SIZE;
+ }
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+}
+
+/*
+ * vfs_bio_clrbuf:
+ *
+ * If the specified buffer is a non-VMIO buffer, clear the entire
+ * buffer. If the specified buffer is a VMIO buffer, clear and
+ * validate only the previously invalid portions of the buffer.
+ * This routine essentially fakes an I/O, so we need to clear
+ * BIO_ERROR and B_INVAL.
+ *
+ * Note that while we only theoretically need to clear through b_bcount,
+ * we go ahead and clear through b_bufsize.
+ */
+void
+vfs_bio_clrbuf(struct buf *bp)
+{
+ int i, j, mask, sa, ea, slide;
+
+ if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
+ clrbuf(bp);
+ return;
+ }
+ bp->b_flags &= ~B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
+ (bp->b_offset & PAGE_MASK) == 0) {
+ if (bp->b_pages[0] == bogus_page)
+ goto unlock;
+ mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
+ if ((bp->b_pages[0]->valid & mask) == mask)
+ goto unlock;
+ if ((bp->b_pages[0]->valid & mask) == 0) {
+ pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
+ bp->b_pages[0]->valid |= mask;
+ goto unlock;
+ }
+ }
+ sa = bp->b_offset & PAGE_MASK;
+ slide = 0;
+ for (i = 0; i < bp->b_npages; i++, sa = 0) {
+ slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
+ ea = slide & PAGE_MASK;
+ if (ea == 0)
+ ea = PAGE_SIZE;
+ if (bp->b_pages[i] == bogus_page)
+ continue;
+ j = sa / DEV_BSIZE;
+ mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
+ if ((bp->b_pages[i]->valid & mask) == mask)
+ continue;
+ if ((bp->b_pages[i]->valid & mask) == 0)
+ pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
+ else {
+ for (; sa < ea; sa += DEV_BSIZE, j++) {
+ if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
+ pmap_zero_page_area(bp->b_pages[i],
+ sa, DEV_BSIZE);
+ }
+ }
+ }
+ bp->b_pages[i]->valid |= mask;
+ }
+unlock:
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+ bp->b_resid = 0;
+}
+
+void
+vfs_bio_bzero_buf(struct buf *bp, int base, int size)
+{
+ vm_page_t m;
+ int i, n;
+
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ bzero(bp->b_data + base, size);
+ } else {
+ BUF_CHECK_UNMAPPED(bp);
+ n = PAGE_SIZE - (base & PAGE_MASK);
+ for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+ m = bp->b_pages[i];
+ if (n > size)
+ n = size;
+ pmap_zero_page_area(m, base & PAGE_MASK, n);
+ base += n;
+ size -= n;
+ n = PAGE_SIZE;
+ }
+ }
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_free_pages get pages into
+ * a buffers address space. The pages are anonymous and are
+ * not associated with a file object.
+ */
+static void
+vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
+{
+ vm_offset_t pg;
+ vm_page_t p;
+ int index;
+
+ BUF_CHECK_MAPPED(bp);
+
+ to = round_page(to);
+ from = round_page(from);
+ index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+
+ for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+tryagain:
+ /*
+ * note: must allocate system pages since blocking here
+ * could interfere with paging I/O, no matter which
+ * process we are.
+ */
+ p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
+ VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
+ if (p == NULL) {
+ VM_WAIT;
+ goto tryagain;
+ }
+ pmap_qenter(pg, &p, 1);
+ bp->b_pages[index] = p;
+ }
+ bp->b_npages = index;
+}
+
+/* Return pages associated with this buf to the vm system */
+static void
+vm_hold_free_pages(struct buf *bp, int newbsize)
+{
+ vm_offset_t from;
+ vm_page_t p;
+ int index, newnpages;
+
+ BUF_CHECK_MAPPED(bp);
+
+ from = round_page((vm_offset_t)bp->b_data + newbsize);
+ newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
+ if (bp->b_npages > newnpages)
+ pmap_qremove(from, bp->b_npages - newnpages);
+ for (index = newnpages; index < bp->b_npages; index++) {
+ p = bp->b_pages[index];
+ bp->b_pages[index] = NULL;
+ if (vm_page_sbusied(p))
+ printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
+ (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
+ p->wire_count--;
+ vm_page_free(p);
+ atomic_subtract_int(&cnt.v_wire_count, 1);
+ }
+ bp->b_npages = newnpages;
+}
+
+/*
+ * Map an IO request into kernel virtual address space.
+ *
+ * All requests are (re)mapped into kernel VA space.
+ * Notice that we use b_bufsize for the size of the buffer
+ * to be mapped. b_bcount might be modified by the driver.
+ *
+ * Note that even if the caller determines that the address space should
+ * be valid, a race or a smaller-file mapped into a larger space may
+ * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
+ * check the return value.
+ */
+int
+vmapbuf(struct buf *bp, int mapbuf)
+{
+ caddr_t kva;
+ vm_prot_t prot;
+ int pidx;
+
+ if (bp->b_bufsize < 0)
+ return (-1);
+ prot = VM_PROT_READ;
+ if (bp->b_iocmd == BIO_READ)
+ prot |= VM_PROT_WRITE; /* Less backwards than it looks */
+ if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+ (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
+ btoc(MAXPHYS))) < 0)
+ return (-1);
+ bp->b_npages = pidx;
+ if (mapbuf || !unmapped_buf_allowed) {
+ pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
+ kva = bp->b_saveaddr;
+ bp->b_saveaddr = bp->b_data;
+ bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
+ bp->b_flags &= ~B_UNMAPPED;
+ } else {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
+ bp->b_saveaddr = bp->b_data;
+ bp->b_data = unmapped_buf;
+ }
+ return(0);
+}
+
+/*
+ * Free the io map PTEs associated with this IO operation.
+ * We also invalidate the TLB entries and restore the original b_addr.
+ */
+void
+vunmapbuf(struct buf *bp)
+{
+ int npages;
+
+ npages = bp->b_npages;
+ if (bp->b_flags & B_UNMAPPED)
+ bp->b_flags &= ~B_UNMAPPED;
+ else
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
+ vm_page_unhold_pages(bp->b_pages, npages);
+
+ bp->b_data = bp->b_saveaddr;
+}
+
+void
+bdone(struct buf *bp)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ bp->b_flags |= B_DONE;
+ wakeup(bp);
+ mtx_unlock(mtxp);
+}
+
+void
+bwait(struct buf *bp, u_char pri, const char *wchan)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ while ((bp->b_flags & B_DONE) == 0)
+ msleep(bp, mtxp, pri, wchan, 0);
+ mtx_unlock(mtxp);
+}
+
+int
+bufsync(struct bufobj *bo, int waitfor)
+{
+
+ return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
+}
+
+void
+bufstrategy(struct bufobj *bo, struct buf *bp)
+{
+ int i = 0;
+ struct vnode *vp;
+
+ vp = bp->b_vp;
+ KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
+ KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
+ ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
+ i = VOP_STRATEGY(vp, bp);
+ KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
+}
+
+void
+bufobj_wrefl(struct bufobj *bo)
+{
+
+ KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
+ ASSERT_BO_WLOCKED(bo);
+ bo->bo_numoutput++;
+}
+
+void
+bufobj_wref(struct bufobj *bo)
+{
+
+ KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
+ BO_LOCK(bo);
+ bo->bo_numoutput++;
+ BO_UNLOCK(bo);
+}
+
+void
+bufobj_wdrop(struct bufobj *bo)
+{
+
+ KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
+ BO_LOCK(bo);
+ KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
+ if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
+ bo->bo_flag &= ~BO_WWAIT;
+ wakeup(&bo->bo_numoutput);
+ }
+ BO_UNLOCK(bo);
+}
+
+int
+bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
+{
+ int error;
+
+ KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
+ ASSERT_BO_WLOCKED(bo);
+ error = 0;
+ while (bo->bo_numoutput) {
+ bo->bo_flag |= BO_WWAIT;
+ error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
+ slpflag | (PRIBIO + 1), "bo_wwait", timeo);
+ if (error)
+ break;
+ }
+ return (error);
+}
+
+void
+bpin(struct buf *bp)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ bp->b_pin_count++;
+ mtx_unlock(mtxp);
+}
+
+void
+bunpin(struct buf *bp)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ if (--bp->b_pin_count == 0)
+ wakeup(bp);
+ mtx_unlock(mtxp);
+}
+
+void
+bunpin_wait(struct buf *bp)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ while (bp->b_pin_count > 0)
+ msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
+ mtx_unlock(mtxp);
+}
+
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+ if ((bp->b_flags & B_UNMAPPED) != 0) {
+ KASSERT(unmapped_buf_allowed, ("unmapped"));
+ bip->bio_ma = bp->b_pages;
+ bip->bio_ma_n = bp->b_npages;
+ bip->bio_data = unmapped_buf;
+ bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+ bip->bio_flags |= BIO_UNMAPPED;
+ KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+ PAGE_SIZE == bp->b_npages,
+ ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
+ (long long)bip->bio_length, bip->bio_ma_n));
+ } else {
+ bip->bio_data = bp->b_data;
+ bip->bio_ma = NULL;
+ }
+}
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+/* DDB command to show buffer data */
+DB_SHOW_COMMAND(buffer, db_show_buffer)
+{
+ /* get args */
+ struct buf *bp = (struct buf *)addr;
+
+ if (!have_addr) {
+ db_printf("usage: show buffer <addr>\n");
+ return;
+ }
+
+ db_printf("buf at %p\n", bp);
+ db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
+ (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
+ PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
+ db_printf(
+ "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
+ "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
+ "b_dep = %p\n",
+ bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
+ bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
+ (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
+ if (bp->b_npages) {
+ int i;
+ db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m;
+ m = bp->b_pages[i];
+ db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
+ (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
+ if ((i + 1) < bp->b_npages)
+ db_printf(",");
+ }
+ db_printf("\n");
+ }
+ db_printf(" ");
+ BUF_LOCKPRINTINFO(bp);
+}
+
+DB_SHOW_COMMAND(lockedbufs, lockedbufs)
+{
+ struct buf *bp;
+ int i;
+
+ for (i = 0; i < nbuf; i++) {
+ bp = &buf[i];
+ if (BUF_ISLOCKED(bp)) {
+ db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+ db_printf("\n");
+ }
+ }
+}
+
+DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
+{
+ struct vnode *vp;
+ struct buf *bp;
+
+ if (!have_addr) {
+ db_printf("usage: show vnodebufs <addr>\n");
+ return;
+ }
+ vp = (struct vnode *)addr;
+ db_printf("Clean buffers:\n");
+ TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
+ db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+ db_printf("\n");
+ }
+ db_printf("Dirty buffers:\n");
+ TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
+ db_show_buffer((uintptr_t)bp, 1, 0, NULL);
+ db_printf("\n");
+ }
+}
+
+DB_COMMAND(countfreebufs, db_coundfreebufs)
+{
+ struct buf *bp;
+ int i, used = 0, nfree = 0;
+
+ if (have_addr) {
+ db_printf("usage: countfreebufs\n");
+ return;
+ }
+
+ for (i = 0; i < nbuf; i++) {
+ bp = &buf[i];
+ if ((bp->b_flags & B_INFREECNT) != 0)
+ nfree++;
+ else
+ used++;
+ }
+
+ db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
+ nfree + used);
+ db_printf("numfreebuffers is %d\n", numfreebuffers);
+}
+#endif /* DDB */
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
new file mode 100644
index 0000000..31ed545
--- /dev/null
+++ b/sys/kern/vfs_cache.c
@@ -0,0 +1,1486 @@
+/*-
+ * Copyright (c) 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Poul-Henning Kamp of the FreeBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filedesc.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/sdt.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/uma.h>
+
+SDT_PROVIDER_DECLARE(vfs);
+SDT_PROBE_DEFINE3(vfs, namecache, enter, done, done, "struct vnode *", "char *",
+ "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, done, "struct vnode *",
+ "char *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, entry, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, hit, "struct vnode *",
+ "char *", "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, miss, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, return, "int",
+ "struct vnode *", "char *");
+SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, hit, "struct vnode *", "char *",
+ "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, hit-negative,
+ "struct vnode *", "char *");
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, miss, "struct vnode *",
+ "char *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge, done, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, done, "struct mount *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap, done, done, "struct vnode *", "char *",
+ "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, done, "struct vnode *",
+ "char *");
+
+/*
+ * This structure describes the elements in the cache of recent
+ * names looked up by namei.
+ */
+
+struct namecache {
+ LIST_ENTRY(namecache) nc_hash; /* hash chain */
+ LIST_ENTRY(namecache) nc_src; /* source vnode list */
+ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
+ struct vnode *nc_dvp; /* vnode of parent of name */
+ struct vnode *nc_vp; /* vnode the name refers to */
+ u_char nc_flag; /* flag bits */
+ u_char nc_nlen; /* length of name */
+ char nc_name[0]; /* segment name + nul */
+};
+
+/*
+ * struct namecache_ts repeats struct namecache layout up to the
+ * nc_nlen member.
+ * struct namecache_ts is used in place of struct namecache when time(s) need
+ * to be stored. The nc_dotdottime field is used when a cache entry is mapping
+ * both a non-dotdot directory name plus dotdot for the directory's
+ * parent.
+ */
+struct namecache_ts {
+ LIST_ENTRY(namecache) nc_hash; /* hash chain */
+ LIST_ENTRY(namecache) nc_src; /* source vnode list */
+ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
+ struct vnode *nc_dvp; /* vnode of parent of name */
+ struct vnode *nc_vp; /* vnode the name refers to */
+ u_char nc_flag; /* flag bits */
+ u_char nc_nlen; /* length of name */
+ struct timespec nc_time; /* timespec provided by fs */
+ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */
+ int nc_ticks; /* ticks value when entry was added */
+ char nc_name[0]; /* segment name + nul */
+};
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE 0x01
+#define NCF_ISDOTDOT 0x02
+#define NCF_TS 0x04
+#define NCF_DTS 0x08
+
+/*
+ * Name caching works as follows:
+ *
+ * Names found by directory scans are retained in a cache
+ * for future reference. It is managed LRU, so frequently
+ * used names will hang around. Cache is indexed by hash value
+ * obtained from (vp, name) where vp refers to the directory
+ * containing name.
+ *
+ * If it is a "negative" entry, (i.e. for a name that is known NOT to
+ * exist) the vnode pointer will be NULL.
+ *
+ * Upon reaching the last segment of a path, if the reference
+ * is for DELETE, or NOCACHE is set (rewrite), and the
+ * name is located in the cache, it will be dropped.
+ */
+
+/*
+ * Structures associated with name cacheing.
+ */
+#define NCHHASH(hash) \
+ (&nchashtbl[(hash) & nchash])
+static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
+static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */
+static u_long nchash; /* size of hash table */
+SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
+ "Size of namecache hash table");
+static u_long ncnegfactor = 16; /* ratio of negative entries */
+SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
+ "Ratio of negative namecache entries");
+static u_long numneg; /* number of negative entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
+ "Number of negative entries in namecache");
+static u_long numcache; /* number of cache entries allocated */
+SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
+ "Number of namecache entries");
+static u_long numcachehv; /* number of cache entries with vnodes held */
+SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
+ "Number of namecache entries with vnodes held");
+static u_int ncsizefactor = 2;
+SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
+ "Size factor for namecache");
+
+struct nchstats nchstats; /* cache effectiveness statistics */
+
+static struct rwlock cache_lock;
+RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
+
+#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock)
+#define CACHE_RLOCK() rw_rlock(&cache_lock)
+#define CACHE_RUNLOCK() rw_runlock(&cache_lock)
+#define CACHE_WLOCK() rw_wlock(&cache_lock)
+#define CACHE_WUNLOCK() rw_wunlock(&cache_lock)
+
+/*
+ * UMA zones for the VFS cache.
+ *
+ * The small cache is used for entries with short names, which are the
+ * most common. The large cache is used for entries which are too big to
+ * fit in the small cache.
+ */
+static uma_zone_t cache_zone_small;
+static uma_zone_t cache_zone_small_ts;
+static uma_zone_t cache_zone_large;
+static uma_zone_t cache_zone_large_ts;
+
+#define CACHE_PATH_CUTOFF 35
+
+static struct namecache *
+cache_alloc(int len, int ts)
+{
+
+ if (len > CACHE_PATH_CUTOFF) {
+ if (ts)
+ return (uma_zalloc(cache_zone_large_ts, M_WAITOK));
+ else
+ return (uma_zalloc(cache_zone_large, M_WAITOK));
+ }
+ if (ts)
+ return (uma_zalloc(cache_zone_small_ts, M_WAITOK));
+ else
+ return (uma_zalloc(cache_zone_small, M_WAITOK));
+}
+
+static void
+cache_free(struct namecache *ncp)
+{
+ int ts;
+
+ if (ncp == NULL)
+ return;
+ ts = ncp->nc_flag & NCF_TS;
+ if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) {
+ if (ts)
+ uma_zfree(cache_zone_small_ts, ncp);
+ else
+ uma_zfree(cache_zone_small, ncp);
+ } else if (ts)
+ uma_zfree(cache_zone_large_ts, ncp);
+ else
+ uma_zfree(cache_zone_large, ncp);
+}
+
+static char *
+nc_get_name(struct namecache *ncp)
+{
+ struct namecache_ts *ncp_ts;
+
+ if ((ncp->nc_flag & NCF_TS) == 0)
+ return (ncp->nc_name);
+ ncp_ts = (struct namecache_ts *)ncp;
+ return (ncp_ts->nc_name);
+}
+
+static void
+cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
+{
+
+ KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
+ (tsp == NULL && ticksp == NULL),
+ ("No NCF_TS"));
+
+ if (tsp != NULL)
+ *tsp = ((struct namecache_ts *)ncp)->nc_time;
+ if (ticksp != NULL)
+ *ticksp = ((struct namecache_ts *)ncp)->nc_ticks;
+}
+
+static int doingcache = 1; /* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
+ "VFS namecache enabled");
+
+/* Export size information to userland */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
+ sizeof(struct namecache), "sizeof(struct namecache)");
+
+/*
+ * The new name cache statistics
+ */
+static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
+ "Name cache statistics");
+#define STATNODE(mode, name, var, descr) \
+ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, descr);
+STATNODE(CTLFLAG_RD, numneg, &numneg, "Number of negative cache entries");
+STATNODE(CTLFLAG_RD, numcache, &numcache, "Number of cache entries");
+static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls,
+ "Number of cache lookups");
+static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits,
+ "Number of '.' hits");
+static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits,
+ "Number of '..' hits");
+static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks,
+ "Number of checks in lookup");
+static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss,
+ "Number of cache misses");
+static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap,
+ "Number of cache misses we do not want to cache");
+static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps,
+ "Number of cache hits (positive) we do not want to cache");
+static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits,
+ "Number of cache hits (positive)");
+static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps,
+ "Number of cache hits (negative) we do not want to cache");
+static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits,
+ "Number of cache hits (negative)");
+static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades,
+ "Number of updates of the cache after lookup (write lock + retry)");
+
+SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE,
+ &nchstats, sizeof(nchstats), "LU",
+ "VFS cache effectiveness statistics");
+
+
+
+static void cache_zap(struct namecache *ncp);
+static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
+ u_int *buflen);
+static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
+ char *buf, char **retbuf, u_int buflen);
+
+static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
+
+#ifdef DIAGNOSTIC
+/*
+ * Grab an atomic snapshot of the name cache hash chain lengths
+ */
+static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
+ "hash table stats");
+
+static int
+sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct nchashhead *ncpp;
+ struct namecache *ncp;
+ int n_nchash;
+ int count;
+
+ n_nchash = nchash + 1; /* nchash is max index, not count */
+ if (!req->oldptr)
+ return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
+
+ /* Scan hash tables for applicable entries */
+ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+ CACHE_RLOCK();
+ count = 0;
+ LIST_FOREACH(ncp, ncpp, nc_hash) {
+ count++;
+ }
+ CACHE_RUNLOCK();
+ error = SYSCTL_OUT(req, &count, sizeof(count));
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
+ CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
+ "nchash chain lengths");
+
+static int
+sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct nchashhead *ncpp;
+ struct namecache *ncp;
+ int n_nchash;
+ int count, maxlength, used, pct;
+
+ if (!req->oldptr)
+ return SYSCTL_OUT(req, 0, 4 * sizeof(int));
+
+ n_nchash = nchash + 1; /* nchash is max index, not count */
+ used = 0;
+ maxlength = 0;
+
+ /* Scan hash tables for applicable entries */
+ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
+ count = 0;
+ CACHE_RLOCK();
+ LIST_FOREACH(ncp, ncpp, nc_hash) {
+ count++;
+ }
+ CACHE_RUNLOCK();
+ if (count)
+ used++;
+ if (maxlength < count)
+ maxlength = count;
+ }
+ n_nchash = nchash + 1;
+ pct = (used * 100) / (n_nchash / 100);
+ error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &used, sizeof(used));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req, &pct, sizeof(pct));
+ if (error)
+ return (error);
+ return (0);
+}
+SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
+ CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
+ "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
+#endif
+
+/*
+ * cache_zap():
+ *
+ * Removes a namecache entry from cache, whether it contains an actual
+ * pointer to a vnode or if it is just a negative cache entry.
+ */
+static void
+cache_zap(ncp)
+ struct namecache *ncp;
+{
+ struct vnode *vp;
+
+ rw_assert(&cache_lock, RA_WLOCKED);
+ CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
+#ifdef KDTRACE_HOOKS
+ if (ncp->nc_vp != NULL) {
+ SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
+ nc_get_name(ncp), ncp->nc_vp, 0, 0);
+ } else {
+ SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
+ nc_get_name(ncp), 0, 0, 0);
+ }
+#endif
+ vp = NULL;
+ LIST_REMOVE(ncp, nc_hash);
+ if (ncp->nc_flag & NCF_ISDOTDOT) {
+ if (ncp == ncp->nc_dvp->v_cache_dd)
+ ncp->nc_dvp->v_cache_dd = NULL;
+ } else {
+ LIST_REMOVE(ncp, nc_src);
+ if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
+ vp = ncp->nc_dvp;
+ numcachehv--;
+ }
+ }
+ if (ncp->nc_vp) {
+ TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+ if (ncp == ncp->nc_vp->v_cache_dd)
+ ncp->nc_vp->v_cache_dd = NULL;
+ } else {
+ TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+ numneg--;
+ }
+ numcache--;
+ cache_free(ncp);
+ if (vp)
+ vdrop(vp);
+}
+
+/*
+ * Lookup an entry in the cache
+ *
+ * Lookup is called with dvp pointing to the directory to search,
+ * cnp pointing to the name of the entry being sought. If the lookup
+ * succeeds, the vnode is returned in *vpp, and a status of -1 is
+ * returned. If the lookup determines that the name does not exist
+ * (negative cacheing), a status of ENOENT is returned. If the lookup
+ * fails, a status of zero is returned. If the directory vnode is
+ * recycled out from under us due to a forced unmount, a status of
+ * ENOENT is returned.
+ *
+ * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is
+ * unlocked. If we're looking up . an extra ref is taken, but the lock is
+ * not recursively acquired.
+ */
+
+int
+cache_lookup(dvp, vpp, cnp, tsp, ticksp)
+ struct vnode *dvp;
+ struct vnode **vpp;
+ struct componentname *cnp;
+ struct timespec *tsp;
+ int *ticksp;
+{
+ struct namecache *ncp;
+ uint32_t hash;
+ int error, ltype, wlocked;
+
+ if (!doingcache) {
+ cnp->cn_flags &= ~MAKEENTRY;
+ return (0);
+ }
+retry:
+ CACHE_RLOCK();
+ wlocked = 0;
+ numcalls++;
+ error = 0;
+
+retry_wlocked:
+ if (cnp->cn_nameptr[0] == '.') {
+ if (cnp->cn_namelen == 1) {
+ *vpp = dvp;
+ CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
+ dvp, cnp->cn_nameptr);
+ dothits++;
+ SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
+ *vpp, 0, 0);
+ if (tsp != NULL)
+ timespecclear(tsp);
+ if (ticksp != NULL)
+ *ticksp = ticks;
+ goto success;
+ }
+ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+ dotdothits++;
+ if (dvp->v_cache_dd == NULL) {
+ SDT_PROBE(vfs, namecache, lookup, miss, dvp,
+ "..", NULL, 0, 0);
+ goto unlock;
+ }
+ if ((cnp->cn_flags & MAKEENTRY) == 0) {
+ if (!wlocked && !CACHE_UPGRADE_LOCK())
+ goto wlock;
+ if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
+ cache_zap(dvp->v_cache_dd);
+ dvp->v_cache_dd = NULL;
+ CACHE_WUNLOCK();
+ return (0);
+ }
+ ncp = dvp->v_cache_dd;
+ if (ncp->nc_flag & NCF_ISDOTDOT)
+ *vpp = ncp->nc_vp;
+ else
+ *vpp = ncp->nc_dvp;
+ /* Return failure if negative entry was found. */
+ if (*vpp == NULL)
+ goto negative_success;
+ CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
+ dvp, cnp->cn_nameptr, *vpp);
+ SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
+ *vpp, 0, 0);
+ cache_out_ts(ncp, tsp, ticksp);
+ if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
+ NCF_DTS && tsp != NULL)
+ *tsp = ((struct namecache_ts *)ncp)->
+ nc_dotdottime;
+ goto success;
+ }
+ }
+
+ hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
+ hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
+ LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+ numchecks++;
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ /* We failed to find an entry */
+ if (ncp == NULL) {
+ SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
+ NULL, 0, 0);
+ if ((cnp->cn_flags & MAKEENTRY) == 0) {
+ nummisszap++;
+ } else {
+ nummiss++;
+ }
+ nchstats.ncs_miss++;
+ goto unlock;
+ }
+
+ /* We don't want to have an entry, so dump it */
+ if ((cnp->cn_flags & MAKEENTRY) == 0) {
+ numposzaps++;
+ nchstats.ncs_badhits++;
+ if (!wlocked && !CACHE_UPGRADE_LOCK())
+ goto wlock;
+ cache_zap(ncp);
+ CACHE_WUNLOCK();
+ return (0);
+ }
+
+ /* We found a "positive" match, return the vnode */
+ if (ncp->nc_vp) {
+ numposhits++;
+ nchstats.ncs_goodhits++;
+ *vpp = ncp->nc_vp;
+ CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
+ dvp, cnp->cn_nameptr, *vpp, ncp);
+ SDT_PROBE(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
+ *vpp, 0, 0);
+ cache_out_ts(ncp, tsp, ticksp);
+ goto success;
+ }
+
+negative_success:
+ /* We found a negative match, and want to create it, so purge */
+ if (cnp->cn_nameiop == CREATE) {
+ numnegzaps++;
+ nchstats.ncs_badhits++;
+ if (!wlocked && !CACHE_UPGRADE_LOCK())
+ goto wlock;
+ cache_zap(ncp);
+ CACHE_WUNLOCK();
+ return (0);
+ }
+
+ if (!wlocked && !CACHE_UPGRADE_LOCK())
+ goto wlock;
+ numneghits++;
+ /*
+ * We found a "negative" match, so we shift it to the end of
+ * the "negative" cache entries queue to satisfy LRU. Also,
+ * check to see if the entry is a whiteout; indicate this to
+ * the componentname, if so.
+ */
+ TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+ TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+ nchstats.ncs_neghits++;
+ if (ncp->nc_flag & NCF_WHITE)
+ cnp->cn_flags |= ISWHITEOUT;
+ SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, nc_get_name(ncp),
+ 0, 0, 0);
+ cache_out_ts(ncp, tsp, ticksp);
+ CACHE_WUNLOCK();
+ return (ENOENT);
+
+wlock:
+ /*
+ * We need to update the cache after our lookup, so upgrade to
+ * a write lock and retry the operation.
+ */
+ CACHE_RUNLOCK();
+ CACHE_WLOCK();
+ numupgrades++;
+ wlocked = 1;
+ goto retry_wlocked;
+
+success:
+ /*
+ * On success we return a locked and ref'd vnode as per the lookup
+ * protocol.
+ */
+ if (dvp == *vpp) { /* lookup on "." */
+ VREF(*vpp);
+ if (wlocked)
+ CACHE_WUNLOCK();
+ else
+ CACHE_RUNLOCK();
+ /*
+ * When we lookup "." we still can be asked to lock it
+ * differently...
+ */
+ ltype = cnp->cn_lkflags & LK_TYPE_MASK;
+ if (ltype != VOP_ISLOCKED(*vpp)) {
+ if (ltype == LK_EXCLUSIVE) {
+ vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
+ if ((*vpp)->v_iflag & VI_DOOMED) {
+ /* forced unmount */
+ vrele(*vpp);
+ *vpp = NULL;
+ return (ENOENT);
+ }
+ } else
+ vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
+ }
+ return (-1);
+ }
+ ltype = 0; /* silence gcc warning */
+ if (cnp->cn_flags & ISDOTDOT) {
+ ltype = VOP_ISLOCKED(dvp);
+ VOP_UNLOCK(dvp, 0);
+ }
+ VI_LOCK(*vpp);
+ if (wlocked)
+ CACHE_WUNLOCK();
+ else
+ CACHE_RUNLOCK();
+ error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
+ if (cnp->cn_flags & ISDOTDOT) {
+ vn_lock(dvp, ltype | LK_RETRY);
+ if (dvp->v_iflag & VI_DOOMED) {
+ if (error == 0)
+ vput(*vpp);
+ *vpp = NULL;
+ return (ENOENT);
+ }
+ }
+ if (error) {
+ *vpp = NULL;
+ goto retry;
+ }
+ if ((cnp->cn_flags & ISLASTCN) &&
+ (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
+ ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
+ }
+ return (-1);
+
+unlock:
+ if (wlocked)
+ CACHE_WUNLOCK();
+ else
+ CACHE_RUNLOCK();
+ return (0);
+}
+
+/*
+ * Add an entry to the cache.
+ */
+void
+cache_enter_time(dvp, vp, cnp, tsp, dtsp)
+ struct vnode *dvp;
+ struct vnode *vp;
+ struct componentname *cnp;
+ struct timespec *tsp;
+ struct timespec *dtsp;
+{
+ struct namecache *ncp, *n2;
+ struct namecache_ts *n3;
+ struct nchashhead *ncpp;
+ uint32_t hash;
+ int flag;
+ int hold;
+ int zap;
+ int len;
+
+ CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
+ VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
+ ("cache_enter: Adding a doomed vnode"));
+ VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
+ ("cache_enter: Doomed vnode used as src"));
+
+ if (!doingcache)
+ return;
+
+ /*
+ * Avoid blowout in namecache entries.
+ */
+ if (numcache >= desiredvnodes * ncsizefactor)
+ return;
+
+ flag = 0;
+ if (cnp->cn_nameptr[0] == '.') {
+ if (cnp->cn_namelen == 1)
+ return;
+ if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
+ CACHE_WLOCK();
+ /*
+ * If dotdot entry already exists, just retarget it
+ * to new parent vnode, otherwise continue with new
+ * namecache entry allocation.
+ */
+ if ((ncp = dvp->v_cache_dd) != NULL &&
+ ncp->nc_flag & NCF_ISDOTDOT) {
+ KASSERT(ncp->nc_dvp == dvp,
+ ("wrong isdotdot parent"));
+ if (ncp->nc_vp != NULL)
+ TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
+ ncp, nc_dst);
+ else
+ TAILQ_REMOVE(&ncneg, ncp, nc_dst);
+ if (vp != NULL)
+ TAILQ_INSERT_HEAD(&vp->v_cache_dst,
+ ncp, nc_dst);
+ else
+ TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+ ncp->nc_vp = vp;
+ CACHE_WUNLOCK();
+ return;
+ }
+ dvp->v_cache_dd = NULL;
+ SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
+ 0, 0);
+ CACHE_WUNLOCK();
+ flag = NCF_ISDOTDOT;
+ }
+ }
+
+ hold = 0;
+ zap = 0;
+
+ /*
+ * Calculate the hash key and setup as much of the new
+ * namecache entry as possible before acquiring the lock.
+ */
+ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
+ ncp->nc_vp = vp;
+ ncp->nc_dvp = dvp;
+ ncp->nc_flag = flag;
+ if (tsp != NULL) {
+ n3 = (struct namecache_ts *)ncp;
+ n3->nc_time = *tsp;
+ n3->nc_ticks = ticks;
+ n3->nc_flag |= NCF_TS;
+ if (dtsp != NULL) {
+ n3->nc_dotdottime = *dtsp;
+ n3->nc_flag |= NCF_DTS;
+ }
+ }
+ len = ncp->nc_nlen = cnp->cn_namelen;
+ hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
+ strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1);
+ hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
+ CACHE_WLOCK();
+
+ /*
+ * See if this vnode or negative entry is already in the cache
+ * with this name. This can happen with concurrent lookups of
+ * the same path name.
+ */
+ ncpp = NCHHASH(hash);
+ LIST_FOREACH(n2, ncpp, nc_hash) {
+ if (n2->nc_dvp == dvp &&
+ n2->nc_nlen == cnp->cn_namelen &&
+ !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) {
+ if (tsp != NULL) {
+ KASSERT((n2->nc_flag & NCF_TS) != 0,
+ ("no NCF_TS"));
+ n3 = (struct namecache_ts *)n2;
+ n3->nc_time =
+ ((struct namecache_ts *)ncp)->nc_time;
+ n3->nc_ticks =
+ ((struct namecache_ts *)ncp)->nc_ticks;
+ if (dtsp != NULL) {
+ n3->nc_dotdottime =
+ ((struct namecache_ts *)ncp)->
+ nc_dotdottime;
+ n3->nc_flag |= NCF_DTS;
+ }
+ }
+ CACHE_WUNLOCK();
+ cache_free(ncp);
+ return;
+ }
+ }
+
+ if (flag == NCF_ISDOTDOT) {
+ /*
+ * See if we are trying to add .. entry, but some other lookup
+ * has populated v_cache_dd pointer already.
+ */
+ if (dvp->v_cache_dd != NULL) {
+ CACHE_WUNLOCK();
+ cache_free(ncp);
+ return;
+ }
+ KASSERT(vp == NULL || vp->v_type == VDIR,
+ ("wrong vnode type %p", vp));
+ dvp->v_cache_dd = ncp;
+ }
+
+ numcache++;
+ if (!vp) {
+ numneg++;
+ if (cnp->cn_flags & ISWHITEOUT)
+ ncp->nc_flag |= NCF_WHITE;
+ } else if (vp->v_type == VDIR) {
+ if (flag != NCF_ISDOTDOT) {
+ /*
+ * For this case, the cache entry maps both the
+ * directory name in it and the name ".." for the
+ * directory's parent.
+ */
+ if ((n2 = vp->v_cache_dd) != NULL &&
+ (n2->nc_flag & NCF_ISDOTDOT) != 0)
+ cache_zap(n2);
+ vp->v_cache_dd = ncp;
+ }
+ } else {
+ vp->v_cache_dd = NULL;
+ }
+
+ /*
+ * Insert the new namecache entry into the appropriate chain
+ * within the cache entries table.
+ */
+ LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+ if (flag != NCF_ISDOTDOT) {
+ if (LIST_EMPTY(&dvp->v_cache_src)) {
+ hold = 1;
+ numcachehv++;
+ }
+ LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
+ }
+
+ /*
+ * If the entry is "negative", we place it into the
+ * "negative" cache queue, otherwise, we place it into the
+ * destination vnode's cache entries queue.
+ */
+ if (vp) {
+ TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
+ SDT_PROBE(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
+ vp, 0, 0);
+ } else {
+ TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+ SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
+ nc_get_name(ncp), 0, 0, 0);
+ }
+ if (numneg * ncnegfactor > numcache) {
+ ncp = TAILQ_FIRST(&ncneg);
+ zap = 1;
+ }
+ if (hold)
+ vhold(dvp);
+ if (zap)
+ cache_zap(ncp);
+ CACHE_WUNLOCK();
+}
+
+/*
+ * Name cache initialization, from vfs_init() when we are booting
+ */
+static void
+nchinit(void *dummy __unused)
+{
+
+ TAILQ_INIT(&ncneg);
+
+ cache_zone_small = uma_zcreate("S VFS Cache",
+ sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+ cache_zone_small_ts = uma_zcreate("STS VFS Cache",
+ sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+ cache_zone_large = uma_zcreate("L VFS Cache",
+ sizeof(struct namecache) + NAME_MAX + 1,
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+ cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
+ sizeof(struct namecache_ts) + NAME_MAX + 1,
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+
+ nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
+
+
+/*
+ * Invalidate all entries to a particular vnode.
+ */
+void
+cache_purge(vp)
+ struct vnode *vp;
+{
+
+ CTR1(KTR_VFS, "cache_purge(%p)", vp);
+ SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
+ CACHE_WLOCK();
+ while (!LIST_EMPTY(&vp->v_cache_src))
+ cache_zap(LIST_FIRST(&vp->v_cache_src));
+ while (!TAILQ_EMPTY(&vp->v_cache_dst))
+ cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
+ if (vp->v_cache_dd != NULL) {
+ KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
+ ("lost dotdot link"));
+ cache_zap(vp->v_cache_dd);
+ }
+ KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
+ CACHE_WUNLOCK();
+}
+
+/*
+ * Invalidate all negative entries for a particular directory vnode.
+ */
+void
+cache_purge_negative(vp)
+ struct vnode *vp;
+{
+ struct namecache *cp, *ncp;
+
+ CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
+ SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
+ CACHE_WLOCK();
+ LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
+ if (cp->nc_vp == NULL)
+ cache_zap(cp);
+ }
+ CACHE_WUNLOCK();
+}
+
+/*
+ * Flush all entries referencing a particular filesystem.
+ */
+void
+cache_purgevfs(mp)
+ struct mount *mp;
+{
+ struct nchashhead *ncpp;
+ struct namecache *ncp, *nnp;
+
+ /* Scan hash tables for applicable entries */
+ SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
+ CACHE_WLOCK();
+ for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+ LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
+ if (ncp->nc_dvp->v_mount == mp)
+ cache_zap(ncp);
+ }
+ }
+ CACHE_WUNLOCK();
+}
+
+/*
+ * Perform canonical checks and cache lookup and pass on to filesystem
+ * through the vop_cachedlookup only if needed.
+ */
+
+int
+vfs_cache_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ struct vnode *dvp;
+ int error;
+ struct vnode **vpp = ap->a_vpp;
+ struct componentname *cnp = ap->a_cnp;
+ struct ucred *cred = cnp->cn_cred;
+ int flags = cnp->cn_flags;
+ struct thread *td = cnp->cn_thread;
+
+ *vpp = NULL;
+ dvp = ap->a_dvp;
+
+ if (dvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+ return (EROFS);
+
+ error = VOP_ACCESS(dvp, VEXEC, cred, td);
+ if (error)
+ return (error);
+
+ error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
+ if (error == 0)
+ return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct __getcwd_args {
+ u_char *buf;
+ u_int buflen;
+};
+#endif
+
+/*
+ * XXX All of these sysctls would probably be more productive dead.
+ */
+static int disablecwd;
+SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
+ "Disable the getcwd syscall");
+
+/* Implementation of the getcwd syscall. */
+int
+sys___getcwd(td, uap)
+ struct thread *td;
+ struct __getcwd_args *uap;
+{
+
+ return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
+}
+
+int
+kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
+{
+ char *bp, *tmpbuf;
+ struct filedesc *fdp;
+ struct vnode *cdir, *rdir;
+ int error;
+
+ if (disablecwd)
+ return (ENODEV);
+ if (buflen < 2)
+ return (EINVAL);
+ if (buflen > MAXPATHLEN)
+ buflen = MAXPATHLEN;
+
+ tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
+ fdp = td->td_proc->p_fd;
+ FILEDESC_SLOCK(fdp);
+ cdir = fdp->fd_cdir;
+ VREF(cdir);
+ rdir = fdp->fd_rdir;
+ VREF(rdir);
+ FILEDESC_SUNLOCK(fdp);
+ error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
+ vrele(rdir);
+ vrele(cdir);
+
+ if (!error) {
+ if (bufseg == UIO_SYSSPACE)
+ bcopy(bp, buf, strlen(bp) + 1);
+ else
+ error = copyout(bp, buf, strlen(bp) + 1);
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_NAMEI))
+ ktrnamei(bp);
+#endif
+ }
+ free(tmpbuf, M_TEMP);
+ return (error);
+}
+
+/*
+ * Thus begins the fullpath magic.
+ */
+
+#undef STATNODE
+#define STATNODE(name, descr) \
+ static u_int name; \
+ SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr)
+
+static int disablefullpath;
+SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
+ "Disable the vn_fullpath function");
+
+/* These count for kern___getcwd(), too. */
+STATNODE(numfullpathcalls, "Number of fullpath search calls");
+STATNODE(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
+STATNODE(numfullpathfail2,
+ "Number of fullpath search errors (VOP_VPTOCNP failures)");
+STATNODE(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
+STATNODE(numfullpathfound, "Number of successful fullpath calls");
+
+/*
+ * Retrieve the full filesystem path that correspond to a vnode from the name
+ * cache (if available)
+ */
+int
+vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
+{
+ char *buf;
+ struct filedesc *fdp;
+ struct vnode *rdir;
+ int error;
+
+ if (disablefullpath)
+ return (ENODEV);
+ if (vn == NULL)
+ return (EINVAL);
+
+ buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ fdp = td->td_proc->p_fd;
+ FILEDESC_SLOCK(fdp);
+ rdir = fdp->fd_rdir;
+ VREF(rdir);
+ FILEDESC_SUNLOCK(fdp);
+ error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
+ vrele(rdir);
+
+ if (!error)
+ *freebuf = buf;
+ else
+ free(buf, M_TEMP);
+ return (error);
+}
+
+/*
+ * This function is similar to vn_fullpath, but it attempts to lookup the
+ * pathname relative to the global root mount point. This is required for the
+ * auditing sub-system, as audited pathnames must be absolute, relative to the
+ * global root mount point.
+ */
+int
+vn_fullpath_global(struct thread *td, struct vnode *vn,
+ char **retbuf, char **freebuf)
+{
+ char *buf;
+ int error;
+
+ if (disablefullpath)
+ return (ENODEV);
+ if (vn == NULL)
+ return (EINVAL);
+ buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
+ if (!error)
+ *freebuf = buf;
+ else
+ free(buf, M_TEMP);
+ return (error);
+}
+
+int
+vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
+{
+ int error;
+
+ CACHE_RLOCK();
+ error = vn_vptocnp_locked(vp, cred, buf, buflen);
+ if (error == 0)
+ CACHE_RUNLOCK();
+ return (error);
+}
+
+static int
+vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
+ u_int *buflen)
+{
+ struct vnode *dvp;
+ struct namecache *ncp;
+ int error;
+
+ TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
+ if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+ break;
+ }
+ if (ncp != NULL) {
+ if (*buflen < ncp->nc_nlen) {
+ CACHE_RUNLOCK();
+ vrele(*vp);
+ numfullpathfail4++;
+ error = ENOMEM;
+ SDT_PROBE(vfs, namecache, fullpath, return, error,
+ vp, NULL, 0, 0);
+ return (error);
+ }
+ *buflen -= ncp->nc_nlen;
+ memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
+ SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
+ nc_get_name(ncp), vp, 0, 0);
+ dvp = *vp;
+ *vp = ncp->nc_dvp;
+ vref(*vp);
+ CACHE_RUNLOCK();
+ vrele(dvp);
+ CACHE_RLOCK();
+ return (0);
+ }
+ SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);
+
+ CACHE_RUNLOCK();
+ vn_lock(*vp, LK_SHARED | LK_RETRY);
+ error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
+ vput(*vp);
+ if (error) {
+ numfullpathfail2++;
+ SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
+ NULL, 0, 0);
+ return (error);
+ }
+
+ *vp = dvp;
+ CACHE_RLOCK();
+ if (dvp->v_iflag & VI_DOOMED) {
+ /* forced unmount */
+ CACHE_RUNLOCK();
+ vrele(dvp);
+ error = ENOENT;
+ SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
+ NULL, 0, 0);
+ return (error);
+ }
+ /*
+ * *vp has its use count incremented still.
+ */
+
+ return (0);
+}
+
+/*
+ * The magic behind kern___getcwd() and vn_fullpath().
+ */
+static int
+vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
+ char *buf, char **retbuf, u_int buflen)
+{
+ int error, slash_prefixed;
+#ifdef KDTRACE_HOOKS
+ struct vnode *startvp = vp;
+#endif
+ struct vnode *vp1;
+
+ buflen--;
+ buf[buflen] = '\0';
+ error = 0;
+ slash_prefixed = 0;
+
+ SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
+ numfullpathcalls++;
+ vref(vp);
+ CACHE_RLOCK();
+ if (vp->v_type != VDIR) {
+ error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
+ if (error)
+ return (error);
+ if (buflen == 0) {
+ CACHE_RUNLOCK();
+ vrele(vp);
+ return (ENOMEM);
+ }
+ buf[--buflen] = '/';
+ slash_prefixed = 1;
+ }
+ while (vp != rdir && vp != rootvnode) {
+ if (vp->v_vflag & VV_ROOT) {
+ if (vp->v_iflag & VI_DOOMED) { /* forced unmount */
+ CACHE_RUNLOCK();
+ vrele(vp);
+ error = ENOENT;
+ SDT_PROBE(vfs, namecache, fullpath, return,
+ error, vp, NULL, 0, 0);
+ break;
+ }
+ vp1 = vp->v_mount->mnt_vnodecovered;
+ vref(vp1);
+ CACHE_RUNLOCK();
+ vrele(vp);
+ vp = vp1;
+ CACHE_RLOCK();
+ continue;
+ }
+ if (vp->v_type != VDIR) {
+ CACHE_RUNLOCK();
+ vrele(vp);
+ numfullpathfail1++;
+ error = ENOTDIR;
+ SDT_PROBE(vfs, namecache, fullpath, return,
+ error, vp, NULL, 0, 0);
+ break;
+ }
+ error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
+ if (error)
+ break;
+ if (buflen == 0) {
+ CACHE_RUNLOCK();
+ vrele(vp);
+ error = ENOMEM;
+ SDT_PROBE(vfs, namecache, fullpath, return, error,
+ startvp, NULL, 0, 0);
+ break;
+ }
+ buf[--buflen] = '/';
+ slash_prefixed = 1;
+ }
+ if (error)
+ return (error);
+ if (!slash_prefixed) {
+ if (buflen == 0) {
+ CACHE_RUNLOCK();
+ vrele(vp);
+ numfullpathfail4++;
+ SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM,
+ startvp, NULL, 0, 0);
+ return (ENOMEM);
+ }
+ buf[--buflen] = '/';
+ }
+ numfullpathfound++;
+ CACHE_RUNLOCK();
+ vrele(vp);
+
+ SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen,
+ 0, 0);
+ *retbuf = buf + buflen;
+ return (0);
+}
+
+struct vnode *
+vn_dir_dd_ino(struct vnode *vp)
+{
+ struct namecache *ncp;
+ struct vnode *ddvp;
+
+ ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
+ CACHE_RLOCK();
+ TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
+ if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
+ continue;
+ ddvp = ncp->nc_dvp;
+ VI_LOCK(ddvp);
+ CACHE_RUNLOCK();
+ if (vget(ddvp, LK_INTERLOCK | LK_SHARED | LK_NOWAIT, curthread))
+ return (NULL);
+ return (ddvp);
+ }
+ CACHE_RUNLOCK();
+ return (NULL);
+}
+
+int
+vn_commname(struct vnode *vp, char *buf, u_int buflen)
+{
+ struct namecache *ncp;
+ int l;
+
+ CACHE_RLOCK();
+ TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
+ if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+ break;
+ if (ncp == NULL) {
+ CACHE_RUNLOCK();
+ return (ENOENT);
+ }
+ l = min(ncp->nc_nlen, buflen - 1);
+ memcpy(buf, nc_get_name(ncp), l);
+ CACHE_RUNLOCK();
+ buf[l] = '\0';
+ return (0);
+}
+
+/* ABI compat shims for old kernel modules. */
+#undef cache_enter
+
+void cache_enter(struct vnode *dvp, struct vnode *vp,
+ struct componentname *cnp);
+
+void
+cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
+{
+
+ cache_enter_time(dvp, vp, cnp, NULL, NULL);
+}
+
+/*
+ * This function updates path string to vnode's full global path
+ * and checks the size of the new path string against the pathlen argument.
+ *
+ * Requires a locked, referenced vnode and GIANT lock held.
+ * Vnode is re-locked on success or ENODEV, otherwise unlocked.
+ *
+ * If sysctl debug.disablefullpath is set, ENODEV is returned,
+ * vnode is left locked and path remain untouched.
+ *
+ * If vp is a directory, the call to vn_fullpath_global() always succeeds
+ * because it falls back to the ".." lookup if the namecache lookup fails.
+ */
+int
+vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
+ u_int pathlen)
+{
+ struct nameidata nd;
+ struct vnode *vp1;
+ char *rpath, *fbuf;
+ int error;
+
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ /* Return ENODEV if sysctl debug.disablefullpath==1 */
+ if (disablefullpath)
+ return (ENODEV);
+
+ /* Construct global filesystem path from vp. */
+ VOP_UNLOCK(vp, 0);
+ error = vn_fullpath_global(td, vp, &rpath, &fbuf);
+
+ if (error != 0) {
+ vrele(vp);
+ return (error);
+ }
+
+ if (strlen(rpath) >= pathlen) {
+ vrele(vp);
+ error = ENAMETOOLONG;
+ goto out;
+ }
+
+ /*
+ * Re-lookup the vnode by path to detect a possible rename.
+ * As a side effect, the vnode is relocked.
+ * If vnode was renamed, return ENOENT.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+ UIO_SYSSPACE, path, td);
+ error = namei(&nd);
+ if (error != 0) {
+ vrele(vp);
+ goto out;
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp1 = nd.ni_vp;
+ vrele(vp);
+ if (vp1 == vp)
+ strcpy(path, rpath);
+ else {
+ vput(vp1);
+ error = ENOENT;
+ }
+
+out:
+ free(fbuf, M_TEMP);
+ return (error);
+}
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
new file mode 100644
index 0000000..9601082
--- /dev/null
+++ b/sys/kern/vfs_cluster.c
@@ -0,0 +1,1058 @@
+/*-
+ * Copyright (c) 1993
+ * The Regents of the University of California. All rights reserved.
+ * Modifications/enhancements:
+ * Copyright (c) 1995 John S. Dyson. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_debug_cluster.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <sys/sysctl.h>
+
+#if defined(CLUSTERDEBUG)
+static int rcluster= 0;
+SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
+ "Debug VFS clustering code");
+#endif
+
+static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
+
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+ struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+ daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+ struct buf *fbp);
+static void cluster_callback(struct buf *);
+
+static int write_behind = 1;
+SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
+ "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
+
+static int read_max = 64;
+SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
+ "Cluster read-ahead max block count");
+
+static int read_min = 1;
+SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
+ "Cluster read min block count");
+
+/* Page expended to mark partially backed buffers */
+extern vm_page_t bogus_page;
+
+/*
+ * Read data to a buf, including read-ahead if we find this to be beneficial.
+ * cluster_read replaces bread.
+ */
+int
+cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
+ struct ucred *cred, long totread, int seqcount, int gbflags,
+ struct buf **bpp)
+{
+ struct buf *bp, *rbp, *reqbp;
+ struct bufobj *bo;
+ daddr_t blkno, origblkno;
+ int maxra, racluster;
+ int error, ncontig;
+ int i;
+
+ error = 0;
+ bo = &vp->v_bufobj;
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
+ /*
+ * Try to limit the amount of read-ahead by a few
+ * ad-hoc parameters. This needs work!!!
+ */
+ racluster = vp->v_mount->mnt_iosize_max / size;
+ maxra = seqcount;
+ maxra = min(read_max, maxra);
+ maxra = min(nbuf/8, maxra);
+ if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
+ maxra = (filesize / size) - lblkno;
+
+ /*
+ * get the requested block
+ */
+ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
+ origblkno = lblkno;
+
+ /*
+ * if it is in the cache, then check to see if the reads have been
+ * sequential. If they have, then try some read-ahead, otherwise
+ * back-off on prospective read-aheads.
+ */
+ if (bp->b_flags & B_CACHE) {
+ if (!seqcount) {
+ return 0;
+ } else if ((bp->b_flags & B_RAM) == 0) {
+ return 0;
+ } else {
+ bp->b_flags &= ~B_RAM;
+ BO_RLOCK(bo);
+ for (i = 1; i < maxra; i++) {
+ /*
+ * Stop if the buffer does not exist or it
+ * is invalid (about to go away?)
+ */
+ rbp = gbincore(&vp->v_bufobj, lblkno+i);
+ if (rbp == NULL || (rbp->b_flags & B_INVAL))
+ break;
+
+ /*
+ * Set another read-ahead mark so we know
+ * to check again. (If we can lock the
+ * buffer without waiting)
+ */
+ if ((((i % racluster) == (racluster - 1)) ||
+ (i == (maxra - 1)))
+ && (0 == BUF_LOCK(rbp,
+ LK_EXCLUSIVE | LK_NOWAIT, NULL))) {
+ rbp->b_flags |= B_RAM;
+ BUF_UNLOCK(rbp);
+ }
+ }
+ BO_RUNLOCK(bo);
+ if (i >= maxra) {
+ return 0;
+ }
+ lblkno += i;
+ }
+ reqbp = bp = NULL;
+ /*
+ * If it isn't in the cache, then get a chunk from
+ * disk if sequential, otherwise just get the block.
+ */
+ } else {
+ off_t firstread = bp->b_offset;
+ int nblks;
+ long minread;
+
+ KASSERT(bp->b_offset != NOOFFSET,
+ ("cluster_read: no buffer offset"));
+
+ ncontig = 0;
+
+ /*
+ * Adjust totread if needed
+ */
+ minread = read_min * size;
+ if (minread > totread)
+ totread = minread;
+
+ /*
+ * Compute the total number of blocks that we should read
+ * synchronously.
+ */
+ if (firstread + totread > filesize)
+ totread = filesize - firstread;
+ nblks = howmany(totread, size);
+ if (nblks > racluster)
+ nblks = racluster;
+
+ /*
+ * Now compute the number of contiguous blocks.
+ */
+ if (nblks > 1) {
+ error = VOP_BMAP(vp, lblkno, NULL,
+ &blkno, &ncontig, NULL);
+ /*
+ * If this failed to map just do the original block.
+ */
+ if (error || blkno == -1)
+ ncontig = 0;
+ }
+
+ /*
+ * If we have contiguous data available do a cluster
+ * otherwise just read the requested block.
+ */
+ if (ncontig) {
+ /* Account for our first block. */
+ ncontig = min(ncontig + 1, nblks);
+ if (ncontig < nblks)
+ nblks = ncontig;
+ bp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, nblks, gbflags, bp);
+ lblkno += (bp->b_bufsize / size);
+ } else {
+ bp->b_flags |= B_RAM;
+ bp->b_iocmd = BIO_READ;
+ lblkno += 1;
+ }
+ }
+
+ /*
+ * handle the synchronous read so that it is available ASAP.
+ */
+ if (bp) {
+ if ((bp->b_flags & B_CLUSTER) == 0) {
+ vfs_busy_pages(bp, 0);
+ }
+ bp->b_flags &= ~B_INVAL;
+ bp->b_ioflags &= ~BIO_ERROR;
+ if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
+ BUF_KERNPROC(bp);
+ bp->b_iooffset = dbtob(bp->b_blkno);
+ bstrategy(bp);
+ curthread->td_ru.ru_inblock++;
+ }
+
+ /*
+ * If we have been doing sequential I/O, then do some read-ahead.
+ */
+ while (lblkno < (origblkno + maxra)) {
+ error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
+ if (error)
+ break;
+
+ if (blkno == -1)
+ break;
+
+ /*
+ * We could throttle ncontig here by maxra but we might as
+ * well read the data if it is contiguous. We're throttled
+ * by racluster anyway.
+ */
+ if (ncontig) {
+ ncontig = min(ncontig + 1, racluster);
+ rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
+ size, ncontig, gbflags, NULL);
+ lblkno += (rbp->b_bufsize / size);
+ if (rbp->b_flags & B_DELWRI) {
+ bqrelse(rbp);
+ continue;
+ }
+ } else {
+ rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
+ lblkno += 1;
+ if (rbp->b_flags & B_DELWRI) {
+ bqrelse(rbp);
+ continue;
+ }
+ rbp->b_flags |= B_ASYNC | B_RAM;
+ rbp->b_iocmd = BIO_READ;
+ rbp->b_blkno = blkno;
+ }
+ if (rbp->b_flags & B_CACHE) {
+ rbp->b_flags &= ~B_ASYNC;
+ bqrelse(rbp);
+ continue;
+ }
+ if ((rbp->b_flags & B_CLUSTER) == 0) {
+ vfs_busy_pages(rbp, 0);
+ }
+ rbp->b_flags &= ~B_INVAL;
+ rbp->b_ioflags &= ~BIO_ERROR;
+ if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
+ BUF_KERNPROC(rbp);
+ rbp->b_iooffset = dbtob(rbp->b_blkno);
+ bstrategy(rbp);
+ curthread->td_ru.ru_inblock++;
+ }
+
+ if (reqbp)
+ return (bufwait(reqbp));
+ else
+ return (error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead. We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+static struct buf *
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+ daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
+{
+ struct bufobj *bo;
+ struct buf *bp, *tbp;
+ daddr_t bn;
+ off_t off;
+ long tinc, tsize;
+ int i, inc, j, k, toff;
+
+ KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
+ ("cluster_rbuild: size %ld != f_iosize %jd\n",
+ size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
+
+ /*
+ * avoid a division
+ */
+ while ((u_quad_t) size * (lbn + run) > filesize) {
+ --run;
+ }
+
+ if (fbp) {
+ tbp = fbp;
+ tbp->b_iocmd = BIO_READ;
+ } else {
+ tbp = getblk(vp, lbn, size, 0, 0, gbflags);
+ if (tbp->b_flags & B_CACHE)
+ return tbp;
+ tbp->b_flags |= B_ASYNC | B_RAM;
+ tbp->b_iocmd = BIO_READ;
+ }
+ tbp->b_blkno = blkno;
+ if( (tbp->b_flags & B_MALLOC) ||
+ ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+ return tbp;
+
+ bp = trypbuf(&cluster_pbuf_freecnt);
+ if (bp == 0)
+ return tbp;
+
+ /*
+ * We are synthesizing a buffer out of vm_page_t's, but
+ * if the block size is not page aligned then the starting
+ * address may not be either. Inherit the b_data offset
+ * from the original buffer.
+ */
+ bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+ if ((gbflags & GB_UNMAPPED) != 0) {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_data = unmapped_buf;
+ } else {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ }
+ bp->b_iocmd = BIO_READ;
+ bp->b_iodone = cluster_callback;
+ bp->b_blkno = blkno;
+ bp->b_lblkno = lbn;
+ bp->b_offset = tbp->b_offset;
+ KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
+ pbgetvp(vp, bp);
+
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+
+ inc = btodb(size);
+ bo = &vp->v_bufobj;
+ for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+ if (i == 0) {
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ vfs_drain_busy_pages(tbp);
+ vm_object_pip_add(tbp->b_bufobj->bo_object,
+ tbp->b_npages);
+ for (k = 0; k < tbp->b_npages; k++)
+ vm_page_sbusy(tbp->b_pages[k]);
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ } else {
+ if ((bp->b_npages * PAGE_SIZE) +
+ round_page(size) > vp->v_mount->mnt_iosize_max) {
+ break;
+ }
+
+ tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+ (gbflags & GB_UNMAPPED));
+
+ /* Don't wait around for locked bufs. */
+ if (tbp == NULL)
+ break;
+
+ /*
+ * Stop scanning if the buffer is fully valid
+ * (marked B_CACHE), or locked (may be doing a
+ * background write), or if the buffer is not
+ * VMIO backed. The clustering code can only deal
+ * with VMIO-backed buffers. The bo lock is not
+ * required for the BKGRDINPROG check since it
+ * can not be set without the buf lock.
+ */
+ if ((tbp->b_vflags & BV_BKGRDINPROG) ||
+ (tbp->b_flags & B_CACHE) ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bqrelse(tbp);
+ break;
+ }
+
+ /*
+ * The buffer must be completely invalid in order to
+ * take part in the cluster. If it is partially valid
+ * then we stop.
+ */
+ off = tbp->b_offset;
+ tsize = size;
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ for (j = 0; tsize > 0; j++) {
+ toff = off & PAGE_MASK;
+ tinc = tsize;
+ if (toff + tinc > PAGE_SIZE)
+ tinc = PAGE_SIZE - toff;
+ VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object);
+ if ((tbp->b_pages[j]->valid &
+ vm_page_bits(toff, tinc)) != 0)
+ break;
+ if (vm_page_xbusied(tbp->b_pages[j]))
+ break;
+ vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
+ vm_page_sbusy(tbp->b_pages[j]);
+ off += tinc;
+ tsize -= tinc;
+ }
+ if (tsize > 0) {
+clean_sbusy:
+ vm_object_pip_add(tbp->b_bufobj->bo_object, -j);
+ for (k = 0; k < j; k++)
+ vm_page_sunbusy(tbp->b_pages[k]);
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ bqrelse(tbp);
+ break;
+ }
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+
+ /*
+ * Set a read-ahead mark as appropriate
+ */
+ if ((fbp && (i == 1)) || (i == (run - 1)))
+ tbp->b_flags |= B_RAM;
+
+ /*
+ * Set the buffer up for an async read (XXX should
+ * we do this only if we do not wind up brelse()ing?).
+ * Set the block number if it isn't set, otherwise
+ * if it is make sure it matches the block number we
+ * expect.
+ */
+ tbp->b_flags |= B_ASYNC;
+ tbp->b_iocmd = BIO_READ;
+ if (tbp->b_blkno == tbp->b_lblkno) {
+ tbp->b_blkno = bn;
+ } else if (tbp->b_blkno != bn) {
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ goto clean_sbusy;
+ }
+ }
+ /*
+ * XXX fbp from caller may not be B_ASYNC, but we are going
+ * to biodone() it in cluster_callback() anyway
+ */
+ BUF_KERNPROC(tbp);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ vm_page_t m;
+ m = tbp->b_pages[j];
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages-1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ if (m->valid == VM_PAGE_BITS_ALL)
+ tbp->b_pages[j] = bogus_page;
+ }
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ /*
+ * Don't inherit tbp->b_bufsize as it may be larger due to
+ * a non-page-aligned size. Instead just aggregate using
+ * 'size'.
+ */
+ if (tbp->b_bcount != size)
+ printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
+ if (tbp->b_bufsize != size)
+ printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
+ bp->b_bcount += size;
+ bp->b_bufsize += size;
+ }
+
+ /*
+ * Fully valid pages in the cluster are already good and do not need
+ * to be re-read from disk. Replace the page with bogus_page
+ */
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ for (j = 0; j < bp->b_npages; j++) {
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object);
+ if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
+ bp->b_pages[j] = bogus_page;
+ }
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ bp->b_kvasize = bp->b_bufsize;
+
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
+ return (bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ * This is complicated by the fact that any of the buffers might have
+ * extra memory (if there were no empty buffer headers at allocbuf time)
+ * that we will need to shift around.
+ */
+static void
+cluster_callback(bp)
+ struct buf *bp;
+{
+ struct buf *nbp, *tbp;
+ int error = 0;
+
+ /*
+ * Must propogate errors to all the components.
+ */
+ if (bp->b_ioflags & BIO_ERROR)
+ error = bp->b_error;
+
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+ bp->b_npages);
+ }
+ /*
+ * Move memory from the large cluster buffer into the component
+ * buffers and mark IO as done on these.
+ */
+ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+ tbp; tbp = nbp) {
+ nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
+ if (error) {
+ tbp->b_ioflags |= BIO_ERROR;
+ tbp->b_error = error;
+ } else {
+ tbp->b_dirtyoff = tbp->b_dirtyend = 0;
+ tbp->b_flags &= ~B_INVAL;
+ tbp->b_ioflags &= ~BIO_ERROR;
+ /*
+ * XXX the bdwrite()/bqrelse() issued during
+ * cluster building clears B_RELBUF (see bqrelse()
+ * comment). If direct I/O was specified, we have
+ * to restore it here to allow the buffer and VM
+ * to be freed.
+ */
+ if (tbp->b_flags & B_DIRECT)
+ tbp->b_flags |= B_RELBUF;
+ }
+ bufdone(tbp);
+ }
+ pbrelvp(bp);
+ relpbuf(bp, &cluster_pbuf_freecnt);
+}
+
+/*
+ * cluster_wbuild_wb:
+ *
+ * Implement modified write build for cluster.
+ *
+ * write_behind = 0 write behind disabled
+ * write_behind = 1 write behind normal (default)
+ * write_behind = 2 write behind backed-off
+ */
+
+static __inline int
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+ int gbflags)
+{
+ int r = 0;
+
+ switch (write_behind) {
+ case 2:
+ if (start_lbn < len)
+ break;
+ start_lbn -= len;
+ /* FALLTHROUGH */
+ case 1:
+ r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
+ /* FALLTHROUGH */
+ default:
+ /* FALLTHROUGH */
+ break;
+ }
+ return(r);
+}
+
+/*
+ * Do clustered write for FFS.
+ *
+ * Three cases:
+ * 1. Write is not sequential (write asynchronously)
+ * Write is sequential:
+ * 2. beginning of cluster - begin cluster
+ * 3. middle of a cluster - add to cluster
+ * 4. end of a cluster - asynchronously write cluster
+ */
+void
+cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
+ int gbflags)
+{
+ daddr_t lbn;
+ int maxclen, cursize;
+ int lblocksize;
+ int async;
+
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
+ if (vp->v_type == VREG) {
+ async = DOINGASYNC(vp);
+ lblocksize = vp->v_mount->mnt_stat.f_iosize;
+ } else {
+ async = 0;
+ lblocksize = bp->b_bufsize;
+ }
+ lbn = bp->b_lblkno;
+ KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
+
+ /* Initialize vnode to beginning of file. */
+ if (lbn == 0)
+ vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+
+ if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+ (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+ maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
+ if (vp->v_clen != 0) {
+ /*
+ * Next block is not sequential.
+ *
+ * If we are not writing at end of file, the process
+ * seeked to another point in the file since its last
+ * write, or we have reached our maximum cluster size,
+ * then push the previous cluster. Otherwise try
+ * reallocating to make it sequential.
+ *
+ * Change to algorithm: only push previous cluster if
+ * it was sequential from the point of view of the
+ * seqcount heuristic, otherwise leave the buffer
+ * intact so we can potentially optimize the I/O
+ * later on in the buf_daemon or update daemon
+ * flush.
+ */
+ cursize = vp->v_lastw - vp->v_cstart + 1;
+ if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
+ lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
+ if (!async && seqcount > 0) {
+ cluster_wbuild_wb(vp, lblocksize,
+ vp->v_cstart, cursize, gbflags);
+ }
+ } else {
+ struct buf **bpp, **endbp;
+ struct cluster_save *buflist;
+
+ buflist = cluster_collectbufs(vp, bp, gbflags);
+ endbp = &buflist->bs_children
+ [buflist->bs_nchildren - 1];
+ if (VOP_REALLOCBLKS(vp, buflist)) {
+ /*
+ * Failed, push the previous cluster
+ * if *really* writing sequentially
+ * in the logical file (seqcount > 1),
+ * otherwise delay it in the hopes that
+ * the low level disk driver can
+ * optimize the write ordering.
+ */
+ for (bpp = buflist->bs_children;
+ bpp < endbp; bpp++)
+ brelse(*bpp);
+ free(buflist, M_SEGMENT);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp,
+ lblocksize, vp->v_cstart,
+ cursize, gbflags);
+ }
+ } else {
+ /*
+ * Succeeded, keep building cluster.
+ */
+ for (bpp = buflist->bs_children;
+ bpp <= endbp; bpp++)
+ bdwrite(*bpp);
+ free(buflist, M_SEGMENT);
+ vp->v_lastw = lbn;
+ vp->v_lasta = bp->b_blkno;
+ return;
+ }
+ }
+ }
+ /*
+ * Consider beginning a cluster. If at end of file, make
+ * cluster as large as possible, otherwise find size of
+ * existing cluster.
+ */
+ if ((vp->v_type == VREG) &&
+ ((u_quad_t) bp->b_offset + lblocksize) != filesize &&
+ (bp->b_blkno == bp->b_lblkno) &&
+ (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
+ bp->b_blkno == -1)) {
+ bawrite(bp);
+ vp->v_clen = 0;
+ vp->v_lasta = bp->b_blkno;
+ vp->v_cstart = lbn + 1;
+ vp->v_lastw = lbn;
+ return;
+ }
+ vp->v_clen = maxclen;
+ if (!async && maxclen == 0) { /* I/O not contiguous */
+ vp->v_cstart = lbn + 1;
+ bawrite(bp);
+ } else { /* Wait for rest of cluster */
+ vp->v_cstart = lbn;
+ bdwrite(bp);
+ }
+ } else if (lbn == vp->v_cstart + vp->v_clen) {
+ /*
+ * At end of cluster, write it out if seqcount tells us we
+ * are operating sequentially, otherwise let the buf or
+ * update daemon handle it.
+ */
+ bdwrite(bp);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+ vp->v_clen + 1, gbflags);
+ }
+ vp->v_clen = 0;
+ vp->v_cstart = lbn + 1;
+ } else if (vm_page_count_severe()) {
+ /*
+ * We are low on memory, get it going NOW
+ */
+ bawrite(bp);
+ } else {
+ /*
+ * In the middle of a cluster, so just delay the I/O for now.
+ */
+ bdwrite(bp);
+ }
+ vp->v_lastw = lbn;
+ vp->v_lasta = bp->b_blkno;
+}
+
+
+/*
+ * This is an awful lot like cluster_rbuild...wish they could be combined.
+ * The last lbn argument is the current block on which I/O is being
+ * performed. Check to see that it doesn't fall in the middle of
+ * the current block (if last_bp == NULL).
+ */
+int
+cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
+ int gbflags)
+{
+ struct buf *bp, *tbp;
+ struct bufobj *bo;
+ int i, j;
+ int totalwritten = 0;
+ int dbsize = btodb(size);
+
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
+ bo = &vp->v_bufobj;
+ while (len > 0) {
+ /*
+ * If the buffer is not delayed-write (i.e. dirty), or it
+ * is delayed-write but either locked or inval, it cannot
+ * partake in the clustered write.
+ */
+ BO_LOCK(bo);
+ if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
+ (tbp->b_vflags & BV_BKGRDINPROG)) {
+ BO_UNLOCK(bo);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+ if (BUF_LOCK(tbp,
+ LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) {
+ ++start_lbn;
+ --len;
+ continue;
+ }
+ if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) {
+ BUF_UNLOCK(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+ if (tbp->b_pin_count > 0) {
+ BUF_UNLOCK(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+ bremfree(tbp);
+ tbp->b_flags &= ~B_DONE;
+
+ /*
+ * Extra memory in the buffer, punt on this buffer.
+ * XXX we could handle this in most cases, but we would
+ * have to push the extra memory down to after our max
+ * possible cluster size and then potentially pull it back
+ * up if the cluster was terminated prematurely--too much
+ * hassle.
+ */
+ if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) !=
+ (B_CLUSTEROK | B_VMIO)) ||
+ (tbp->b_bcount != tbp->b_bufsize) ||
+ (tbp->b_bcount != size) ||
+ (len == 1) ||
+ ((bp = (vp->v_vflag & VV_MD) != 0 ?
+ trypbuf(&cluster_pbuf_freecnt) :
+ getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+ totalwritten += tbp->b_bufsize;
+ bawrite(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
+
+ /*
+ * We got a pbuf to make the cluster in.
+ * so initialise it.
+ */
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+ if (tbp->b_wcred != NOCRED)
+ bp->b_wcred = crhold(tbp->b_wcred);
+
+ bp->b_blkno = tbp->b_blkno;
+ bp->b_lblkno = tbp->b_lblkno;
+ bp->b_offset = tbp->b_offset;
+
+ /*
+ * We are synthesizing a buffer out of vm_page_t's, but
+ * if the block size is not page aligned then the starting
+ * address may not be either. Inherit the b_data offset
+ * from the original buffer.
+ */
+ if ((gbflags & GB_UNMAPPED) == 0 ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ } else {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_data = unmapped_buf;
+ }
+ bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+ B_NEEDCOMMIT));
+ bp->b_iodone = cluster_callback;
+ pbgetvp(vp, bp);
+ /*
+ * From this location in the file, scan forward to see
+ * if there are buffers with adjacent data that need to
+ * be written as well.
+ */
+ for (i = 0; i < len; ++i, ++start_lbn) {
+ if (i != 0) { /* If not the first buffer */
+ /*
+ * If the adjacent data is not even in core it
+ * can't need to be written.
+ */
+ BO_LOCK(bo);
+ if ((tbp = gbincore(bo, start_lbn)) == NULL ||
+ (tbp->b_vflags & BV_BKGRDINPROG)) {
+ BO_UNLOCK(bo);
+ break;
+ }
+
+ /*
+ * If it IS in core, but has different
+ * characteristics, or is locked (which
+ * means it could be undergoing a background
+ * I/O or be in a weird state), then don't
+ * cluster with it.
+ */
+ if (BUF_LOCK(tbp,
+ LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
+ BO_LOCKPTR(bo)))
+ break;
+
+ if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
+ B_INVAL | B_DELWRI | B_NEEDCOMMIT))
+ != (B_DELWRI | B_CLUSTEROK |
+ (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
+ tbp->b_wcred != bp->b_wcred) {
+ BUF_UNLOCK(tbp);
+ break;
+ }
+
+ /*
+ * Check that the combined cluster
+ * would make sense with regard to pages
+ * and would not be too large
+ */
+ if ((tbp->b_bcount != size) ||
+ ((bp->b_blkno + (dbsize * i)) !=
+ tbp->b_blkno) ||
+ ((tbp->b_npages + bp->b_npages) >
+ (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
+ BUF_UNLOCK(tbp);
+ break;
+ }
+
+ /*
+ * Do not pull in pinned buffers.
+ */
+ if (tbp->b_pin_count > 0) {
+ BUF_UNLOCK(tbp);
+ break;
+ }
+
+ /*
+ * Ok, it's passed all the tests,
+ * so remove it from the free list
+ * and mark it busy. We will use it.
+ */
+ bremfree(tbp);
+ tbp->b_flags &= ~B_DONE;
+ } /* end of code for non-first buffers only */
+ /*
+ * If the IO is via the VM then we do some
+ * special VM hackery (yuck). Since the buffer's
+ * block size may not be page-aligned it is possible
+ * for a page to be shared between two buffers. We
+ * have to get rid of the duplication when building
+ * the cluster.
+ */
+ if (tbp->b_flags & B_VMIO) {
+ vm_page_t m;
+
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ if (i == 0) {
+ vfs_drain_busy_pages(tbp);
+ } else { /* if not first buffer */
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ m = tbp->b_pages[j];
+ if (vm_page_xbusied(m)) {
+ VM_OBJECT_WUNLOCK(
+ tbp->b_object);
+ bqrelse(tbp);
+ goto finishcluster;
+ }
+ }
+ }
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ m = tbp->b_pages[j];
+ vm_page_sbusy(m);
+ vm_object_pip_add(m->object, 1);
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages - 1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ }
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ }
+ bp->b_bcount += size;
+ bp->b_bufsize += size;
+ /*
+ * If any of the clustered buffers have their
+ * B_BARRIER flag set, transfer that request to
+ * the cluster.
+ */
+ bp->b_flags |= (tbp->b_flags & B_BARRIER);
+ tbp->b_flags &= ~(B_DONE | B_BARRIER);
+ tbp->b_flags |= B_ASYNC;
+ tbp->b_ioflags &= ~BIO_ERROR;
+ tbp->b_iocmd = BIO_WRITE;
+ bundirty(tbp);
+ reassignbuf(tbp); /* put on clean list */
+ bufobj_wref(tbp->b_bufobj);
+ BUF_KERNPROC(tbp);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ }
+ finishcluster:
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic(
+ "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ bp->b_kvasize = bp->b_bufsize;
+ totalwritten += bp->b_bufsize;
+ bp->b_dirtyoff = 0;
+ bp->b_dirtyend = bp->b_bufsize;
+ bawrite(bp);
+
+ len -= i;
+ }
+ return totalwritten;
+}
+
+/*
+ * Collect together all the buffers in a cluster.
+ * Plus add one additional buffer.
+ */
+static struct cluster_save *
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
+{
+ struct cluster_save *buflist;
+ struct buf *bp;
+ daddr_t lbn;
+ int i, len;
+
+ len = vp->v_lastw - vp->v_cstart + 1;
+ buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
+ M_SEGMENT, M_WAITOK);
+ buflist->bs_nchildren = 0;
+ buflist->bs_children = (struct buf **) (buflist + 1);
+ for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
+ (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+ gbflags, &bp);
+ buflist->bs_children[i] = bp;
+ if (bp->b_blkno == bp->b_lblkno)
+ VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
+ NULL, NULL);
+ }
+ buflist->bs_children[i] = bp = last_bp;
+ if (bp->b_blkno == bp->b_lblkno)
+ VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+ buflist->bs_nchildren = i + 1;
+ return (buflist);
+}
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
new file mode 100644
index 0000000..2c01117
--- /dev/null
+++ b/sys/kern/vfs_default.c
@@ -0,0 +1,1269 @@
+/*-
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/event.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/rwlock.h>
+#include <sys/fcntl.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/dirent.h>
+#include <sys/poll.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+
+static int vop_nolookup(struct vop_lookup_args *);
+static int vop_norename(struct vop_rename_args *);
+static int vop_nostrategy(struct vop_strategy_args *);
+static int get_next_dirent(struct vnode *vp, struct dirent **dpp,
+ char *dirbuf, int dirbuflen, off_t *off,
+ char **cpos, int *len, int *eofflag,
+ struct thread *td);
+static int dirent_exists(struct vnode *vp, const char *dirname,
+ struct thread *td);
+
+#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
+
+static int vop_stdis_text(struct vop_is_text_args *ap);
+static int vop_stdset_text(struct vop_set_text_args *ap);
+static int vop_stdunset_text(struct vop_unset_text_args *ap);
+static int vop_stdget_writecount(struct vop_get_writecount_args *ap);
+static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
+
+/*
+ * This vnode table stores what we want to do if the filesystem doesn't
+ * implement a particular VOP.
+ *
+ * If there is no specific entry here, we will return EOPNOTSUPP.
+ *
+ * Note that every filesystem has to implement either vop_access
+ * or vop_accessx; failing to do so will result in immediate crash
+ * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(),
+ * which calls vop_stdaccess() etc.
+ */
+
+struct vop_vector default_vnodeops = {
+ .vop_default = NULL,
+ .vop_bypass = VOP_EOPNOTSUPP,
+
+ .vop_access = vop_stdaccess,
+ .vop_accessx = vop_stdaccessx,
+ .vop_advise = vop_stdadvise,
+ .vop_advlock = vop_stdadvlock,
+ .vop_advlockasync = vop_stdadvlockasync,
+ .vop_advlockpurge = vop_stdadvlockpurge,
+ .vop_allocate = vop_stdallocate,
+ .vop_bmap = vop_stdbmap,
+ .vop_close = VOP_NULL,
+ .vop_fsync = VOP_NULL,
+ .vop_getpages = vop_stdgetpages,
+ .vop_getwritemount = vop_stdgetwritemount,
+ .vop_inactive = VOP_NULL,
+ .vop_ioctl = VOP_ENOTTY,
+ .vop_kqfilter = vop_stdkqfilter,
+ .vop_islocked = vop_stdislocked,
+ .vop_lock1 = vop_stdlock,
+ .vop_lookup = vop_nolookup,
+ .vop_open = VOP_NULL,
+ .vop_pathconf = VOP_EINVAL,
+ .vop_poll = vop_nopoll,
+ .vop_putpages = vop_stdputpages,
+ .vop_readlink = VOP_EINVAL,
+ .vop_rename = vop_norename,
+ .vop_revoke = VOP_PANIC,
+ .vop_strategy = vop_nostrategy,
+ .vop_unlock = vop_stdunlock,
+ .vop_vptocnp = vop_stdvptocnp,
+ .vop_vptofh = vop_stdvptofh,
+ .vop_unp_bind = vop_stdunp_bind,
+ .vop_unp_connect = vop_stdunp_connect,
+ .vop_unp_detach = vop_stdunp_detach,
+ .vop_is_text = vop_stdis_text,
+ .vop_set_text = vop_stdset_text,
+ .vop_unset_text = vop_stdunset_text,
+ .vop_get_writecount = vop_stdget_writecount,
+ .vop_add_writecount = vop_stdadd_writecount,
+};
+
+/*
+ * Series of placeholder functions for various error returns for
+ * VOPs.
+ */
+
+int
+vop_eopnotsupp(struct vop_generic_args *ap)
+{
+ /*
+ printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
+ */
+
+ return (EOPNOTSUPP);
+}
+
+int
+vop_ebadf(struct vop_generic_args *ap)
+{
+
+ return (EBADF);
+}
+
+int
+vop_enotty(struct vop_generic_args *ap)
+{
+
+ return (ENOTTY);
+}
+
+int
+vop_einval(struct vop_generic_args *ap)
+{
+
+ return (EINVAL);
+}
+
+int
+vop_enoent(struct vop_generic_args *ap)
+{
+
+ return (ENOENT);
+}
+
+int
+vop_null(struct vop_generic_args *ap)
+{
+
+ return (0);
+}
+
+/*
+ * Helper function to panic on some bad VOPs in some filesystems.
+ */
+int
+vop_panic(struct vop_generic_args *ap)
+{
+
+ panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
+}
+
+/*
+ * vop_std<something> and vop_no<something> are default functions for use by
+ * filesystems that need the "default reasonable" implementation for a
+ * particular operation.
+ *
+ * The documentation for the operations they implement exists (if it exists)
+ * in the VOP_<SOMETHING>(9) manpage (all uppercase).
+ */
+
+/*
+ * Default vop for filesystems that do not support name lookup
+ */
+static int
+vop_nolookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+
+ *ap->a_vpp = NULL;
+ return (ENOTDIR);
+}
+
+/*
+ * vop_norename:
+ *
+ * Handle unlock and reference counting for arguments of vop_rename
+ * for filesystems that do not implement rename operation.
+ */
+static int
+vop_norename(struct vop_rename_args *ap)
+{
+
+ vop_rename_fail(ap);
+ return (EOPNOTSUPP);
+}
+
+/*
+ * vop_nostrategy:
+ *
+ * Strategy routine for VFS devices that have none.
+ *
+ * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
+ * routine. Typically this is done for a BIO_READ strategy call.
+ * Typically B_INVAL is assumed to already be clear prior to a write
+ * and should not be cleared manually unless you just made the buffer
+ * invalid. BIO_ERROR should be cleared either way.
+ */
+
+static int
+vop_nostrategy (struct vop_strategy_args *ap)
+{
+ printf("No strategy for buffer at %p\n", ap->a_bp);
+ vprint("vnode", ap->a_vp);
+ ap->a_bp->b_ioflags |= BIO_ERROR;
+ ap->a_bp->b_error = EOPNOTSUPP;
+ bufdone(ap->a_bp);
+ return (EOPNOTSUPP);
+}
+
+static int
+get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
+ int dirbuflen, off_t *off, char **cpos, int *len,
+ int *eofflag, struct thread *td)
+{
+ int error, reclen;
+ struct uio uio;
+ struct iovec iov;
+ struct dirent *dp;
+
+ KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+ KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+
+ if (*len == 0) {
+ iov.iov_base = dirbuf;
+ iov.iov_len = dirbuflen;
+
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = *off;
+ uio.uio_resid = dirbuflen;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_rw = UIO_READ;
+ uio.uio_td = td;
+
+ *eofflag = 0;
+
+#ifdef MAC
+ error = mac_vnode_check_readdir(td->td_ucred, vp);
+ if (error == 0)
+#endif
+ error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
+ NULL, NULL);
+ if (error)
+ return (error);
+
+ *off = uio.uio_offset;
+
+ *cpos = dirbuf;
+ *len = (dirbuflen - uio.uio_resid);
+
+ if (*len == 0)
+ return (ENOENT);
+ }
+
+ dp = (struct dirent *)(*cpos);
+ reclen = dp->d_reclen;
+ *dpp = dp;
+
+ /* check for malformed directory.. */
+ if (reclen < DIRENT_MINSIZE)
+ return (EINVAL);
+
+ *cpos += reclen;
+ *len -= reclen;
+
+ return (0);
+}
+
+/*
+ * Check if a named file exists in a given directory vnode.
+ */
+static int
+dirent_exists(struct vnode *vp, const char *dirname, struct thread *td)
+{
+ char *dirbuf, *cpos;
+ int error, eofflag, dirbuflen, len, found;
+ off_t off;
+ struct dirent *dp;
+ struct vattr va;
+
+ KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
+ KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
+
+ found = 0;
+
+ error = VOP_GETATTR(vp, &va, td->td_ucred);
+ if (error)
+ return (found);
+
+ dirbuflen = DEV_BSIZE;
+ if (dirbuflen < va.va_blocksize)
+ dirbuflen = va.va_blocksize;
+ dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+
+ off = 0;
+ len = 0;
+ do {
+ error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off,
+ &cpos, &len, &eofflag, td);
+ if (error)
+ goto out;
+
+ if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
+ strcmp(dp->d_name, dirname) == 0) {
+ found = 1;
+ goto out;
+ }
+ } while (len > 0 || !eofflag);
+
+out:
+ free(dirbuf, M_TEMP);
+ return (found);
+}
+
+int
+vop_stdaccess(struct vop_access_args *ap)
+{
+
+ KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
+ VAPPEND)) == 0, ("invalid bit in accmode"));
+
+ return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td));
+}
+
+int
+vop_stdaccessx(struct vop_accessx_args *ap)
+{
+ int error;
+ accmode_t accmode = ap->a_accmode;
+
+ error = vfs_unixify_accmode(&accmode);
+ if (error != 0)
+ return (error);
+
+ if (accmode == 0)
+ return (0);
+
+ return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td));
+}
+
+/*
+ * Advisory record locking support
+ */
+int
+vop_stdadvlock(struct vop_advlock_args *ap)
+{
+ struct vnode *vp;
+ struct ucred *cred;
+ struct vattr vattr;
+ int error;
+
+ vp = ap->a_vp;
+ cred = curthread->td_ucred;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ if (error)
+ return (error);
+
+ return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
+}
+
+int
+vop_stdadvlockasync(struct vop_advlockasync_args *ap)
+{
+ struct vnode *vp;
+ struct ucred *cred;
+ struct vattr vattr;
+ int error;
+
+ vp = ap->a_vp;
+ cred = curthread->td_ucred;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ if (error)
+ return (error);
+
+ return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
+}
+
+int
+vop_stdadvlockpurge(struct vop_advlockpurge_args *ap)
+{
+ struct vnode *vp;
+
+ vp = ap->a_vp;
+ lf_purgelocks(vp, &vp->v_lockf);
+ return (0);
+}
+
+/*
+ * vop_stdpathconf:
+ *
+ * Standard implementation of POSIX pathconf, to get information about limits
+ * for a filesystem.
+ * Override per filesystem for the case where the filesystem has smaller
+ * limits.
+ */
+int
+vop_stdpathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ int *a_retval;
+ } */ *ap;
+{
+
+ switch (ap->a_name) {
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
+ case _PC_PATH_MAX:
+ *ap->a_retval = PATH_MAX;
+ return (0);
+ case _PC_LINK_MAX:
+ *ap->a_retval = LINK_MAX;
+ return (0);
+ case _PC_MAX_CANON:
+ *ap->a_retval = MAX_CANON;
+ return (0);
+ case _PC_MAX_INPUT:
+ *ap->a_retval = MAX_INPUT;
+ return (0);
+ case _PC_PIPE_BUF:
+ *ap->a_retval = PIPE_BUF;
+ return (0);
+ case _PC_CHOWN_RESTRICTED:
+ *ap->a_retval = 1;
+ return (0);
+ case _PC_VDISABLE:
+ *ap->a_retval = _POSIX_VDISABLE;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Standard lock, unlock and islocked functions.
+ */
+int
+vop_stdlock(ap)
+ struct vop_lock1_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ char *file;
+ int line;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ return (_lockmgr_args(vp->v_vnlock, ap->a_flags, VI_MTX(vp),
+ LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, ap->a_file,
+ ap->a_line));
+}
+
+/* See above. */
+int
+vop_stdunlock(ap)
+ struct vop_unlock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ return (lockmgr(vp->v_vnlock, ap->a_flags | LK_RELEASE, VI_MTX(vp)));
+}
+
+/* See above. */
+int
+vop_stdislocked(ap)
+ struct vop_islocked_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+
+ return (lockstatus(ap->a_vp->v_vnlock));
+}
+
+/*
+ * Return true for select/poll.
+ */
+int
+vop_nopoll(ap)
+ struct vop_poll_args /* {
+ struct vnode *a_vp;
+ int a_events;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+
+ return (poll_no_poll(ap->a_events));
+}
+
+/*
+ * Implement poll for local filesystems that support it.
+ */
+int
+vop_stdpoll(ap)
+ struct vop_poll_args /* {
+ struct vnode *a_vp;
+ int a_events;
+ struct ucred *a_cred;
+ struct thread *a_td;
+ } */ *ap;
+{
+ if (ap->a_events & ~POLLSTANDARD)
+ return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
+ return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Return our mount point, as we will take charge of the writes.
+ */
+int
+vop_stdgetwritemount(ap)
+ struct vop_getwritemount_args /* {
+ struct vnode *a_vp;
+ struct mount **a_mpp;
+ } */ *ap;
+{
+ struct mount *mp;
+
+ /*
+ * XXX Since this is called unlocked we may be recycled while
+ * attempting to ref the mount. If this is the case or mountpoint
+ * will be set to NULL. We only have to prevent this call from
+ * returning with a ref to an incorrect mountpoint. It is not
+ * harmful to return with a ref to our previous mountpoint.
+ */
+ mp = ap->a_vp->v_mount;
+ if (mp != NULL) {
+ vfs_ref(mp);
+ if (mp != ap->a_vp->v_mount) {
+ vfs_rel(mp);
+ mp = NULL;
+ }
+ }
+ *(ap->a_mpp) = mp;
+ return (0);
+}
+
+/* XXX Needs good comment and VOP_BMAP(9) manpage */
+int
+vop_stdbmap(ap)
+ struct vop_bmap_args /* {
+ struct vnode *a_vp;
+ daddr_t a_bn;
+ struct bufobj **a_bop;
+ daddr_t *a_bnp;
+ int *a_runp;
+ int *a_runb;
+ } */ *ap;
+{
+
+ if (ap->a_bop != NULL)
+ *ap->a_bop = &ap->a_vp->v_bufobj;
+ if (ap->a_bnp != NULL)
+ *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
+ if (ap->a_runp != NULL)
+ *ap->a_runp = 0;
+ if (ap->a_runb != NULL)
+ *ap->a_runb = 0;
+ return (0);
+}
+
+int
+vop_stdfsync(ap)
+ struct vop_fsync_args /* {
+ struct vnode *a_vp;
+ struct ucred *a_cred;
+ int a_waitfor;
+ struct thread *a_td;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+ struct buf *bp;
+ struct bufobj *bo;
+ struct buf *nbp;
+ int error = 0;
+ int maxretry = 1000; /* large, arbitrarily chosen */
+
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+loop1:
+ /*
+ * MARK/SCAN initialization to avoid infinite loops.
+ */
+ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
+ bp->b_vflags &= ~BV_SCANNED;
+ bp->b_error = 0;
+ }
+
+ /*
+ * Flush all dirty buffers associated with a vnode.
+ */
+loop2:
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+ if ((bp->b_vflags & BV_SCANNED) != 0)
+ continue;
+ bp->b_vflags |= BV_SCANNED;
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
+ if (ap->a_waitfor != MNT_WAIT)
+ continue;
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
+ BO_LOCKPTR(bo)) != 0) {
+ BO_LOCK(bo);
+ goto loop1;
+ }
+ BO_LOCK(bo);
+ }
+ BO_UNLOCK(bo);
+ KASSERT(bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p",
+ bp, bp->b_bufobj, bo));
+ if ((bp->b_flags & B_DELWRI) == 0)
+ panic("fsync: not dirty");
+ if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
+ vfs_bio_awrite(bp);
+ } else {
+ bremfree(bp);
+ bawrite(bp);
+ }
+ BO_LOCK(bo);
+ goto loop2;
+ }
+
+ /*
+ * If synchronous the caller expects us to completely resolve all
+ * dirty buffers in the system. Wait for in-progress I/O to
+ * complete (which could include background bitmap writes), then
+ * retry if dirty blocks still exist.
+ */
+ if (ap->a_waitfor == MNT_WAIT) {
+ bufobj_wwait(bo, 0, 0);
+ if (bo->bo_dirty.bv_cnt > 0) {
+ /*
+ * If we are unable to write any of these buffers
+ * then we fail now rather than trying endlessly
+ * to write them out.
+ */
+ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
+ if ((error = bp->b_error) == 0)
+ continue;
+ if (error == 0 && --maxretry >= 0)
+ goto loop1;
+ error = EAGAIN;
+ }
+ }
+ BO_UNLOCK(bo);
+ if (error == EAGAIN)
+ vprint("fsync: giving up on dirty", vp);
+
+ return (error);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
+int
+vop_stdgetpages(ap)
+ struct vop_getpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_reqpage;
+ vm_ooffset_t a_offset;
+ } */ *ap;
+{
+
+ return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
+ ap->a_count, ap->a_reqpage);
+}
+
+int
+vop_stdkqfilter(struct vop_kqfilter_args *ap)
+{
+ return vfs_kqfilter(ap);
+}
+
+/* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
+int
+vop_stdputpages(ap)
+ struct vop_putpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_sync;
+ int *a_rtvals;
+ vm_ooffset_t a_offset;
+ } */ *ap;
+{
+
+ return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
+ ap->a_sync, ap->a_rtvals);
+}
+
+int
+vop_stdvptofh(struct vop_vptofh_args *ap)
+{
+ return (EOPNOTSUPP);
+}
+
+int
+vop_stdvptocnp(struct vop_vptocnp_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct vnode **dvp = ap->a_vpp;
+ struct ucred *cred = ap->a_cred;
+ char *buf = ap->a_buf;
+ int *buflen = ap->a_buflen;
+ char *dirbuf, *cpos;
+ int i, error, eofflag, dirbuflen, flags, locked, len, covered;
+ off_t off;
+ ino_t fileno;
+ struct vattr va;
+ struct nameidata nd;
+ struct thread *td;
+ struct dirent *dp;
+ struct vnode *mvp;
+
+ i = *buflen;
+ error = 0;
+ covered = 0;
+ td = curthread;
+
+ if (vp->v_type != VDIR)
+ return (ENOENT);
+
+ error = VOP_GETATTR(vp, &va, cred);
+ if (error)
+ return (error);
+
+ VREF(vp);
+ locked = VOP_ISLOCKED(vp);
+ VOP_UNLOCK(vp, 0);
+ NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
+ "..", vp, td);
+ flags = FREAD;
+ error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL);
+ if (error) {
+ vn_lock(vp, locked | LK_RETRY);
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ mvp = *dvp = nd.ni_vp;
+
+ if (vp->v_mount != (*dvp)->v_mount &&
+ ((*dvp)->v_vflag & VV_ROOT) &&
+ ((*dvp)->v_mount->mnt_flag & MNT_UNION)) {
+ *dvp = (*dvp)->v_mount->mnt_vnodecovered;
+ VREF(mvp);
+ VOP_UNLOCK(mvp, 0);
+ vn_close(mvp, FREAD, cred, td);
+ VREF(*dvp);
+ vn_lock(*dvp, LK_EXCLUSIVE | LK_RETRY);
+ covered = 1;
+ }
+
+ fileno = va.va_fileid;
+
+ dirbuflen = DEV_BSIZE;
+ if (dirbuflen < va.va_blocksize)
+ dirbuflen = va.va_blocksize;
+ dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
+
+ if ((*dvp)->v_type != VDIR) {
+ error = ENOENT;
+ goto out;
+ }
+
+ off = 0;
+ len = 0;
+ do {
+ /* call VOP_READDIR of parent */
+ error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off,
+ &cpos, &len, &eofflag, td);
+ if (error)
+ goto out;
+
+ if ((dp->d_type != DT_WHT) &&
+ (dp->d_fileno == fileno)) {
+ if (covered) {
+ VOP_UNLOCK(*dvp, 0);
+ vn_lock(mvp, LK_EXCLUSIVE | LK_RETRY);
+ if (dirent_exists(mvp, dp->d_name, td)) {
+ error = ENOENT;
+ VOP_UNLOCK(mvp, 0);
+ vn_lock(*dvp, LK_EXCLUSIVE | LK_RETRY);
+ goto out;
+ }
+ VOP_UNLOCK(mvp, 0);
+ vn_lock(*dvp, LK_EXCLUSIVE | LK_RETRY);
+ }
+ i -= dp->d_namlen;
+
+ if (i < 0) {
+ error = ENOMEM;
+ goto out;
+ }
+ if (dp->d_namlen == 1 && dp->d_name[0] == '.') {
+ error = ENOENT;
+ } else {
+ bcopy(dp->d_name, buf + i, dp->d_namlen);
+ error = 0;
+ }
+ goto out;
+ }
+ } while (len > 0 || !eofflag);
+ error = ENOENT;
+
+out:
+ free(dirbuf, M_TEMP);
+ if (!error) {
+ *buflen = i;
+ vref(*dvp);
+ }
+ if (covered) {
+ vput(*dvp);
+ vrele(mvp);
+ } else {
+ VOP_UNLOCK(mvp, 0);
+ vn_close(mvp, FREAD, cred, td);
+ }
+ vn_lock(vp, locked | LK_RETRY);
+ return (error);
+}
+
+int
+vop_stdallocate(struct vop_allocate_args *ap)
+{
+#ifdef __notyet__
+ struct statfs sfs;
+#endif
+ struct iovec aiov;
+ struct vattr vattr, *vap;
+ struct uio auio;
+ off_t fsize, len, cur, offset;
+ uint8_t *buf;
+ struct thread *td;
+ struct vnode *vp;
+ size_t iosize;
+ int error;
+
+ buf = NULL;
+ error = 0;
+ td = curthread;
+ vap = &vattr;
+ vp = ap->a_vp;
+ len = *ap->a_len;
+ offset = *ap->a_offset;
+
+ error = VOP_GETATTR(vp, vap, td->td_ucred);
+ if (error != 0)
+ goto out;
+ fsize = vap->va_size;
+ iosize = vap->va_blocksize;
+ if (iosize == 0)
+ iosize = BLKDEV_IOSIZE;
+ if (iosize > MAXPHYS)
+ iosize = MAXPHYS;
+ buf = malloc(iosize, M_TEMP, M_WAITOK);
+
+#ifdef __notyet__
+ /*
+ * Check if the filesystem sets f_maxfilesize; if not use
+ * VOP_SETATTR to perform the check.
+ */
+ error = VFS_STATFS(vp->v_mount, &sfs, td);
+ if (error != 0)
+ goto out;
+ if (sfs.f_maxfilesize) {
+ if (offset > sfs.f_maxfilesize || len > sfs.f_maxfilesize ||
+ offset + len > sfs.f_maxfilesize) {
+ error = EFBIG;
+ goto out;
+ }
+ } else
+#endif
+ if (offset + len > vap->va_size) {
+ /*
+ * Test offset + len against the filesystem's maxfilesize.
+ */
+ VATTR_NULL(vap);
+ vap->va_size = offset + len;
+ error = VOP_SETATTR(vp, vap, td->td_ucred);
+ if (error != 0)
+ goto out;
+ VATTR_NULL(vap);
+ vap->va_size = fsize;
+ error = VOP_SETATTR(vp, vap, td->td_ucred);
+ if (error != 0)
+ goto out;
+ }
+
+ for (;;) {
+ /*
+ * Read and write back anything below the nominal file
+ * size. There's currently no way outside the filesystem
+ * to know whether this area is sparse or not.
+ */
+ cur = iosize;
+ if ((offset % iosize) != 0)
+ cur -= (offset % iosize);
+ if (cur > len)
+ cur = len;
+ if (offset < fsize) {
+ aiov.iov_base = buf;
+ aiov.iov_len = cur;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ auio.uio_resid = cur;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_td = td;
+ error = VOP_READ(vp, &auio, 0, td->td_ucred);
+ if (error != 0)
+ break;
+ if (auio.uio_resid > 0) {
+ bzero(buf + cur - auio.uio_resid,
+ auio.uio_resid);
+ }
+ } else {
+ bzero(buf, cur);
+ }
+
+ aiov.iov_base = buf;
+ aiov.iov_len = cur;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ auio.uio_resid = cur;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_td = td;
+
+ error = VOP_WRITE(vp, &auio, 0, td->td_ucred);
+ if (error != 0)
+ break;
+
+ len -= cur;
+ offset += cur;
+ if (len == 0)
+ break;
+ if (should_yield())
+ break;
+ }
+
+ out:
+ *ap->a_len = len;
+ *ap->a_offset = offset;
+ free(buf, M_TEMP);
+ return (error);
+}
+
+int
+vop_stdadvise(struct vop_advise_args *ap)
+{
+ struct vnode *vp;
+ off_t start, end;
+ int error;
+
+ vp = ap->a_vp;
+ switch (ap->a_advice) {
+ case POSIX_FADV_WILLNEED:
+ /*
+ * Do nothing for now. Filesystems should provide a
+ * custom method which starts an asynchronous read of
+ * the requested region.
+ */
+ error = 0;
+ break;
+ case POSIX_FADV_DONTNEED:
+ /*
+ * Flush any open FS buffers and then remove pages
+ * from the backing VM object. Using vinvalbuf() here
+ * is a bit heavy-handed as it flushes all buffers for
+ * the given vnode, not just the buffers covering the
+ * requested range.
+ */
+ error = 0;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_iflag & VI_DOOMED) {
+ VOP_UNLOCK(vp, 0);
+ break;
+ }
+ vinvalbuf(vp, V_CLEANONLY, 0, 0);
+ if (vp->v_object != NULL) {
+ start = trunc_page(ap->a_start);
+ end = round_page(ap->a_end);
+ VM_OBJECT_WLOCK(vp->v_object);
+ vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
+ OFF_TO_IDX(end));
+ VM_OBJECT_WUNLOCK(vp->v_object);
+ }
+ VOP_UNLOCK(vp, 0);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+int
+vop_stdunp_bind(struct vop_unp_bind_args *ap)
+{
+
+ ap->a_vp->v_socket = ap->a_socket;
+ return (0);
+}
+
+int
+vop_stdunp_connect(struct vop_unp_connect_args *ap)
+{
+
+ *ap->a_socket = ap->a_vp->v_socket;
+ return (0);
+}
+
+int
+vop_stdunp_detach(struct vop_unp_detach_args *ap)
+{
+
+ ap->a_vp->v_socket = NULL;
+ return (0);
+}
+
+static int
+vop_stdis_text(struct vop_is_text_args *ap)
+{
+
+ return ((ap->a_vp->v_vflag & VV_TEXT) != 0);
+}
+
+static int
+vop_stdset_text(struct vop_set_text_args *ap)
+{
+
+ ap->a_vp->v_vflag |= VV_TEXT;
+ return (0);
+}
+
+static int
+vop_stdunset_text(struct vop_unset_text_args *ap)
+{
+
+ ap->a_vp->v_vflag &= ~VV_TEXT;
+ return (0);
+}
+
+static int
+vop_stdget_writecount(struct vop_get_writecount_args *ap)
+{
+
+ *ap->a_writecount = ap->a_vp->v_writecount;
+ return (0);
+}
+
+static int
+vop_stdadd_writecount(struct vop_add_writecount_args *ap)
+{
+
+ ap->a_vp->v_writecount += ap->a_inc;
+ return (0);
+}
+
+/*
+ * vfs default ops
+ * used to fill the vfs function table to get reasonable default return values.
+ */
+int
+vfs_stdroot (mp, flags, vpp)
+ struct mount *mp;
+ int flags;
+ struct vnode **vpp;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdstatfs (mp, sbp)
+ struct mount *mp;
+ struct statfs *sbp;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdquotactl (mp, cmds, uid, arg)
+ struct mount *mp;
+ int cmds;
+ uid_t uid;
+ void *arg;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsync(mp, waitfor)
+ struct mount *mp;
+ int waitfor;
+{
+ struct vnode *vp, *mvp;
+ struct thread *td;
+ int error, lockreq, allerror = 0;
+
+ td = curthread;
+ lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
+ if (waitfor != MNT_WAIT)
+ lockreq |= LK_NOWAIT;
+ /*
+ * Force stale buffer cache information to be flushed.
+ */
+loop:
+ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+ if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
+ VI_UNLOCK(vp);
+ continue;
+ }
+ if ((error = vget(vp, lockreq, td)) != 0) {
+ if (error == ENOENT) {
+ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+ goto loop;
+ }
+ continue;
+ }
+ error = VOP_FSYNC(vp, waitfor, td);
+ if (error)
+ allerror = error;
+ vput(vp);
+ }
+ return (allerror);
+}
+
+int
+vfs_stdnosync (mp, waitfor)
+ struct mount *mp;
+ int waitfor;
+{
+
+ return (0);
+}
+
+int
+vfs_stdvget (mp, ino, flags, vpp)
+ struct mount *mp;
+ ino_t ino;
+ int flags;
+ struct vnode **vpp;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdfhtovp (mp, fhp, flags, vpp)
+ struct mount *mp;
+ struct fid *fhp;
+ int flags;
+ struct vnode **vpp;
+{
+
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdinit (vfsp)
+ struct vfsconf *vfsp;
+{
+
+ return (0);
+}
+
+int
+vfs_stduninit (vfsp)
+ struct vfsconf *vfsp;
+{
+
+ return(0);
+}
+
+int
+vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)
+ struct mount *mp;
+ int cmd;
+ struct vnode *filename_vp;
+ int attrnamespace;
+ const char *attrname;
+{
+
+ if (filename_vp != NULL)
+ VOP_UNLOCK(filename_vp, 0);
+ return (EOPNOTSUPP);
+}
+
+int
+vfs_stdsysctl(mp, op, req)
+ struct mount *mp;
+ fsctlop_t op;
+ struct sysctl_req *req;
+{
+
+ return (EOPNOTSUPP);
+}
+
+/* end of vfs default ops */
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
new file mode 100644
index 0000000..6a3f291
--- /dev/null
+++ b/sys/kern/vfs_export.c
@@ -0,0 +1,493 @@
+/*-
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/dirent.h>
+#include <sys/domain.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/refcount.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <net/radix.h>
+
+static MALLOC_DEFINE(M_NETADDR, "export_host", "Export host address structure");
+
+static void vfs_free_addrlist(struct netexport *nep);
+static int vfs_free_netcred(struct radix_node *rn, void *w);
+static int vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+ struct export_args *argp);
+static struct netcred *vfs_export_lookup(struct mount *, struct sockaddr *);
+
+/*
+ * Network address lookup element
+ */
+struct netcred {
+ struct radix_node netc_rnodes[2];
+ int netc_exflags;
+ struct ucred *netc_anon;
+ int netc_numsecflavors;
+ int netc_secflavors[MAXSECFLAVORS];
+};
+
+/*
+ * Network export information
+ */
+struct netexport {
+ struct netcred ne_defexported; /* Default export */
+ struct radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */
+};
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by vfs_export() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+ struct export_args *argp)
+{
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ register int i;
+ struct radix_node *rn;
+ struct sockaddr *saddr, *smask = 0;
+ struct domain *dom;
+ int error;
+
+ /*
+ * XXX: This routine converts from a `struct xucred'
+ * (argp->ex_anon) to a `struct ucred' (np->netc_anon). This
+ * operation is questionable; for example, what should be done
+ * with fields like cr_uidinfo and cr_prison? Currently, this
+ * routine does not touch them (leaves them as NULL).
+ */
+ if (argp->ex_anon.cr_version != XUCRED_VERSION) {
+ vfs_mount_error(mp, "ex_anon.cr_version: %d != %d",
+ argp->ex_anon.cr_version, XUCRED_VERSION);
+ return (EINVAL);
+ }
+
+ if (argp->ex_addrlen == 0) {
+ if (mp->mnt_flag & MNT_DEFEXPORTED) {
+ vfs_mount_error(mp,
+ "MNT_DEFEXPORTED already set for mount %p", mp);
+ return (EPERM);
+ }
+ np = &nep->ne_defexported;
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = crget();
+ np->netc_anon->cr_uid = argp->ex_anon.cr_uid;
+ crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups,
+ argp->ex_anon.cr_groups);
+ np->netc_anon->cr_prison = &prison0;
+ prison_hold(np->netc_anon->cr_prison);
+ np->netc_numsecflavors = argp->ex_numsecflavors;
+ bcopy(argp->ex_secflavors, np->netc_secflavors,
+ sizeof(np->netc_secflavors));
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_DEFEXPORTED;
+ MNT_IUNLOCK(mp);
+ return (0);
+ }
+
+#if MSIZE <= 256
+ if (argp->ex_addrlen > MLEN) {
+ vfs_mount_error(mp, "ex_addrlen %d is greater than %d",
+ argp->ex_addrlen, MLEN);
+ return (EINVAL);
+ }
+#endif
+
+ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+ np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
+ saddr = (struct sockaddr *) (np + 1);
+ if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen)))
+ goto out;
+ if (saddr->sa_family == AF_UNSPEC || saddr->sa_family > AF_MAX) {
+ error = EINVAL;
+ vfs_mount_error(mp, "Invalid saddr->sa_family: %d");
+ goto out;
+ }
+ if (saddr->sa_len > argp->ex_addrlen)
+ saddr->sa_len = argp->ex_addrlen;
+ if (argp->ex_masklen) {
+ smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
+ error = copyin(argp->ex_mask, smask, argp->ex_masklen);
+ if (error)
+ goto out;
+ if (smask->sa_len > argp->ex_masklen)
+ smask->sa_len = argp->ex_masklen;
+ }
+ i = saddr->sa_family;
+ if ((rnh = nep->ne_rtable[i]) == NULL) {
+ /*
+ * Seems silly to initialize every AF when most are not used,
+ * do so on demand here
+ */
+ for (dom = domains; dom; dom = dom->dom_next) {
+ KASSERT(((i == AF_INET) || (i == AF_INET6)),
+ ("unexpected protocol in vfs_hang_addrlist"));
+ if (dom->dom_family == i && dom->dom_rtattach) {
+ /*
+ * XXX MRT
+ * The INET and INET6 domains know the
+ * offset already. We don't need to send it
+ * So we just use it as a flag to say that
+ * we are or are not setting up a real routing
+ * table. Only IP and IPV6 need have this
+ * be 0 so all other protocols can stay the
+ * same (ABI compatible).
+ */
+ dom->dom_rtattach(
+ (void **) &nep->ne_rtable[i], 0);
+ break;
+ }
+ }
+ if ((rnh = nep->ne_rtable[i]) == NULL) {
+ error = ENOBUFS;
+ vfs_mount_error(mp, "%s %s %d",
+ "Unable to initialize radix node head ",
+ "for address family", i);
+ goto out;
+ }
+ }
+ RADIX_NODE_HEAD_LOCK(rnh);
+ rn = (*rnh->rnh_addaddr)(saddr, smask, rnh, np->netc_rnodes);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ if (rn == NULL || np != (struct netcred *)rn) { /* already exists */
+ error = EPERM;
+ vfs_mount_error(mp, "Invalid radix node head, rn: %p %p",
+ rn, np);
+ goto out;
+ }
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = crget();
+ np->netc_anon->cr_uid = argp->ex_anon.cr_uid;
+ crsetgroups(np->netc_anon, argp->ex_anon.cr_ngroups,
+ argp->ex_anon.cr_groups);
+ np->netc_anon->cr_prison = &prison0;
+ prison_hold(np->netc_anon->cr_prison);
+ np->netc_numsecflavors = argp->ex_numsecflavors;
+ bcopy(argp->ex_secflavors, np->netc_secflavors,
+ sizeof(np->netc_secflavors));
+ return (0);
+out:
+ free(np, M_NETADDR);
+ return (error);
+}
+
+/* Helper for vfs_free_addrlist. */
+/* ARGSUSED */
+static int
+vfs_free_netcred(struct radix_node *rn, void *w)
+{
+ struct radix_node_head *rnh = (struct radix_node_head *) w;
+ struct ucred *cred;
+
+ (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+ cred = ((struct netcred *)rn)->netc_anon;
+ if (cred != NULL)
+ crfree(cred);
+ free(rn, M_NETADDR);
+ return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(struct netexport *nep)
+{
+ int i;
+ struct radix_node_head *rnh;
+ struct ucred *cred;
+
+ for (i = 0; i <= AF_MAX; i++) {
+ if ((rnh = nep->ne_rtable[i])) {
+ RADIX_NODE_HEAD_LOCK(rnh);
+ (*rnh->rnh_walktree) (rnh, vfs_free_netcred, rnh);
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ RADIX_NODE_HEAD_DESTROY(rnh);
+ free(rnh, M_RTABLE);
+ nep->ne_rtable[i] = NULL; /* not SMP safe XXX */
+ }
+ }
+ cred = nep->ne_defexported.netc_anon;
+ if (cred != NULL)
+ crfree(cred);
+
+}
+
+/*
+ * High level function to manipulate export options on a mount point
+ * and the passed in netexport.
+ * Struct export_args *argp is the variable used to twiddle options,
+ * the structure is described in sys/mount.h
+ */
+int
+vfs_export(struct mount *mp, struct export_args *argp)
+{
+ struct netexport *nep;
+ int error;
+
+ if (argp->ex_numsecflavors < 0
+ || argp->ex_numsecflavors >= MAXSECFLAVORS)
+ return (EINVAL);
+
+ error = 0;
+ lockmgr(&mp->mnt_explock, LK_EXCLUSIVE, NULL);
+ nep = mp->mnt_export;
+ if (argp->ex_flags & MNT_DELEXPORT) {
+ if (nep == NULL) {
+ error = ENOENT;
+ goto out;
+ }
+ if (mp->mnt_flag & MNT_EXPUBLIC) {
+ vfs_setpublicfs(NULL, NULL, NULL);
+ MNT_ILOCK(mp);
+ mp->mnt_flag &= ~MNT_EXPUBLIC;
+ MNT_IUNLOCK(mp);
+ }
+ vfs_free_addrlist(nep);
+ mp->mnt_export = NULL;
+ free(nep, M_MOUNT);
+ nep = NULL;
+ MNT_ILOCK(mp);
+ mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+ MNT_IUNLOCK(mp);
+ }
+ if (argp->ex_flags & MNT_EXPORTED) {
+ if (nep == NULL) {
+ nep = malloc(sizeof(struct netexport), M_MOUNT, M_WAITOK | M_ZERO);
+ mp->mnt_export = nep;
+ }
+ if (argp->ex_flags & MNT_EXPUBLIC) {
+ if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
+ goto out;
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_EXPUBLIC;
+ MNT_IUNLOCK(mp);
+ }
+ if ((error = vfs_hang_addrlist(mp, nep, argp)))
+ goto out;
+ MNT_ILOCK(mp);
+ mp->mnt_flag |= MNT_EXPORTED;
+ MNT_IUNLOCK(mp);
+ }
+
+out:
+ lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+ /*
+ * Once we have executed the vfs_export() command, we do
+ * not want to keep the "export" option around in the
+ * options list, since that will cause subsequent MNT_UPDATE
+ * calls to fail. The export information is saved in
+ * mp->mnt_export, so we can safely delete the "export" mount option
+ * here.
+ */
+ vfs_deleteopt(mp->mnt_optnew, "export");
+ vfs_deleteopt(mp->mnt_opt, "export");
+ return (error);
+}
+
+/*
+ * Set the publicly exported filesystem (WebNFS). Currently, only
+ * one public filesystem is possible in the spec (RFC 2054 and 2055)
+ */
+int
+vfs_setpublicfs(struct mount *mp, struct netexport *nep,
+ struct export_args *argp)
+{
+ int error;
+ struct vnode *rvp;
+ char *cp;
+
+ /*
+ * mp == NULL -> invalidate the current info, the FS is
+ * no longer exported. May be called from either vfs_export
+ * or unmount, so check if it hasn't already been done.
+ */
+ if (mp == NULL) {
+ if (nfs_pub.np_valid) {
+ nfs_pub.np_valid = 0;
+ if (nfs_pub.np_index != NULL) {
+ free(nfs_pub.np_index, M_TEMP);
+ nfs_pub.np_index = NULL;
+ }
+ }
+ return (0);
+ }
+
+ /*
+ * Only one allowed at a time.
+ */
+ if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
+ return (EBUSY);
+
+ /*
+ * Get real filehandle for root of exported FS.
+ */
+ bzero(&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
+ nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
+
+ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp)))
+ return (error);
+
+ if ((error = VOP_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+ return (error);
+
+ vput(rvp);
+
+ /*
+ * If an indexfile was specified, pull it in.
+ */
+ if (argp->ex_indexfile != NULL) {
+ if (nfs_pub.np_index != NULL)
+ nfs_pub.np_index = malloc(MAXNAMLEN + 1, M_TEMP,
+ M_WAITOK);
+ error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
+ MAXNAMLEN, (size_t *)0);
+ if (!error) {
+ /*
+ * Check for illegal filenames.
+ */
+ for (cp = nfs_pub.np_index; *cp; cp++) {
+ if (*cp == '/') {
+ error = EINVAL;
+ break;
+ }
+ }
+ }
+ if (error) {
+ free(nfs_pub.np_index, M_TEMP);
+ nfs_pub.np_index = NULL;
+ return (error);
+ }
+ }
+
+ nfs_pub.np_mount = mp;
+ nfs_pub.np_valid = 1;
+ return (0);
+}
+
+/*
+ * Used by the filesystems to determine if a given network address
+ * (passed in 'nam') is present in their exports list, returns a pointer
+ * to struct netcred so that the filesystem can examine it for
+ * access rights (read/write/etc).
+ */
+static struct netcred *
+vfs_export_lookup(struct mount *mp, struct sockaddr *nam)
+{
+ struct netexport *nep;
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ struct sockaddr *saddr;
+
+ nep = mp->mnt_export;
+ if (nep == NULL)
+ return (NULL);
+ np = NULL;
+ if (mp->mnt_flag & MNT_EXPORTED) {
+ /*
+ * Lookup in the export list first.
+ */
+ if (nam != NULL) {
+ saddr = nam;
+ rnh = nep->ne_rtable[saddr->sa_family];
+ if (rnh != NULL) {
+ RADIX_NODE_HEAD_RLOCK(rnh);
+ np = (struct netcred *)
+ (*rnh->rnh_matchaddr)(saddr, rnh);
+ RADIX_NODE_HEAD_RUNLOCK(rnh);
+ if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+ np = NULL;
+ }
+ }
+ /*
+ * If no address match, use the default if it exists.
+ */
+ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+ np = &nep->ne_defexported;
+ }
+ return (np);
+}
+
+/*
+ * XXX: This comment comes from the deprecated ufs_check_export()
+ * XXX: and may not entirely apply, but lacking something better:
+ * This is the generic part of fhtovp called after the underlying
+ * filesystem has validated the file handle.
+ *
+ * Verify that a host should have access to a filesystem.
+ */
+
+int
+vfs_stdcheckexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
+ struct ucred **credanonp, int *numsecflavors, int **secflavors)
+{
+ struct netcred *np;
+
+ lockmgr(&mp->mnt_explock, LK_SHARED, NULL);
+ np = vfs_export_lookup(mp, nam);
+ if (np == NULL) {
+ lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+ *credanonp = NULL;
+ return (EACCES);
+ }
+ *extflagsp = np->netc_exflags;
+ if ((*credanonp = np->netc_anon) != NULL)
+ crhold(*credanonp);
+ if (numsecflavors)
+ *numsecflavors = np->netc_numsecflavors;
+ if (secflavors)
+ *secflavors = np->netc_secflavors;
+ lockmgr(&mp->mnt_explock, LK_RELEASE, NULL);
+ return (0);
+}
+
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
new file mode 100644
index 0000000..bc7b942
--- /dev/null
+++ b/sys/kern/vfs_extattr.c
@@ -0,0 +1,765 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/capability.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/fcntl.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/limits.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/extattr.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+/*
+ * Syscall to push extended attribute configuration information into the VFS.
+ * Accepts a path, which it converts to a mountpoint, as well as a command
+ * (int cmd), and attribute name and misc data.
+ *
+ * Currently this is used only by UFS1 extended attributes.
+ */
+int
+sys_extattrctl(td, uap)
+ struct thread *td;
+ struct extattrctl_args /* {
+ const char *path;
+ int cmd;
+ const char *filename;
+ int attrnamespace;
+ const char *attrname;
+ } */ *uap;
+{
+ struct vnode *filename_vp;
+ struct nameidata nd;
+ struct mount *mp, *mp_writable;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_CMD(uap->cmd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ /*
+ * uap->attrname is not always defined. We check again later when we
+ * invoke the VFS call so as to pass in NULL there if needed.
+ */
+ if (uap->attrname != NULL) {
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+ NULL);
+ if (error)
+ return (error);
+ }
+ AUDIT_ARG_TEXT(attrname);
+
+ mp = NULL;
+ filename_vp = NULL;
+ if (uap->filename != NULL) {
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2,
+ UIO_USERSPACE, uap->filename, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ filename_vp = nd.ni_vp;
+ NDFREE(&nd, NDF_NO_VP_RELE);
+ }
+
+ /* uap->path is always defined. */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error)
+ goto out;
+ mp = nd.ni_vp->v_mount;
+ error = vfs_busy(mp, 0);
+ if (error) {
+ NDFREE(&nd, 0);
+ mp = NULL;
+ goto out;
+ }
+ VOP_UNLOCK(nd.ni_vp, 0);
+ error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+ NDFREE(&nd, NDF_NO_VP_UNLOCK);
+ if (error)
+ goto out;
+ if (filename_vp != NULL) {
+ /*
+ * uap->filename is not always defined. If it is,
+ * grab a vnode lock, which VFS_EXTATTRCTL() will
+ * later release.
+ */
+ error = vn_lock(filename_vp, LK_EXCLUSIVE);
+ if (error) {
+ vn_finished_write(mp_writable);
+ goto out;
+ }
+ }
+
+ error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
+ uap->attrname != NULL ? attrname : NULL);
+
+ vn_finished_write(mp_writable);
+out:
+ if (mp != NULL)
+ vfs_unbusy(mp);
+
+ /*
+ * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
+ * so vrele it if it is defined.
+ */
+ if (filename_vp != NULL)
+ vrele(filename_vp);
+ return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct mount *mp;
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t cnt;
+ int error;
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ if (nbytes > IOSIZE_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ cnt = nbytes;
+
+#ifdef MAC
+ error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace,
+ attrname);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+ td->td_ucred, td);
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+
+done:
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+sys_extattr_set_fd(td, uap)
+ struct thread *td;
+ struct extattr_set_fd_args /* {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
+ if (error)
+ return (error);
+
+ error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+ fdrop(fp, td);
+
+ return (error);
+}
+
+int
+sys_extattr_set_file(td, uap)
+ struct thread *td;
+ struct extattr_set_file_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+int
+sys_extattr_set_link(td, uap)
+ struct thread *td;
+ struct extattr_set_link_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", userspace buffer
+ * pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ void *data, size_t nbytes, struct thread *td)
+{
+ struct uio auio, *auiop;
+ struct iovec aiov;
+ ssize_t cnt;
+ size_t size, *sizep;
+ int error;
+
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+
+ /*
+ * Slightly unusual semantics: if the user provides a NULL data
+ * pointer, they don't want to receive the data, just the maximum
+ * read length.
+ */
+ auiop = NULL;
+ sizep = NULL;
+ cnt = 0;
+ if (data != NULL) {
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ if (nbytes > IOSIZE_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auiop = &auio;
+ cnt = nbytes;
+ } else
+ sizep = &size;
+
+#ifdef MAC
+ error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace,
+ attrname);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+ td->td_ucred, td);
+
+ if (auiop != NULL) {
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+ } else
+ td->td_retval[0] = size;
+
+done:
+ VOP_UNLOCK(vp, 0);
+ return (error);
+}
+
+int
+sys_extattr_get_fd(td, uap)
+ struct thread *td;
+ struct extattr_get_fd_args /* {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_GET), &fp);
+ if (error)
+ return (error);
+
+ error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, uap->data, uap->nbytes, td);
+
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_extattr_get_file(td, uap)
+ struct thread *td;
+ struct extattr_get_file_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+int
+sys_extattr_get_link(td, uap)
+ struct thread *td;
+ struct extattr_get_link_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
+ td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+ uap->data, uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ * directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ * kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+ struct thread *td)
+{
+ struct mount *mp;
+ int error;
+
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+#ifdef MAC
+ error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace,
+ attrname);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
+ td);
+ if (error == EOPNOTSUPP)
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+ td->td_ucred, td);
+#ifdef MAC
+done:
+#endif
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+int
+sys_extattr_delete_fd(td, uap)
+ struct thread *td;
+ struct extattr_delete_fd_args /* {
+ int fd;
+ int attrnamespace;
+ const char *attrname;
+ } */ *uap;
+{
+ struct file *fp;
+ char attrname[EXTATTR_MAXNAMELEN];
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return (error);
+ AUDIT_ARG_TEXT(attrname);
+
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp);
+ if (error)
+ return (error);
+
+ error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
+ attrname, td);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_extattr_delete_file(td, uap)
+ struct thread *td;
+ struct extattr_delete_file_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return(error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return(error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+ vrele(nd.ni_vp);
+ return(error);
+}
+
+int
+sys_extattr_delete_link(td, uap)
+ struct thread *td;
+ struct extattr_delete_link_args /* {
+ const char *path;
+ int attrnamespace;
+ const char *attrname;
+ } */ *uap;
+{
+ struct nameidata nd;
+ char attrname[EXTATTR_MAXNAMELEN];
+ int error;
+
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+ if (error)
+ return(error);
+ AUDIT_ARG_TEXT(attrname);
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return(error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+ vrele(nd.ni_vp);
+ return(error);
+}
+
+/*-
+ * Retrieve a list of extended attributes on a file or directory.
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
+ * userspace buffer pointer "data", buffer length "nbytes",
+ * thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
+ size_t nbytes, struct thread *td)
+{
+ struct uio auio, *auiop;
+ size_t size, *sizep;
+ struct iovec aiov;
+ ssize_t cnt;
+ int error;
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+
+ auiop = NULL;
+ sizep = NULL;
+ cnt = 0;
+ if (data != NULL) {
+ aiov.iov_base = data;
+ aiov.iov_len = nbytes;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ if (nbytes > IOSIZE_MAX) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid = nbytes;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auiop = &auio;
+ cnt = nbytes;
+ } else
+ sizep = &size;
+
+#ifdef MAC
+ error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace);
+ if (error)
+ goto done;
+#endif
+
+ error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
+ td->td_ucred, td);
+
+ if (auiop != NULL) {
+ cnt -= auio.uio_resid;
+ td->td_retval[0] = cnt;
+ } else
+ td->td_retval[0] = size;
+
+done:
+ VOP_UNLOCK(vp, 0);
+ return (error);
+}
+
+
+int
+sys_extattr_list_fd(td, uap)
+ struct thread *td;
+ struct extattr_list_fd_args /* {
+ int fd;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp);
+ if (error)
+ return (error);
+
+ error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
+ uap->nbytes, td);
+
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_extattr_list_file(td, uap)
+ struct thread*td;
+ struct extattr_list_file_args /* {
+ const char *path;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ int error;
+
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
+ uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+int
+sys_extattr_list_link(td, uap)
+ struct thread*td;
+ struct extattr_list_link_args /* {
+ const char *path;
+ int attrnamespace;
+ void *data;
+ size_t nbytes;
+ } */ *uap;
+{
+ struct nameidata nd;
+ int error;
+
+ AUDIT_ARG_VALUE(uap->attrnamespace);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
+ td);
+ error = namei(&nd);
+ if (error)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
+ uap->nbytes, td);
+
+ vrele(nd.ni_vp);
+ return (error);
+}
diff --git a/sys/kern/vfs_hash.c b/sys/kern/vfs_hash.c
new file mode 100644
index 0000000..0271e49
--- /dev/null
+++ b/sys/kern/vfs_hash.c
@@ -0,0 +1,162 @@
+/*-
+ * Copyright (c) 2005 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+
+static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table");
+
+static LIST_HEAD(vfs_hash_head, vnode) *vfs_hash_tbl;
+static LIST_HEAD(,vnode) vfs_hash_side;
+static u_long vfs_hash_mask;
+static struct mtx vfs_hash_mtx;
+
+static void
+vfs_hashinit(void *dummy __unused)
+{
+
+ vfs_hash_tbl = hashinit(desiredvnodes, M_VFS_HASH, &vfs_hash_mask);
+ mtx_init(&vfs_hash_mtx, "vfs hash", NULL, MTX_DEF);
+ LIST_INIT(&vfs_hash_side);
+}
+
+/* Must be SI_ORDER_SECOND so desiredvnodes is available */
+SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL);
+
+u_int
+vfs_hash_index(struct vnode *vp)
+{
+
+ return (vp->v_hash + vp->v_mount->mnt_hashseed);
+}
+
+static struct vfs_hash_head *
+vfs_hash_bucket(const struct mount *mp, u_int hash)
+{
+
+ return (&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]);
+}
+
+int
+vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+ struct vnode *vp;
+ int error;
+
+ while (1) {
+ mtx_lock(&vfs_hash_mtx);
+ LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
+ if (vp->v_hash != hash)
+ continue;
+ if (vp->v_mount != mp)
+ continue;
+ if (fn != NULL && fn(vp, arg))
+ continue;
+ VI_LOCK(vp);
+ mtx_unlock(&vfs_hash_mtx);
+ error = vget(vp, flags | LK_INTERLOCK, td);
+ if (error == ENOENT && (flags & LK_NOWAIT) == 0)
+ break;
+ if (error)
+ return (error);
+ *vpp = vp;
+ return (0);
+ }
+ if (vp == NULL) {
+ mtx_unlock(&vfs_hash_mtx);
+ *vpp = NULL;
+ return (0);
+ }
+ }
+}
+
+void
+vfs_hash_remove(struct vnode *vp)
+{
+
+ mtx_lock(&vfs_hash_mtx);
+ LIST_REMOVE(vp, v_hashlist);
+ mtx_unlock(&vfs_hash_mtx);
+}
+
+int
+vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+ struct vnode *vp2;
+ int error;
+
+ *vpp = NULL;
+ while (1) {
+ mtx_lock(&vfs_hash_mtx);
+ LIST_FOREACH(vp2,
+ vfs_hash_bucket(vp->v_mount, hash), v_hashlist) {
+ if (vp2->v_hash != hash)
+ continue;
+ if (vp2->v_mount != vp->v_mount)
+ continue;
+ if (fn != NULL && fn(vp2, arg))
+ continue;
+ VI_LOCK(vp2);
+ mtx_unlock(&vfs_hash_mtx);
+ error = vget(vp2, flags | LK_INTERLOCK, td);
+ if (error == ENOENT && (flags & LK_NOWAIT) == 0)
+ break;
+ mtx_lock(&vfs_hash_mtx);
+ LIST_INSERT_HEAD(&vfs_hash_side, vp, v_hashlist);
+ mtx_unlock(&vfs_hash_mtx);
+ vput(vp);
+ if (!error)
+ *vpp = vp2;
+ return (error);
+ }
+ if (vp2 == NULL)
+ break;
+
+ }
+ vp->v_hash = hash;
+ LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
+ mtx_unlock(&vfs_hash_mtx);
+ return (0);
+}
+
+void
+vfs_hash_rehash(struct vnode *vp, u_int hash)
+{
+
+ mtx_lock(&vfs_hash_mtx);
+ LIST_REMOVE(vp, v_hashlist);
+ LIST_INSERT_HEAD(vfs_hash_bucket(vp->v_mount, hash), vp, v_hashlist);
+ vp->v_hash = hash;
+ mtx_unlock(&vfs_hash_mtx);
+}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
new file mode 100644
index 0000000..eab48fb
--- /dev/null
+++ b/sys/kern/vfs_init.c
@@ -0,0 +1,344 @@
+/*-
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed
+ * to Berkeley by John Heidemann of the UCLA Ficus project.
+ *
+ * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fnv_hash.h>
+#include <sys/kernel.h>
+#include <sys/linker.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+
+static int vfs_register(struct vfsconf *);
+static int vfs_unregister(struct vfsconf *);
+
+MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
+
+/*
+ * The highest defined VFS number.
+ */
+int maxvfsconf = VFS_GENERIC + 1;
+
+/*
+ * Single-linked list of configured VFSes.
+ * New entries are added/deleted by vfs_register()/vfs_unregister()
+ */
+struct vfsconfhead vfsconf = TAILQ_HEAD_INITIALIZER(vfsconf);
+
+/*
+ * Loader.conf variable vfs.typenumhash enables setting vfc_typenum using a hash
+ * calculation on vfc_name, so that it doesn't change when file systems are
+ * loaded in a different order. This will avoid the NFS server file handles from
+ * changing for file systems that use vfc_typenum in their fsid.
+ */
+static int vfs_typenumhash = 1;
+TUNABLE_INT("vfs.typenumhash", &vfs_typenumhash);
+SYSCTL_INT(_vfs, OID_AUTO, typenumhash, CTLFLAG_RDTUN, &vfs_typenumhash, 0,
+ "Set vfc_typenum using a hash calculation on vfc_name, so that it does not"
+ "change when file systems are loaded in a different order.");
+
+/*
+ * A Zen vnode attribute structure.
+ *
+ * Initialized when the first filesystem registers by vfs_register().
+ */
+struct vattr va_null;
+
+/*
+ * vfs_init.c
+ *
+ * Allocate and fill in operations vectors.
+ *
+ * An undocumented feature of this approach to defining operations is that
+ * there can be multiple entries in vfs_opv_descs for the same operations
+ * vector. This allows third parties to extend the set of operations
+ * supported by another layer in a binary compatibile way. For example,
+ * assume that NFS needed to be modified to support Ficus. NFS has an entry
+ * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
+ * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
+ * listing those new operations Ficus adds to NFS, all without modifying the
+ * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
+ * that is a(whole)nother story.) This is a feature.
+ */
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+struct vfsconf *
+vfs_byname(const char *name)
+{
+ struct vfsconf *vfsp;
+
+ if (!strcmp(name, "ffs"))
+ name = "ufs";
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+ if (!strcmp(name, vfsp->vfc_name))
+ return (vfsp);
+ return (NULL);
+}
+
+struct vfsconf *
+vfs_byname_kld(const char *fstype, struct thread *td, int *error)
+{
+ struct vfsconf *vfsp;
+ int fileid, loaded;
+
+ vfsp = vfs_byname(fstype);
+ if (vfsp != NULL)
+ return (vfsp);
+
+ /* Try to load the respective module. */
+ *error = kern_kldload(td, fstype, &fileid);
+ loaded = (*error == 0);
+ if (*error == EEXIST)
+ *error = 0;
+ if (*error)
+ return (NULL);
+
+ /* Look up again to see if the VFS was loaded. */
+ vfsp = vfs_byname(fstype);
+ if (vfsp == NULL) {
+ if (loaded)
+ (void)kern_kldunload(td, fileid, LINKER_UNLOAD_FORCE);
+ *error = ENODEV;
+ return (NULL);
+ }
+ return (vfsp);
+}
+
+
+/* Register a new filesystem type in the global table */
+static int
+vfs_register(struct vfsconf *vfc)
+{
+ struct sysctl_oid *oidp;
+ struct vfsops *vfsops;
+ static int once;
+ struct vfsconf *tvfc;
+ uint32_t hashval;
+ int secondpass;
+
+ if (!once) {
+ vattr_null(&va_null);
+ once = 1;
+ }
+
+ if (vfc->vfc_version != VFS_VERSION) {
+ printf("ERROR: filesystem %s, unsupported ABI version %x\n",
+ vfc->vfc_name, vfc->vfc_version);
+ return (EINVAL);
+ }
+ if (vfs_byname(vfc->vfc_name) != NULL)
+ return (EEXIST);
+
+ if (vfs_typenumhash != 0) {
+ /*
+ * Calculate a hash on vfc_name to use for vfc_typenum. Unless
+ * all of 1<->255 are assigned, it is limited to 8bits since
+ * that is what ZFS uses from vfc_typenum and is also the
+ * preferred range for vfs_getnewfsid().
+ */
+ hashval = fnv_32_str(vfc->vfc_name, FNV1_32_INIT);
+ hashval &= 0xff;
+ secondpass = 0;
+ do {
+ /* Look for and fix any collision. */
+ TAILQ_FOREACH(tvfc, &vfsconf, vfc_list) {
+ if (hashval == tvfc->vfc_typenum) {
+ if (hashval == 255 && secondpass == 0) {
+ hashval = 1;
+ secondpass = 1;
+ } else
+ hashval++;
+ break;
+ }
+ }
+ } while (tvfc != NULL);
+ vfc->vfc_typenum = hashval;
+ if (vfc->vfc_typenum >= maxvfsconf)
+ maxvfsconf = vfc->vfc_typenum + 1;
+ } else
+ vfc->vfc_typenum = maxvfsconf++;
+ TAILQ_INSERT_TAIL(&vfsconf, vfc, vfc_list);
+
+ /*
+ * If this filesystem has a sysctl node under vfs
+ * (i.e. vfs.xxfs), then change the oid number of that node to
+ * match the filesystem's type number. This allows user code
+ * which uses the type number to read sysctl variables defined
+ * by the filesystem to continue working. Since the oids are
+ * in a sorted list, we need to make sure the order is
+ * preserved by re-registering the oid after modifying its
+ * number.
+ */
+ sysctl_lock();
+ SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link)
+ if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
+ sysctl_unregister_oid(oidp);
+ oidp->oid_number = vfc->vfc_typenum;
+ sysctl_register_oid(oidp);
+ break;
+ }
+ sysctl_unlock();
+
+ /*
+ * Initialise unused ``struct vfsops'' fields, to use
+ * the vfs_std*() functions. Note, we need the mount
+ * and unmount operations, at the least. The check
+ * for vfsops available is just a debugging aid.
+ */
+ KASSERT(vfc->vfc_vfsops != NULL,
+ ("Filesystem %s has no vfsops", vfc->vfc_name));
+ /*
+ * Check the mount and unmount operations.
+ */
+ vfsops = vfc->vfc_vfsops;
+ KASSERT(vfsops->vfs_mount != NULL,
+ ("Filesystem %s has no mount op", vfc->vfc_name));
+ KASSERT(vfsops->vfs_unmount != NULL,
+ ("Filesystem %s has no unmount op", vfc->vfc_name));
+
+ if (vfsops->vfs_root == NULL)
+ /* return file system's root vnode */
+ vfsops->vfs_root = vfs_stdroot;
+ if (vfsops->vfs_quotactl == NULL)
+ /* quota control */
+ vfsops->vfs_quotactl = vfs_stdquotactl;
+ if (vfsops->vfs_statfs == NULL)
+ /* return file system's status */
+ vfsops->vfs_statfs = vfs_stdstatfs;
+ if (vfsops->vfs_sync == NULL)
+ /*
+ * flush unwritten data (nosync)
+ * file systems can use vfs_stdsync
+ * explicitly by setting it in the
+ * vfsop vector.
+ */
+ vfsops->vfs_sync = vfs_stdnosync;
+ if (vfsops->vfs_vget == NULL)
+ /* convert an inode number to a vnode */
+ vfsops->vfs_vget = vfs_stdvget;
+ if (vfsops->vfs_fhtovp == NULL)
+ /* turn an NFS file handle into a vnode */
+ vfsops->vfs_fhtovp = vfs_stdfhtovp;
+ if (vfsops->vfs_checkexp == NULL)
+ /* check if file system is exported */
+ vfsops->vfs_checkexp = vfs_stdcheckexp;
+ if (vfsops->vfs_init == NULL)
+ /* file system specific initialisation */
+ vfsops->vfs_init = vfs_stdinit;
+ if (vfsops->vfs_uninit == NULL)
+ /* file system specific uninitialisation */
+ vfsops->vfs_uninit = vfs_stduninit;
+ if (vfsops->vfs_extattrctl == NULL)
+ /* extended attribute control */
+ vfsops->vfs_extattrctl = vfs_stdextattrctl;
+ if (vfsops->vfs_sysctl == NULL)
+ vfsops->vfs_sysctl = vfs_stdsysctl;
+
+ /*
+ * Call init function for this VFS...
+ */
+ (*(vfc->vfc_vfsops->vfs_init))(vfc);
+
+ return 0;
+}
+
+
+/* Remove registration of a filesystem type */
+static int
+vfs_unregister(struct vfsconf *vfc)
+{
+ struct vfsconf *vfsp;
+ int error, i, maxtypenum;
+
+ i = vfc->vfc_typenum;
+
+ vfsp = vfs_byname(vfc->vfc_name);
+ if (vfsp == NULL)
+ return EINVAL;
+ if (vfsp->vfc_refcount)
+ return EBUSY;
+ if (vfc->vfc_vfsops->vfs_uninit != NULL) {
+ error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
+ if (error)
+ return (error);
+ }
+ TAILQ_REMOVE(&vfsconf, vfsp, vfc_list);
+ maxtypenum = VFS_GENERIC;
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+ if (maxtypenum < vfsp->vfc_typenum)
+ maxtypenum = vfsp->vfc_typenum;
+ maxvfsconf = maxtypenum + 1;
+ return 0;
+}
+
+/*
+ * Standard kernel module handling code for filesystem modules.
+ * Referenced from VFS_SET().
+ */
+int
+vfs_modevent(module_t mod, int type, void *data)
+{
+ struct vfsconf *vfc;
+ int error = 0;
+
+ vfc = (struct vfsconf *)data;
+
+ switch (type) {
+ case MOD_LOAD:
+ if (vfc)
+ error = vfs_register(vfc);
+ break;
+
+ case MOD_UNLOAD:
+ if (vfc)
+ error = vfs_unregister(vfc);
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+ return (error);
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
new file mode 100644
index 0000000..d4d0166
--- /dev/null
+++ b/sys/kern/vfs_lookup.c
@@ -0,0 +1,1254 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/capability.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+#define NAMEI_DIAGNOSTIC 1
+#undef NAMEI_DIAGNOSTIC
+
+SDT_PROVIDER_DECLARE(vfs);
+SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, entry, "struct vnode *", "char *",
+ "unsigned long");
+SDT_PROBE_DEFINE2(vfs, namei, lookup, return, return, "int", "struct vnode *");
+
+/*
+ * Allocation zone for namei
+ */
+uma_zone_t namei_zone;
+/*
+ * Placeholder vnode for mp traversal
+ */
+static struct vnode *vp_crossmp;
+
+static void
+nameiinit(void *dummy __unused)
+{
+
+ namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
+ vn_lock(vp_crossmp, LK_EXCLUSIVE);
+ VN_LOCK_ASHARE(vp_crossmp);
+ VOP_UNLOCK(vp_crossmp, 0);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
+
+static int lookup_shared = 1;
+SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0,
+ "Enables/Disables shared locks for path name translation");
+TUNABLE_INT("vfs.lookup_shared", &lookup_shared);
+
+/*
+ * Convert a pathname into a pointer to a locked vnode.
+ *
+ * The FOLLOW flag is set when symbolic links are to be followed
+ * when they occur at the end of the name translation process.
+ * Symbolic links are always followed for all other pathname
+ * components other than the last.
+ *
+ * The segflg defines whether the name is to be copied from user
+ * space or kernel space.
+ *
+ * Overall outline of namei:
+ *
+ * copy in name
+ * get starting directory
+ * while (!done && !error) {
+ * call lookup to search path.
+ * if symbolic link, massage name in buffer and continue
+ * }
+ */
+int
+namei(struct nameidata *ndp)
+{
+ struct filedesc *fdp; /* pointer to file descriptor state */
+ char *cp; /* pointer into pathname argument */
+ struct vnode *dp; /* the directory we are searching */
+ struct iovec aiov; /* uio for reading symbolic links */
+ struct uio auio;
+ int error, linklen;
+ struct componentname *cnp = &ndp->ni_cnd;
+ struct thread *td = cnp->cn_thread;
+ struct proc *p = td->td_proc;
+
+ ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
+ KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
+ KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
+ ("namei: nameiop contaminated with flags"));
+ KASSERT((cnp->cn_flags & OPMASK) == 0,
+ ("namei: flags contaminated with nameiops"));
+ if (!lookup_shared)
+ cnp->cn_flags &= ~LOCKSHARED;
+ fdp = p->p_fd;
+
+ /* We will set this ourselves if we need it. */
+ cnp->cn_flags &= ~TRAILINGSLASH;
+
+ /*
+ * Get a buffer for the name to be translated, and copy the
+ * name into the buffer.
+ */
+ if ((cnp->cn_flags & HASBUF) == 0)
+ cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
+ if (ndp->ni_segflg == UIO_SYSSPACE)
+ error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
+ MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+ else
+ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
+ MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+
+ /*
+ * Don't allow empty pathnames.
+ */
+ if (!error && *cnp->cn_pnbuf == '\0')
+ error = ENOENT;
+
+#ifdef CAPABILITY_MODE
+ /*
+ * In capability mode, lookups must be "strictly relative" (i.e.
+ * not an absolute path, and not containing '..' components) to
+ * a real file descriptor, not the pseudo-descriptor AT_FDCWD.
+ */
+ if (error == 0 && IN_CAPABILITY_MODE(td) &&
+ (cnp->cn_flags & NOCAPCHECK) == 0) {
+ ndp->ni_strictrelative = 1;
+ if (ndp->ni_dirfd == AT_FDCWD) {
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, 0, 0);
+#endif
+ error = ECAPMODE;
+ }
+ }
+#endif
+ if (error) {
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+ ndp->ni_vp = NULL;
+ return (error);
+ }
+ ndp->ni_loopcnt = 0;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_NAMEI)) {
+ KASSERT(cnp->cn_thread == curthread,
+ ("namei not using curthread"));
+ ktrnamei(cnp->cn_pnbuf);
+ }
+#endif
+ /*
+ * Get starting point for the translation.
+ */
+ FILEDESC_SLOCK(fdp);
+ ndp->ni_rootdir = fdp->fd_rdir;
+ ndp->ni_topdir = fdp->fd_jdir;
+
+ /*
+ * If we are auditing the kernel pathname, save the user pathname.
+ */
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf);
+ if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf);
+
+ dp = NULL;
+ if (cnp->cn_pnbuf[0] != '/') {
+ if (ndp->ni_startdir != NULL) {
+ dp = ndp->ni_startdir;
+ error = 0;
+ } else if (ndp->ni_dirfd != AT_FDCWD) {
+ cap_rights_t rights;
+
+ rights = ndp->ni_rightsneeded;
+ cap_rights_set(&rights, CAP_LOOKUP);
+
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG_ATFD1(ndp->ni_dirfd);
+ if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG_ATFD2(ndp->ni_dirfd);
+ error = fgetvp_rights(td, ndp->ni_dirfd,
+ &rights, &ndp->ni_filecaps, &dp);
+#ifdef CAPABILITIES
+ /*
+ * If file descriptor doesn't have all rights,
+ * all lookups relative to it must also be
+ * strictly relative.
+ */
+ CAP_ALL(&rights);
+ if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
+ &rights) ||
+ ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
+ ndp->ni_filecaps.fc_nioctls != -1) {
+ ndp->ni_strictrelative = 1;
+ }
+#endif
+ }
+ if (error != 0 || dp != NULL) {
+ FILEDESC_SUNLOCK(fdp);
+ if (error == 0 && dp->v_type != VDIR) {
+ vrele(dp);
+ error = ENOTDIR;
+ }
+ }
+ if (error) {
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+ return (error);
+ }
+ }
+ if (dp == NULL) {
+ dp = fdp->fd_cdir;
+ VREF(dp);
+ FILEDESC_SUNLOCK(fdp);
+ if (ndp->ni_startdir != NULL)
+ vrele(ndp->ni_startdir);
+ }
+ SDT_PROBE(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
+ cnp->cn_flags, 0, 0);
+ for (;;) {
+ /*
+ * Check if root directory should replace current directory.
+ * Done at start of translation and after symbolic link.
+ */
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ if (*(cnp->cn_nameptr) == '/') {
+ vrele(dp);
+ if (ndp->ni_strictrelative != 0) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, 0, 0);
+#endif
+ return (ENOTCAPABLE);
+ }
+ while (*(cnp->cn_nameptr) == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ dp = ndp->ni_rootdir;
+ VREF(dp);
+ }
+ ndp->ni_startdir = dp;
+ error = lookup(ndp);
+ if (error) {
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+ SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0,
+ 0, 0);
+ return (error);
+ }
+ /*
+ * If not a symbolic link, we're done.
+ */
+ if ((cnp->cn_flags & ISSYMLINK) == 0) {
+ if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+ } else
+ cnp->cn_flags |= HASBUF;
+
+ SDT_PROBE(vfs, namei, lookup, return, 0, ndp->ni_vp,
+ 0, 0, 0);
+ return (0);
+ }
+ if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
+ error = ELOOP;
+ break;
+ }
+#ifdef MAC
+ if ((cnp->cn_flags & NOMACCHECK) == 0) {
+ error = mac_vnode_check_readlink(td->td_ucred,
+ ndp->ni_vp);
+ if (error)
+ break;
+ }
+#endif
+ if (ndp->ni_pathlen > 1)
+ cp = uma_zalloc(namei_zone, M_WAITOK);
+ else
+ cp = cnp->cn_pnbuf;
+ aiov.iov_base = cp;
+ aiov.iov_len = MAXPATHLEN;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = MAXPATHLEN;
+ error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+ if (error) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ break;
+ }
+ linklen = MAXPATHLEN - auio.uio_resid;
+ if (linklen == 0) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ error = ENOENT;
+ break;
+ }
+ if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
+ if (ndp->ni_pathlen > 1)
+ uma_zfree(namei_zone, cp);
+ error = ENAMETOOLONG;
+ break;
+ }
+ if (ndp->ni_pathlen > 1) {
+ bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+ cnp->cn_pnbuf = cp;
+ } else
+ cnp->cn_pnbuf[linklen] = '\0';
+ ndp->ni_pathlen += linklen;
+ vput(ndp->ni_vp);
+ dp = ndp->ni_dvp;
+ }
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+ vput(ndp->ni_vp);
+ ndp->ni_vp = NULL;
+ vrele(ndp->ni_dvp);
+ SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0, 0, 0);
+ return (error);
+}
+
+static int
+compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
+{
+
+ if (mp == NULL || ((lkflags & LK_SHARED) &&
+ (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
+ ((cnflags & ISDOTDOT) &&
+ (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
+ lkflags &= ~LK_SHARED;
+ lkflags |= LK_EXCLUSIVE;
+ }
+ return (lkflags);
+}
+
+static __inline int
+needs_exclusive_leaf(struct mount *mp, int flags)
+{
+
+ /*
+ * Intermediate nodes can use shared locks, we only need to
+ * force an exclusive lock for leaf nodes.
+ */
+ if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
+ return (0);
+
+ /* Always use exclusive locks if LOCKSHARED isn't set. */
+ if (!(flags & LOCKSHARED))
+ return (1);
+
+ /*
+ * For lookups during open(), if the mount point supports
+ * extended shared operations, then use a shared lock for the
+ * leaf node, otherwise use an exclusive lock.
+ */
+ if (flags & ISOPEN) {
+ if (mp != NULL &&
+ (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED))
+ return (0);
+ else
+ return (1);
+ }
+
+ /*
+ * Lookup requests outside of open() that specify LOCKSHARED
+ * only need a shared lock on the leaf vnode.
+ */
+ return (0);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central and rather complicated routine.
+ *
+ * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
+ * The starting directory is taken from ni_startdir. The pathname is
+ * descended until done, or a symbolic link is encountered. The variable
+ * ni_more is clear if the path is completed; it is set to one if a
+ * symbolic link needing interpretation is encountered.
+ *
+ * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
+ * whether the name is to be looked up, created, renamed, or deleted.
+ * When CREATE, RENAME, or DELETE is specified, information usable in
+ * creating, renaming, or deleting a directory entry may be calculated.
+ * If flag has LOCKPARENT or'ed into it, the parent directory is returned
+ * locked. If flag has WANTPARENT or'ed into it, the parent directory is
+ * returned unlocked. Otherwise the parent directory is not returned. If
+ * the target of the pathname exists and LOCKLEAF is or'ed into the flag
+ * the target is returned locked, otherwise it is returned unlocked.
+ * When creating or renaming and LOCKPARENT is specified, the target may not
+ * be ".". When deleting and LOCKPARENT is specified, the target may be ".".
+ *
+ * Overall outline of lookup:
+ *
+ * dirloop:
+ * identify next component of name at ndp->ni_ptr
+ * handle degenerate case where name is null string
+ * if .. and crossing mount points and on mounted filesys, find parent
+ * call VOP_LOOKUP routine for next component name
+ * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
+ * component vnode returned in ni_vp (if it exists), locked.
+ * if result vnode is mounted on and crossing mount points,
+ * find mounted on vnode
+ * if more components of name, do next level at dirloop
+ * return the answer in ni_vp, locked if LOCKLEAF set
+ * if LOCKPARENT set, return locked parent in ni_dvp
+ * if WANTPARENT set, return unlocked parent in ni_dvp
+ */
+int
+lookup(struct nameidata *ndp)
+{
+ char *cp; /* pointer into pathname argument */
+ struct vnode *dp = 0; /* the directory we are searching */
+ struct vnode *tdp; /* saved dp */
+ struct mount *mp; /* mount table entry */
+ struct prison *pr;
+ int docache; /* == 0 do not cache last component */
+ int wantparent; /* 1 => wantparent or lockparent flag */
+ int rdonly; /* lookup read-only flag bit */
+ int error = 0;
+ int dpunlocked = 0; /* dp has already been unlocked */
+ struct componentname *cnp = &ndp->ni_cnd;
+ int lkflags_save;
+ int ni_dvp_unlocked;
+
+ /*
+ * Setup: break out flag bits into variables.
+ */
+ ni_dvp_unlocked = 0;
+ wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
+ KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
+ ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
+ docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
+ if (cnp->cn_nameiop == DELETE ||
+ (wantparent && cnp->cn_nameiop != CREATE &&
+ cnp->cn_nameiop != LOOKUP))
+ docache = 0;
+ rdonly = cnp->cn_flags & RDONLY;
+ cnp->cn_flags &= ~ISSYMLINK;
+ ndp->ni_dvp = NULL;
+ /*
+ * We use shared locks until we hit the parent of the last cn then
+ * we adjust based on the requesting flags.
+ */
+ if (lookup_shared)
+ cnp->cn_lkflags = LK_SHARED;
+ else
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+ dp = ndp->ni_startdir;
+ ndp->ni_startdir = NULLVP;
+ vn_lock(dp,
+ compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
+ cnp->cn_flags));
+
+dirloop:
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+ cnp->cn_consume = 0;
+ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ continue;
+ cnp->cn_namelen = cp - cnp->cn_nameptr;
+ if (cnp->cn_namelen > NAME_MAX) {
+ error = ENAMETOOLONG;
+ goto bad;
+ }
+#ifdef NAMEI_DIAGNOSTIC
+ { char c = *cp;
+ *cp = '\0';
+ printf("{%s}: ", cnp->cn_nameptr);
+ *cp = c; }
+#endif
+ ndp->ni_pathlen -= cnp->cn_namelen;
+ ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ *ndp->ni_next = '\0';
+ cnp->cn_flags |= TRAILINGSLASH;
+ }
+ }
+ ndp->ni_next = cp;
+
+ cnp->cn_flags |= MAKEENTRY;
+ if (*cp == '\0' && docache == 0)
+ cnp->cn_flags &= ~MAKEENTRY;
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+ cnp->cn_flags |= ISDOTDOT;
+ else
+ cnp->cn_flags &= ~ISDOTDOT;
+ if (*ndp->ni_next == 0)
+ cnp->cn_flags |= ISLASTCN;
+ else
+ cnp->cn_flags &= ~ISLASTCN;
+
+ if ((cnp->cn_flags & ISLASTCN) != 0 &&
+ cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EINVAL;
+ goto bad;
+ }
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ */
+ if (cnp->cn_nameptr[0] == '\0') {
+ if (dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ if (cnp->cn_nameiop != LOOKUP) {
+ error = EISDIR;
+ goto bad;
+ }
+ if (wantparent) {
+ ndp->ni_dvp = dp;
+ VREF(dp);
+ }
+ ndp->ni_vp = dp;
+
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG_VNODE1(dp);
+ else if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG_VNODE2(dp);
+
+ if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
+ VOP_UNLOCK(dp, 0);
+ /* XXX This should probably move to the top of function. */
+ if (cnp->cn_flags & SAVESTART)
+ panic("lookup: SAVESTART");
+ goto success;
+ }
+
+ /*
+ * Handle "..": five special cases.
+ * 0. If doing a capability lookup, return ENOTCAPABLE (this is a
+ * fairly conservative design choice, but it's the only one that we
+ * are satisfied guarantees the property we're looking for).
+ * 1. Return an error if this is the last component of
+ * the name and the operation is DELETE or RENAME.
+ * 2. If at root directory (e.g. after chroot)
+ * or at absolute root directory
+ * then ignore it so can't get out.
+ * 3. If this vnode is the root of a mounted
+ * filesystem, then replace it with the
+ * vnode which was mounted on so we take the
+ * .. in the other filesystem.
+ * 4. If the vnode is the top directory of
+ * the jail or chroot, don't let them out.
+ */
+ if (cnp->cn_flags & ISDOTDOT) {
+ if (ndp->ni_strictrelative != 0) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, 0, 0);
+#endif
+ error = ENOTCAPABLE;
+ goto bad;
+ }
+ if ((cnp->cn_flags & ISLASTCN) != 0 &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EINVAL;
+ goto bad;
+ }
+ for (;;) {
+ for (pr = cnp->cn_cred->cr_prison; pr != NULL;
+ pr = pr->pr_parent)
+ if (dp == pr->pr_root)
+ break;
+ if (dp == ndp->ni_rootdir ||
+ dp == ndp->ni_topdir ||
+ dp == rootvnode ||
+ pr != NULL ||
+ ((dp->v_vflag & VV_ROOT) != 0 &&
+ (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
+ ndp->ni_dvp = dp;
+ ndp->ni_vp = dp;
+ VREF(dp);
+ goto nextname;
+ }
+ if ((dp->v_vflag & VV_ROOT) == 0)
+ break;
+ if (dp->v_iflag & VI_DOOMED) { /* forced unmount */
+ error = ENOENT;
+ goto bad;
+ }
+ tdp = dp;
+ dp = dp->v_mount->mnt_vnodecovered;
+ VREF(dp);
+ vput(tdp);
+ vn_lock(dp,
+ compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+ LK_RETRY, ISDOTDOT));
+ }
+ }
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ */
+unionlookup:
+#ifdef MAC
+ if ((cnp->cn_flags & NOMACCHECK) == 0) {
+ error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp,
+ cnp);
+ if (error)
+ goto bad;
+ }
+#endif
+ ndp->ni_dvp = dp;
+ ndp->ni_vp = NULL;
+ ASSERT_VOP_LOCKED(dp, "lookup");
+ /*
+ * If we have a shared lock we may need to upgrade the lock for the
+ * last operation.
+ */
+ if (dp != vp_crossmp &&
+ VOP_ISLOCKED(dp) == LK_SHARED &&
+ (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
+ vn_lock(dp, LK_UPGRADE|LK_RETRY);
+ if ((dp->v_iflag & VI_DOOMED) != 0) {
+ error = ENOENT;
+ goto bad;
+ }
+ /*
+ * If we're looking up the last component and we need an exclusive
+ * lock, adjust our lkflags.
+ */
+ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+#ifdef NAMEI_DIAGNOSTIC
+ vprint("lookup in", dp);
+#endif
+ lkflags_save = cnp->cn_lkflags;
+ cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
+ cnp->cn_flags);
+ if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
+ cnp->cn_lkflags = lkflags_save;
+ KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
+#ifdef NAMEI_DIAGNOSTIC
+ printf("not found\n");
+#endif
+ if ((error == ENOENT) &&
+ (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
+ (dp->v_mount->mnt_flag & MNT_UNION)) {
+ tdp = dp;
+ dp = dp->v_mount->mnt_vnodecovered;
+ VREF(dp);
+ vput(tdp);
+ vn_lock(dp,
+ compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
+ LK_RETRY, cnp->cn_flags));
+ goto unionlookup;
+ }
+
+ if (error != EJUSTRETURN)
+ goto bad;
+ /*
+ * At this point, we know we're at the end of the
+ * pathname. If creating / renaming, we can consider
+ * allowing the file or directory to be created / renamed,
+ * provided we're not on a read-only filesystem.
+ */
+ if (rdonly) {
+ error = EROFS;
+ goto bad;
+ }
+ /* trailing slash only allowed for directories */
+ if ((cnp->cn_flags & TRAILINGSLASH) &&
+ !(cnp->cn_flags & WILLBEDIR)) {
+ error = ENOENT;
+ goto bad;
+ }
+ if ((cnp->cn_flags & LOCKPARENT) == 0)
+ VOP_UNLOCK(dp, 0);
+ /*
+ * We return with ni_vp NULL to indicate that the entry
+ * doesn't currently exist, leaving a pointer to the
+ * (possibly locked) directory vnode in ndp->ni_dvp.
+ */
+ if (cnp->cn_flags & SAVESTART) {
+ ndp->ni_startdir = ndp->ni_dvp;
+ VREF(ndp->ni_startdir);
+ }
+ goto success;
+ } else
+ cnp->cn_lkflags = lkflags_save;
+#ifdef NAMEI_DIAGNOSTIC
+ printf("found\n");
+#endif
+ /*
+ * Take into account any additional components consumed by
+ * the underlying filesystem.
+ */
+ if (cnp->cn_consume > 0) {
+ cnp->cn_nameptr += cnp->cn_consume;
+ ndp->ni_next += cnp->cn_consume;
+ ndp->ni_pathlen -= cnp->cn_consume;
+ cnp->cn_consume = 0;
+ }
+
+ dp = ndp->ni_vp;
+
+ /*
+ * Check to see if the vnode has been mounted on;
+ * if so find the root of the mounted filesystem.
+ */
+ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
+ (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+ if (vfs_busy(mp, 0))
+ continue;
+ vput(dp);
+ if (dp != ndp->ni_dvp)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ vref(vp_crossmp);
+ ndp->ni_dvp = vp_crossmp;
+ error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
+ cnp->cn_flags), &tdp);
+ vfs_unbusy(mp);
+ if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
+ panic("vp_crossmp exclusively locked or reclaimed");
+ if (error) {
+ dpunlocked = 1;
+ goto bad2;
+ }
+ ndp->ni_vp = dp = tdp;
+ }
+
+ /*
+ * Check for symbolic link
+ */
+ if ((dp->v_type == VLNK) &&
+ ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
+ *ndp->ni_next == '/')) {
+ cnp->cn_flags |= ISSYMLINK;
+ if (dp->v_iflag & VI_DOOMED) {
+ /*
+ * We can't know whether the directory was mounted with
+ * NOSYMFOLLOW, so we can't follow safely.
+ */
+ error = ENOENT;
+ goto bad2;
+ }
+ if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
+ error = EACCES;
+ goto bad2;
+ }
+ /*
+ * Symlink code always expects an unlocked dvp.
+ */
+ if (ndp->ni_dvp != ndp->ni_vp) {
+ VOP_UNLOCK(ndp->ni_dvp, 0);
+ ni_dvp_unlocked = 1;
+ }
+ goto success;
+ }
+
+nextname:
+ /*
+ * Not a symbolic link that we will follow. Continue with the
+ * next component if there is any; otherwise, we're done.
+ */
+ KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
+ ("lookup: invalid path state."));
+ if (*ndp->ni_next == '/') {
+ cnp->cn_nameptr = ndp->ni_next;
+ while (*cnp->cn_nameptr == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ if (ndp->ni_dvp != dp)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ goto dirloop;
+ }
+ /*
+ * If we're processing a path with a trailing slash,
+ * check that the end result is a directory.
+ */
+ if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad2;
+ }
+ /*
+ * Disallow directory write attempts on read-only filesystems.
+ */
+ if (rdonly &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ error = EROFS;
+ goto bad2;
+ }
+ if (cnp->cn_flags & SAVESTART) {
+ ndp->ni_startdir = ndp->ni_dvp;
+ VREF(ndp->ni_startdir);
+ }
+ if (!wantparent) {
+ ni_dvp_unlocked = 2;
+ if (ndp->ni_dvp != dp)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
+ VOP_UNLOCK(ndp->ni_dvp, 0);
+ ni_dvp_unlocked = 1;
+ }
+
+ if (cnp->cn_flags & AUDITVNODE1)
+ AUDIT_ARG_VNODE1(dp);
+ else if (cnp->cn_flags & AUDITVNODE2)
+ AUDIT_ARG_VNODE2(dp);
+
+ if ((cnp->cn_flags & LOCKLEAF) == 0)
+ VOP_UNLOCK(dp, 0);
+success:
+ /*
+ * Because of lookup_shared we may have the vnode shared locked, but
+ * the caller may want it to be exclusively locked.
+ */
+ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
+ VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
+ vn_lock(dp, LK_UPGRADE | LK_RETRY);
+ if (dp->v_iflag & VI_DOOMED) {
+ error = ENOENT;
+ goto bad2;
+ }
+ }
+ return (0);
+
+bad2:
+ if (ni_dvp_unlocked != 2) {
+ if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
+ vput(ndp->ni_dvp);
+ else
+ vrele(ndp->ni_dvp);
+ }
+bad:
+ if (!dpunlocked)
+ vput(dp);
+ ndp->ni_vp = NULL;
+ return (error);
+}
+
+/*
+ * relookup - lookup a path name component
+ * Used by lookup to re-acquire things.
+ */
+int
+relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
+{
+ struct vnode *dp = 0; /* the directory we are searching */
+ int wantparent; /* 1 => wantparent or lockparent flag */
+ int rdonly; /* lookup read-only flag bit */
+ int error = 0;
+
+ KASSERT(cnp->cn_flags & ISLASTCN,
+ ("relookup: Not given last component."));
+ /*
+ * Setup: break out flag bits into variables.
+ */
+ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
+ KASSERT(wantparent, ("relookup: parent not wanted."));
+ rdonly = cnp->cn_flags & RDONLY;
+ cnp->cn_flags &= ~ISSYMLINK;
+ dp = dvp;
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+ vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
+
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+#ifdef NAMEI_DIAGNOSTIC
+ printf("{%s}: ", cnp->cn_nameptr);
+#endif
+
+ /*
+ * Check for "" which represents the root directory after slash
+ * removal.
+ */
+ if (cnp->cn_nameptr[0] == '\0') {
+ /*
+ * Support only LOOKUP for "/" because lookup()
+ * can't succeed for CREATE, DELETE and RENAME.
+ */
+ KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
+ KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
+
+ if (!(cnp->cn_flags & LOCKLEAF))
+ VOP_UNLOCK(dp, 0);
+ *vpp = dp;
+ /* XXX This should probably move to the top of function. */
+ if (cnp->cn_flags & SAVESTART)
+ panic("lookup: SAVESTART");
+ return (0);
+ }
+
+ if (cnp->cn_flags & ISDOTDOT)
+ panic ("relookup: lookup on dot-dot");
+
+ /*
+ * We now have a segment name to search for, and a directory to search.
+ */
+#ifdef NAMEI_DIAGNOSTIC
+ vprint("search in:", dp);
+#endif
+ if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
+ KASSERT(*vpp == NULL, ("leaf should be empty"));
+ if (error != EJUSTRETURN)
+ goto bad;
+ /*
+ * If creating and at end of pathname, then can consider
+ * allowing file to be created.
+ */
+ if (rdonly) {
+ error = EROFS;
+ goto bad;
+ }
+ /* ASSERT(dvp == ndp->ni_startdir) */
+ if (cnp->cn_flags & SAVESTART)
+ VREF(dvp);
+ if ((cnp->cn_flags & LOCKPARENT) == 0)
+ VOP_UNLOCK(dp, 0);
+ /*
+ * We return with ni_vp NULL to indicate that the entry
+ * doesn't currently exist, leaving a pointer to the
+ * (possibly locked) directory vnode in ndp->ni_dvp.
+ */
+ return (0);
+ }
+
+ dp = *vpp;
+
+ /*
+ * Disallow directory write attempts on read-only filesystems.
+ */
+ if (rdonly &&
+ (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
+ if (dvp == dp)
+ vrele(dvp);
+ else
+ vput(dvp);
+ error = EROFS;
+ goto bad;
+ }
+ /*
+ * Set the parent lock/ref state to the requested state.
+ */
+ if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
+ if (wantparent)
+ VOP_UNLOCK(dvp, 0);
+ else
+ vput(dvp);
+ } else if (!wantparent)
+ vrele(dvp);
+ /*
+ * Check for symbolic link
+ */
+ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
+ ("relookup: symlink found.\n"));
+
+ /* ASSERT(dvp == ndp->ni_startdir) */
+ if (cnp->cn_flags & SAVESTART)
+ VREF(dvp);
+
+ if ((cnp->cn_flags & LOCKLEAF) == 0)
+ VOP_UNLOCK(dp, 0);
+ return (0);
+bad:
+ vput(dp);
+ *vpp = NULL;
+ return (error);
+}
+
+void
+NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
+ const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
+ struct thread *td)
+{
+
+ ndp->ni_cnd.cn_nameiop = op;
+ ndp->ni_cnd.cn_flags = flags;
+ ndp->ni_segflg = segflg;
+ ndp->ni_dirp = namep;
+ ndp->ni_dirfd = dirfd;
+ ndp->ni_startdir = startdir;
+ ndp->ni_strictrelative = 0;
+ if (rightsp != NULL)
+ ndp->ni_rightsneeded = *rightsp;
+ else
+ cap_rights_init(&ndp->ni_rightsneeded);
+ filecaps_init(&ndp->ni_filecaps);
+ ndp->ni_cnd.cn_thread = td;
+}
+
+/*
+ * Free data allocated by namei(); see namei(9) for details.
+ */
+void
+NDFREE(struct nameidata *ndp, const u_int flags)
+{
+ int unlock_dvp;
+ int unlock_vp;
+
+ unlock_dvp = 0;
+ unlock_vp = 0;
+
+ if (!(flags & NDF_NO_FREE_PNBUF) &&
+ (ndp->ni_cnd.cn_flags & HASBUF)) {
+ uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
+ ndp->ni_cnd.cn_flags &= ~HASBUF;
+ }
+ if (!(flags & NDF_NO_VP_UNLOCK) &&
+ (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
+ unlock_vp = 1;
+ if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
+ if (unlock_vp) {
+ vput(ndp->ni_vp);
+ unlock_vp = 0;
+ } else
+ vrele(ndp->ni_vp);
+ ndp->ni_vp = NULL;
+ }
+ if (unlock_vp)
+ VOP_UNLOCK(ndp->ni_vp, 0);
+ if (!(flags & NDF_NO_DVP_UNLOCK) &&
+ (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
+ ndp->ni_dvp != ndp->ni_vp)
+ unlock_dvp = 1;
+ if (!(flags & NDF_NO_DVP_RELE) &&
+ (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
+ if (unlock_dvp) {
+ vput(ndp->ni_dvp);
+ unlock_dvp = 0;
+ } else
+ vrele(ndp->ni_dvp);
+ ndp->ni_dvp = NULL;
+ }
+ if (unlock_dvp)
+ VOP_UNLOCK(ndp->ni_dvp, 0);
+ if (!(flags & NDF_NO_STARTDIR_RELE) &&
+ (ndp->ni_cnd.cn_flags & SAVESTART)) {
+ vrele(ndp->ni_startdir);
+ ndp->ni_startdir = NULL;
+ }
+}
+
+/*
+ * Determine if there is a suitable alternate filename under the specified
+ * prefix for the specified path. If the create flag is set, then the
+ * alternate prefix will be used so long as the parent directory exists.
+ * This is used by the various compatiblity ABIs so that Linux binaries prefer
+ * files under /compat/linux for example. The chosen path (whether under
+ * the prefix or under /) is returned in a kernel malloc'd buffer pointed
+ * to by pathbuf. The caller is responsible for free'ing the buffer from
+ * the M_TEMP bucket if one is returned.
+ */
+int
+kern_alternate_path(struct thread *td, const char *prefix, const char *path,
+ enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
+{
+ struct nameidata nd, ndroot;
+ char *ptr, *buf, *cp;
+ size_t len, sz;
+ int error;
+
+ buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+ *pathbuf = buf;
+
+ /* Copy the prefix into the new pathname as a starting point. */
+ len = strlcpy(buf, prefix, MAXPATHLEN);
+ if (len >= MAXPATHLEN) {
+ *pathbuf = NULL;
+ free(buf, M_TEMP);
+ return (EINVAL);
+ }
+ sz = MAXPATHLEN - len;
+ ptr = buf + len;
+
+ /* Append the filename to the prefix. */
+ if (pathseg == UIO_SYSSPACE)
+ error = copystr(path, ptr, sz, &len);
+ else
+ error = copyinstr(path, ptr, sz, &len);
+
+ if (error) {
+ *pathbuf = NULL;
+ free(buf, M_TEMP);
+ return (error);
+ }
+
+ /* Only use a prefix with absolute pathnames. */
+ if (*ptr != '/') {
+ error = EINVAL;
+ goto keeporig;
+ }
+
+ if (dirfd != AT_FDCWD) {
+ /*
+ * We want the original because the "prefix" is
+ * included in the already opened dirfd.
+ */
+ bcopy(ptr, buf, len);
+ return (0);
+ }
+
+ /*
+ * We know that there is a / somewhere in this pathname.
+ * Search backwards for it, to find the file's parent dir
+ * to see if it exists in the alternate tree. If it does,
+ * and we want to create a file (cflag is set). We don't
+ * need to worry about the root comparison in this case.
+ */
+
+ if (create) {
+ for (cp = &ptr[len] - 1; *cp != '/'; cp--);
+ *cp = '\0';
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
+ error = namei(&nd);
+ *cp = '/';
+ if (error != 0)
+ goto keeporig;
+ } else {
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
+
+ error = namei(&nd);
+ if (error != 0)
+ goto keeporig;
+
+ /*
+ * We now compare the vnode of the prefix to the one
+ * vnode asked. If they resolve to be the same, then we
+ * ignore the match so that the real root gets used.
+ * This avoids the problem of traversing "../.." to find the
+ * root directory and never finding it, because "/" resolves
+ * to the emulation root directory. This is expensive :-(
+ */
+ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
+ td);
+
+ /* We shouldn't ever get an error from this namei(). */
+ error = namei(&ndroot);
+ if (error == 0) {
+ if (nd.ni_vp == ndroot.ni_vp)
+ error = ENOENT;
+
+ NDFREE(&ndroot, NDF_ONLY_PNBUF);
+ vrele(ndroot.ni_vp);
+ }
+ }
+
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(nd.ni_vp);
+
+keeporig:
+ /* If there was an error, use the original path name. */
+ if (error)
+ bcopy(ptr, buf, len);
+ return (error);
+}
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
new file mode 100644
index 0000000..8f92e10
--- /dev/null
+++ b/sys/kern/vfs_mount.c
@@ -0,0 +1,1949 @@
+/*-
+ * Copyright (c) 1999-2004 Poul-Henning Kamp
+ * Copyright (c) 1999 Michael Smith
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/reboot.h>
+#include <sys/sbuf.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <vm/uma.h>
+
+#include <geom/geom.h>
+
+#include <machine/stdarg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#define VFS_MOUNTARG_SIZE_MAX (1024 * 64)
+
+static int vfs_domount(struct thread *td, const char *fstype, char *fspath,
+ uint64_t fsflags, struct vfsoptlist **optlist);
+static void free_mntarg(struct mntarg *ma);
+
+static int usermount = 0;
+SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
+ "Unprivileged users may mount and unmount file systems");
+
+MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
+static uma_zone_t mount_zone;
+
+/* List of mounted filesystems. */
+struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
+
+/* For any iteration/modification of mountlist */
+struct mtx mountlist_mtx;
+MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
+
+/*
+ * Global opts, taken by all filesystems
+ */
+static const char *global_opts[] = {
+ "errmsg",
+ "fstype",
+ "fspath",
+ "ro",
+ "rw",
+ "nosuid",
+ "noexec",
+ NULL
+};
+
+static int
+mount_init(void *mem, int size, int flags)
+{
+ struct mount *mp;
+
+ mp = (struct mount *)mem;
+ mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
+ lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
+ return (0);
+}
+
+static void
+mount_fini(void *mem, int size)
+{
+ struct mount *mp;
+
+ mp = (struct mount *)mem;
+ lockdestroy(&mp->mnt_explock);
+ mtx_destroy(&mp->mnt_mtx);
+}
+
+static void
+vfs_mount_init(void *dummy __unused)
+{
+
+ mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
+ NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+}
+SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
+
+/*
+ * ---------------------------------------------------------------------
+ * Functions for building and sanitizing the mount options
+ */
+
+/* Remove one mount option. */
+static void
+vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
+{
+
+ TAILQ_REMOVE(opts, opt, link);
+ free(opt->name, M_MOUNT);
+ if (opt->value != NULL)
+ free(opt->value, M_MOUNT);
+ free(opt, M_MOUNT);
+}
+
+/* Release all resources related to the mount options. */
+void
+vfs_freeopts(struct vfsoptlist *opts)
+{
+ struct vfsopt *opt;
+
+ while (!TAILQ_EMPTY(opts)) {
+ opt = TAILQ_FIRST(opts);
+ vfs_freeopt(opts, opt);
+ }
+ free(opts, M_MOUNT);
+}
+
+void
+vfs_deleteopt(struct vfsoptlist *opts, const char *name)
+{
+ struct vfsopt *opt, *temp;
+
+ if (opts == NULL)
+ return;
+ TAILQ_FOREACH_SAFE(opt, opts, link, temp) {
+ if (strcmp(opt->name, name) == 0)
+ vfs_freeopt(opts, opt);
+ }
+}
+
+static int
+vfs_isopt_ro(const char *opt)
+{
+
+ if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
+ strcmp(opt, "norw") == 0)
+ return (1);
+ return (0);
+}
+
+static int
+vfs_isopt_rw(const char *opt)
+{
+
+ if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
+ return (1);
+ return (0);
+}
+
+/*
+ * Check if options are equal (with or without the "no" prefix).
+ */
+static int
+vfs_equalopts(const char *opt1, const char *opt2)
+{
+ char *p;
+
+ /* "opt" vs. "opt" or "noopt" vs. "noopt" */
+ if (strcmp(opt1, opt2) == 0)
+ return (1);
+ /* "noopt" vs. "opt" */
+ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
+ return (1);
+ /* "opt" vs. "noopt" */
+ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
+ return (1);
+ while ((p = strchr(opt1, '.')) != NULL &&
+ !strncmp(opt1, opt2, ++p - opt1)) {
+ opt2 += p - opt1;
+ opt1 = p;
+ /* "foo.noopt" vs. "foo.opt" */
+ if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
+ return (1);
+ /* "foo.opt" vs. "foo.noopt" */
+ if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
+ return (1);
+ }
+ /* "ro" / "rdonly" / "norw" / "rw" / "noro" */
+ if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
+ (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
+ return (1);
+ return (0);
+}
+
+/*
+ * If a mount option is specified several times,
+ * (with or without the "no" prefix) only keep
+ * the last occurrence of it.
+ */
+static void
+vfs_sanitizeopts(struct vfsoptlist *opts)
+{
+ struct vfsopt *opt, *opt2, *tmp;
+
+ TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
+ opt2 = TAILQ_PREV(opt, vfsoptlist, link);
+ while (opt2 != NULL) {
+ if (vfs_equalopts(opt->name, opt2->name)) {
+ tmp = TAILQ_PREV(opt2, vfsoptlist, link);
+ vfs_freeopt(opts, opt2);
+ opt2 = tmp;
+ } else {
+ opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
+ }
+ }
+ }
+}
+
+/*
+ * Build a linked list of mount options from a struct uio.
+ */
+int
+vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
+{
+ struct vfsoptlist *opts;
+ struct vfsopt *opt;
+ size_t memused, namelen, optlen;
+ unsigned int i, iovcnt;
+ int error;
+
+ opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+ TAILQ_INIT(opts);
+ memused = 0;
+ iovcnt = auio->uio_iovcnt;
+ for (i = 0; i < iovcnt; i += 2) {
+ namelen = auio->uio_iov[i].iov_len;
+ optlen = auio->uio_iov[i + 1].iov_len;
+ memused += sizeof(struct vfsopt) + optlen + namelen;
+ /*
+ * Avoid consuming too much memory, and attempts to overflow
+ * memused.
+ */
+ if (memused > VFS_MOUNTARG_SIZE_MAX ||
+ optlen > VFS_MOUNTARG_SIZE_MAX ||
+ namelen > VFS_MOUNTARG_SIZE_MAX) {
+ error = EINVAL;
+ goto bad;
+ }
+
+ opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+ opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
+ opt->value = NULL;
+ opt->len = 0;
+ opt->pos = i / 2;
+ opt->seen = 0;
+
+ /*
+ * Do this early, so jumps to "bad" will free the current
+ * option.
+ */
+ TAILQ_INSERT_TAIL(opts, opt, link);
+
+ if (auio->uio_segflg == UIO_SYSSPACE) {
+ bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
+ } else {
+ error = copyin(auio->uio_iov[i].iov_base, opt->name,
+ namelen);
+ if (error)
+ goto bad;
+ }
+ /* Ensure names are null-terminated strings. */
+ if (namelen == 0 || opt->name[namelen - 1] != '\0') {
+ error = EINVAL;
+ goto bad;
+ }
+ if (optlen != 0) {
+ opt->len = optlen;
+ opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
+ if (auio->uio_segflg == UIO_SYSSPACE) {
+ bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
+ optlen);
+ } else {
+ error = copyin(auio->uio_iov[i + 1].iov_base,
+ opt->value, optlen);
+ if (error)
+ goto bad;
+ }
+ }
+ }
+ vfs_sanitizeopts(opts);
+ *options = opts;
+ return (0);
+bad:
+ vfs_freeopts(opts);
+ return (error);
+}
+
+/*
+ * Merge the old mount options with the new ones passed
+ * in the MNT_UPDATE case.
+ *
+ * XXX: This function will keep a "nofoo" option in the new
+ * options. E.g, if the option's canonical name is "foo",
+ * "nofoo" ends up in the mount point's active options.
+ */
+static void
+vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
+{
+ struct vfsopt *opt, *new;
+
+ TAILQ_FOREACH(opt, oldopts, link) {
+ new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+ new->name = strdup(opt->name, M_MOUNT);
+ if (opt->len != 0) {
+ new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
+ bcopy(opt->value, new->value, opt->len);
+ } else
+ new->value = NULL;
+ new->len = opt->len;
+ new->seen = opt->seen;
+ TAILQ_INSERT_HEAD(toopts, new, link);
+ }
+ vfs_sanitizeopts(toopts);
+}
+
+/*
+ * Mount a filesystem.
+ */
+int
+sys_nmount(td, uap)
+ struct thread *td;
+ struct nmount_args /* {
+ struct iovec *iovp;
+ unsigned int iovcnt;
+ int flags;
+ } */ *uap;
+{
+ struct uio *auio;
+ int error;
+ u_int iovcnt;
+ uint64_t flags;
+
+ /*
+ * Mount flags are now 64-bits. On 32-bit archtectures only
+ * 32-bits are passed in, but from here on everything handles
+ * 64-bit flags correctly.
+ */
+ flags = uap->flags;
+
+ AUDIT_ARG_FFLAGS(flags);
+ CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
+ uap->iovp, uap->iovcnt, flags);
+
+ /*
+ * Filter out MNT_ROOTFS. We do not want clients of nmount() in
+ * userspace to set this flag, but we must filter it out if we want
+ * MNT_UPDATE on the root file system to work.
+ * MNT_ROOTFS should only be set by the kernel when mounting its
+ * root file system.
+ */
+ flags &= ~MNT_ROOTFS;
+
+ iovcnt = uap->iovcnt;
+ /*
+ * Check that we have an even number of iovec's
+ * and that we have at least two options.
+ */
+ if ((iovcnt & 1) || (iovcnt < 4)) {
+ CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
+ uap->iovcnt);
+ return (EINVAL);
+ }
+
+ error = copyinuio(uap->iovp, iovcnt, &auio);
+ if (error) {
+ CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
+ __func__, error);
+ return (error);
+ }
+ error = vfs_donmount(td, flags, auio);
+
+ free(auio, M_IOV);
+ return (error);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Various utility functions
+ */
+
+void
+vfs_ref(struct mount *mp)
+{
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ MNT_ILOCK(mp);
+ MNT_REF(mp);
+ MNT_IUNLOCK(mp);
+}
+
+void
+vfs_rel(struct mount *mp)
+{
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+}
+
+/*
+ * Allocate and initialize the mount point struct.
+ */
+struct mount *
+vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
+ struct ucred *cred)
+{
+ struct mount *mp;
+
+ mp = uma_zalloc(mount_zone, M_WAITOK);
+ bzero(&mp->mnt_startzero,
+ __rangeof(struct mount, mnt_startzero, mnt_endzero));
+ TAILQ_INIT(&mp->mnt_nvnodelist);
+ mp->mnt_nvnodelistsize = 0;
+ TAILQ_INIT(&mp->mnt_activevnodelist);
+ mp->mnt_activevnodelistsize = 0;
+ mp->mnt_ref = 0;
+ (void) vfs_busy(mp, MBF_NOWAIT);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ vfsp->vfc_refcount++; /* XXX Unlocked */
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_gen++;
+ strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_cred = crdup(cred);
+ mp->mnt_stat.f_owner = cred->cr_uid;
+ strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
+ mp->mnt_iosize_max = DFLTPHYS;
+#ifdef MAC
+ mac_mount_init(mp);
+ mac_mount_create(cred, mp);
+#endif
+ arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
+ TAILQ_INIT(&mp->mnt_uppers);
+ return (mp);
+}
+
+/*
+ * Destroy the mount struct previously allocated by vfs_mount_alloc().
+ */
+void
+vfs_mount_destroy(struct mount *mp)
+{
+
+ MNT_ILOCK(mp);
+ mp->mnt_kern_flag |= MNTK_REFEXPIRE;
+ if (mp->mnt_kern_flag & MNTK_MWAIT) {
+ mp->mnt_kern_flag &= ~MNTK_MWAIT;
+ wakeup(mp);
+ }
+ while (mp->mnt_ref)
+ msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
+ KASSERT(mp->mnt_ref == 0,
+ ("%s: invalid refcount in the drain path @ %s:%d", __func__,
+ __FILE__, __LINE__));
+ if (mp->mnt_writeopcount != 0)
+ panic("vfs_mount_destroy: nonzero writeopcount");
+ if (mp->mnt_secondary_writes != 0)
+ panic("vfs_mount_destroy: nonzero secondary_writes");
+ mp->mnt_vfc->vfc_refcount--;
+ if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
+ struct vnode *vp;
+
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
+ vprint("", vp);
+ panic("unmount: dangling vnode");
+ }
+ KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
+ if (mp->mnt_nvnodelistsize != 0)
+ panic("vfs_mount_destroy: nonzero nvnodelistsize");
+ if (mp->mnt_activevnodelistsize != 0)
+ panic("vfs_mount_destroy: nonzero activevnodelistsize");
+ if (mp->mnt_lockref != 0)
+ panic("vfs_mount_destroy: nonzero lock refcount");
+ MNT_IUNLOCK(mp);
+#ifdef MAC
+ mac_mount_destroy(mp);
+#endif
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ crfree(mp->mnt_cred);
+ uma_zfree(mount_zone, mp);
+}
+
+int
+vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
+{
+ struct vfsoptlist *optlist;
+ struct vfsopt *opt, *tmp_opt;
+ char *fstype, *fspath, *errmsg;
+ int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
+
+ errmsg = fspath = NULL;
+ errmsg_len = fspathlen = 0;
+ errmsg_pos = -1;
+
+ error = vfs_buildopts(fsoptions, &optlist);
+ if (error)
+ return (error);
+
+ if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
+ errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
+
+ /*
+ * We need these two options before the others,
+ * and they are mandatory for any filesystem.
+ * Ensure they are NUL terminated as well.
+ */
+ fstypelen = 0;
+ error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
+ if (error || fstype[fstypelen - 1] != '\0') {
+ error = EINVAL;
+ if (errmsg != NULL)
+ strncpy(errmsg, "Invalid fstype", errmsg_len);
+ goto bail;
+ }
+ fspathlen = 0;
+ error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
+ if (error || fspath[fspathlen - 1] != '\0') {
+ error = EINVAL;
+ if (errmsg != NULL)
+ strncpy(errmsg, "Invalid fspath", errmsg_len);
+ goto bail;
+ }
+
+ /*
+ * We need to see if we have the "update" option
+ * before we call vfs_domount(), since vfs_domount() has special
+ * logic based on MNT_UPDATE. This is very important
+ * when we want to update the root filesystem.
+ */
+ TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
+ if (strcmp(opt->name, "update") == 0) {
+ fsflags |= MNT_UPDATE;
+ vfs_freeopt(optlist, opt);
+ }
+ else if (strcmp(opt->name, "async") == 0)
+ fsflags |= MNT_ASYNC;
+ else if (strcmp(opt->name, "force") == 0) {
+ fsflags |= MNT_FORCE;
+ vfs_freeopt(optlist, opt);
+ }
+ else if (strcmp(opt->name, "reload") == 0) {
+ fsflags |= MNT_RELOAD;
+ vfs_freeopt(optlist, opt);
+ }
+ else if (strcmp(opt->name, "multilabel") == 0)
+ fsflags |= MNT_MULTILABEL;
+ else if (strcmp(opt->name, "noasync") == 0)
+ fsflags &= ~MNT_ASYNC;
+ else if (strcmp(opt->name, "noatime") == 0)
+ fsflags |= MNT_NOATIME;
+ else if (strcmp(opt->name, "atime") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoatime", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noclusterr") == 0)
+ fsflags |= MNT_NOCLUSTERR;
+ else if (strcmp(opt->name, "clusterr") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoclusterr", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noclusterw") == 0)
+ fsflags |= MNT_NOCLUSTERW;
+ else if (strcmp(opt->name, "clusterw") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoclusterw", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noexec") == 0)
+ fsflags |= MNT_NOEXEC;
+ else if (strcmp(opt->name, "exec") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonoexec", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "nosuid") == 0)
+ fsflags |= MNT_NOSUID;
+ else if (strcmp(opt->name, "suid") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonosuid", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "nosymfollow") == 0)
+ fsflags |= MNT_NOSYMFOLLOW;
+ else if (strcmp(opt->name, "symfollow") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("nonosymfollow", M_MOUNT);
+ }
+ else if (strcmp(opt->name, "noro") == 0)
+ fsflags &= ~MNT_RDONLY;
+ else if (strcmp(opt->name, "rw") == 0)
+ fsflags &= ~MNT_RDONLY;
+ else if (strcmp(opt->name, "ro") == 0)
+ fsflags |= MNT_RDONLY;
+ else if (strcmp(opt->name, "rdonly") == 0) {
+ free(opt->name, M_MOUNT);
+ opt->name = strdup("ro", M_MOUNT);
+ fsflags |= MNT_RDONLY;
+ }
+ else if (strcmp(opt->name, "suiddir") == 0)
+ fsflags |= MNT_SUIDDIR;
+ else if (strcmp(opt->name, "sync") == 0)
+ fsflags |= MNT_SYNCHRONOUS;
+ else if (strcmp(opt->name, "union") == 0)
+ fsflags |= MNT_UNION;
+ }
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
+ error = ENAMETOOLONG;
+ goto bail;
+ }
+
+ error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+bail:
+ /* copyout the errmsg */
+ if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
+ && errmsg_len > 0 && errmsg != NULL) {
+ if (fsoptions->uio_segflg == UIO_SYSSPACE) {
+ bcopy(errmsg,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+ } else {
+ copyout(errmsg,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+ fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+ }
+ }
+
+ if (optlist != NULL)
+ vfs_freeopts(optlist);
+ return (error);
+}
+
+/*
+ * Old mount API.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+sys_mount(td, uap)
+ struct thread *td;
+ struct mount_args /* {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+ } */ *uap;
+{
+ char *fstype;
+ struct vfsconf *vfsp = NULL;
+ struct mntarg *ma = NULL;
+ uint64_t flags;
+ int error;
+
+ /*
+ * Mount flags are now 64-bits. On 32-bit architectures only
+ * 32-bits are passed in, but from here on everything handles
+ * 64-bit flags correctly.
+ */
+ flags = uap->flags;
+
+ AUDIT_ARG_FFLAGS(flags);
+
+ /*
+ * Filter out MNT_ROOTFS. We do not want clients of mount() in
+ * userspace to set this flag, but we must filter it out if we want
+ * MNT_UPDATE on the root file system to work.
+ * MNT_ROOTFS should only be set by the kernel when mounting its
+ * root file system.
+ */
+ flags &= ~MNT_ROOTFS;
+
+ fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
+ if (error) {
+ free(fstype, M_TEMP);
+ return (error);
+ }
+
+ AUDIT_ARG_TEXT(fstype);
+ mtx_lock(&Giant);
+ vfsp = vfs_byname_kld(fstype, td, &error);
+ free(fstype, M_TEMP);
+ if (vfsp == NULL) {
+ mtx_unlock(&Giant);
+ return (ENOENT);
+ }
+ if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
+ mtx_unlock(&Giant);
+ return (EOPNOTSUPP);
+ }
+
+ ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
+ ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
+ ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
+ ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
+ ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
+
+ error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * vfs_domount_first(): first file system mount (not update)
+ */
+static int
+vfs_domount_first(
+ struct thread *td, /* Calling thread. */
+ struct vfsconf *vfsp, /* File system type. */
+ char *fspath, /* Mount path. */
+ struct vnode *vp, /* Vnode to be covered. */
+ uint64_t fsflags, /* Flags common to all filesystems. */
+ struct vfsoptlist **optlist /* Options local to the filesystem. */
+ )
+{
+ struct vattr va;
+ struct mount *mp;
+ struct vnode *newdp;
+ int error;
+
+ mtx_assert(&Giant, MA_OWNED);
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
+
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ error = VOP_GETATTR(vp, &va, td->td_ucred);
+ if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
+ error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
+ if (error == 0)
+ error = vinvalbuf(vp, V_SAVE, 0, 0);
+ if (error == 0 && vp->v_type != VDIR)
+ error = ENOTDIR;
+ if (error == 0) {
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
+ vp->v_iflag |= VI_MOUNT;
+ else
+ error = EBUSY;
+ VI_UNLOCK(vp);
+ }
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0);
+
+ /* Allocate and initialize the filesystem. */
+ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
+ /* XXXMAC: pass to vfs_mount_alloc? */
+ mp->mnt_optnew = *optlist;
+ /* Set the mount level flags. */
+ mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
+
+ /*
+ * Mount the filesystem.
+ * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+ * get. No freeing of cn_pnbuf.
+ */
+ error = VFS_MOUNT(mp);
+ if (error != 0) {
+ vfs_unbusy(mp);
+ vfs_mount_destroy(mp);
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+ vrele(vp);
+ return (error);
+ }
+
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ *optlist = NULL;
+ (void)VFS_STATFS(mp, &mp->mnt_stat);
+
+ /*
+ * Prevent external consumers of mount options from reading mnt_optnew.
+ */
+ mp->mnt_optnew = NULL;
+
+ MNT_ILOCK(mp);
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+ (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ else
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ cache_purge(vp);
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+ vp->v_mountedhere = mp;
+ /* Place the new filesystem at the end of the mount list. */
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ vfs_event_signal(NULL, VQ_MOUNT, 0);
+ if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
+ panic("mount: lost mount");
+ VOP_UNLOCK(vp, 0);
+ EVENTHANDLER_INVOKE(vfs_mounted, mp, newdp, td);
+ VOP_UNLOCK(newdp, 0);
+ mountcheckdirs(vp, newdp);
+ vrele(newdp);
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ vfs_allocate_syncvnode(mp);
+ vfs_unbusy(mp);
+ return (0);
+}
+
+/*
+ * vfs_domount_update(): update of mounted file system
+ */
+static int
+vfs_domount_update(
+ struct thread *td, /* Calling thread. */
+ struct vnode *vp, /* Mount point vnode. */
+ uint64_t fsflags, /* Flags common to all filesystems. */
+ struct vfsoptlist **optlist /* Options local to the filesystem. */
+ )
+{
+ struct oexport_args oexport;
+ struct export_args export;
+ struct mount *mp;
+ int error, export_error;
+ uint64_t flag;
+
+ mtx_assert(&Giant, MA_OWNED);
+ ASSERT_VOP_ELOCKED(vp, __func__);
+ KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
+
+ if ((vp->v_vflag & VV_ROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ mp = vp->v_mount;
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ flag = mp->mnt_flag;
+ if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
+ vput(vp);
+ return (EOPNOTSUPP); /* Needs translation */
+ }
+ /*
+ * Only privileged root, or (if MNT_USER is set) the user that
+ * did the original mount is permitted to update it.
+ */
+ error = vfs_suser(mp, td);
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ if (vfs_busy(mp, MBF_NOWAIT)) {
+ vput(vp);
+ return (EBUSY);
+ }
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
+ VI_UNLOCK(vp);
+ vfs_unbusy(mp);
+ vput(vp);
+ return (EBUSY);
+ }
+ vp->v_iflag |= VI_MOUNT;
+ VI_UNLOCK(vp);
+ VOP_UNLOCK(vp, 0);
+
+ MNT_ILOCK(mp);
+ mp->mnt_flag &= ~MNT_UPDATEMASK;
+ mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
+ MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
+ if ((mp->mnt_flag & MNT_ASYNC) == 0)
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
+ mp->mnt_optnew = *optlist;
+ vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
+
+ /*
+ * Mount the filesystem.
+ * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
+ * get. No freeing of cn_pnbuf.
+ */
+ error = VFS_MOUNT(mp);
+
+ export_error = 0;
+ if (error == 0) {
+ /* Process the export option. */
+ if (vfs_copyopt(mp->mnt_optnew, "export", &export,
+ sizeof(export)) == 0) {
+ export_error = vfs_export(mp, &export);
+ } else if (vfs_copyopt(mp->mnt_optnew, "export", &oexport,
+ sizeof(oexport)) == 0) {
+ export.ex_flags = oexport.ex_flags;
+ export.ex_root = oexport.ex_root;
+ export.ex_anon = oexport.ex_anon;
+ export.ex_addr = oexport.ex_addr;
+ export.ex_addrlen = oexport.ex_addrlen;
+ export.ex_mask = oexport.ex_mask;
+ export.ex_masklen = oexport.ex_masklen;
+ export.ex_indexfile = oexport.ex_indexfile;
+ export.ex_numsecflavors = 0;
+ export_error = vfs_export(mp, &export);
+ }
+ }
+
+ MNT_ILOCK(mp);
+ if (error == 0) {
+ mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
+ MNT_SNAPSHOT);
+ } else {
+ /*
+ * If we fail, restore old mount flags. MNT_QUOTA is special,
+ * because it is not part of MNT_UPDATEMASK, but it could have
+ * changed in the meantime if quotactl(2) was called.
+ * All in all we want current value of MNT_QUOTA, not the old
+ * one.
+ */
+ mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
+ }
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+ (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ else
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
+
+ if (error != 0)
+ goto end;
+
+ if (mp->mnt_opt != NULL)
+ vfs_freeopts(mp->mnt_opt);
+ mp->mnt_opt = mp->mnt_optnew;
+ *optlist = NULL;
+ (void)VFS_STATFS(mp, &mp->mnt_stat);
+ /*
+ * Prevent external consumers of mount options from reading
+ * mnt_optnew.
+ */
+ mp->mnt_optnew = NULL;
+
+ if ((mp->mnt_flag & MNT_RDONLY) == 0)
+ vfs_allocate_syncvnode(mp);
+ else
+ vfs_deallocate_syncvnode(mp);
+end:
+ vfs_unbusy(mp);
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vp);
+ vrele(vp);
+ return (error != 0 ? error : export_error);
+}
+
+/*
+ * vfs_domount(): actually attempt a filesystem mount.
+ */
+static int
+vfs_domount(
+ struct thread *td, /* Calling thread. */
+ const char *fstype, /* Filesystem type. */
+ char *fspath, /* Mount path. */
+ uint64_t fsflags, /* Flags common to all filesystems. */
+ struct vfsoptlist **optlist /* Options local to the filesystem. */
+ )
+{
+ struct vfsconf *vfsp;
+ struct nameidata nd;
+ struct vnode *vp;
+ char *pathbuf;
+ int error;
+
+ /*
+ * Be ultra-paranoid about making sure the type and fspath
+ * variables will fit in our mp buffers, including the
+ * terminating NUL.
+ */
+ if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+ return (ENAMETOOLONG);
+
+ if (jailed(td->td_ucred) || usermount == 0) {
+ if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
+ return (error);
+ }
+
+ /*
+ * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
+ */
+ if (fsflags & MNT_EXPORTED) {
+ error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
+ if (error)
+ return (error);
+ }
+ if (fsflags & MNT_SUIDDIR) {
+ error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
+ if (error)
+ return (error);
+ }
+ /*
+ * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
+ */
+ if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
+ if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
+ fsflags |= MNT_NOSUID | MNT_USER;
+ }
+
+ /* Load KLDs before we lock the covered vnode to avoid reversals. */
+ vfsp = NULL;
+ if ((fsflags & MNT_UPDATE) == 0) {
+ /* Don't try to load KLDs if we're mounting the root. */
+ if (fsflags & MNT_ROOTFS)
+ vfsp = vfs_byname(fstype);
+ else
+ vfsp = vfs_byname_kld(fstype, td, &error);
+ if (vfsp == NULL)
+ return (ENODEV);
+ if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
+ return (EPERM);
+ }
+
+ /*
+ * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+ UIO_SYSSPACE, fspath, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+ mtx_lock(&Giant);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if ((fsflags & MNT_UPDATE) == 0) {
+ pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+ strcpy(pathbuf, fspath);
+ error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN);
+ /* debug.disablefullpath == 1 results in ENODEV */
+ if (error == 0 || error == ENODEV) {
+ error = vfs_domount_first(td, vfsp, pathbuf, vp,
+ fsflags, optlist);
+ }
+ free(pathbuf, M_TEMP);
+ } else
+ error = vfs_domount_update(td, vp, fsflags, optlist);
+ mtx_unlock(&Giant);
+
+ ASSERT_VI_UNLOCKED(vp, __func__);
+ ASSERT_VOP_UNLOCKED(vp, __func__);
+
+ return (error);
+}
+
+/*
+ * Unmount a filesystem.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument, not
+ * special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+sys_unmount(td, uap)
+ struct thread *td;
+ register struct unmount_args /* {
+ char *path;
+ int flags;
+ } */ *uap;
+{
+ struct nameidata nd;
+ struct mount *mp;
+ char *pathbuf;
+ int error, id0, id1;
+
+ AUDIT_ARG_VALUE(uap->flags);
+ if (jailed(td->td_ucred) || usermount == 0) {
+ error = priv_check(td, PRIV_VFS_UNMOUNT);
+ if (error)
+ return (error);
+ }
+
+ pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
+ error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
+ if (error) {
+ free(pathbuf, M_TEMP);
+ return (error);
+ }
+ mtx_lock(&Giant);
+ if (uap->flags & MNT_BYFSID) {
+ AUDIT_ARG_TEXT(pathbuf);
+ /* Decode the filesystem ID. */
+ if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
+ mtx_unlock(&Giant);
+ free(pathbuf, M_TEMP);
+ return (EINVAL);
+ }
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+ if (mp->mnt_stat.f_fsid.val[0] == id0 &&
+ mp->mnt_stat.f_fsid.val[1] == id1)
+ break;
+ }
+ mtx_unlock(&mountlist_mtx);
+ } else {
+ /*
+ * Try to find global path for path argument.
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
+ UIO_SYSSPACE, pathbuf, td);
+ if (namei(&nd) == 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
+ MNAMELEN);
+ if (error == 0 || error == ENODEV)
+ vput(nd.ni_vp);
+ }
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
+ if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
+ break;
+ }
+ mtx_unlock(&mountlist_mtx);
+ }
+ free(pathbuf, M_TEMP);
+ if (mp == NULL) {
+ /*
+ * Previously we returned ENOENT for a nonexistent path and
+ * EINVAL for a non-mountpoint. We cannot tell these apart
+ * now, so in the !MNT_BYFSID case return the more likely
+ * EINVAL for compatibility.
+ */
+ mtx_unlock(&Giant);
+ return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
+ }
+
+ /*
+ * Don't allow unmounting the root filesystem.
+ */
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ mtx_unlock(&Giant);
+ return (EINVAL);
+ }
+ error = dounmount(mp, uap->flags, td);
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Do the actual filesystem unmount.
+ */
+int
+dounmount(mp, flags, td)
+ struct mount *mp;
+ int flags;
+ struct thread *td;
+{
+ struct vnode *coveredvp, *fsrootvp;
+ int error;
+ uint64_t async_flag;
+ int mnt_gen_r;
+
+ mtx_assert(&Giant, MA_OWNED);
+
+ if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
+ mnt_gen_r = mp->mnt_gen;
+ VI_LOCK(coveredvp);
+ vholdl(coveredvp);
+ vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
+ vdrop(coveredvp);
+ /*
+ * Check for mp being unmounted while waiting for the
+ * covered vnode lock.
+ */
+ if (coveredvp->v_mountedhere != mp ||
+ coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
+ VOP_UNLOCK(coveredvp, 0);
+ return (EBUSY);
+ }
+ }
+ /*
+ * Only privileged root, or (if MNT_USER is set) the user that did the
+ * original mount is permitted to unmount this filesystem.
+ */
+ error = vfs_suser(mp, td);
+ if (error) {
+ if (coveredvp)
+ VOP_UNLOCK(coveredvp, 0);
+ return (error);
+ }
+
+ vn_start_write(NULL, &mp, V_WAIT);
+ MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
+ !TAILQ_EMPTY(&mp->mnt_uppers)) {
+ MNT_IUNLOCK(mp);
+ if (coveredvp)
+ VOP_UNLOCK(coveredvp, 0);
+ vn_finished_write(mp);
+ return (EBUSY);
+ }
+ mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
+ /* Allow filesystems to detect that a forced unmount is in progress. */
+ if (flags & MNT_FORCE) {
+ mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+ MNT_IUNLOCK(mp);
+ /*
+ * Must be done after setting MNTK_UNMOUNTF and before
+ * waiting for mnt_lockref to become 0.
+ */
+ VFS_PURGE(mp);
+ MNT_ILOCK(mp);
+ }
+ error = 0;
+ if (mp->mnt_lockref) {
+ mp->mnt_kern_flag |= MNTK_DRAINING;
+ error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
+ "mount drain", 0);
+ }
+ MNT_IUNLOCK(mp);
+ KASSERT(mp->mnt_lockref == 0,
+ ("%s: invalid lock refcount in the drain path @ %s:%d",
+ __func__, __FILE__, __LINE__));
+ KASSERT(error == 0,
+ ("%s: invalid return value for msleep in the drain path @ %s:%d",
+ __func__, __FILE__, __LINE__));
+
+ if (mp->mnt_flag & MNT_EXPUBLIC)
+ vfs_setpublicfs(NULL, NULL, NULL);
+
+ vfs_msync(mp, MNT_WAIT);
+ MNT_ILOCK(mp);
+ async_flag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ mp->mnt_kern_flag &= ~MNTK_ASYNC;
+ MNT_IUNLOCK(mp);
+ cache_purgevfs(mp); /* remove cache entries for this file sys */
+ vfs_deallocate_syncvnode(mp);
+ /*
+ * For forced unmounts, move process cdir/rdir refs on the fs root
+ * vnode to the covered vnode. For non-forced unmounts we want
+ * such references to cause an EBUSY error.
+ */
+ if ((flags & MNT_FORCE) &&
+ VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
+ if (mp->mnt_vnodecovered != NULL)
+ mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
+ if (fsrootvp == rootvnode) {
+ vrele(rootvnode);
+ rootvnode = NULL;
+ }
+ vput(fsrootvp);
+ }
+ if (((mp->mnt_flag & MNT_RDONLY) ||
+ (error = VFS_SYNC(mp, MNT_WAIT)) == 0) || (flags & MNT_FORCE) != 0)
+ error = VFS_UNMOUNT(mp, flags);
+ vn_finished_write(mp);
+ /*
+ * If we failed to flush the dirty blocks for this mount point,
+ * undo all the cdir/rdir and rootvnode changes we made above.
+ * Unless we failed to do so because the device is reporting that
+ * it doesn't exist anymore.
+ */
+ if (error && error != ENXIO) {
+ if ((flags & MNT_FORCE) &&
+ VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
+ if (mp->mnt_vnodecovered != NULL)
+ mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
+ if (rootvnode == NULL) {
+ rootvnode = fsrootvp;
+ vref(rootvnode);
+ }
+ vput(fsrootvp);
+ }
+ MNT_ILOCK(mp);
+ mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ MNT_IUNLOCK(mp);
+ vfs_allocate_syncvnode(mp);
+ MNT_ILOCK(mp);
+ }
+ mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+ mp->mnt_flag |= async_flag;
+ if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+ (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
+ mp->mnt_kern_flag |= MNTK_ASYNC;
+ if (mp->mnt_kern_flag & MNTK_MWAIT) {
+ mp->mnt_kern_flag &= ~MNTK_MWAIT;
+ wakeup(mp);
+ }
+ MNT_IUNLOCK(mp);
+ if (coveredvp)
+ VOP_UNLOCK(coveredvp, 0);
+ return (error);
+ }
+ mtx_lock(&mountlist_mtx);
+ TAILQ_REMOVE(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+ EVENTHANDLER_INVOKE(vfs_unmounted, mp, td);
+ if (coveredvp != NULL) {
+ coveredvp->v_mountedhere = NULL;
+ vput(coveredvp);
+ }
+ vfs_event_signal(NULL, VQ_UNMOUNT, 0);
+ vfs_mount_destroy(mp);
+ return (0);
+}
+
+/*
+ * Report errors during filesystem mounting.
+ */
+void
+vfs_mount_error(struct mount *mp, const char *fmt, ...)
+{
+ struct vfsoptlist *moptlist = mp->mnt_optnew;
+ va_list ap;
+ int error, len;
+ char *errmsg;
+
+ error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
+ if (error || errmsg == NULL || len <= 0)
+ return;
+
+ va_start(ap, fmt);
+ vsnprintf(errmsg, (size_t)len, fmt, ap);
+ va_end(ap);
+}
+
+void
+vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
+{
+ va_list ap;
+ int error, len;
+ char *errmsg;
+
+ error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
+ if (error || errmsg == NULL || len <= 0)
+ return;
+
+ va_start(ap, fmt);
+ vsnprintf(errmsg, (size_t)len, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * Functions for querying mount options/arguments from filesystems.
+ */
+
+/*
+ * Check that no unknown options are given
+ */
+int
+vfs_filteropt(struct vfsoptlist *opts, const char **legal)
+{
+ struct vfsopt *opt;
+ char errmsg[255];
+ const char **t, *p, *q;
+ int ret = 0;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ p = opt->name;
+ q = NULL;
+ if (p[0] == 'n' && p[1] == 'o')
+ q = p + 2;
+ for(t = global_opts; *t != NULL; t++) {
+ if (strcmp(*t, p) == 0)
+ break;
+ if (q != NULL) {
+ if (strcmp(*t, q) == 0)
+ break;
+ }
+ }
+ if (*t != NULL)
+ continue;
+ for(t = legal; *t != NULL; t++) {
+ if (strcmp(*t, p) == 0)
+ break;
+ if (q != NULL) {
+ if (strcmp(*t, q) == 0)
+ break;
+ }
+ }
+ if (*t != NULL)
+ continue;
+ snprintf(errmsg, sizeof(errmsg),
+ "mount option <%s> is unknown", p);
+ ret = EINVAL;
+ }
+ if (ret != 0) {
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(opt->name, "errmsg") == 0) {
+ strncpy((char *)opt->value, errmsg, opt->len);
+ break;
+ }
+ }
+ if (opt == NULL)
+ printf("%s\n", errmsg);
+ }
+ return (ret);
+}
+
+/*
+ * Get a mount option by its name.
+ *
+ * Return 0 if the option was found, ENOENT otherwise.
+ * If len is non-NULL it will be filled with the length
+ * of the option. If buf is non-NULL, it will be filled
+ * with the address of the option.
+ */
+int
+vfs_getopt(opts, name, buf, len)
+ struct vfsoptlist *opts;
+ const char *name;
+ void **buf;
+ int *len;
+{
+ struct vfsopt *opt;
+
+ KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ opt->seen = 1;
+ if (len != NULL)
+ *len = opt->len;
+ if (buf != NULL)
+ *buf = opt->value;
+ return (0);
+ }
+ }
+ return (ENOENT);
+}
+
+int
+vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
+{
+ struct vfsopt *opt;
+
+ if (opts == NULL)
+ return (-1);
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ opt->seen = 1;
+ return (opt->pos);
+ }
+ }
+ return (-1);
+}
+
+int
+vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
+{
+ char *opt_value, *vtp;
+ quad_t iv;
+ int error, opt_len;
+
+ error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
+ if (error != 0)
+ return (error);
+ if (opt_len == 0 || opt_value == NULL)
+ return (EINVAL);
+ if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
+ return (EINVAL);
+ iv = strtoq(opt_value, &vtp, 0);
+ if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
+ return (EINVAL);
+ if (iv < 0)
+ return (EINVAL);
+ switch (vtp[0]) {
+ case 't':
+ case 'T':
+ iv *= 1024;
+ case 'g':
+ case 'G':
+ iv *= 1024;
+ case 'm':
+ case 'M':
+ iv *= 1024;
+ case 'k':
+ case 'K':
+ iv *= 1024;
+ case '\0':
+ break;
+ default:
+ return (EINVAL);
+ }
+ *value = iv;
+
+ return (0);
+}
+
+char *
+vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
+{
+ struct vfsopt *opt;
+
+ *error = 0;
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->len == 0 ||
+ ((char *)opt->value)[opt->len - 1] != '\0') {
+ *error = EINVAL;
+ return (NULL);
+ }
+ return (opt->value);
+ }
+ *error = ENOENT;
+ return (NULL);
+}
+
+int
+vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
+ uint64_t val)
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ opt->seen = 1;
+ if (w != NULL)
+ *w |= val;
+ return (1);
+ }
+ }
+ if (w != NULL)
+ *w &= ~val;
+ return (0);
+}
+
+int
+vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
+{
+ va_list ap;
+ struct vfsopt *opt;
+ int ret;
+
+ KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->len == 0 || opt->value == NULL)
+ return (0);
+ if (((char *)opt->value)[opt->len - 1] != '\0')
+ return (0);
+ va_start(ap, fmt);
+ ret = vsscanf(opt->value, fmt, ap);
+ va_end(ap);
+ return (ret);
+ }
+ return (0);
+}
+
+int
+vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->value == NULL)
+ opt->len = len;
+ else {
+ if (opt->len != len)
+ return (EINVAL);
+ bcopy(value, opt->value, len);
+ }
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->value == NULL)
+ opt->len = len;
+ else {
+ if (opt->len < len)
+ return (EINVAL);
+ opt->len = len;
+ bcopy(value, opt->value, len);
+ }
+ return (0);
+ }
+ return (ENOENT);
+}
+
+int
+vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
+{
+ struct vfsopt *opt;
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) != 0)
+ continue;
+ opt->seen = 1;
+ if (opt->value == NULL)
+ opt->len = strlen(value) + 1;
+ else if (strlcpy(opt->value, value, opt->len) >= opt->len)
+ return (EINVAL);
+ return (0);
+ }
+ return (ENOENT);
+}
+
+/*
+ * Find and copy a mount option.
+ *
+ * The size of the buffer has to be specified
+ * in len, if it is not the same length as the
+ * mount option, EINVAL is returned.
+ * Returns ENOENT if the option is not found.
+ */
+int
+vfs_copyopt(opts, name, dest, len)
+ struct vfsoptlist *opts;
+ const char *name;
+ void *dest;
+ int len;
+{
+ struct vfsopt *opt;
+
+ KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
+
+ TAILQ_FOREACH(opt, opts, link) {
+ if (strcmp(name, opt->name) == 0) {
+ opt->seen = 1;
+ if (len != opt->len)
+ return (EINVAL);
+ bcopy(opt->value, dest, opt->len);
+ return (0);
+ }
+ }
+ return (ENOENT);
+}
+
+int
+__vfs_statfs(struct mount *mp, struct statfs *sbp)
+{
+ int error;
+
+ error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
+ if (sbp != &mp->mnt_stat)
+ *sbp = mp->mnt_stat;
+ return (error);
+}
+
+void
+vfs_mountedfrom(struct mount *mp, const char *from)
+{
+
+ bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
+ strlcpy(mp->mnt_stat.f_mntfromname, from,
+ sizeof mp->mnt_stat.f_mntfromname);
+}
+
+/*
+ * ---------------------------------------------------------------------
+ * This is the api for building mount args and mounting filesystems from
+ * inside the kernel.
+ *
+ * The API works by accumulation of individual args. First error is
+ * latched.
+ *
+ * XXX: should be documented in new manpage kernel_mount(9)
+ */
+
+/* A memory allocation which must be freed when we are done */
+struct mntaarg {
+ SLIST_ENTRY(mntaarg) next;
+};
+
+/* The header for the mount arguments */
+struct mntarg {
+ struct iovec *v;
+ int len;
+ int error;
+ SLIST_HEAD(, mntaarg) list;
+};
+
+/*
+ * Add a boolean argument.
+ *
+ * flag is the boolean value.
+ * name must start with "no".
+ */
+struct mntarg *
+mount_argb(struct mntarg *ma, int flag, const char *name)
+{
+
+ KASSERT(name[0] == 'n' && name[1] == 'o',
+ ("mount_argb(...,%s): name must start with 'no'", name));
+
+ return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
+}
+
+/*
+ * Add an argument printf style
+ */
+struct mntarg *
+mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
+{
+ va_list ap;
+ struct mntaarg *maa;
+ struct sbuf *sb;
+ int len;
+
+ if (ma == NULL) {
+ ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INIT(&ma->list);
+ }
+ if (ma->error)
+ return (ma);
+
+ ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
+ M_MOUNT, M_WAITOK);
+ ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
+ ma->v[ma->len].iov_len = strlen(name) + 1;
+ ma->len++;
+
+ sb = sbuf_new_auto();
+ va_start(ap, fmt);
+ sbuf_vprintf(sb, fmt, ap);
+ va_end(ap);
+ sbuf_finish(sb);
+ len = sbuf_len(sb) + 1;
+ maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INSERT_HEAD(&ma->list, maa, next);
+ bcopy(sbuf_data(sb), maa + 1, len);
+ sbuf_delete(sb);
+
+ ma->v[ma->len].iov_base = maa + 1;
+ ma->v[ma->len].iov_len = len;
+ ma->len++;
+
+ return (ma);
+}
+
+/*
+ * Add an argument which is a userland string.
+ */
+struct mntarg *
+mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
+{
+ struct mntaarg *maa;
+ char *tbuf;
+
+ if (val == NULL)
+ return (ma);
+ if (ma == NULL) {
+ ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INIT(&ma->list);
+ }
+ if (ma->error)
+ return (ma);
+ maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INSERT_HEAD(&ma->list, maa, next);
+ tbuf = (void *)(maa + 1);
+ ma->error = copyinstr(val, tbuf, len, NULL);
+ return (mount_arg(ma, name, tbuf, -1));
+}
+
+/*
+ * Plain argument.
+ *
+ * If length is -1, treat value as a C string.
+ */
+struct mntarg *
+mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
+{
+
+ if (ma == NULL) {
+ ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
+ SLIST_INIT(&ma->list);
+ }
+ if (ma->error)
+ return (ma);
+
+ ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
+ M_MOUNT, M_WAITOK);
+ ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
+ ma->v[ma->len].iov_len = strlen(name) + 1;
+ ma->len++;
+
+ ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
+ if (len < 0)
+ ma->v[ma->len].iov_len = strlen(val) + 1;
+ else
+ ma->v[ma->len].iov_len = len;
+ ma->len++;
+ return (ma);
+}
+
+/*
+ * Free a mntarg structure
+ */
+static void
+free_mntarg(struct mntarg *ma)
+{
+ struct mntaarg *maa;
+
+ while (!SLIST_EMPTY(&ma->list)) {
+ maa = SLIST_FIRST(&ma->list);
+ SLIST_REMOVE_HEAD(&ma->list, next);
+ free(maa, M_MOUNT);
+ }
+ free(ma->v, M_MOUNT);
+ free(ma, M_MOUNT);
+}
+
+/*
+ * Mount a filesystem
+ */
+int
+kernel_mount(struct mntarg *ma, uint64_t flags)
+{
+ struct uio auio;
+ int error;
+
+ KASSERT(ma != NULL, ("kernel_mount NULL ma"));
+ KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
+ KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
+
+ auio.uio_iov = ma->v;
+ auio.uio_iovcnt = ma->len;
+ auio.uio_segflg = UIO_SYSSPACE;
+
+ error = ma->error;
+ if (!error)
+ error = vfs_donmount(curthread, flags, &auio);
+ free_mntarg(ma);
+ return (error);
+}
+
+/*
+ * A printflike function to mount a filesystem.
+ */
+int
+kernel_vmount(int flags, ...)
+{
+ struct mntarg *ma = NULL;
+ va_list ap;
+ const char *cp;
+ const void *vp;
+ int error;
+
+ va_start(ap, flags);
+ for (;;) {
+ cp = va_arg(ap, const char *);
+ if (cp == NULL)
+ break;
+ vp = va_arg(ap, const void *);
+ ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
+ }
+ va_end(ap);
+
+ error = kernel_mount(ma, flags);
+ return (error);
+}
+
+void
+vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
+{
+
+ bcopy(oexp, exp, sizeof(*oexp));
+ exp->ex_numsecflavors = 0;
+}
diff --git a/sys/kern/vfs_mountroot.c b/sys/kern/vfs_mountroot.c
new file mode 100644
index 0000000..322fc9a
--- /dev/null
+++ b/sys/kern/vfs_mountroot.c
@@ -0,0 +1,1041 @@
+/*-
+ * Copyright (c) 2010 Marcel Moolenaar
+ * Copyright (c) 1999-2004 Poul-Henning Kamp
+ * Copyright (c) 1999 Michael Smith
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_rootdevname.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/cons.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mdioctl.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/filedesc.h>
+#include <sys/reboot.h>
+#include <sys/sbuf.h>
+#include <sys/stat.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <geom/geom.h>
+
+/*
+ * The root filesystem is detailed in the kernel environment variable
+ * vfs.root.mountfrom, which is expected to be in the general format
+ *
+ * <vfsname>:[<path>][ <vfsname>:[<path>] ...]
+ * vfsname := the name of a VFS known to the kernel and capable
+ * of being mounted as root
+ * path := disk device name or other data used by the filesystem
+ * to locate its physical store
+ *
+ * If the environment variable vfs.root.mountfrom is a space separated list,
+ * each list element is tried in turn and the root filesystem will be mounted
+ * from the first one that suceeds.
+ *
+ * The environment variable vfs.root.mountfrom.options is a comma delimited
+ * set of string mount options. These mount options must be parseable
+ * by nmount() in the kernel.
+ */
+
+static int parse_mount(char **);
+static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
+
+/*
+ * The vnode of the system's root (/ in the filesystem, without chroot
+ * active.)
+ */
+struct vnode *rootvnode;
+
+char *rootdevnames[2] = {NULL, NULL};
+
+struct root_hold_token {
+ const char *who;
+ LIST_ENTRY(root_hold_token) list;
+};
+
+static LIST_HEAD(, root_hold_token) root_holds =
+ LIST_HEAD_INITIALIZER(root_holds);
+
+enum action {
+ A_CONTINUE,
+ A_PANIC,
+ A_REBOOT,
+ A_RETRY
+};
+
+static enum action root_mount_onfail = A_CONTINUE;
+
+static int root_mount_mddev;
+static int root_mount_complete;
+
+/* By default wait up to 3 seconds for devices to appear. */
+static int root_mount_timeout = 3;
+TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
+
+struct root_hold_token *
+root_mount_hold(const char *identifier)
+{
+ struct root_hold_token *h;
+
+ if (root_mounted())
+ return (NULL);
+
+ h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
+ h->who = identifier;
+ mtx_lock(&mountlist_mtx);
+ LIST_INSERT_HEAD(&root_holds, h, list);
+ mtx_unlock(&mountlist_mtx);
+ return (h);
+}
+
+void
+root_mount_rel(struct root_hold_token *h)
+{
+
+ if (h == NULL)
+ return;
+ mtx_lock(&mountlist_mtx);
+ LIST_REMOVE(h, list);
+ wakeup(&root_holds);
+ mtx_unlock(&mountlist_mtx);
+ free(h, M_DEVBUF);
+}
+
+int
+root_mounted(void)
+{
+
+ /* No mutex is acquired here because int stores are atomic. */
+ return (root_mount_complete);
+}
+
+void
+root_mount_wait(void)
+{
+
+ /*
+ * Panic on an obvious deadlock - the function can't be called from
+ * a thread which is doing the whole SYSINIT stuff.
+ */
+ KASSERT(curthread->td_proc->p_pid != 0,
+ ("root_mount_wait: cannot be called from the swapper thread"));
+ mtx_lock(&mountlist_mtx);
+ while (!root_mount_complete) {
+ msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
+ hz);
+ }
+ mtx_unlock(&mountlist_mtx);
+}
+
+static void
+set_rootvnode(void)
+{
+ struct proc *p;
+
+ if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode))
+ panic("Cannot find root vnode");
+
+ VOP_UNLOCK(rootvnode, 0);
+
+ p = curthread->td_proc;
+ FILEDESC_XLOCK(p->p_fd);
+
+ if (p->p_fd->fd_cdir != NULL)
+ vrele(p->p_fd->fd_cdir);
+ p->p_fd->fd_cdir = rootvnode;
+ VREF(rootvnode);
+
+ if (p->p_fd->fd_rdir != NULL)
+ vrele(p->p_fd->fd_rdir);
+ p->p_fd->fd_rdir = rootvnode;
+ VREF(rootvnode);
+
+ FILEDESC_XUNLOCK(p->p_fd);
+}
+
+static int
+vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
+{
+ struct vfsoptlist *opts;
+ struct vfsconf *vfsp;
+ struct mount *mp;
+ int error;
+
+ *mpp = NULL;
+
+ vfsp = vfs_byname("devfs");
+ KASSERT(vfsp != NULL, ("Could not find devfs by name"));
+ if (vfsp == NULL)
+ return (ENOENT);
+
+ mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
+
+ error = VFS_MOUNT(mp);
+ KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
+ if (error)
+ return (error);
+
+ opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+ TAILQ_INIT(opts);
+ mp->mnt_opt = opts;
+
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+
+ *mpp = mp;
+ set_rootvnode();
+
+ error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
+ if (error)
+ printf("kern_symlink /dev -> / returns %d\n", error);
+
+ return (error);
+}
+
+static int
+vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
+{
+ struct nameidata nd;
+ struct mount *mporoot, *mpnroot;
+ struct vnode *vp, *vporoot, *vpdevfs;
+ char *fspath;
+ int error;
+
+ mpnroot = TAILQ_NEXT(mpdevfs, mnt_list);
+
+ /* Shuffle the mountlist. */
+ mtx_lock(&mountlist_mtx);
+ mporoot = TAILQ_FIRST(&mountlist);
+ TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list);
+ if (mporoot != mpdevfs) {
+ TAILQ_REMOVE(&mountlist, mpnroot, mnt_list);
+ TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list);
+ }
+ TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list);
+ mtx_unlock(&mountlist_mtx);
+
+ cache_purgevfs(mporoot);
+ if (mporoot != mpdevfs)
+ cache_purgevfs(mpdevfs);
+
+ VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot);
+
+ VI_LOCK(vporoot);
+ vporoot->v_iflag &= ~VI_MOUNT;
+ VI_UNLOCK(vporoot);
+ vporoot->v_mountedhere = NULL;
+ mporoot->mnt_flag &= ~MNT_ROOTFS;
+ mporoot->mnt_vnodecovered = NULL;
+ vput(vporoot);
+
+ /* Set up the new rootvnode, and purge the cache */
+ mpnroot->mnt_vnodecovered = NULL;
+ set_rootvnode();
+ cache_purgevfs(rootvnode->v_mount);
+
+ if (mporoot != mpdevfs) {
+ /* Remount old root under /.mount or /mnt */
+ fspath = "/.mount";
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
+ fspath, td);
+ error = namei(&nd);
+ if (error) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ fspath = "/mnt";
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
+ fspath, td);
+ error = namei(&nd);
+ }
+ if (!error) {
+ vp = nd.ni_vp;
+ error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
+ if (!error)
+ error = vinvalbuf(vp, V_SAVE, 0, 0);
+ if (!error) {
+ cache_purge(vp);
+ mporoot->mnt_vnodecovered = vp;
+ vp->v_mountedhere = mporoot;
+ strlcpy(mporoot->mnt_stat.f_mntonname,
+ fspath, MNAMELEN);
+ VOP_UNLOCK(vp, 0);
+ } else
+ vput(vp);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ if (error && bootverbose)
+ printf("mountroot: unable to remount previous root "
+ "under /.mount or /mnt (error %d).\n", error);
+ }
+
+ /* Remount devfs under /dev */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
+ error = namei(&nd);
+ if (!error) {
+ vp = nd.ni_vp;
+ error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
+ if (!error)
+ error = vinvalbuf(vp, V_SAVE, 0, 0);
+ if (!error) {
+ vpdevfs = mpdevfs->mnt_vnodecovered;
+ if (vpdevfs != NULL) {
+ cache_purge(vpdevfs);
+ vpdevfs->v_mountedhere = NULL;
+ vrele(vpdevfs);
+ }
+ mpdevfs->mnt_vnodecovered = vp;
+ vp->v_mountedhere = mpdevfs;
+ VOP_UNLOCK(vp, 0);
+ } else
+ vput(vp);
+ }
+ if (error && bootverbose)
+ printf("mountroot: unable to remount devfs under /dev "
+ "(error %d).\n", error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ if (mporoot == mpdevfs) {
+ vfs_unbusy(mpdevfs);
+ /* Unlink the no longer needed /dev/dev -> / symlink */
+ error = kern_unlink(td, "/dev/dev", UIO_SYSSPACE);
+ if (error && bootverbose)
+ printf("mountroot: unable to unlink /dev/dev "
+ "(error %d)\n", error);
+ }
+
+ return (0);
+}
+
+/*
+ * Configuration parser.
+ */
+
+/* Parser character classes. */
+#define CC_WHITESPACE -1
+#define CC_NONWHITESPACE -2
+
+/* Parse errors. */
+#define PE_EOF -1
+#define PE_EOL -2
+
+static __inline int
+parse_peek(char **conf)
+{
+
+ return (**conf);
+}
+
+static __inline void
+parse_poke(char **conf, int c)
+{
+
+ **conf = c;
+}
+
+static __inline void
+parse_advance(char **conf)
+{
+
+ (*conf)++;
+}
+
+static __inline int
+parse_isspace(int c)
+{
+
+ return ((c == ' ' || c == '\t' || c == '\n') ? 1 : 0);
+}
+
+static int
+parse_skipto(char **conf, int mc)
+{
+ int c, match;
+
+ while (1) {
+ c = parse_peek(conf);
+ if (c == 0)
+ return (PE_EOF);
+ switch (mc) {
+ case CC_WHITESPACE:
+ match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0;
+ break;
+ case CC_NONWHITESPACE:
+ if (c == '\n')
+ return (PE_EOL);
+ match = (c != ' ' && c != '\t') ? 1 : 0;
+ break;
+ default:
+ match = (c == mc) ? 1 : 0;
+ break;
+ }
+ if (match)
+ break;
+ parse_advance(conf);
+ }
+ return (0);
+}
+
+static int
+parse_token(char **conf, char **tok)
+{
+ char *p;
+ size_t len;
+ int error;
+
+ *tok = NULL;
+ error = parse_skipto(conf, CC_NONWHITESPACE);
+ if (error)
+ return (error);
+ p = *conf;
+ error = parse_skipto(conf, CC_WHITESPACE);
+ len = *conf - p;
+ *tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO);
+ bcopy(p, *tok, len);
+ return (0);
+}
+
+static void
+parse_dir_ask_printenv(const char *var)
+{
+ char *val;
+
+ val = getenv(var);
+ if (val != NULL) {
+ printf(" %s=%s\n", var, val);
+ freeenv(val);
+ }
+}
+
+static int
+parse_dir_ask(char **conf)
+{
+ char name[80];
+ char *mnt;
+ int error;
+
+ printf("\nLoader variables:\n");
+ parse_dir_ask_printenv("vfs.root.mountfrom");
+ parse_dir_ask_printenv("vfs.root.mountfrom.options");
+
+ printf("\nManual root filesystem specification:\n");
+ printf(" <fstype>:<device> [options]\n");
+ printf(" Mount <device> using filesystem <fstype>\n");
+ printf(" and with the specified (optional) option list.\n");
+ printf("\n");
+ printf(" eg. ufs:/dev/da0s1a\n");
+ printf(" zfs:tank\n");
+ printf(" cd9660:/dev/acd0 ro\n");
+ printf(" (which is equivalent to: ");
+ printf("mount -t cd9660 -o ro /dev/acd0 /)\n");
+ printf("\n");
+ printf(" ? List valid disk boot devices\n");
+ printf(" . Yield 1 second (for background tasks)\n");
+ printf(" <empty line> Abort manual input\n");
+
+ do {
+ error = EINVAL;
+ printf("\nmountroot> ");
+ cngets(name, sizeof(name), GETS_ECHO);
+ if (name[0] == '\0')
+ break;
+ if (name[0] == '?' && name[1] == '\0') {
+ printf("\nList of GEOM managed disk devices:\n ");
+ g_dev_print();
+ continue;
+ }
+ if (name[0] == '.' && name[1] == '\0') {
+ pause("rmask", hz);
+ continue;
+ }
+ mnt = name;
+ error = parse_mount(&mnt);
+ if (error == -1)
+ printf("Invalid file system specification.\n");
+ } while (error != 0);
+
+ return (error);
+}
+
+static int
+parse_dir_md(char **conf)
+{
+ struct stat sb;
+ struct thread *td;
+ struct md_ioctl *mdio;
+ char *path, *tok;
+ int error, fd, len;
+
+ td = curthread;
+
+ error = parse_token(conf, &tok);
+ if (error)
+ return (error);
+
+ len = strlen(tok);
+ mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO);
+ path = (void *)(mdio + 1);
+ bcopy(tok, path, len);
+ free(tok, M_TEMP);
+
+ /* Get file status. */
+ error = kern_stat(td, path, UIO_SYSSPACE, &sb);
+ if (error)
+ goto out;
+
+ /* Open /dev/mdctl so that we can attach/detach. */
+ error = kern_open(td, "/dev/" MDCTL_NAME, UIO_SYSSPACE, O_RDWR, 0);
+ if (error)
+ goto out;
+
+ fd = td->td_retval[0];
+ mdio->md_version = MDIOVERSION;
+ mdio->md_type = MD_VNODE;
+
+ if (root_mount_mddev != -1) {
+ mdio->md_unit = root_mount_mddev;
+ DROP_GIANT();
+ error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
+ PICKUP_GIANT();
+ /* Ignore errors. We don't care. */
+ root_mount_mddev = -1;
+ }
+
+ mdio->md_file = (void *)(mdio + 1);
+ mdio->md_options = MD_AUTOUNIT | MD_READONLY;
+ mdio->md_mediasize = sb.st_size;
+ mdio->md_unit = 0;
+ DROP_GIANT();
+ error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio);
+ PICKUP_GIANT();
+ if (error)
+ goto out;
+
+ if (mdio->md_unit > 9) {
+ printf("rootmount: too many md units\n");
+ mdio->md_file = NULL;
+ mdio->md_options = 0;
+ mdio->md_mediasize = 0;
+ DROP_GIANT();
+ error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
+ PICKUP_GIANT();
+ /* Ignore errors. We don't care. */
+ error = ERANGE;
+ goto out;
+ }
+
+ root_mount_mddev = mdio->md_unit;
+ printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file);
+
+ error = kern_close(td, fd);
+
+ out:
+ free(mdio, M_TEMP);
+ return (error);
+}
+
+static int
+parse_dir_onfail(char **conf)
+{
+ char *action;
+ int error;
+
+ error = parse_token(conf, &action);
+ if (error)
+ return (error);
+
+ if (!strcmp(action, "continue"))
+ root_mount_onfail = A_CONTINUE;
+ else if (!strcmp(action, "panic"))
+ root_mount_onfail = A_PANIC;
+ else if (!strcmp(action, "reboot"))
+ root_mount_onfail = A_REBOOT;
+ else if (!strcmp(action, "retry"))
+ root_mount_onfail = A_RETRY;
+ else {
+ printf("rootmount: %s: unknown action\n", action);
+ error = EINVAL;
+ }
+
+ free(action, M_TEMP);
+ return (0);
+}
+
+static int
+parse_dir_timeout(char **conf)
+{
+ char *tok, *endtok;
+ long secs;
+ int error;
+
+ error = parse_token(conf, &tok);
+ if (error)
+ return (error);
+
+ secs = strtol(tok, &endtok, 0);
+ error = (secs < 0 || *endtok != '\0') ? EINVAL : 0;
+ if (!error)
+ root_mount_timeout = secs;
+ free(tok, M_TEMP);
+ return (error);
+}
+
+static int
+parse_directive(char **conf)
+{
+ char *dir;
+ int error;
+
+ error = parse_token(conf, &dir);
+ if (error)
+ return (error);
+
+ if (strcmp(dir, ".ask") == 0)
+ error = parse_dir_ask(conf);
+ else if (strcmp(dir, ".md") == 0)
+ error = parse_dir_md(conf);
+ else if (strcmp(dir, ".onfail") == 0)
+ error = parse_dir_onfail(conf);
+ else if (strcmp(dir, ".timeout") == 0)
+ error = parse_dir_timeout(conf);
+ else {
+ printf("mountroot: invalid directive `%s'\n", dir);
+ /* Ignore the rest of the line. */
+ (void)parse_skipto(conf, '\n');
+ error = EINVAL;
+ }
+ free(dir, M_TEMP);
+ return (error);
+}
+
+static int
+parse_mount_dev_present(const char *dev)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread);
+ error = namei(&nd);
+ if (!error)
+ vput(nd.ni_vp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error != 0) ? 0 : 1;
+}
+
+#define ERRMSGL 255
+static int
+parse_mount(char **conf)
+{
+ char *errmsg;
+ struct mntarg *ma;
+ char *dev, *fs, *opts, *tok;
+ int delay, error, timeout;
+
+ error = parse_token(conf, &tok);
+ if (error)
+ return (error);
+ fs = tok;
+ error = parse_skipto(&tok, ':');
+ if (error) {
+ free(fs, M_TEMP);
+ return (error);
+ }
+ parse_poke(&tok, '\0');
+ parse_advance(&tok);
+ dev = tok;
+
+ if (root_mount_mddev != -1) {
+ /* Handle substitution for the md unit number. */
+ tok = strstr(dev, "md#");
+ if (tok != NULL)
+ tok[2] = '0' + root_mount_mddev;
+ }
+
+ /* Parse options. */
+ error = parse_token(conf, &tok);
+ opts = (error == 0) ? tok : NULL;
+
+ printf("Trying to mount root from %s:%s [%s]...\n", fs, dev,
+ (opts != NULL) ? opts : "");
+
+ errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
+
+ if (vfs_byname(fs) == NULL) {
+ strlcpy(errmsg, "unknown file system", ERRMSGL);
+ error = ENOENT;
+ goto out;
+ }
+
+ if (strcmp(fs, "zfs") != 0 && strstr(fs, "nfs") == NULL &&
+ dev[0] != '\0' && !parse_mount_dev_present(dev)) {
+ printf("mountroot: waiting for device %s ...\n", dev);
+ delay = hz / 10;
+ timeout = root_mount_timeout * hz;
+ do {
+ pause("rmdev", delay);
+ timeout -= delay;
+ } while (timeout > 0 && !parse_mount_dev_present(dev));
+ if (timeout <= 0) {
+ error = ENODEV;
+ goto out;
+ }
+ }
+
+ ma = NULL;
+ ma = mount_arg(ma, "fstype", fs, -1);
+ ma = mount_arg(ma, "fspath", "/", -1);
+ ma = mount_arg(ma, "from", dev, -1);
+ ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
+ ma = mount_arg(ma, "ro", NULL, 0);
+ ma = parse_mountroot_options(ma, opts);
+ error = kernel_mount(ma, MNT_ROOTFS);
+
+ out:
+ if (error) {
+ printf("Mounting from %s:%s failed with error %d",
+ fs, dev, error);
+ if (errmsg[0] != '\0')
+ printf(": %s", errmsg);
+ printf(".\n");
+ }
+ free(fs, M_TEMP);
+ free(errmsg, M_TEMP);
+ if (opts != NULL)
+ free(opts, M_TEMP);
+ /* kernel_mount can return -1 on error. */
+ return ((error < 0) ? EDOOFUS : error);
+}
+#undef ERRMSGL
+
+static int
+vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs)
+{
+ struct mount *mp;
+ char *conf;
+ int error;
+
+ root_mount_mddev = -1;
+
+retry:
+ conf = sbuf_data(sb);
+ mp = TAILQ_NEXT(mpdevfs, mnt_list);
+ error = (mp == NULL) ? 0 : EDOOFUS;
+ root_mount_onfail = A_CONTINUE;
+ while (mp == NULL) {
+ error = parse_skipto(&conf, CC_NONWHITESPACE);
+ if (error == PE_EOL) {
+ parse_advance(&conf);
+ continue;
+ }
+ if (error < 0)
+ break;
+ switch (parse_peek(&conf)) {
+ case '#':
+ error = parse_skipto(&conf, '\n');
+ break;
+ case '.':
+ error = parse_directive(&conf);
+ break;
+ default:
+ error = parse_mount(&conf);
+ break;
+ }
+ if (error < 0)
+ break;
+ /* Ignore any trailing garbage on the line. */
+ if (parse_peek(&conf) != '\n') {
+ printf("mountroot: advancing to next directive...\n");
+ (void)parse_skipto(&conf, '\n');
+ }
+ mp = TAILQ_NEXT(mpdevfs, mnt_list);
+ }
+ if (mp != NULL)
+ return (0);
+
+ /*
+ * We failed to mount (a new) root.
+ */
+ switch (root_mount_onfail) {
+ case A_CONTINUE:
+ break;
+ case A_PANIC:
+ panic("mountroot: unable to (re-)mount root.");
+ /* NOTREACHED */
+ case A_RETRY:
+ goto retry;
+ case A_REBOOT:
+ kern_reboot(RB_NOSYNC);
+ /* NOTREACHED */
+ }
+
+ return (error);
+}
+
+static void
+vfs_mountroot_conf0(struct sbuf *sb)
+{
+ char *s, *tok, *mnt, *opt;
+ int error;
+
+ sbuf_printf(sb, ".onfail panic\n");
+ sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
+ if (boothowto & RB_ASKNAME)
+ sbuf_printf(sb, ".ask\n");
+#ifdef ROOTDEVNAME
+ if (boothowto & RB_DFLTROOT)
+ sbuf_printf(sb, "%s\n", ROOTDEVNAME);
+#endif
+ if (boothowto & RB_CDROM) {
+ sbuf_printf(sb, "cd9660:/dev/cd0 ro\n");
+ sbuf_printf(sb, ".timeout 0\n");
+ sbuf_printf(sb, "cd9660:/dev/acd0 ro\n");
+ sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
+ }
+ s = getenv("vfs.root.mountfrom");
+ if (s != NULL) {
+ opt = getenv("vfs.root.mountfrom.options");
+ tok = s;
+ error = parse_token(&tok, &mnt);
+ while (!error) {
+ sbuf_printf(sb, "%s %s\n", mnt,
+ (opt != NULL) ? opt : "");
+ free(mnt, M_TEMP);
+ error = parse_token(&tok, &mnt);
+ }
+ if (opt != NULL)
+ freeenv(opt);
+ freeenv(s);
+ }
+ if (rootdevnames[0] != NULL)
+ sbuf_printf(sb, "%s\n", rootdevnames[0]);
+ if (rootdevnames[1] != NULL)
+ sbuf_printf(sb, "%s\n", rootdevnames[1]);
+#ifdef ROOTDEVNAME
+ if (!(boothowto & RB_DFLTROOT))
+ sbuf_printf(sb, "%s\n", ROOTDEVNAME);
+#endif
+ if (!(boothowto & RB_ASKNAME))
+ sbuf_printf(sb, ".ask\n");
+}
+
+static int
+vfs_mountroot_readconf(struct thread *td, struct sbuf *sb)
+{
+ static char buf[128];
+ struct nameidata nd;
+ off_t ofs;
+ ssize_t resid;
+ int error, flags, len;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
+ flags = FREAD;
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error)
+ return (error);
+
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ ofs = 0;
+ len = sizeof(buf) - 1;
+ while (1) {
+ error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
+ NOCRED, &resid, td);
+ if (error)
+ break;
+ if (resid == len)
+ break;
+ buf[len - resid] = 0;
+ sbuf_printf(sb, "%s", buf);
+ ofs += len - resid;
+ }
+
+ VOP_UNLOCK(nd.ni_vp, 0);
+ vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+ return (error);
+}
+
+static void
+vfs_mountroot_wait(void)
+{
+ struct root_hold_token *h;
+ struct timeval lastfail;
+ int curfail;
+
+ curfail = 0;
+ while (1) {
+ DROP_GIANT();
+ g_waitidle();
+ PICKUP_GIANT();
+ mtx_lock(&mountlist_mtx);
+ if (LIST_EMPTY(&root_holds)) {
+ mtx_unlock(&mountlist_mtx);
+ break;
+ }
+ if (ppsratecheck(&lastfail, &curfail, 1)) {
+ printf("Root mount waiting for:");
+ LIST_FOREACH(h, &root_holds, list)
+ printf(" %s", h->who);
+ printf("\n");
+ }
+ msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
+ hz);
+ }
+}
+
+void
+vfs_mountroot(void)
+{
+ struct mount *mp;
+ struct sbuf *sb;
+ struct thread *td;
+ time_t timebase;
+ int error;
+
+ td = curthread;
+
+ vfs_mountroot_wait();
+
+ sb = sbuf_new_auto();
+ vfs_mountroot_conf0(sb);
+ sbuf_finish(sb);
+
+ error = vfs_mountroot_devfs(td, &mp);
+ while (!error) {
+ error = vfs_mountroot_parse(sb, mp);
+ if (!error) {
+ error = vfs_mountroot_shuffle(td, mp);
+ if (!error) {
+ sbuf_clear(sb);
+ error = vfs_mountroot_readconf(td, sb);
+ sbuf_finish(sb);
+ }
+ }
+ }
+
+ sbuf_delete(sb);
+
+ /*
+ * Iterate over all currently mounted file systems and use
+ * the time stamp found to check and/or initialize the RTC.
+ * Call inittodr() only once and pass it the largest of the
+ * timestamps we encounter.
+ */
+ timebase = 0;
+ mtx_lock(&mountlist_mtx);
+ mp = TAILQ_FIRST(&mountlist);
+ while (mp != NULL) {
+ if (mp->mnt_time > timebase)
+ timebase = mp->mnt_time;
+ mp = TAILQ_NEXT(mp, mnt_list);
+ }
+ mtx_unlock(&mountlist_mtx);
+ inittodr(timebase);
+
+ /* Keep prison0's root in sync with the global rootvnode. */
+ mtx_lock(&prison0.pr_mtx);
+ prison0.pr_root = rootvnode;
+ vref(prison0.pr_root);
+ mtx_unlock(&prison0.pr_mtx);
+
+ mtx_lock(&mountlist_mtx);
+ atomic_store_rel_int(&root_mount_complete, 1);
+ wakeup(&root_mount_complete);
+ mtx_unlock(&mountlist_mtx);
+
+ EVENTHANDLER_INVOKE(mountroot);
+}
+
+static struct mntarg *
+parse_mountroot_options(struct mntarg *ma, const char *options)
+{
+ char *p;
+ char *name, *name_arg;
+ char *val, *val_arg;
+ char *opts;
+
+ if (options == NULL || options[0] == '\0')
+ return (ma);
+
+ p = opts = strdup(options, M_MOUNT);
+ if (opts == NULL) {
+ return (ma);
+ }
+
+ while((name = strsep(&p, ",")) != NULL) {
+ if (name[0] == '\0')
+ break;
+
+ val = strchr(name, '=');
+ if (val != NULL) {
+ *val = '\0';
+ ++val;
+ }
+ if( strcmp(name, "rw") == 0 ||
+ strcmp(name, "noro") == 0) {
+ /*
+ * The first time we mount the root file system,
+ * we need to mount 'ro', so We need to ignore
+ * 'rw' and 'noro' mount options.
+ */
+ continue;
+ }
+ name_arg = strdup(name, M_MOUNT);
+ val_arg = NULL;
+ if (val != NULL)
+ val_arg = strdup(val, M_MOUNT);
+
+ ma = mount_arg(ma, name_arg, val_arg,
+ (val_arg != NULL ? -1 : 0));
+ }
+ free(opts, M_MOUNT);
+ return (ma);
+}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
new file mode 100644
index 0000000..3cbc95f
--- /dev/null
+++ b/sys/kern/vfs_subr.c
@@ -0,0 +1,4775 @@
+/*-
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ddb.h"
+#include "opt_watchdog.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/extattr.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/jail.h>
+#include <sys/kdb.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lockf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/pctrie.h>
+#include <sys/priv.h>
+#include <sys/reboot.h>
+#include <sys/rwlock.h>
+#include <sys/sched.h>
+#include <sys/sleepqueue.h>
+#include <sys/smp.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/vmmeter.h>
+#include <sys/vnode.h>
+#include <sys/watchdog.h>
+
+#include <machine/stdarg.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_kern.h>
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static void delmntque(struct vnode *vp);
+static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
+ int slpflag, int slptimeo);
+static void syncer_shutdown(void *arg, int howto);
+static int vtryrecycle(struct vnode *vp);
+static void v_incr_usecount(struct vnode *);
+static void v_decr_usecount(struct vnode *);
+static void v_decr_useonly(struct vnode *);
+static void v_upgrade_usecount(struct vnode *);
+static void vnlru_free(int);
+static void vgonel(struct vnode *);
+static void vfs_knllock(void *arg);
+static void vfs_knlunlock(void *arg);
+static void vfs_knl_assert_locked(void *arg);
+static void vfs_knl_assert_unlocked(void *arg);
+static void destroy_vpollinfo(struct vpollinfo *vi);
+
+/*
+ * Number of vnodes in existence. Increased whenever getnewvnode()
+ * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
+ */
+static unsigned long numvnodes;
+
+SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
+ "Number of vnodes in existence");
+
+/*
+ * Conversion tables for conversion from vnode types to inode formats
+ * and back.
+ */
+enum vtype iftovt_tab[16] = {
+ VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+ VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[10] = {
+ 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+ S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
+};
+
+/*
+ * List of vnodes that are ready for recycling.
+ */
+static TAILQ_HEAD(freelst, vnode) vnode_free_list;
+
+/*
+ * Free vnode target. Free vnodes may simply be files which have been stat'd
+ * but not read. This is somewhat common, and a small cache of such files
+ * should be kept to avoid recreation costs.
+ */
+static u_long wantfreevnodes;
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+/* Number of vnodes in the free list. */
+static u_long freevnodes;
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
+ "Number of vnodes in the free list");
+
+static int vlru_allow_cache_src;
+SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
+ &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
+
+/*
+ * Various variables used for debugging the new implementation of
+ * reassignbuf().
+ * XXX these are probably of (very) limited utility now.
+ */
+static int reassignbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
+ "Number of calls to reassignbuf");
+
+/*
+ * Cache for the mount type id assigned to NFS. This is used for
+ * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
+ */
+int nfs_mount_type = -1;
+
+/* To keep more than one thread at a time from running vfs_getnewfsid */
+static struct mtx mntid_mtx;
+
+/*
+ * Lock for any access to the following:
+ * vnode_free_list
+ * numvnodes
+ * freevnodes
+ */
+static struct mtx vnode_free_list_mtx;
+
+/* Publicly exported FS */
+struct nfs_public nfs_pub;
+
+static uma_zone_t buf_trie_zone;
+
+/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
+static uma_zone_t vnode_zone;
+static uma_zone_t vnodepoll_zone;
+
+/*
+ * The workitem queue.
+ *
+ * It is useful to delay writes of file data and filesystem metadata
+ * for tens of seconds so that quickly created and deleted files need
+ * not waste disk bandwidth being created and removed. To realize this,
+ * we append vnodes to a "workitem" queue. When running with a soft
+ * updates implementation, most pending metadata dependencies should
+ * not wait for more than a few seconds. Thus, mounted on block devices
+ * are delayed only about a half the time that file data is delayed.
+ * Similarly, directory updates are more critical, so are only delayed
+ * about a third the time that file data is delayed. Thus, there are
+ * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
+ * one each second (driven off the filesystem syncer process). The
+ * syncer_delayno variable indicates the next queue that is to be processed.
+ * Items that need to be processed soon are placed in this queue:
+ *
+ * syncer_workitem_pending[syncer_delayno]
+ *
+ * A delay of fifteen seconds is done by placing the request fifteen
+ * entries later in the queue:
+ *
+ * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
+ *
+ */
+static int syncer_delayno;
+static long syncer_mask;
+LIST_HEAD(synclist, bufobj);
+static struct synclist *syncer_workitem_pending;
+/*
+ * The sync_mtx protects:
+ * bo->bo_synclist
+ * sync_vnode_count
+ * syncer_delayno
+ * syncer_state
+ * syncer_workitem_pending
+ * syncer_worklist_len
+ * rushjob
+ */
+static struct mtx sync_mtx;
+static struct cv sync_wakeup;
+
+#define SYNCER_MAXDELAY 32
+static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
+static int syncdelay = 30; /* max time to delay syncing data */
+static int filedelay = 30; /* time to delay syncing files */
+SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
+ "Time to delay syncing files (in seconds)");
+static int dirdelay = 29; /* time to delay syncing directories */
+SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
+ "Time to delay syncing directories (in seconds)");
+static int metadelay = 28; /* time to delay syncing metadata */
+SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
+ "Time to delay syncing metadata (in seconds)");
+static int rushjob; /* number of slots to run ASAP */
+static int stat_rush_requests; /* number of times I/O speeded up */
+SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
+ "Number of times I/O speeded up (rush requests)");
+
+/*
+ * When shutting down the syncer, run it at four times normal speed.
+ */
+#define SYNCER_SHUTDOWN_SPEEDUP 4
+static int sync_vnode_count;
+static int syncer_worklist_len;
+static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
+ syncer_state;
+
+/*
+ * Number of vnodes we want to exist at any one time. This is mostly used
+ * to size hash tables in vnode-related code. It is normally not used in
+ * getnewvnode(), as wantfreevnodes is normally nonzero.)
+ *
+ * XXX desiredvnodes is historical cruft and should not exist.
+ */
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
+ &desiredvnodes, 0, "Maximum number of vnodes");
+SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
+ &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
+static int vnlru_nowhere;
+SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
+ &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
+
+/*
+ * Macros to control when a vnode is freed and recycled. All require
+ * the vnode interlock.
+ */
+#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
+#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
+#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
+
+/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
+static int vnsz2log;
+
+/*
+ * Support for the bufobj clean & dirty pctrie.
+ */
+static void *
+buf_trie_alloc(struct pctrie *ptree)
+{
+
+ return uma_zalloc(buf_trie_zone, M_NOWAIT);
+}
+
+static void
+buf_trie_free(struct pctrie *ptree, void *node)
+{
+
+ uma_zfree(buf_trie_zone, node);
+}
+PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
+
+/*
+ * Initialize the vnode management data structures.
+ *
+ * Reevaluate the following cap on the number of vnodes after the physical
+ * memory size exceeds 512GB. In the limit, as the physical memory size
+ * grows, the ratio of physical pages to vnodes approaches sixteen to one.
+ */
+#ifndef MAXVNODES_MAX
+#define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
+#endif
+static void
+vntblinit(void *dummy __unused)
+{
+ u_int i;
+ int physvnodes, virtvnodes;
+
+ /*
+ * Desiredvnodes is a function of the physical memory size and the
+ * kernel's heap size. Generally speaking, it scales with the
+ * physical memory size. The ratio of desiredvnodes to physical pages
+ * is one to four until desiredvnodes exceeds 98,304. Thereafter, the
+ * marginal ratio of desiredvnodes to physical pages is one to
+ * sixteen. However, desiredvnodes is limited by the kernel's heap
+ * size. The memory required by desiredvnodes vnodes and vm objects
+ * may not exceed one seventh of the kernel's heap size.
+ */
+ physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
+ cnt.v_page_count) / 16;
+ virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
+ sizeof(struct vnode)));
+ desiredvnodes = min(physvnodes, virtvnodes);
+ if (desiredvnodes > MAXVNODES_MAX) {
+ if (bootverbose)
+ printf("Reducing kern.maxvnodes %d -> %d\n",
+ desiredvnodes, MAXVNODES_MAX);
+ desiredvnodes = MAXVNODES_MAX;
+ }
+ wantfreevnodes = desiredvnodes / 4;
+ mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
+ TAILQ_INIT(&vnode_free_list);
+ mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
+ vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
+ NULL, NULL, UMA_ALIGN_PTR, 0);
+ vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+ /*
+ * Preallocate enough nodes to support one-per buf so that
+ * we can not fail an insert. reassignbuf() callers can not
+ * tolerate the insertion failure.
+ */
+ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
+ NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
+ UMA_ZONE_NOFREE | UMA_ZONE_VM);
+ uma_prealloc(buf_trie_zone, nbuf);
+ /*
+ * Initialize the filesystem syncer.
+ */
+ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
+ &syncer_mask);
+ syncer_maxdelay = syncer_mask + 1;
+ mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
+ cv_init(&sync_wakeup, "syncer");
+ for (i = 1; i <= sizeof(struct vnode); i <<= 1)
+ vnsz2log++;
+ vnsz2log--;
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
+
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Eventually, mountlist_mtx is not released on failure.
+ *
+ * vfs_busy() is a custom lock, it can block the caller.
+ * vfs_busy() only sleeps if the unmount is active on the mount point.
+ * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
+ * vnode belonging to mp.
+ *
+ * Lookup uses vfs_busy() to traverse mount points.
+ * root fs var fs
+ * / vnode lock A / vnode lock (/var) D
+ * /var vnode lock B /log vnode lock(/var/log) E
+ * vfs_busy lock C vfs_busy lock F
+ *
+ * Within each file system, the lock order is C->A->B and F->D->E.
+ *
+ * When traversing across mounts, the system follows that lock order:
+ *
+ * C->A->B
+ * |
+ * +->F->D->E
+ *
+ * The lookup() process for namei("/var") illustrates the process:
+ * VOP_LOOKUP() obtains B while A is held
+ * vfs_busy() obtains a shared lock on F while A and B are held
+ * vput() releases lock on B
+ * vput() releases lock on A
+ * VFS_ROOT() obtains lock on D while shared lock on F is held
+ * vfs_unbusy() releases shared lock on F
+ * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
+ * Attempt to lock A (instead of vp_crossmp) while D is held would
+ * violate the global order, causing deadlocks.
+ *
+ * dounmount() locks B while F is drained.
+ */
+int
+vfs_busy(struct mount *mp, int flags)
+{
+
+ MPASS((flags & ~MBF_MASK) == 0);
+ CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
+
+ MNT_ILOCK(mp);
+ MNT_REF(mp);
+ /*
+ * If mount point is currenly being unmounted, sleep until the
+ * mount point fate is decided. If thread doing the unmounting fails,
+ * it will clear MNTK_UNMOUNT flag before waking us up, indicating
+ * that this mount point has survived the unmount attempt and vfs_busy
+ * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE
+ * flag in addition to MNTK_UNMOUNT, indicating that mount point is
+ * about to be really destroyed. vfs_busy needs to release its
+ * reference on the mount point in this case and return with ENOENT,
+ * telling the caller that mount mount it tried to busy is no longer
+ * valid.
+ */
+ while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
+ if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ CTR1(KTR_VFS, "%s: failed busying before sleeping",
+ __func__);
+ return (ENOENT);
+ }
+ if (flags & MBF_MNTLSTLOCK)
+ mtx_unlock(&mountlist_mtx);
+ mp->mnt_kern_flag |= MNTK_MWAIT;
+ msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
+ if (flags & MBF_MNTLSTLOCK)
+ mtx_lock(&mountlist_mtx);
+ MNT_ILOCK(mp);
+ }
+ if (flags & MBF_MNTLSTLOCK)
+ mtx_unlock(&mountlist_mtx);
+ mp->mnt_lockref++;
+ MNT_IUNLOCK(mp);
+ return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(struct mount *mp)
+{
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
+ mp->mnt_lockref--;
+ if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
+ MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
+ CTR1(KTR_VFS, "%s: waking up waiters", __func__);
+ mp->mnt_kern_flag &= ~MNTK_DRAINING;
+ wakeup(&mp->mnt_lockref);
+ }
+ MNT_IUNLOCK(mp);
+}
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid_t *fsid)
+{
+ struct mount *mp;
+
+ CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ vfs_ref(mp);
+ mtx_unlock(&mountlist_mtx);
+ return (mp);
+ }
+ }
+ mtx_unlock(&mountlist_mtx);
+ CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
+ return ((struct mount *) 0);
+}
+
+/*
+ * Lookup a mount point by filesystem identifier, busying it before
+ * returning.
+ */
+struct mount *
+vfs_busyfs(fsid_t *fsid)
+{
+ struct mount *mp;
+ int error;
+
+ CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ error = vfs_busy(mp, MBF_MNTLSTLOCK);
+ if (error) {
+ mtx_unlock(&mountlist_mtx);
+ return (NULL);
+ }
+ return (mp);
+ }
+ }
+ CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
+ mtx_unlock(&mountlist_mtx);
+ return ((struct mount *) 0);
+}
+
+/*
+ * Check if a user can access privileged mount options.
+ */
+int
+vfs_suser(struct mount *mp, struct thread *td)
+{
+ int error;
+
+ /*
+ * If the thread is jailed, but this is not a jail-friendly file
+ * system, deny immediately.
+ */
+ if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
+ return (EPERM);
+
+ /*
+ * If the file system was mounted outside the jail of the calling
+ * thread, deny immediately.
+ */
+ if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
+ return (EPERM);
+
+ /*
+ * If file system supports delegated administration, we don't check
+ * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
+ * by the file system itself.
+ * If this is not the user that did original mount, we check for
+ * the PRIV_VFS_MOUNT_OWNER privilege.
+ */
+ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
+ mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
+ if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * Get a new unique fsid. Try to make its val[0] unique, since this value
+ * will be used to create fake device numbers for stat(). Also try (but
+ * not so hard) make its val[0] unique mod 2^16, since some emulators only
+ * support 16-bit device numbers. We end up with unique val[0]'s for the
+ * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
+ *
+ * Keep in mind that several mounts may be running in parallel. Starting
+ * the search one past where the previous search terminated is both a
+ * micro-optimization and a defense against returning the same fsid to
+ * different mounts.
+ */
+void
+vfs_getnewfsid(struct mount *mp)
+{
+ static uint16_t mntid_base;
+ struct mount *nmp;
+ fsid_t tfsid;
+ int mtype;
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ mtx_lock(&mntid_mtx);
+ mtype = mp->mnt_vfc->vfc_typenum;
+ tfsid.val[1] = mtype;
+ mtype = (mtype & 0xFF) << 24;
+ for (;;) {
+ tfsid.val[0] = makedev(255,
+ mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
+ mntid_base++;
+ if ((nmp = vfs_getvfs(&tfsid)) == NULL)
+ break;
+ vfs_rel(nmp);
+ }
+ mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+ mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
+ mtx_unlock(&mntid_mtx);
+}
+
+/*
+ * Knob to control the precision of file timestamps:
+ *
+ * 0 = seconds only; nanoseconds zeroed.
+ * 1 = seconds and nanoseconds, accurate within 1/HZ.
+ * 2 = seconds and nanoseconds, truncated to microseconds.
+ * >=3 = seconds and nanoseconds, maximum precision.
+ */
+enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
+
+static int timestamp_precision = TSP_SEC;
+SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
+ &timestamp_precision, 0, "File timestamp precision (0: seconds, "
+ "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
+ "3+: sec + ns (max. precision))");
+
+/*
+ * Get a current timestamp.
+ */
+void
+vfs_timestamp(struct timespec *tsp)
+{
+ struct timeval tv;
+
+ switch (timestamp_precision) {
+ case TSP_SEC:
+ tsp->tv_sec = time_second;
+ tsp->tv_nsec = 0;
+ break;
+ case TSP_HZ:
+ getnanotime(tsp);
+ break;
+ case TSP_USEC:
+ microtime(&tv);
+ TIMEVAL_TO_TIMESPEC(&tv, tsp);
+ break;
+ case TSP_NSEC:
+ default:
+ nanotime(tsp);
+ break;
+ }
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(struct vattr *vap)
+{
+
+ vap->va_type = VNON;
+ vap->va_size = VNOVAL;
+ vap->va_bytes = VNOVAL;
+ vap->va_mode = VNOVAL;
+ vap->va_nlink = VNOVAL;
+ vap->va_uid = VNOVAL;
+ vap->va_gid = VNOVAL;
+ vap->va_fsid = VNOVAL;
+ vap->va_fileid = VNOVAL;
+ vap->va_blocksize = VNOVAL;
+ vap->va_rdev = VNOVAL;
+ vap->va_atime.tv_sec = VNOVAL;
+ vap->va_atime.tv_nsec = VNOVAL;
+ vap->va_mtime.tv_sec = VNOVAL;
+ vap->va_mtime.tv_nsec = VNOVAL;
+ vap->va_ctime.tv_sec = VNOVAL;
+ vap->va_ctime.tv_nsec = VNOVAL;
+ vap->va_birthtime.tv_sec = VNOVAL;
+ vap->va_birthtime.tv_nsec = VNOVAL;
+ vap->va_flags = VNOVAL;
+ vap->va_gen = VNOVAL;
+ vap->va_vaflags = 0;
+}
+
+/*
+ * This routine is called when we have too many vnodes. It attempts
+ * to free <count> vnodes and will potentially free vnodes that still
+ * have VM backing store (VM backing store is typically the cause
+ * of a vnode blowout so we want to do this). Therefore, this operation
+ * is not considered cheap.
+ *
+ * A number of conditions may prevent a vnode from being reclaimed.
+ * the buffer cache may have references on the vnode, a directory
+ * vnode may still have references due to the namei cache representing
+ * underlying files, or the vnode may be in active use. It is not
+ * desireable to reuse such vnodes. These conditions may cause the
+ * number of vnodes to reach some minimum value regardless of what
+ * you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
+ */
+static int
+vlrureclaim(struct mount *mp)
+{
+ struct vnode *vp;
+ int done;
+ int trigger;
+ int usevnodes;
+ int count;
+
+ /*
+ * Calculate the trigger point, don't allow user
+ * screwups to blow us up. This prevents us from
+ * recycling vnodes with lots of resident pages. We
+ * aren't trying to free memory, we are trying to
+ * free vnodes.
+ */
+ usevnodes = desiredvnodes;
+ if (usevnodes <= 0)
+ usevnodes = 1;
+ trigger = cnt.v_page_count * 2 / usevnodes;
+ done = 0;
+ vn_start_write(NULL, &mp, V_WAIT);
+ MNT_ILOCK(mp);
+ count = mp->mnt_nvnodelistsize / 10 + 1;
+ while (count != 0) {
+ vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+ while (vp != NULL && vp->v_type == VMARKER)
+ vp = TAILQ_NEXT(vp, v_nmntvnodes);
+ if (vp == NULL)
+ break;
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ --count;
+ if (!VI_TRYLOCK(vp))
+ goto next_iter;
+ /*
+ * If it's been deconstructed already, it's still
+ * referenced, or it exceeds the trigger, skip it.
+ */
+ if (vp->v_usecount ||
+ (!vlru_allow_cache_src &&
+ !LIST_EMPTY(&(vp)->v_cache_src)) ||
+ (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
+ vp->v_object->resident_page_count > trigger)) {
+ VI_UNLOCK(vp);
+ goto next_iter;
+ }
+ MNT_IUNLOCK(mp);
+ vholdl(vp);
+ if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
+ vdrop(vp);
+ goto next_iter_mntunlocked;
+ }
+ VI_LOCK(vp);
+ /*
+ * v_usecount may have been bumped after VOP_LOCK() dropped
+ * the vnode interlock and before it was locked again.
+ *
+ * It is not necessary to recheck VI_DOOMED because it can
+ * only be set by another thread that holds both the vnode
+ * lock and vnode interlock. If another thread has the
+ * vnode lock before we get to VOP_LOCK() and obtains the
+ * vnode interlock after VOP_LOCK() drops the vnode
+ * interlock, the other thread will be unable to drop the
+ * vnode lock before our VOP_LOCK() call fails.
+ */
+ if (vp->v_usecount ||
+ (!vlru_allow_cache_src &&
+ !LIST_EMPTY(&(vp)->v_cache_src)) ||
+ (vp->v_object != NULL &&
+ vp->v_object->resident_page_count > trigger)) {
+ VOP_UNLOCK(vp, LK_INTERLOCK);
+ vdrop(vp);
+ goto next_iter_mntunlocked;
+ }
+ KASSERT((vp->v_iflag & VI_DOOMED) == 0,
+ ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
+ vgonel(vp);
+ VOP_UNLOCK(vp, 0);
+ vdropl(vp);
+ done++;
+next_iter_mntunlocked:
+ if (!should_yield())
+ goto relock_mnt;
+ goto yield;
+next_iter:
+ if (!should_yield())
+ continue;
+ MNT_IUNLOCK(mp);
+yield:
+ kern_yield(PRI_USER);
+relock_mnt:
+ MNT_ILOCK(mp);
+ }
+ MNT_IUNLOCK(mp);
+ vn_finished_write(mp);
+ return done;
+}
+
+/*
+ * Attempt to keep the free list at wantfreevnodes length.
+ */
+static void
+vnlru_free(int count)
+{
+ struct vnode *vp;
+
+ mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+ for (; count > 0; count--) {
+ vp = TAILQ_FIRST(&vnode_free_list);
+ /*
+ * The list can be modified while the free_list_mtx
+ * has been dropped and vp could be NULL here.
+ */
+ if (!vp)
+ break;
+ VNASSERT(vp->v_op != NULL, vp,
+ ("vnlru_free: vnode already reclaimed."));
+ KASSERT((vp->v_iflag & VI_FREE) != 0,
+ ("Removing vnode not on freelist"));
+ KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+ ("Mangling active vnode"));
+ TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
+ /*
+ * Don't recycle if we can't get the interlock.
+ */
+ if (!VI_TRYLOCK(vp)) {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
+ continue;
+ }
+ VNASSERT(VCANRECYCLE(vp), vp,
+ ("vp inconsistent on freelist"));
+ freevnodes--;
+ vp->v_iflag &= ~VI_FREE;
+ vholdl(vp);
+ mtx_unlock(&vnode_free_list_mtx);
+ VI_UNLOCK(vp);
+ vtryrecycle(vp);
+ /*
+ * If the recycled succeeded this vdrop will actually free
+ * the vnode. If not it will simply place it back on
+ * the free list.
+ */
+ vdrop(vp);
+ mtx_lock(&vnode_free_list_mtx);
+ }
+}
+/*
+ * Attempt to recycle vnodes in a context that is always safe to block.
+ * Calling vlrurecycle() from the bowels of filesystem code has some
+ * interesting deadlock problems.
+ */
+static struct proc *vnlruproc;
+static int vnlruproc_sig;
+
+static void
+vnlru_proc(void)
+{
+ struct mount *mp, *nmp;
+ int done;
+ struct proc *p = vnlruproc;
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
+ SHUTDOWN_PRI_FIRST);
+
+ for (;;) {
+ kproc_suspend_check(p);
+ mtx_lock(&vnode_free_list_mtx);
+ if (freevnodes > wantfreevnodes)
+ vnlru_free(freevnodes - wantfreevnodes);
+ if (numvnodes <= desiredvnodes * 9 / 10) {
+ vnlruproc_sig = 0;
+ wakeup(&vnlruproc_sig);
+ msleep(vnlruproc, &vnode_free_list_mtx,
+ PVFS|PDROP, "vlruwt", hz);
+ continue;
+ }
+ mtx_unlock(&vnode_free_list_mtx);
+ done = 0;
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ done += vlrureclaim(mp);
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp);
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (done == 0) {
+#if 0
+ /* These messages are temporary debugging aids */
+ if (vnlru_nowhere < 5)
+ printf("vnlru process getting nowhere..\n");
+ else if (vnlru_nowhere == 5)
+ printf("vnlru process messages stopped.\n");
+#endif
+ vnlru_nowhere++;
+ tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
+ } else
+ kern_yield(PRI_USER);
+ }
+}
+
+static struct kproc_desc vnlru_kp = {
+ "vnlru",
+ vnlru_proc,
+ &vnlruproc
+};
+SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
+ &vnlru_kp);
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+
+/*
+ * Try to recycle a freed vnode. We abort if anyone picks up a reference
+ * before we actually vgone(). This function must be called with the vnode
+ * held to prevent the vnode from being returned to the free list midway
+ * through vgone().
+ */
+static int
+vtryrecycle(struct vnode *vp)
+{
+ struct mount *vnmp;
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ VNASSERT(vp->v_holdcnt, vp,
+ ("vtryrecycle: Recycling vp %p without a reference.", vp));
+ /*
+ * This vnode may found and locked via some other list, if so we
+ * can't recycle it yet.
+ */
+ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+ CTR2(KTR_VFS,
+ "%s: impossible to recycle, vp %p lock is already held",
+ __func__, vp);
+ return (EWOULDBLOCK);
+ }
+ /*
+ * Don't recycle if its filesystem is being suspended.
+ */
+ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
+ VOP_UNLOCK(vp, 0);
+ CTR2(KTR_VFS,
+ "%s: impossible to recycle, cannot start the write for %p",
+ __func__, vp);
+ return (EBUSY);
+ }
+ /*
+ * If we got this far, we need to acquire the interlock and see if
+ * anyone picked up this vnode from another list. If not, we will
+ * mark it with DOOMED via vgonel() so that anyone who does find it
+ * will skip over it.
+ */
+ VI_LOCK(vp);
+ if (vp->v_usecount) {
+ VOP_UNLOCK(vp, LK_INTERLOCK);
+ vn_finished_write(vnmp);
+ CTR2(KTR_VFS,
+ "%s: impossible to recycle, %p is already referenced",
+ __func__, vp);
+ return (EBUSY);
+ }
+ if ((vp->v_iflag & VI_DOOMED) == 0)
+ vgonel(vp);
+ VOP_UNLOCK(vp, LK_INTERLOCK);
+ vn_finished_write(vnmp);
+ return (0);
+}
+
+/*
+ * Wait for available vnodes.
+ */
+static int
+getnewvnode_wait(int suspended)
+{
+
+ mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+ if (numvnodes > desiredvnodes) {
+ if (suspended) {
+ /*
+ * File system is beeing suspended, we cannot risk a
+ * deadlock here, so allocate new vnode anyway.
+ */
+ if (freevnodes > wantfreevnodes)
+ vnlru_free(freevnodes - wantfreevnodes);
+ return (0);
+ }
+ if (vnlruproc_sig == 0) {
+ vnlruproc_sig = 1; /* avoid unnecessary wakeups */
+ wakeup(vnlruproc);
+ }
+ msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
+ "vlruwk", hz);
+ }
+ return (numvnodes > desiredvnodes ? ENFILE : 0);
+}
+
+void
+getnewvnode_reserve(u_int count)
+{
+ struct thread *td;
+
+ td = curthread;
+ mtx_lock(&vnode_free_list_mtx);
+ while (count > 0) {
+ if (getnewvnode_wait(0) == 0) {
+ count--;
+ td->td_vp_reserv++;
+ numvnodes++;
+ }
+ }
+ mtx_unlock(&vnode_free_list_mtx);
+}
+
+void
+getnewvnode_drop_reserve(void)
+{
+ struct thread *td;
+
+ td = curthread;
+ mtx_lock(&vnode_free_list_mtx);
+ KASSERT(numvnodes >= td->td_vp_reserv, ("reserve too large"));
+ numvnodes -= td->td_vp_reserv;
+ mtx_unlock(&vnode_free_list_mtx);
+ td->td_vp_reserv = 0;
+}
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
+ struct vnode **vpp)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+ struct thread *td;
+ int error;
+
+ CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
+ vp = NULL;
+ td = curthread;
+ if (td->td_vp_reserv > 0) {
+ td->td_vp_reserv -= 1;
+ goto alloc;
+ }
+ mtx_lock(&vnode_free_list_mtx);
+ /*
+ * Lend our context to reclaim vnodes if they've exceeded the max.
+ */
+ if (freevnodes > wantfreevnodes)
+ vnlru_free(1);
+ error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+ MNTK_SUSPEND));
+#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
+ if (error != 0) {
+ mtx_unlock(&vnode_free_list_mtx);
+ return (error);
+ }
+#endif
+ numvnodes++;
+ mtx_unlock(&vnode_free_list_mtx);
+alloc:
+ vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
+ /*
+ * Setup locks.
+ */
+ vp->v_vnlock = &vp->v_lock;
+ mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+ /*
+ * By default, don't allow shared locks unless filesystems
+ * opt-in.
+ */
+ lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
+ /*
+ * Initialize bufobj.
+ */
+ bo = &vp->v_bufobj;
+ bo->__bo_vnode = vp;
+ rw_init(BO_LOCKPTR(bo), "bufobj interlock");
+ bo->bo_ops = &buf_ops_bio;
+ bo->bo_private = vp;
+ TAILQ_INIT(&bo->bo_clean.bv_hd);
+ TAILQ_INIT(&bo->bo_dirty.bv_hd);
+ /*
+ * Initialize namecache.
+ */
+ LIST_INIT(&vp->v_cache_src);
+ TAILQ_INIT(&vp->v_cache_dst);
+ /*
+ * Finalize various vnode identity bits.
+ */
+ vp->v_type = VNON;
+ vp->v_tag = tag;
+ vp->v_op = vops;
+ v_incr_usecount(vp);
+ vp->v_data = NULL;
+#ifdef MAC
+ mac_vnode_init(vp);
+ if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
+ mac_vnode_associate_singlelabel(mp, vp);
+ else if (mp == NULL && vops != &dead_vnodeops)
+ printf("NULL mp in getnewvnode()\n");
+#endif
+ if (mp != NULL) {
+ bo->bo_bsize = mp->mnt_stat.f_iosize;
+ if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
+ vp->v_vflag |= VV_NOKNOTE;
+ }
+ rangelock_init(&vp->v_rl);
+
+ /*
+ * For the filesystems which do not use vfs_hash_insert(),
+ * still initialize v_hash to have vfs_hash_index() useful.
+ * E.g., nullfs uses vfs_hash_index() on the lower vnode for
+ * its own hashing.
+ */
+ vp->v_hash = (uintptr_t)vp >> vnsz2log;
+
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * Delete from old mount point vnode list, if on one.
+ */
+static void
+delmntque(struct vnode *vp)
+{
+ struct mount *mp;
+ int active;
+
+ mp = vp->v_mount;
+ if (mp == NULL)
+ return;
+ MNT_ILOCK(mp);
+ VI_LOCK(vp);
+ KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
+ ("Active vnode list size %d > Vnode list size %d",
+ mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
+ active = vp->v_iflag & VI_ACTIVE;
+ vp->v_iflag &= ~VI_ACTIVE;
+ if (active) {
+ mtx_lock(&vnode_free_list_mtx);
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
+ mp->mnt_activevnodelistsize--;
+ mtx_unlock(&vnode_free_list_mtx);
+ }
+ vp->v_mount = NULL;
+ VI_UNLOCK(vp);
+ VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
+ ("bad mount point vnode list size"));
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ mp->mnt_nvnodelistsize--;
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+}
+
+static void
+insmntque_stddtr(struct vnode *vp, void *dtr_arg)
+{
+
+ vp->v_data = NULL;
+ vp->v_op = &dead_vnodeops;
+ vgone(vp);
+ vput(vp);
+}
+
+/*
+ * Insert into list of vnodes for the new mount point, if available.
+ */
+int
+insmntque1(struct vnode *vp, struct mount *mp,
+ void (*dtr)(struct vnode *, void *), void *dtr_arg)
+{
+
+ KASSERT(vp->v_mount == NULL,
+ ("insmntque: vnode already on per mount vnode list"));
+ VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
+ ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
+
+ /*
+ * We acquire the vnode interlock early to ensure that the
+ * vnode cannot be recycled by another process releasing a
+ * holdcnt on it before we get it on both the vnode list
+ * and the active vnode list. The mount mutex protects only
+ * manipulation of the vnode list and the vnode freelist
+ * mutex protects only manipulation of the active vnode list.
+ * Hence the need to hold the vnode interlock throughout.
+ */
+ MNT_ILOCK(mp);
+ VI_LOCK(vp);
+ if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
+ ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
+ mp->mnt_nvnodelistsize == 0)) &&
+ (vp->v_vflag & VV_FORCEINSMQ) == 0) {
+ VI_UNLOCK(vp);
+ MNT_IUNLOCK(mp);
+ if (dtr != NULL)
+ dtr(vp, dtr_arg);
+ return (EBUSY);
+ }
+ vp->v_mount = mp;
+ MNT_REF(mp);
+ TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
+ VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
+ ("neg mount point vnode list size"));
+ mp->mnt_nvnodelistsize++;
+ KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+ ("Activating already active vnode"));
+ vp->v_iflag |= VI_ACTIVE;
+ mtx_lock(&vnode_free_list_mtx);
+ TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
+ mp->mnt_activevnodelistsize++;
+ mtx_unlock(&vnode_free_list_mtx);
+ VI_UNLOCK(vp);
+ MNT_IUNLOCK(mp);
+ return (0);
+}
+
+int
+insmntque(struct vnode *vp, struct mount *mp)
+{
+
+ return (insmntque1(vp, mp, insmntque_stddtr, NULL));
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a bufobj
+ * Called with the underlying object locked.
+ */
+int
+bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
+{
+ int error;
+
+ BO_LOCK(bo);
+ if (flags & V_SAVE) {
+ error = bufobj_wwait(bo, slpflag, slptimeo);
+ if (error) {
+ BO_UNLOCK(bo);
+ return (error);
+ }
+ if (bo->bo_dirty.bv_cnt > 0) {
+ BO_UNLOCK(bo);
+ if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
+ return (error);
+ /*
+ * XXX We could save a lock/unlock if this was only
+ * enabled under INVARIANTS
+ */
+ BO_LOCK(bo);
+ if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
+ panic("vinvalbuf: dirty bufs");
+ }
+ }
+ /*
+ * If you alter this loop please notice that interlock is dropped and
+ * reacquired in flushbuflist. Special care is needed to ensure that
+ * no race conditions occur from this.
+ */
+ do {
+ error = flushbuflist(&bo->bo_clean,
+ flags, bo, slpflag, slptimeo);
+ if (error == 0 && !(flags & V_CLEANONLY))
+ error = flushbuflist(&bo->bo_dirty,
+ flags, bo, slpflag, slptimeo);
+ if (error != 0 && error != EAGAIN) {
+ BO_UNLOCK(bo);
+ return (error);
+ }
+ } while (error != 0);
+
+ /*
+ * Wait for I/O to complete. XXX needs cleaning up. The vnode can
+ * have write I/O in-progress but if there is a VM object then the
+ * VM object can also have read-I/O in-progress.
+ */
+ do {
+ bufobj_wwait(bo, 0, 0);
+ BO_UNLOCK(bo);
+ if (bo->bo_object != NULL) {
+ VM_OBJECT_WLOCK(bo->bo_object);
+ vm_object_pip_wait(bo->bo_object, "bovlbx");
+ VM_OBJECT_WUNLOCK(bo->bo_object);
+ }
+ BO_LOCK(bo);
+ } while (bo->bo_numoutput > 0);
+ BO_UNLOCK(bo);
+
+ /*
+ * Destroy the copy in the VM cache, too.
+ */
+ if (bo->bo_object != NULL &&
+ (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
+ VM_OBJECT_WLOCK(bo->bo_object);
+ vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
+ OBJPR_CLEANONLY : 0);
+ VM_OBJECT_WUNLOCK(bo->bo_object);
+ }
+
+#ifdef INVARIANTS
+ BO_LOCK(bo);
+ if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
+ (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
+ panic("vinvalbuf: flush failed");
+ BO_UNLOCK(bo);
+#endif
+ return (0);
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
+{
+
+ CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
+ ASSERT_VOP_LOCKED(vp, "vinvalbuf");
+ return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
+}
+
+/*
+ * Flush out buffers on the specified list.
+ *
+ */
+static int
+flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
+ int slptimeo)
+{
+ struct buf *bp, *nbp;
+ int retval, error;
+ daddr_t lblkno;
+ b_xflags_t xflags;
+
+ ASSERT_BO_WLOCKED(bo);
+
+ retval = 0;
+ TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
+ if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
+ ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
+ continue;
+ }
+ lblkno = 0;
+ xflags = 0;
+ if (nbp != NULL) {
+ lblkno = nbp->b_lblkno;
+ xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
+ }
+ retval = EAGAIN;
+ error = BUF_TIMELOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
+ "flushbuf", slpflag, slptimeo);
+ if (error) {
+ BO_LOCK(bo);
+ return (error != ENOLCK ? error : EAGAIN);
+ }
+ KASSERT(bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p",
+ bp, bp->b_bufobj, bo));
+ if (bp->b_bufobj != bo) { /* XXX: necessary ? */
+ BUF_UNLOCK(bp);
+ BO_LOCK(bo);
+ return (EAGAIN);
+ }
+ /*
+ * XXX Since there are no node locks for NFS, I
+ * believe there is a slight chance that a delayed
+ * write will occur while sleeping just above, so
+ * check for it.
+ */
+ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
+ (flags & V_SAVE)) {
+ bremfree(bp);
+ bp->b_flags |= B_ASYNC;
+ bwrite(bp);
+ BO_LOCK(bo);
+ return (EAGAIN); /* XXX: why not loop ? */
+ }
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ BO_LOCK(bo);
+ if (nbp != NULL &&
+ (nbp->b_bufobj != bo ||
+ nbp->b_lblkno != lblkno ||
+ (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
+ break; /* nbp invalid */
+ }
+ return (retval);
+}
+
+/*
+ * Truncate a file's buffer and pages to a specified length. This
+ * is in lieu of the old vinvalbuf mechanism, which performed unneeded
+ * sync activity.
+ */
+int
+vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
+{
+ struct buf *bp, *nbp;
+ int anyfreed;
+ int trunclbn;
+ struct bufobj *bo;
+
+ CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
+ vp, cred, blksize, (uintmax_t)length);
+
+ /*
+ * Round up to the *next* lbn.
+ */
+ trunclbn = (length + blksize - 1) / blksize;
+
+ ASSERT_VOP_LOCKED(vp, "vtruncbuf");
+restart:
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+ anyfreed = 1;
+ for (;anyfreed;) {
+ anyfreed = 0;
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
+ if (bp->b_lblkno < trunclbn)
+ continue;
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+ BO_LOCKPTR(bo)) == ENOLCK)
+ goto restart;
+
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = 1;
+
+ BO_LOCK(bo);
+ if (nbp != NULL &&
+ (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI))) {
+ BO_UNLOCK(bo);
+ goto restart;
+ }
+ }
+
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+ if (bp->b_lblkno < trunclbn)
+ continue;
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+ BO_LOCKPTR(bo)) == ENOLCK)
+ goto restart;
+ bremfree(bp);
+ bp->b_flags |= (B_INVAL | B_RELBUF);
+ bp->b_flags &= ~B_ASYNC;
+ brelse(bp);
+ anyfreed = 1;
+
+ BO_LOCK(bo);
+ if (nbp != NULL &&
+ (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
+ (nbp->b_vp != vp) ||
+ (nbp->b_flags & B_DELWRI) == 0)) {
+ BO_UNLOCK(bo);
+ goto restart;
+ }
+ }
+ }
+
+ if (length > 0) {
+restartsync:
+ TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
+ if (bp->b_lblkno > 0)
+ continue;
+ /*
+ * Since we hold the vnode lock this should only
+ * fail if we're racing with the buf daemon.
+ */
+ if (BUF_LOCK(bp,
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
+ BO_LOCKPTR(bo)) == ENOLCK) {
+ goto restart;
+ }
+ VNASSERT((bp->b_flags & B_DELWRI), vp,
+ ("buf(%p) on dirty queue without DELWRI", bp));
+
+ bremfree(bp);
+ bawrite(bp);
+ BO_LOCK(bo);
+ goto restartsync;
+ }
+ }
+
+ bufobj_wwait(bo, 0, 0);
+ BO_UNLOCK(bo);
+ vnode_pager_setsize(vp, length);
+
+ return (0);
+}
+
+static void
+buf_vlist_remove(struct buf *bp)
+{
+ struct bufv *bv;
+
+ KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
+ ASSERT_BO_WLOCKED(bp->b_bufobj);
+ KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
+ (BX_VNDIRTY|BX_VNCLEAN),
+ ("buf_vlist_remove: Buf %p is on two lists", bp));
+ if (bp->b_xflags & BX_VNDIRTY)
+ bv = &bp->b_bufobj->bo_dirty;
+ else
+ bv = &bp->b_bufobj->bo_clean;
+ BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
+ TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
+ bv->bv_cnt--;
+ bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+}
+
+/*
+ * Add the buffer to the sorted clean or dirty block list.
+ *
+ * NOTE: xflags is passed as a constant, optimizing this inline function!
+ */
+static void
+buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
+{
+ struct bufv *bv;
+ struct buf *n;
+ int error;
+
+ ASSERT_BO_WLOCKED(bo);
+ KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
+ ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
+ bp->b_xflags |= xflags;
+ if (xflags & BX_VNDIRTY)
+ bv = &bo->bo_dirty;
+ else
+ bv = &bo->bo_clean;
+
+ /*
+ * Keep the list ordered. Optimize empty list insertion. Assume
+ * we tend to grow at the tail so lookup_le should usually be cheaper
+ * than _ge.
+ */
+ if (bv->bv_cnt == 0 ||
+ bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
+ TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
+ else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
+ TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
+ else
+ TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
+ error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
+ if (error)
+ panic("buf_vlist_add: Preallocated nodes insufficient.");
+ bv->bv_cnt++;
+}
+
+/*
+ * Lookup a buffer using the splay tree. Note that we specifically avoid
+ * shadow buffers used in background bitmap writes.
+ *
+ * This code isn't quite efficient as it could be because we are maintaining
+ * two sorted lists and do not know which list the block resides in.
+ *
+ * During a "make buildworld" the desired buffer is found at one of
+ * the roots more than 60% of the time. Thus, checking both roots
+ * before performing either splay eliminates unnecessary splays on the
+ * first tree splayed.
+ */
+struct buf *
+gbincore(struct bufobj *bo, daddr_t lblkno)
+{
+ struct buf *bp;
+
+ ASSERT_BO_LOCKED(bo);
+ bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
+ if (bp != NULL)
+ return (bp);
+ return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(struct vnode *vp, struct buf *bp)
+{
+ struct bufobj *bo;
+
+ bo = &vp->v_bufobj;
+ ASSERT_BO_WLOCKED(bo);
+ VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
+
+ CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
+ VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
+ ("bgetvp: bp already attached! %p", bp));
+
+ vhold(vp);
+ bp->b_vp = vp;
+ bp->b_bufobj = bo;
+ /*
+ * Insert onto list for new vnode.
+ */
+ buf_vlist_add(bp, bo, BX_VNCLEAN);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(struct buf *bp)
+{
+ struct bufobj *bo;
+ struct vnode *vp;
+
+ CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ vp = bp->b_vp; /* XXX */
+ bo = bp->b_bufobj;
+ BO_LOCK(bo);
+ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+ buf_vlist_remove(bp);
+ else
+ panic("brelvp: Buffer %p not on queue.", bp);
+ if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
+ bo->bo_flag &= ~BO_ONWORKLST;
+ mtx_lock(&sync_mtx);
+ LIST_REMOVE(bo, bo_synclist);
+ syncer_worklist_len--;
+ mtx_unlock(&sync_mtx);
+ }
+ bp->b_vp = NULL;
+ bp->b_bufobj = NULL;
+ BO_UNLOCK(bo);
+ vdrop(vp);
+}
+
+/*
+ * Add an item to the syncer work queue.
+ */
+static void
+vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
+{
+ int slot;
+
+ ASSERT_BO_WLOCKED(bo);
+
+ mtx_lock(&sync_mtx);
+ if (bo->bo_flag & BO_ONWORKLST)
+ LIST_REMOVE(bo, bo_synclist);
+ else {
+ bo->bo_flag |= BO_ONWORKLST;
+ syncer_worklist_len++;
+ }
+
+ if (delay > syncer_maxdelay - 2)
+ delay = syncer_maxdelay - 2;
+ slot = (syncer_delayno + delay) & syncer_mask;
+
+ LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
+ mtx_unlock(&sync_mtx);
+}
+
+static int
+sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
+{
+ int error, len;
+
+ mtx_lock(&sync_mtx);
+ len = syncer_worklist_len - sync_vnode_count;
+ mtx_unlock(&sync_mtx);
+ error = SYSCTL_OUT(req, &len, sizeof(len));
+ return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
+ sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
+
+static struct proc *updateproc;
+static void sched_sync(void);
+static struct kproc_desc up_kp = {
+ "syncer",
+ sched_sync,
+ &updateproc
+};
+SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
+
+static int
+sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
+{
+ struct vnode *vp;
+ struct mount *mp;
+
+ *bo = LIST_FIRST(slp);
+ if (*bo == NULL)
+ return (0);
+ vp = (*bo)->__bo_vnode; /* XXX */
+ if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
+ return (1);
+ /*
+ * We use vhold in case the vnode does not
+ * successfully sync. vhold prevents the vnode from
+ * going away when we unlock the sync_mtx so that
+ * we can acquire the vnode interlock.
+ */
+ vholdl(vp);
+ mtx_unlock(&sync_mtx);
+ VI_UNLOCK(vp);
+ if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+ vdrop(vp);
+ mtx_lock(&sync_mtx);
+ return (*bo == LIST_FIRST(slp));
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ (void) VOP_FSYNC(vp, MNT_LAZY, td);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ BO_LOCK(*bo);
+ if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
+ /*
+ * Put us back on the worklist. The worklist
+ * routine will remove us from our current
+ * position and then add us back in at a later
+ * position.
+ */
+ vn_syncer_add_to_worklist(*bo, syncdelay);
+ }
+ BO_UNLOCK(*bo);
+ vdrop(vp);
+ mtx_lock(&sync_mtx);
+ return (0);
+}
+
+/*
+ * System filesystem synchronizer daemon.
+ */
+static void
+sched_sync(void)
+{
+ struct synclist *next, *slp;
+ struct bufobj *bo;
+ long starttime;
+ struct thread *td = curthread;
+ int last_work_seen;
+ int net_worklist_len;
+ int syncer_final_iter;
+ int first_printf;
+ int error;
+
+ last_work_seen = 0;
+ syncer_final_iter = 0;
+ first_printf = 1;
+ syncer_state = SYNCER_RUNNING;
+ starttime = time_uptime;
+ td->td_pflags |= TDP_NORUNNINGBUF;
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
+ SHUTDOWN_PRI_LAST);
+
+ mtx_lock(&sync_mtx);
+ for (;;) {
+ if (syncer_state == SYNCER_FINAL_DELAY &&
+ syncer_final_iter == 0) {
+ mtx_unlock(&sync_mtx);
+ kproc_suspend_check(td->td_proc);
+ mtx_lock(&sync_mtx);
+ }
+ net_worklist_len = syncer_worklist_len - sync_vnode_count;
+ if (syncer_state != SYNCER_RUNNING &&
+ starttime != time_uptime) {
+ if (first_printf) {
+ printf("\nSyncing disks, vnodes remaining...");
+ first_printf = 0;
+ }
+ printf("%d ", net_worklist_len);
+ }
+ starttime = time_uptime;
+
+ /*
+ * Push files whose dirty time has expired. Be careful
+ * of interrupt race on slp queue.
+ *
+ * Skip over empty worklist slots when shutting down.
+ */
+ do {
+ slp = &syncer_workitem_pending[syncer_delayno];
+ syncer_delayno += 1;
+ if (syncer_delayno == syncer_maxdelay)
+ syncer_delayno = 0;
+ next = &syncer_workitem_pending[syncer_delayno];
+ /*
+ * If the worklist has wrapped since the
+ * it was emptied of all but syncer vnodes,
+ * switch to the FINAL_DELAY state and run
+ * for one more second.
+ */
+ if (syncer_state == SYNCER_SHUTTING_DOWN &&
+ net_worklist_len == 0 &&
+ last_work_seen == syncer_delayno) {
+ syncer_state = SYNCER_FINAL_DELAY;
+ syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
+ }
+ } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
+ syncer_worklist_len > 0);
+
+ /*
+ * Keep track of the last time there was anything
+ * on the worklist other than syncer vnodes.
+ * Return to the SHUTTING_DOWN state if any
+ * new work appears.
+ */
+ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
+ last_work_seen = syncer_delayno;
+ if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
+ syncer_state = SYNCER_SHUTTING_DOWN;
+ while (!LIST_EMPTY(slp)) {
+ error = sync_vnode(slp, &bo, td);
+ if (error == 1) {
+ LIST_REMOVE(bo, bo_synclist);
+ LIST_INSERT_HEAD(next, bo, bo_synclist);
+ continue;
+ }
+
+ if (first_printf == 0)
+ wdog_kern_pat(WD_LASTVAL);
+
+ }
+ if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
+ syncer_final_iter--;
+ /*
+ * The variable rushjob allows the kernel to speed up the
+ * processing of the filesystem syncer process. A rushjob
+ * value of N tells the filesystem syncer to process the next
+ * N seconds worth of work on its queue ASAP. Currently rushjob
+ * is used by the soft update code to speed up the filesystem
+ * syncer process when the incore state is getting so far
+ * ahead of the disk that the kernel memory pool is being
+ * threatened with exhaustion.
+ */
+ if (rushjob > 0) {
+ rushjob -= 1;
+ continue;
+ }
+ /*
+ * Just sleep for a short period of time between
+ * iterations when shutting down to allow some I/O
+ * to happen.
+ *
+ * If it has taken us less than a second to process the
+ * current work, then wait. Otherwise start right over
+ * again. We can still lose time if any single round
+ * takes more than two seconds, but it does not really
+ * matter as we are just trying to generally pace the
+ * filesystem activity.
+ */
+ if (syncer_state != SYNCER_RUNNING ||
+ time_uptime == starttime) {
+ thread_lock(td);
+ sched_prio(td, PPAUSE);
+ thread_unlock(td);
+ }
+ if (syncer_state != SYNCER_RUNNING)
+ cv_timedwait(&sync_wakeup, &sync_mtx,
+ hz / SYNCER_SHUTDOWN_SPEEDUP);
+ else if (time_uptime == starttime)
+ cv_timedwait(&sync_wakeup, &sync_mtx, hz);
+ }
+}
+
+/*
+ * Request the syncer daemon to speed up its work.
+ * We never push it to speed up more than half of its
+ * normal turn time, otherwise it could take over the cpu.
+ */
+int
+speedup_syncer(void)
+{
+ int ret = 0;
+
+ mtx_lock(&sync_mtx);
+ if (rushjob < syncdelay / 2) {
+ rushjob += 1;
+ stat_rush_requests += 1;
+ ret = 1;
+ }
+ mtx_unlock(&sync_mtx);
+ cv_broadcast(&sync_wakeup);
+ return (ret);
+}
+
+/*
+ * Tell the syncer to speed up its work and run though its work
+ * list several times, then tell it to shut down.
+ */
+static void
+syncer_shutdown(void *arg, int howto)
+{
+
+ if (howto & RB_NOSYNC)
+ return;
+ mtx_lock(&sync_mtx);
+ syncer_state = SYNCER_SHUTTING_DOWN;
+ rushjob = 0;
+ mtx_unlock(&sync_mtx);
+ cv_broadcast(&sync_wakeup);
+ kproc_shutdown(arg, howto);
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(struct buf *bp)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+ int delay;
+#ifdef INVARIANTS
+ struct bufv *bv;
+#endif
+
+ vp = bp->b_vp;
+ bo = bp->b_bufobj;
+ ++reassignbufcalls;
+
+ CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
+ bp, bp->b_vp, bp->b_flags);
+ /*
+ * B_PAGING flagged buffers cannot be reassigned because their vp
+ * is not fully linked in.
+ */
+ if (bp->b_flags & B_PAGING)
+ panic("cannot reassign paging buffer");
+
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ BO_LOCK(bo);
+ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+ buf_vlist_remove(bp);
+ else
+ panic("reassignbuf: Buffer %p not on queue.", bp);
+ /*
+ * If dirty, put on list of dirty buffers; otherwise insert onto list
+ * of clean buffers.
+ */
+ if (bp->b_flags & B_DELWRI) {
+ if ((bo->bo_flag & BO_ONWORKLST) == 0) {
+ switch (vp->v_type) {
+ case VDIR:
+ delay = dirdelay;
+ break;
+ case VCHR:
+ delay = metadelay;
+ break;
+ default:
+ delay = filedelay;
+ }
+ vn_syncer_add_to_worklist(bo, delay);
+ }
+ buf_vlist_add(bp, bo, BX_VNDIRTY);
+ } else {
+ buf_vlist_add(bp, bo, BX_VNCLEAN);
+
+ if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
+ mtx_lock(&sync_mtx);
+ LIST_REMOVE(bo, bo_synclist);
+ syncer_worklist_len--;
+ mtx_unlock(&sync_mtx);
+ bo->bo_flag &= ~BO_ONWORKLST;
+ }
+ }
+#ifdef INVARIANTS
+ bv = &bo->bo_clean;
+ bp = TAILQ_FIRST(&bv->bv_hd);
+ KASSERT(bp == NULL || bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+ bp = TAILQ_LAST(&bv->bv_hd, buflists);
+ KASSERT(bp == NULL || bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+ bv = &bo->bo_dirty;
+ bp = TAILQ_FIRST(&bv->bv_hd);
+ KASSERT(bp == NULL || bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+ bp = TAILQ_LAST(&bv->bv_hd, buflists);
+ KASSERT(bp == NULL || bp->b_bufobj == bo,
+ ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
+#endif
+ BO_UNLOCK(bo);
+}
+
+/*
+ * Increment the use and hold counts on the vnode, taking care to reference
+ * the driver's usecount if this is a chardev. The vholdl() will remove
+ * the vnode from the free list if it is presently free. Requires the
+ * vnode interlock and returns with it held.
+ */
+static void
+v_incr_usecount(struct vnode *vp)
+{
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ vp->v_usecount++;
+ if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+ dev_lock();
+ vp->v_rdev->si_usecount++;
+ dev_unlock();
+ }
+ vholdl(vp);
+}
+
+/*
+ * Turn a holdcnt into a use+holdcnt such that only one call to
+ * v_decr_usecount is needed.
+ */
+static void
+v_upgrade_usecount(struct vnode *vp)
+{
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ vp->v_usecount++;
+ if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+ dev_lock();
+ vp->v_rdev->si_usecount++;
+ dev_unlock();
+ }
+}
+
+/*
+ * Decrement the vnode use and hold count along with the driver's usecount
+ * if this is a chardev. The vdropl() below releases the vnode interlock
+ * as it may free the vnode.
+ */
+static void
+v_decr_usecount(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __FUNCTION__);
+ VNASSERT(vp->v_usecount > 0, vp,
+ ("v_decr_usecount: negative usecount"));
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ vp->v_usecount--;
+ if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+ dev_lock();
+ vp->v_rdev->si_usecount--;
+ dev_unlock();
+ }
+ vdropl(vp);
+}
+
+/*
+ * Decrement only the use count and driver use count. This is intended to
+ * be paired with a follow on vdropl() to release the remaining hold count.
+ * In this way we may vgone() a vnode with a 0 usecount without risk of
+ * having it end up on a free list because the hold count is kept above 0.
+ */
+static void
+v_decr_useonly(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __FUNCTION__);
+ VNASSERT(vp->v_usecount > 0, vp,
+ ("v_decr_useonly: negative usecount"));
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ vp->v_usecount--;
+ if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+ dev_lock();
+ vp->v_rdev->si_usecount--;
+ dev_unlock();
+ }
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. VI_DOOMED is set if the vnode
+ * is being destroyed. Only callers who specify LK_RETRY will
+ * see doomed vnodes. If inactive processing was delayed in
+ * vput try to do it here.
+ */
+int
+vget(struct vnode *vp, int flags, struct thread *td)
+{
+ int error;
+
+ error = 0;
+ VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
+ ("vget: invalid lock operation"));
+ CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
+
+ if ((flags & LK_INTERLOCK) == 0)
+ VI_LOCK(vp);
+ vholdl(vp);
+ if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
+ vdrop(vp);
+ CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
+ vp);
+ return (error);
+ }
+ if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
+ panic("vget: vn_lock failed to return ENOENT\n");
+ VI_LOCK(vp);
+ /* Upgrade our holdcnt to a usecount. */
+ v_upgrade_usecount(vp);
+ /*
+ * We don't guarantee that any particular close will
+ * trigger inactive processing so just make a best effort
+ * here at preventing a reference to a removed file. If
+ * we don't succeed no harm is done.
+ */
+ if (vp->v_iflag & VI_OWEINACT) {
+ if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
+ (flags & LK_NOWAIT) == 0)
+ vinactive(vp, td);
+ vp->v_iflag &= ~VI_OWEINACT;
+ }
+ VI_UNLOCK(vp);
+ return (0);
+}
+
+/*
+ * Increase the reference count of a vnode.
+ */
+void
+vref(struct vnode *vp)
+{
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ VI_LOCK(vp);
+ v_incr_usecount(vp);
+ VI_UNLOCK(vp);
+}
+
+/*
+ * Return reference count of a vnode.
+ *
+ * The results of this call are only guaranteed when some mechanism other
+ * than the VI lock is used to stop other processes from gaining references
+ * to the vnode. This may be the case if the caller holds the only reference.
+ * This is also useful when stale data is acceptable as race conditions may
+ * be accounted for by some other means.
+ */
+int
+vrefcnt(struct vnode *vp)
+{
+ int usecnt;
+
+ VI_LOCK(vp);
+ usecnt = vp->v_usecount;
+ VI_UNLOCK(vp);
+
+ return (usecnt);
+}
+
+#define VPUTX_VRELE 1
+#define VPUTX_VPUT 2
+#define VPUTX_VUNREF 3
+
+static void
+vputx(struct vnode *vp, int func)
+{
+ int error;
+
+ KASSERT(vp != NULL, ("vputx: null vp"));
+ if (func == VPUTX_VUNREF)
+ ASSERT_VOP_LOCKED(vp, "vunref");
+ else if (func == VPUTX_VPUT)
+ ASSERT_VOP_LOCKED(vp, "vput");
+ else
+ KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ VI_LOCK(vp);
+
+ /* Skip this v_writecount check if we're going to panic below. */
+ VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
+ ("vputx: missed vn_close"));
+ error = 0;
+
+ if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
+ vp->v_usecount == 1)) {
+ if (func == VPUTX_VPUT)
+ VOP_UNLOCK(vp, 0);
+ v_decr_usecount(vp);
+ return;
+ }
+
+ if (vp->v_usecount != 1) {
+ vprint("vputx: negative ref count", vp);
+ panic("vputx: negative ref cnt");
+ }
+ CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
+ /*
+ * We want to hold the vnode until the inactive finishes to
+ * prevent vgone() races. We drop the use count here and the
+ * hold count below when we're done.
+ */
+ v_decr_useonly(vp);
+ /*
+ * We must call VOP_INACTIVE with the node locked. Mark
+ * as VI_DOINGINACT to avoid recursion.
+ */
+ vp->v_iflag |= VI_OWEINACT;
+ switch (func) {
+ case VPUTX_VRELE:
+ error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
+ VI_LOCK(vp);
+ break;
+ case VPUTX_VPUT:
+ if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+ error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
+ LK_NOWAIT);
+ VI_LOCK(vp);
+ }
+ break;
+ case VPUTX_VUNREF:
+ if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
+ error = EBUSY;
+ break;
+ }
+ if (vp->v_usecount > 0)
+ vp->v_iflag &= ~VI_OWEINACT;
+ if (error == 0) {
+ if (vp->v_iflag & VI_OWEINACT)
+ vinactive(vp, curthread);
+ if (func != VPUTX_VUNREF)
+ VOP_UNLOCK(vp, 0);
+ }
+ vdropl(vp);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(struct vnode *vp)
+{
+
+ vputx(vp, VPUTX_VRELE);
+}
+
+/*
+ * Release an already locked vnode. This give the same effects as
+ * unlock+vrele(), but takes less time and avoids releasing and
+ * re-aquiring the lock (as vrele() acquires the lock internally.)
+ */
+void
+vput(struct vnode *vp)
+{
+
+ vputx(vp, VPUTX_VPUT);
+}
+
+/*
+ * Release an exclusively locked vnode. Do not unlock the vnode lock.
+ */
+void
+vunref(struct vnode *vp)
+{
+
+ vputx(vp, VPUTX_VUNREF);
+}
+
+/*
+ * Somebody doesn't want the vnode recycled.
+ */
+void
+vhold(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ vholdl(vp);
+ VI_UNLOCK(vp);
+}
+
+/*
+ * Increase the hold count and activate if this is the first reference.
+ */
+void
+vholdl(struct vnode *vp)
+{
+ struct mount *mp;
+
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ vp->v_holdcnt++;
+ if (!VSHOULDBUSY(vp))
+ return;
+ ASSERT_VI_LOCKED(vp, "vholdl");
+ VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
+ VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
+ /*
+ * Remove a vnode from the free list, mark it as in use,
+ * and put it on the active list.
+ */
+ mtx_lock(&vnode_free_list_mtx);
+ TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
+ freevnodes--;
+ vp->v_iflag &= ~(VI_FREE|VI_AGE);
+ KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
+ ("Activating already active vnode"));
+ vp->v_iflag |= VI_ACTIVE;
+ mp = vp->v_mount;
+ TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
+ mp->mnt_activevnodelistsize++;
+ mtx_unlock(&vnode_free_list_mtx);
+}
+
+/*
+ * Note that there is one less who cares about this vnode.
+ * vdrop() is the opposite of vhold().
+ */
+void
+vdrop(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ vdropl(vp);
+}
+
+/*
+ * Drop the hold count of the vnode. If this is the last reference to
+ * the vnode we place it on the free list unless it has been vgone'd
+ * (marked VI_DOOMED) in which case we will free it.
+ */
+void
+vdropl(struct vnode *vp)
+{
+ struct bufobj *bo;
+ struct mount *mp;
+ int active;
+
+ ASSERT_VI_LOCKED(vp, "vdropl");
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ if (vp->v_holdcnt <= 0)
+ panic("vdrop: holdcnt %d", vp->v_holdcnt);
+ vp->v_holdcnt--;
+ if (vp->v_holdcnt > 0) {
+ VI_UNLOCK(vp);
+ return;
+ }
+ if ((vp->v_iflag & VI_DOOMED) == 0) {
+ /*
+ * Mark a vnode as free: remove it from its active list
+ * and put it up for recycling on the freelist.
+ */
+ VNASSERT(vp->v_op != NULL, vp,
+ ("vdropl: vnode already reclaimed."));
+ VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+ ("vnode already free"));
+ VNASSERT(VSHOULDFREE(vp), vp,
+ ("vdropl: freeing when we shouldn't"));
+ active = vp->v_iflag & VI_ACTIVE;
+ vp->v_iflag &= ~VI_ACTIVE;
+ mp = vp->v_mount;
+ mtx_lock(&vnode_free_list_mtx);
+ if (active) {
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
+ v_actfreelist);
+ mp->mnt_activevnodelistsize--;
+ }
+ if (vp->v_iflag & VI_AGE) {
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
+ } else {
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
+ }
+ freevnodes++;
+ vp->v_iflag &= ~VI_AGE;
+ vp->v_iflag |= VI_FREE;
+ mtx_unlock(&vnode_free_list_mtx);
+ VI_UNLOCK(vp);
+ return;
+ }
+ /*
+ * The vnode has been marked for destruction, so free it.
+ */
+ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
+ mtx_lock(&vnode_free_list_mtx);
+ numvnodes--;
+ mtx_unlock(&vnode_free_list_mtx);
+ bo = &vp->v_bufobj;
+ VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+ ("cleaned vnode still on the free list."));
+ VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
+ VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
+ VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
+ VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
+ VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
+ VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
+ VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
+ ("clean blk trie not empty"));
+ VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
+ VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
+ ("dirty blk trie not empty"));
+ VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
+ VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
+ VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
+ VI_UNLOCK(vp);
+#ifdef MAC
+ mac_vnode_destroy(vp);
+#endif
+ if (vp->v_pollinfo != NULL)
+ destroy_vpollinfo(vp->v_pollinfo);
+#ifdef INVARIANTS
+ /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
+ vp->v_op = NULL;
+#endif
+ rangelock_destroy(&vp->v_rl);
+ lockdestroy(vp->v_vnlock);
+ mtx_destroy(&vp->v_interlock);
+ rw_destroy(BO_LOCKPTR(bo));
+ uma_zfree(vnode_zone, vp);
+}
+
+/*
+ * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
+ * flags. DOINGINACT prevents us from recursing in calls to vinactive.
+ * OWEINACT tracks whether a vnode missed a call to inactive due to a
+ * failed lock upgrade.
+ */
+void
+vinactive(struct vnode *vp, struct thread *td)
+{
+ struct vm_object *obj;
+
+ ASSERT_VOP_ELOCKED(vp, "vinactive");
+ ASSERT_VI_LOCKED(vp, "vinactive");
+ VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
+ ("vinactive: recursed on VI_DOINGINACT"));
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ vp->v_iflag |= VI_DOINGINACT;
+ vp->v_iflag &= ~VI_OWEINACT;
+ VI_UNLOCK(vp);
+ /*
+ * Before moving off the active list, we must be sure that any
+ * modified pages are on the vnode's dirty list since these will
+ * no longer be checked once the vnode is on the inactive list.
+ * Because the vnode vm object keeps a hold reference on the vnode
+ * if there is at least one resident non-cached page, the vnode
+ * cannot leave the active list without the page cleanup done.
+ */
+ obj = vp->v_object;
+ if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
+ VM_OBJECT_WLOCK(obj);
+ vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
+ VM_OBJECT_WUNLOCK(obj);
+ }
+ VOP_INACTIVE(vp, td);
+ VI_LOCK(vp);
+ VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
+ ("vinactive: lost VI_DOINGINACT"));
+ vp->v_iflag &= ~VI_DOINGINACT;
+}
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If FORCECLOSE is not specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If FORCECLOSE is specified, detach any active vnodes
+ * that are found.
+ *
+ * If WRITECLOSE is set, only flush out regular file vnodes open for
+ * writing.
+ *
+ * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
+ *
+ * `rootrefs' specifies the base reference count for the root vnode
+ * of this filesystem. The root vnode is considered busy if its
+ * v_usecount exceeds this value. On a successful return, vflush(, td)
+ * will call vrele() on the root vnode exactly rootrefs times.
+ * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
+ * be zero.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0; /* print out busy vnodes */
+SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
+#endif
+
+int
+vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
+{
+ struct vnode *vp, *mvp, *rootvp = NULL;
+ struct vattr vattr;
+ int busy = 0, error;
+
+ CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
+ rootrefs, flags);
+ if (rootrefs > 0) {
+ KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
+ ("vflush: bad args"));
+ /*
+ * Get the filesystem root vnode. We can vput() it
+ * immediately, since with rootrefs > 0, it won't go away.
+ */
+ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
+ CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
+ __func__, error);
+ return (error);
+ }
+ vput(rootvp);
+ }
+loop:
+ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
+ vholdl(vp);
+ error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
+ if (error) {
+ vdrop(vp);
+ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+ goto loop;
+ }
+ /*
+ * Skip over a vnodes marked VV_SYSTEM.
+ */
+ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
+ VOP_UNLOCK(vp, 0);
+ vdrop(vp);
+ continue;
+ }
+ /*
+ * If WRITECLOSE is set, flush out unlinked but still open
+ * files (even if open only for reading) and regular file
+ * vnodes open for writing.
+ */
+ if (flags & WRITECLOSE) {
+ if (vp->v_object != NULL) {
+ VM_OBJECT_WLOCK(vp->v_object);
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ VM_OBJECT_WUNLOCK(vp->v_object);
+ }
+ error = VOP_FSYNC(vp, MNT_WAIT, td);
+ if (error != 0) {
+ VOP_UNLOCK(vp, 0);
+ vdrop(vp);
+ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
+ return (error);
+ }
+ error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+ VI_LOCK(vp);
+
+ if ((vp->v_type == VNON ||
+ (error == 0 && vattr.va_nlink > 0)) &&
+ (vp->v_writecount == 0 || vp->v_type != VREG)) {
+ VOP_UNLOCK(vp, 0);
+ vdropl(vp);
+ continue;
+ }
+ } else
+ VI_LOCK(vp);
+ /*
+ * With v_usecount == 0, all we need to do is clear out the
+ * vnode data structures and we are done.
+ *
+ * If FORCECLOSE is set, forcibly close the vnode.
+ */
+ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
+ VNASSERT(vp->v_usecount == 0 ||
+ (vp->v_type != VCHR && vp->v_type != VBLK), vp,
+ ("device VNODE %p is FORCECLOSED", vp));
+ vgonel(vp);
+ } else {
+ busy++;
+#ifdef DIAGNOSTIC
+ if (busyprt)
+ vprint("vflush: busy vnode", vp);
+#endif
+ }
+ VOP_UNLOCK(vp, 0);
+ vdropl(vp);
+ }
+ if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
+ /*
+ * If just the root vnode is busy, and if its refcount
+ * is equal to `rootrefs', then go ahead and kill it.
+ */
+ VI_LOCK(rootvp);
+ KASSERT(busy > 0, ("vflush: not busy"));
+ VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
+ ("vflush: usecount %d < rootrefs %d",
+ rootvp->v_usecount, rootrefs));
+ if (busy == 1 && rootvp->v_usecount == rootrefs) {
+ VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
+ vgone(rootvp);
+ VOP_UNLOCK(rootvp, 0);
+ busy = 0;
+ } else
+ VI_UNLOCK(rootvp);
+ }
+ if (busy) {
+ CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
+ busy);
+ return (EBUSY);
+ }
+ for (; rootrefs > 0; rootrefs--)
+ vrele(rootvp);
+ return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ */
+int
+vrecycle(struct vnode *vp)
+{
+ int recycled;
+
+ ASSERT_VOP_ELOCKED(vp, "vrecycle");
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ recycled = 0;
+ VI_LOCK(vp);
+ if (vp->v_usecount == 0) {
+ recycled = 1;
+ vgonel(vp);
+ }
+ VI_UNLOCK(vp);
+ return (recycled);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(struct vnode *vp)
+{
+ VI_LOCK(vp);
+ vgonel(vp);
+ VI_UNLOCK(vp);
+}
+
+static void
+notify_lowervp_vfs_dummy(struct mount *mp __unused,
+ struct vnode *lowervp __unused)
+{
+}
+
+/*
+ * Notify upper mounts about reclaimed or unlinked vnode.
+ */
+void
+vfs_notify_upper(struct vnode *vp, int event)
+{
+ static struct vfsops vgonel_vfsops = {
+ .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
+ .vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
+ };
+ struct mount *mp, *ump, *mmp;
+
+ mp = vp->v_mount;
+ if (mp == NULL)
+ return;
+
+ MNT_ILOCK(mp);
+ if (TAILQ_EMPTY(&mp->mnt_uppers))
+ goto unlock;
+ MNT_IUNLOCK(mp);
+ mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
+ mmp->mnt_op = &vgonel_vfsops;
+ mmp->mnt_kern_flag |= MNTK_MARKER;
+ MNT_ILOCK(mp);
+ mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
+ for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
+ if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
+ ump = TAILQ_NEXT(ump, mnt_upper_link);
+ continue;
+ }
+ TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
+ MNT_IUNLOCK(mp);
+ switch (event) {
+ case VFS_NOTIFY_UPPER_RECLAIM:
+ VFS_RECLAIM_LOWERVP(ump, vp);
+ break;
+ case VFS_NOTIFY_UPPER_UNLINK:
+ VFS_UNLINK_LOWERVP(ump, vp);
+ break;
+ default:
+ KASSERT(0, ("invalid event %d", event));
+ break;
+ }
+ MNT_ILOCK(mp);
+ ump = TAILQ_NEXT(mmp, mnt_upper_link);
+ TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
+ }
+ free(mmp, M_TEMP);
+ mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
+ if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
+ mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
+ wakeup(&mp->mnt_uppers);
+ }
+unlock:
+ MNT_IUNLOCK(mp);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+void
+vgonel(struct vnode *vp)
+{
+ struct thread *td;
+ int oweinact;
+ int active;
+ struct mount *mp;
+
+ ASSERT_VOP_ELOCKED(vp, "vgonel");
+ ASSERT_VI_LOCKED(vp, "vgonel");
+ VNASSERT(vp->v_holdcnt, vp,
+ ("vgonel: vp %p has no reference.", vp));
+ CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ td = curthread;
+
+ /*
+ * Don't vgonel if we're already doomed.
+ */
+ if (vp->v_iflag & VI_DOOMED)
+ return;
+ vp->v_iflag |= VI_DOOMED;
+
+ /*
+ * Check to see if the vnode is in use. If so, we have to call
+ * VOP_CLOSE() and VOP_INACTIVE().
+ */
+ active = vp->v_usecount;
+ oweinact = (vp->v_iflag & VI_OWEINACT);
+ VI_UNLOCK(vp);
+ vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
+
+ /*
+ * Clean out any buffers associated with the vnode.
+ * If the flush fails, just toss the buffers.
+ */
+ mp = NULL;
+ if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
+ (void) vn_start_secondary_write(vp, &mp, V_WAIT);
+ if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
+ vinvalbuf(vp, 0, 0, 0);
+
+ /*
+ * If purging an active vnode, it must be closed and
+ * deactivated before being reclaimed.
+ */
+ if (active)
+ VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
+ if (oweinact || active) {
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_DOINGINACT) == 0)
+ vinactive(vp, td);
+ VI_UNLOCK(vp);
+ }
+ if (vp->v_type == VSOCK)
+ vfs_unp_reclaim(vp);
+ /*
+ * Reclaim the vnode.
+ */
+ if (VOP_RECLAIM(vp, td))
+ panic("vgone: cannot reclaim");
+ if (mp != NULL)
+ vn_finished_secondary_write(mp);
+ VNASSERT(vp->v_object == NULL, vp,
+ ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
+ /*
+ * Clear the advisory locks and wake up waiting threads.
+ */
+ (void)VOP_ADVLOCKPURGE(vp);
+ /*
+ * Delete from old mount point vnode list.
+ */
+ delmntque(vp);
+ cache_purge(vp);
+ /*
+ * Done with purge, reset to the standard lock and invalidate
+ * the vnode.
+ */
+ VI_LOCK(vp);
+ vp->v_vnlock = &vp->v_lock;
+ vp->v_op = &dead_vnodeops;
+ vp->v_tag = "none";
+ vp->v_type = VBAD;
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(struct vnode *vp)
+{
+ int count;
+
+ dev_lock();
+ count = vp->v_rdev->si_usecount;
+ dev_unlock();
+ return (count);
+}
+
+/*
+ * Same as above, but using the struct cdev *as argument
+ */
+int
+count_dev(struct cdev *dev)
+{
+ int count;
+
+ dev_lock();
+ count = dev->si_usecount;
+ dev_unlock();
+ return(count);
+}
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
+ "VMARKER"};
+
+void
+vn_printf(struct vnode *vp, const char *fmt, ...)
+{
+ va_list ap;
+ char buf[256], buf2[16];
+ u_long flags;
+
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ printf("%p: ", (void *)vp);
+ printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
+ printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n",
+ vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
+ buf[0] = '\0';
+ buf[1] = '\0';
+ if (vp->v_vflag & VV_ROOT)
+ strlcat(buf, "|VV_ROOT", sizeof(buf));
+ if (vp->v_vflag & VV_ISTTY)
+ strlcat(buf, "|VV_ISTTY", sizeof(buf));
+ if (vp->v_vflag & VV_NOSYNC)
+ strlcat(buf, "|VV_NOSYNC", sizeof(buf));
+ if (vp->v_vflag & VV_ETERNALDEV)
+ strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
+ if (vp->v_vflag & VV_CACHEDLABEL)
+ strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
+ if (vp->v_vflag & VV_TEXT)
+ strlcat(buf, "|VV_TEXT", sizeof(buf));
+ if (vp->v_vflag & VV_COPYONWRITE)
+ strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
+ if (vp->v_vflag & VV_SYSTEM)
+ strlcat(buf, "|VV_SYSTEM", sizeof(buf));
+ if (vp->v_vflag & VV_PROCDEP)
+ strlcat(buf, "|VV_PROCDEP", sizeof(buf));
+ if (vp->v_vflag & VV_NOKNOTE)
+ strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
+ if (vp->v_vflag & VV_DELETED)
+ strlcat(buf, "|VV_DELETED", sizeof(buf));
+ if (vp->v_vflag & VV_MD)
+ strlcat(buf, "|VV_MD", sizeof(buf));
+ if (vp->v_vflag & VV_FORCEINSMQ)
+ strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
+ flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
+ VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
+ VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
+ if (flags != 0) {
+ snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
+ strlcat(buf, buf2, sizeof(buf));
+ }
+ if (vp->v_iflag & VI_MOUNT)
+ strlcat(buf, "|VI_MOUNT", sizeof(buf));
+ if (vp->v_iflag & VI_AGE)
+ strlcat(buf, "|VI_AGE", sizeof(buf));
+ if (vp->v_iflag & VI_DOOMED)
+ strlcat(buf, "|VI_DOOMED", sizeof(buf));
+ if (vp->v_iflag & VI_FREE)
+ strlcat(buf, "|VI_FREE", sizeof(buf));
+ if (vp->v_iflag & VI_ACTIVE)
+ strlcat(buf, "|VI_ACTIVE", sizeof(buf));
+ if (vp->v_iflag & VI_DOINGINACT)
+ strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
+ if (vp->v_iflag & VI_OWEINACT)
+ strlcat(buf, "|VI_OWEINACT", sizeof(buf));
+ flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
+ VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
+ if (flags != 0) {
+ snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
+ strlcat(buf, buf2, sizeof(buf));
+ }
+ printf(" flags (%s)\n", buf + 1);
+ if (mtx_owned(VI_MTX(vp)))
+ printf(" VI_LOCKed");
+ if (vp->v_object != NULL)
+ printf(" v_object %p ref %d pages %d\n",
+ vp->v_object, vp->v_object->ref_count,
+ vp->v_object->resident_page_count);
+ printf(" ");
+ lockmgr_printinfo(vp->v_vnlock);
+ if (vp->v_data != NULL)
+ VOP_PRINT(vp);
+}
+
+#ifdef DDB
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
+{
+ struct mount *mp;
+ struct vnode *vp;
+
+ /*
+ * Note: because this is DDB, we can't obey the locking semantics
+ * for these structures, which means we could catch an inconsistent
+ * state and dereference a nasty pointer. Not much to be done
+ * about that.
+ */
+ db_printf("Locked vnodes\n");
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
+ vprint("", vp);
+ }
+ }
+}
+
+/*
+ * Show details about the given vnode.
+ */
+DB_SHOW_COMMAND(vnode, db_show_vnode)
+{
+ struct vnode *vp;
+
+ if (!have_addr)
+ return;
+ vp = (struct vnode *)addr;
+ vn_printf(vp, "vnode ");
+}
+
+/*
+ * Show details about the given mount point.
+ */
+DB_SHOW_COMMAND(mount, db_show_mount)
+{
+ struct mount *mp;
+ struct vfsopt *opt;
+ struct statfs *sp;
+ struct vnode *vp;
+ char buf[512];
+ uint64_t mflags;
+ u_int flags;
+
+ if (!have_addr) {
+ /* No address given, print short info about all mount points. */
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ db_printf("%p %s on %s (%s)\n", mp,
+ mp->mnt_stat.f_mntfromname,
+ mp->mnt_stat.f_mntonname,
+ mp->mnt_stat.f_fstypename);
+ if (db_pager_quit)
+ break;
+ }
+ db_printf("\nMore info: show mount <addr>\n");
+ return;
+ }
+
+ mp = (struct mount *)addr;
+ db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
+ mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
+
+ buf[0] = '\0';
+ mflags = mp->mnt_flag;
+#define MNT_FLAG(flag) do { \
+ if (mflags & (flag)) { \
+ if (buf[0] != '\0') \
+ strlcat(buf, ", ", sizeof(buf)); \
+ strlcat(buf, (#flag) + 4, sizeof(buf)); \
+ mflags &= ~(flag); \
+ } \
+} while (0)
+ MNT_FLAG(MNT_RDONLY);
+ MNT_FLAG(MNT_SYNCHRONOUS);
+ MNT_FLAG(MNT_NOEXEC);
+ MNT_FLAG(MNT_NOSUID);
+ MNT_FLAG(MNT_NFS4ACLS);
+ MNT_FLAG(MNT_UNION);
+ MNT_FLAG(MNT_ASYNC);
+ MNT_FLAG(MNT_SUIDDIR);
+ MNT_FLAG(MNT_SOFTDEP);
+ MNT_FLAG(MNT_NOSYMFOLLOW);
+ MNT_FLAG(MNT_GJOURNAL);
+ MNT_FLAG(MNT_MULTILABEL);
+ MNT_FLAG(MNT_ACLS);
+ MNT_FLAG(MNT_NOATIME);
+ MNT_FLAG(MNT_NOCLUSTERR);
+ MNT_FLAG(MNT_NOCLUSTERW);
+ MNT_FLAG(MNT_SUJ);
+ MNT_FLAG(MNT_EXRDONLY);
+ MNT_FLAG(MNT_EXPORTED);
+ MNT_FLAG(MNT_DEFEXPORTED);
+ MNT_FLAG(MNT_EXPORTANON);
+ MNT_FLAG(MNT_EXKERB);
+ MNT_FLAG(MNT_EXPUBLIC);
+ MNT_FLAG(MNT_LOCAL);
+ MNT_FLAG(MNT_QUOTA);
+ MNT_FLAG(MNT_ROOTFS);
+ MNT_FLAG(MNT_USER);
+ MNT_FLAG(MNT_IGNORE);
+ MNT_FLAG(MNT_UPDATE);
+ MNT_FLAG(MNT_DELEXPORT);
+ MNT_FLAG(MNT_RELOAD);
+ MNT_FLAG(MNT_FORCE);
+ MNT_FLAG(MNT_SNAPSHOT);
+ MNT_FLAG(MNT_BYFSID);
+#undef MNT_FLAG
+ if (mflags != 0) {
+ if (buf[0] != '\0')
+ strlcat(buf, ", ", sizeof(buf));
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
+ "0x%016jx", mflags);
+ }
+ db_printf(" mnt_flag = %s\n", buf);
+
+ buf[0] = '\0';
+ flags = mp->mnt_kern_flag;
+#define MNT_KERN_FLAG(flag) do { \
+ if (flags & (flag)) { \
+ if (buf[0] != '\0') \
+ strlcat(buf, ", ", sizeof(buf)); \
+ strlcat(buf, (#flag) + 5, sizeof(buf)); \
+ flags &= ~(flag); \
+ } \
+} while (0)
+ MNT_KERN_FLAG(MNTK_UNMOUNTF);
+ MNT_KERN_FLAG(MNTK_ASYNC);
+ MNT_KERN_FLAG(MNTK_SOFTDEP);
+ MNT_KERN_FLAG(MNTK_NOINSMNTQ);
+ MNT_KERN_FLAG(MNTK_DRAINING);
+ MNT_KERN_FLAG(MNTK_REFEXPIRE);
+ MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
+ MNT_KERN_FLAG(MNTK_SHARED_WRITES);
+ MNT_KERN_FLAG(MNTK_NO_IOPF);
+ MNT_KERN_FLAG(MNTK_VGONE_UPPER);
+ MNT_KERN_FLAG(MNTK_VGONE_WAITER);
+ MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
+ MNT_KERN_FLAG(MNTK_MARKER);
+ MNT_KERN_FLAG(MNTK_NOASYNC);
+ MNT_KERN_FLAG(MNTK_UNMOUNT);
+ MNT_KERN_FLAG(MNTK_MWAIT);
+ MNT_KERN_FLAG(MNTK_SUSPEND);
+ MNT_KERN_FLAG(MNTK_SUSPEND2);
+ MNT_KERN_FLAG(MNTK_SUSPENDED);
+ MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
+ MNT_KERN_FLAG(MNTK_NOKNOTE);
+#undef MNT_KERN_FLAG
+ if (flags != 0) {
+ if (buf[0] != '\0')
+ strlcat(buf, ", ", sizeof(buf));
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
+ "0x%08x", flags);
+ }
+ db_printf(" mnt_kern_flag = %s\n", buf);
+
+ db_printf(" mnt_opt = ");
+ opt = TAILQ_FIRST(mp->mnt_opt);
+ if (opt != NULL) {
+ db_printf("%s", opt->name);
+ opt = TAILQ_NEXT(opt, link);
+ while (opt != NULL) {
+ db_printf(", %s", opt->name);
+ opt = TAILQ_NEXT(opt, link);
+ }
+ }
+ db_printf("\n");
+
+ sp = &mp->mnt_stat;
+ db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx "
+ "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
+ "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
+ "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
+ (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
+ (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
+ (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
+ (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
+ (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
+ (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
+ (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
+ (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
+
+ db_printf(" mnt_cred = { uid=%u ruid=%u",
+ (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
+ if (jailed(mp->mnt_cred))
+ db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
+ db_printf(" }\n");
+ db_printf(" mnt_ref = %d\n", mp->mnt_ref);
+ db_printf(" mnt_gen = %d\n", mp->mnt_gen);
+ db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
+ db_printf(" mnt_activevnodelistsize = %d\n",
+ mp->mnt_activevnodelistsize);
+ db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount);
+ db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
+ db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
+ db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
+ db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
+ db_printf(" mnt_secondary_accwrites = %d\n",
+ mp->mnt_secondary_accwrites);
+ db_printf(" mnt_gjprovider = %s\n",
+ mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
+
+ db_printf("\n\nList of active vnodes\n");
+ TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
+ if (vp->v_type != VMARKER) {
+ vn_printf(vp, "vnode ");
+ if (db_pager_quit)
+ break;
+ }
+ }
+ db_printf("\n\nList of inactive vnodes\n");
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
+ vn_printf(vp, "vnode ");
+ if (db_pager_quit)
+ break;
+ }
+ }
+}
+#endif /* DDB */
+
+/*
+ * Fill in a struct xvfsconf based on a struct vfsconf.
+ */
+static int
+vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
+{
+ struct xvfsconf xvfsp;
+
+ bzero(&xvfsp, sizeof(xvfsp));
+ strcpy(xvfsp.vfc_name, vfsp->vfc_name);
+ xvfsp.vfc_typenum = vfsp->vfc_typenum;
+ xvfsp.vfc_refcount = vfsp->vfc_refcount;
+ xvfsp.vfc_flags = vfsp->vfc_flags;
+ /*
+ * These are unused in userland, we keep them
+ * to not break binary compatibility.
+ */
+ xvfsp.vfc_vfsops = NULL;
+ xvfsp.vfc_next = NULL;
+ return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
+}
+
+#ifdef COMPAT_FREEBSD32
+struct xvfsconf32 {
+ uint32_t vfc_vfsops;
+ char vfc_name[MFSNAMELEN];
+ int32_t vfc_typenum;
+ int32_t vfc_refcount;
+ int32_t vfc_flags;
+ uint32_t vfc_next;
+};
+
+static int
+vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
+{
+ struct xvfsconf32 xvfsp;
+
+ strcpy(xvfsp.vfc_name, vfsp->vfc_name);
+ xvfsp.vfc_typenum = vfsp->vfc_typenum;
+ xvfsp.vfc_refcount = vfsp->vfc_refcount;
+ xvfsp.vfc_flags = vfsp->vfc_flags;
+ xvfsp.vfc_vfsops = 0;
+ xvfsp.vfc_next = 0;
+ return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int
+sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
+{
+ struct vfsconf *vfsp;
+ int error;
+
+ error = 0;
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+#ifdef COMPAT_FREEBSD32
+ if (req->flags & SCTL_MASK32)
+ error = vfsconf2x32(req, vfsp);
+ else
+#endif
+ error = vfsconf2x(req, vfsp);
+ if (error)
+ break;
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
+ NULL, 0, sysctl_vfs_conflist,
+ "S,xvfsconf", "List of all configured filesystems");
+
+#ifndef BURN_BRIDGES
+static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int *name = (int *)arg1 - 1; /* XXX */
+ u_int namelen = arg2 + 1; /* XXX */
+ struct vfsconf *vfsp;
+
+ log(LOG_WARNING, "userland calling deprecated sysctl, "
+ "please rebuild world\n");
+
+#if 1 || defined(COMPAT_PRELITE2)
+ /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+ if (namelen == 1)
+ return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+ switch (name[1]) {
+ case VFS_MAXTYPENUM:
+ if (namelen != 2)
+ return (ENOTDIR);
+ return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+ case VFS_CONF:
+ if (namelen != 3)
+ return (ENOTDIR); /* overloaded */
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+ if (vfsp->vfc_typenum == name[2])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+#ifdef COMPAT_FREEBSD32
+ if (req->flags & SCTL_MASK32)
+ return (vfsconf2x32(req, vfsp));
+ else
+#endif
+ return (vfsconf2x(req, vfsp));
+ }
+ return (EOPNOTSUPP);
+}
+
+static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
+ vfs_sysctl, "Generic filesystem");
+
+#if 1 || defined(COMPAT_PRELITE2)
+
+static int
+sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ struct vfsconf *vfsp;
+ struct ovfsconf ovfs;
+
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
+ bzero(&ovfs, sizeof(ovfs));
+ ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
+ strcpy(ovfs.vfc_name, vfsp->vfc_name);
+ ovfs.vfc_index = vfsp->vfc_typenum;
+ ovfs.vfc_refcount = vfsp->vfc_refcount;
+ ovfs.vfc_flags = vfsp->vfc_flags;
+ error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+ if (error)
+ return error;
+ }
+ return 0;
+}
+
+#endif /* 1 || COMPAT_PRELITE2 */
+#endif /* !BURN_BRIDGES */
+
+#define KINFO_VNODESLOP 10
+#ifdef notyet
+/*
+ * Dump vnode list (via sysctl).
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode(SYSCTL_HANDLER_ARGS)
+{
+ struct xvnode *xvn;
+ struct mount *mp;
+ struct vnode *vp;
+ int error, len, n;
+
+ /*
+ * Stale numvnodes access is not fatal here.
+ */
+ req->lock = 0;
+ len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
+ if (!req->oldptr)
+ /* Make an estimate */
+ return (SYSCTL_OUT(req, 0, len));
+
+ error = sysctl_wire_old_buffer(req, 0);
+ if (error != 0)
+ return (error);
+ xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
+ n = 0;
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
+ continue;
+ MNT_ILOCK(mp);
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ if (n == len)
+ break;
+ vref(vp);
+ xvn[n].xv_size = sizeof *xvn;
+ xvn[n].xv_vnode = vp;
+ xvn[n].xv_id = 0; /* XXX compat */
+#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
+ XV_COPY(usecount);
+ XV_COPY(writecount);
+ XV_COPY(holdcnt);
+ XV_COPY(mount);
+ XV_COPY(numoutput);
+ XV_COPY(type);
+#undef XV_COPY
+ xvn[n].xv_flag = vp->v_vflag;
+
+ switch (vp->v_type) {
+ case VREG:
+ case VDIR:
+ case VLNK:
+ break;
+ case VBLK:
+ case VCHR:
+ if (vp->v_rdev == NULL) {
+ vrele(vp);
+ continue;
+ }
+ xvn[n].xv_dev = dev2udev(vp->v_rdev);
+ break;
+ case VSOCK:
+ xvn[n].xv_socket = vp->v_socket;
+ break;
+ case VFIFO:
+ xvn[n].xv_fifo = vp->v_fifoinfo;
+ break;
+ case VNON:
+ case VBAD:
+ default:
+ /* shouldn't happen? */
+ vrele(vp);
+ continue;
+ }
+ vrele(vp);
+ ++n;
+ }
+ MNT_IUNLOCK(mp);
+ mtx_lock(&mountlist_mtx);
+ vfs_unbusy(mp);
+ if (n == len)
+ break;
+ }
+ mtx_unlock(&mountlist_mtx);
+
+ error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
+ free(xvn, M_TEMP);
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_vnode, "S,xvnode", "");
+#endif
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall(void)
+{
+ struct mount *mp;
+ struct thread *td;
+ int error;
+
+ CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
+ td = curthread;
+
+ /*
+ * Since this only runs when rebooting, it is not interlocked.
+ */
+ while(!TAILQ_EMPTY(&mountlist)) {
+ mp = TAILQ_LAST(&mountlist, mntlist);
+ error = dounmount(mp, MNT_FORCE, td);
+ if (error) {
+ TAILQ_REMOVE(&mountlist, mp, mnt_list);
+ /*
+ * XXX: Due to the way in which we mount the root
+ * file system off of devfs, devfs will generate a
+ * "busy" warning when we try to unmount it before
+ * the root. Don't print a warning as a result in
+ * order to avoid false positive errors that may
+ * cause needless upset.
+ */
+ if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
+ printf("unmount of %s failed (",
+ mp->mnt_stat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ }
+ } else {
+ /* The unmount has removed mp from the mountlist */
+ }
+ }
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags)
+{
+ struct vnode *vp, *mvp;
+ struct vm_object *obj;
+
+ CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
+ MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
+ obj = vp->v_object;
+ if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
+ (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
+ if (!vget(vp,
+ LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
+ curthread)) {
+ if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
+ vput(vp);
+ continue;
+ }
+
+ obj = vp->v_object;
+ if (obj != NULL) {
+ VM_OBJECT_WLOCK(obj);
+ vm_object_page_clean(obj, 0, 0,
+ flags == MNT_WAIT ?
+ OBJPC_SYNC : OBJPC_NOSYNC);
+ VM_OBJECT_WUNLOCK(obj);
+ }
+ vput(vp);
+ }
+ } else
+ VI_UNLOCK(vp);
+ }
+}
+
+static void
+destroy_vpollinfo_free(struct vpollinfo *vi)
+{
+
+ knlist_destroy(&vi->vpi_selinfo.si_note);
+ mtx_destroy(&vi->vpi_lock);
+ uma_zfree(vnodepoll_zone, vi);
+}
+
+static void
+destroy_vpollinfo(struct vpollinfo *vi)
+{
+
+ knlist_clear(&vi->vpi_selinfo.si_note, 1);
+ seldrain(&vi->vpi_selinfo);
+ destroy_vpollinfo_free(vi);
+}
+
+/*
+ * Initalize per-vnode helper structure to hold poll-related state.
+ */
+void
+v_addpollinfo(struct vnode *vp)
+{
+ struct vpollinfo *vi;
+
+ if (vp->v_pollinfo != NULL)
+ return;
+ vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
+ mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
+ knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
+ vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
+ VI_LOCK(vp);
+ if (vp->v_pollinfo != NULL) {
+ VI_UNLOCK(vp);
+ destroy_vpollinfo_free(vi);
+ return;
+ }
+ vp->v_pollinfo = vi;
+ VI_UNLOCK(vp);
+}
+
+/*
+ * Record a process's interest in events which might happen to
+ * a vnode. Because poll uses the historic select-style interface
+ * internally, this routine serves as both the ``check for any
+ * pending events'' and the ``record my interest in future events''
+ * functions. (These are done together, while the lock is held,
+ * to avoid race conditions.)
+ */
+int
+vn_pollrecord(struct vnode *vp, struct thread *td, int events)
+{
+
+ v_addpollinfo(vp);
+ mtx_lock(&vp->v_pollinfo->vpi_lock);
+ if (vp->v_pollinfo->vpi_revents & events) {
+ /*
+ * This leaves events we are not interested
+ * in available for the other process which
+ * which presumably had requested them
+ * (otherwise they would never have been
+ * recorded).
+ */
+ events &= vp->v_pollinfo->vpi_revents;
+ vp->v_pollinfo->vpi_revents &= ~events;
+
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ return (events);
+ }
+ vp->v_pollinfo->vpi_events |= events;
+ selrecord(td, &vp->v_pollinfo->vpi_selinfo);
+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ return (0);
+}
+
+/*
+ * Routine to create and manage a filesystem syncer vnode.
+ */
+#define sync_close ((int (*)(struct vop_close_args *))nullop)
+static int sync_fsync(struct vop_fsync_args *);
+static int sync_inactive(struct vop_inactive_args *);
+static int sync_reclaim(struct vop_reclaim_args *);
+
+static struct vop_vector sync_vnodeops = {
+ .vop_bypass = VOP_EOPNOTSUPP,
+ .vop_close = sync_close, /* close */
+ .vop_fsync = sync_fsync, /* fsync */
+ .vop_inactive = sync_inactive, /* inactive */
+ .vop_reclaim = sync_reclaim, /* reclaim */
+ .vop_lock1 = vop_stdlock, /* lock */
+ .vop_unlock = vop_stdunlock, /* unlock */
+ .vop_islocked = vop_stdislocked, /* islocked */
+};
+
+/*
+ * Create a new filesystem syncer vnode for the specified mount point.
+ */
+void
+vfs_allocate_syncvnode(struct mount *mp)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+ static long start, incr, next;
+ int error;
+
+ /* Allocate a new vnode */
+ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
+ if (error != 0)
+ panic("vfs_allocate_syncvnode: getnewvnode() failed");
+ vp->v_type = VNON;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ vp->v_vflag |= VV_FORCEINSMQ;
+ error = insmntque(vp, mp);
+ if (error != 0)
+ panic("vfs_allocate_syncvnode: insmntque() failed");
+ vp->v_vflag &= ~VV_FORCEINSMQ;
+ VOP_UNLOCK(vp, 0);
+ /*
+ * Place the vnode onto the syncer worklist. We attempt to
+ * scatter them about on the list so that they will go off
+ * at evenly distributed times even if all the filesystems
+ * are mounted at once.
+ */
+ next += incr;
+ if (next == 0 || next > syncer_maxdelay) {
+ start /= 2;
+ incr /= 2;
+ if (start == 0) {
+ start = syncer_maxdelay / 2;
+ incr = syncer_maxdelay;
+ }
+ next = start;
+ }
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+ vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
+ /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
+ mtx_lock(&sync_mtx);
+ sync_vnode_count++;
+ if (mp->mnt_syncer == NULL) {
+ mp->mnt_syncer = vp;
+ vp = NULL;
+ }
+ mtx_unlock(&sync_mtx);
+ BO_UNLOCK(bo);
+ if (vp != NULL) {
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ vgone(vp);
+ vput(vp);
+ }
+}
+
+void
+vfs_deallocate_syncvnode(struct mount *mp)
+{
+ struct vnode *vp;
+
+ mtx_lock(&sync_mtx);
+ vp = mp->mnt_syncer;
+ if (vp != NULL)
+ mp->mnt_syncer = NULL;
+ mtx_unlock(&sync_mtx);
+ if (vp != NULL)
+ vrele(vp);
+}
+
+/*
+ * Do a lazy sync of the filesystem.
+ */
+static int
+sync_fsync(struct vop_fsync_args *ap)
+{
+ struct vnode *syncvp = ap->a_vp;
+ struct mount *mp = syncvp->v_mount;
+ int error, save;
+ struct bufobj *bo;
+
+ /*
+ * We only need to do something if this is a lazy evaluation.
+ */
+ if (ap->a_waitfor != MNT_LAZY)
+ return (0);
+
+ /*
+ * Move ourselves to the back of the sync list.
+ */
+ bo = &syncvp->v_bufobj;
+ BO_LOCK(bo);
+ vn_syncer_add_to_worklist(bo, syncdelay);
+ BO_UNLOCK(bo);
+
+ /*
+ * Walk the list of vnodes pushing all that are dirty and
+ * not already on the sync list.
+ */
+ mtx_lock(&mountlist_mtx);
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
+ mtx_unlock(&mountlist_mtx);
+ return (0);
+ }
+ if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
+ vfs_unbusy(mp);
+ return (0);
+ }
+ save = curthread_pflags_set(TDP_SYNCIO);
+ vfs_msync(mp, MNT_NOWAIT);
+ error = VFS_SYNC(mp, MNT_LAZY);
+ curthread_pflags_restore(save);
+ vn_finished_write(mp);
+ vfs_unbusy(mp);
+ return (error);
+}
+
+/*
+ * The syncer vnode is no referenced.
+ */
+static int
+sync_inactive(struct vop_inactive_args *ap)
+{
+
+ vgone(ap->a_vp);
+ return (0);
+}
+
+/*
+ * The syncer vnode is no longer needed and is being decommissioned.
+ *
+ * Modifications to the worklist must be protected by sync_mtx.
+ */
+static int
+sync_reclaim(struct vop_reclaim_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct bufobj *bo;
+
+ bo = &vp->v_bufobj;
+ BO_LOCK(bo);
+ mtx_lock(&sync_mtx);
+ if (vp->v_mount->mnt_syncer == vp)
+ vp->v_mount->mnt_syncer = NULL;
+ if (bo->bo_flag & BO_ONWORKLST) {
+ LIST_REMOVE(bo, bo_synclist);
+ syncer_worklist_len--;
+ sync_vnode_count--;
+ bo->bo_flag &= ~BO_ONWORKLST;
+ }
+ mtx_unlock(&sync_mtx);
+ BO_UNLOCK(bo);
+
+ return (0);
+}
+
+/*
+ * Check if vnode represents a disk device
+ */
+int
+vn_isdisk(struct vnode *vp, int *errp)
+{
+ int error;
+
+ error = 0;
+ dev_lock();
+ if (vp->v_type != VCHR)
+ error = ENOTBLK;
+ else if (vp->v_rdev == NULL)
+ error = ENXIO;
+ else if (vp->v_rdev->si_devsw == NULL)
+ error = ENXIO;
+ else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
+ error = ENOTBLK;
+ dev_unlock();
+ if (errp != NULL)
+ *errp = error;
+ return (error == 0);
+}
+
+/*
+ * Common filesystem object access control check routine. Accepts a
+ * vnode's type, "mode", uid and gid, requested access mode, credentials,
+ * and optional call-by-reference privused argument allowing vaccess()
+ * to indicate to the caller whether privilege was used to satisfy the
+ * request (obsoleted). Returns 0 on success, or an errno on failure.
+ */
+int
+vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
+ accmode_t accmode, struct ucred *cred, int *privused)
+{
+ accmode_t dac_granted;
+ accmode_t priv_granted;
+
+ KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
+ ("invalid bit in accmode"));
+ KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
+ ("VAPPEND without VWRITE"));
+
+ /*
+ * Look for a normal, non-privileged way to access the file/directory
+ * as requested. If it exists, go with that.
+ */
+
+ if (privused != NULL)
+ *privused = 0;
+
+ dac_granted = 0;
+
+ /* Check the owner. */
+ if (cred->cr_uid == file_uid) {
+ dac_granted |= VADMIN;
+ if (file_mode & S_IXUSR)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IRUSR)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWUSR)
+ dac_granted |= (VWRITE | VAPPEND);
+
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+ goto privcheck;
+ }
+
+ /* Otherwise, check the groups (first match) */
+ if (groupmember(file_gid, cred)) {
+ if (file_mode & S_IXGRP)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IRGRP)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWGRP)
+ dac_granted |= (VWRITE | VAPPEND);
+
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+ goto privcheck;
+ }
+
+ /* Otherwise, check everyone else. */
+ if (file_mode & S_IXOTH)
+ dac_granted |= VEXEC;
+ if (file_mode & S_IROTH)
+ dac_granted |= VREAD;
+ if (file_mode & S_IWOTH)
+ dac_granted |= (VWRITE | VAPPEND);
+ if ((accmode & dac_granted) == accmode)
+ return (0);
+
+privcheck:
+ /*
+ * Build a privilege mask to determine if the set of privileges
+ * satisfies the requirements when combined with the granted mask
+ * from above. For each privilege, if the privilege is required,
+ * bitwise or the request type onto the priv_granted mask.
+ */
+ priv_granted = 0;
+
+ if (type == VDIR) {
+ /*
+ * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
+ * requests, instead of PRIV_VFS_EXEC.
+ */
+ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+ !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
+ priv_granted |= VEXEC;
+ } else {
+ /*
+ * Ensure that at least one execute bit is on. Otherwise,
+ * a privileged user will always succeed, and we don't want
+ * this to happen unless the file really is executable.
+ */
+ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
+ (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
+ !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+ priv_granted |= VEXEC;
+ }
+
+ if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
+ !priv_check_cred(cred, PRIV_VFS_READ, 0))
+ priv_granted |= VREAD;
+
+ if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
+ !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+ priv_granted |= (VWRITE | VAPPEND);
+
+ if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
+ !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+ priv_granted |= VADMIN;
+
+ if ((accmode & (priv_granted | dac_granted)) == accmode) {
+ /* XXX audit: privilege used */
+ if (privused != NULL)
+ *privused = 1;
+ return (0);
+ }
+
+ return ((accmode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * Credential check based on process requesting service, and per-attribute
+ * permissions.
+ */
+int
+extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
+ struct thread *td, accmode_t accmode)
+{
+
+ /*
+ * Kernel-invoked always succeeds.
+ */
+ if (cred == NOCRED)
+ return (0);
+
+ /*
+ * Do not allow privileged processes in jail to directly manipulate
+ * system attributes.
+ */
+ switch (attrnamespace) {
+ case EXTATTR_NAMESPACE_SYSTEM:
+ /* Potentially should be: return (EPERM); */
+ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
+ case EXTATTR_NAMESPACE_USER:
+ return (VOP_ACCESS(vp, accmode, cred, td));
+ default:
+ return (EPERM);
+ }
+}
+
+#ifdef DEBUG_VFS_LOCKS
+/*
+ * This only exists to supress warnings from unlocked specfs accesses. It is
+ * no longer ok to have an unlocked VFS.
+ */
+#define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \
+ (vp)->v_type == VCHR || (vp)->v_type == VBAD)
+
+int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
+ "Drop into debugger on lock violation");
+
+int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
+ 0, "Check for interlock across VOPs");
+
+int vfs_badlock_print = 1; /* Print lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
+ 0, "Print lock violations");
+
+#ifdef KDB
+int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */
+SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
+ &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
+#endif
+
+static void
+vfs_badlock(const char *msg, const char *str, struct vnode *vp)
+{
+
+#ifdef KDB
+ if (vfs_badlock_backtrace)
+ kdb_backtrace();
+#endif
+ if (vfs_badlock_print)
+ printf("%s: %p %s\n", str, (void *)vp, msg);
+ if (vfs_badlock_ddb)
+ kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
+}
+
+void
+assert_vi_locked(struct vnode *vp, const char *str)
+{
+
+ if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
+ vfs_badlock("interlock is not locked but should be", str, vp);
+}
+
+void
+assert_vi_unlocked(struct vnode *vp, const char *str)
+{
+
+ if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
+ vfs_badlock("interlock is locked but should not be", str, vp);
+}
+
+void
+assert_vop_locked(struct vnode *vp, const char *str)
+{
+ int locked;
+
+ if (!IGNORE_LOCK(vp)) {
+ locked = VOP_ISLOCKED(vp);
+ if (locked == 0 || locked == LK_EXCLOTHER)
+ vfs_badlock("is not locked but should be", str, vp);
+ }
+}
+
+void
+assert_vop_unlocked(struct vnode *vp, const char *str)
+{
+
+ if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+ vfs_badlock("is locked but should not be", str, vp);
+}
+
+void
+assert_vop_elocked(struct vnode *vp, const char *str)
+{
+
+ if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
+ vfs_badlock("is not exclusive locked but should be", str, vp);
+}
+
+#if 0
+void
+assert_vop_elocked_other(struct vnode *vp, const char *str)
+{
+
+ if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
+ vfs_badlock("is not exclusive locked by another thread",
+ str, vp);
+}
+
+void
+assert_vop_slocked(struct vnode *vp, const char *str)
+{
+
+ if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
+ vfs_badlock("is not locked shared but should be", str, vp);
+}
+#endif /* 0 */
+#endif /* DEBUG_VFS_LOCKS */
+
+void
+vop_rename_fail(struct vop_rename_args *ap)
+{
+
+ if (ap->a_tvp != NULL)
+ vput(ap->a_tvp);
+ if (ap->a_tdvp == ap->a_tvp)
+ vrele(ap->a_tdvp);
+ else
+ vput(ap->a_tdvp);
+ vrele(ap->a_fdvp);
+ vrele(ap->a_fvp);
+}
+
+void
+vop_rename_pre(void *ap)
+{
+ struct vop_rename_args *a = ap;
+
+#ifdef DEBUG_VFS_LOCKS
+ if (a->a_tvp)
+ ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
+ ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
+ ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
+ ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
+
+ /* Check the source (from). */
+ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
+ (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
+ ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
+ if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
+ ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
+
+ /* Check the target. */
+ if (a->a_tvp)
+ ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
+ ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
+#endif
+ if (a->a_tdvp != a->a_fdvp)
+ vhold(a->a_fdvp);
+ if (a->a_tvp != a->a_fvp)
+ vhold(a->a_fvp);
+ vhold(a->a_tdvp);
+ if (a->a_tvp)
+ vhold(a->a_tvp);
+}
+
+void
+vop_strategy_pre(void *ap)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vop_strategy_args *a;
+ struct buf *bp;
+
+ a = ap;
+ bp = a->a_bp;
+
+ /*
+ * Cluster ops lock their component buffers but not the IO container.
+ */
+ if ((bp->b_flags & B_CLUSTER) != 0)
+ return;
+
+ if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
+ if (vfs_badlock_print)
+ printf(
+ "VOP_STRATEGY: bp is not locked but should be\n");
+ if (vfs_badlock_ddb)
+ kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
+ }
+#endif
+}
+
+void
+vop_lock_pre(void *ap)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vop_lock1_args *a = ap;
+
+ if ((a->a_flags & LK_INTERLOCK) == 0)
+ ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
+ else
+ ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
+#endif
+}
+
+void
+vop_lock_post(void *ap, int rc)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vop_lock1_args *a = ap;
+
+ ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
+ if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
+ ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
+#endif
+}
+
+void
+vop_unlock_pre(void *ap)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vop_unlock_args *a = ap;
+
+ if (a->a_flags & LK_INTERLOCK)
+ ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
+ ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
+#endif
+}
+
+void
+vop_unlock_post(void *ap, int rc)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vop_unlock_args *a = ap;
+
+ if (a->a_flags & LK_INTERLOCK)
+ ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
+#endif
+}
+
+void
+vop_create_post(void *ap, int rc)
+{
+ struct vop_create_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_deleteextattr_post(void *ap, int rc)
+{
+ struct vop_deleteextattr_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_link_post(void *ap, int rc)
+{
+ struct vop_link_args *a = ap;
+
+ if (!rc) {
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
+ VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
+ }
+}
+
+void
+vop_mkdir_post(void *ap, int rc)
+{
+ struct vop_mkdir_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
+}
+
+void
+vop_mknod_post(void *ap, int rc)
+{
+ struct vop_mknod_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+void
+vop_remove_post(void *ap, int rc)
+{
+ struct vop_remove_args *a = ap;
+
+ if (!rc) {
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
+ }
+}
+
+void
+vop_rename_post(void *ap, int rc)
+{
+ struct vop_rename_args *a = ap;
+
+ if (!rc) {
+ VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
+ VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
+ VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
+ if (a->a_tvp)
+ VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
+ }
+ if (a->a_tdvp != a->a_fdvp)
+ vdrop(a->a_fdvp);
+ if (a->a_tvp != a->a_fvp)
+ vdrop(a->a_fvp);
+ vdrop(a->a_tdvp);
+ if (a->a_tvp)
+ vdrop(a->a_tvp);
+}
+
+void
+vop_rmdir_post(void *ap, int rc)
+{
+ struct vop_rmdir_args *a = ap;
+
+ if (!rc) {
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
+ }
+}
+
+void
+vop_setattr_post(void *ap, int rc)
+{
+ struct vop_setattr_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_setextattr_post(void *ap, int rc)
+{
+ struct vop_setextattr_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+}
+
+void
+vop_symlink_post(void *ap, int rc)
+{
+ struct vop_symlink_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
+}
+
+static struct knlist fs_knlist;
+
+static void
+vfs_event_init(void *arg)
+{
+ knlist_init_mtx(&fs_knlist, NULL);
+}
+/* XXX - correct order? */
+SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
+
+void
+vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
+{
+
+ KNOTE_UNLOCKED(&fs_knlist, event);
+}
+
+static int filt_fsattach(struct knote *kn);
+static void filt_fsdetach(struct knote *kn);
+static int filt_fsevent(struct knote *kn, long hint);
+
+struct filterops fs_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_fsattach,
+ .f_detach = filt_fsdetach,
+ .f_event = filt_fsevent
+};
+
+static int
+filt_fsattach(struct knote *kn)
+{
+
+ kn->kn_flags |= EV_CLEAR;
+ knlist_add(&fs_knlist, kn, 0);
+ return (0);
+}
+
+static void
+filt_fsdetach(struct knote *kn)
+{
+
+ knlist_remove(&fs_knlist, kn, 0);
+}
+
+static int
+filt_fsevent(struct knote *kn, long hint)
+{
+
+ kn->kn_fflags |= hint;
+ return (kn->kn_fflags != 0);
+}
+
+static int
+sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
+{
+ struct vfsidctl vc;
+ int error;
+ struct mount *mp;
+
+ error = SYSCTL_IN(req, &vc, sizeof(vc));
+ if (error)
+ return (error);
+ if (vc.vc_vers != VFS_CTL_VERS1)
+ return (EINVAL);
+ mp = vfs_getvfs(&vc.vc_fsid);
+ if (mp == NULL)
+ return (ENOENT);
+ /* ensure that a specific sysctl goes to the right filesystem. */
+ if (strcmp(vc.vc_fstypename, "*") != 0 &&
+ strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
+ vfs_rel(mp);
+ return (EINVAL);
+ }
+ VCTLTOREQ(&vc, req);
+ error = VFS_SYSCTL(mp, vc.vc_op, req);
+ vfs_rel(mp);
+ return (error);
+}
+
+SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
+ NULL, 0, sysctl_vfs_ctl, "",
+ "Sysctl by fsid");
+
+/*
+ * Function to initialize a va_filerev field sensibly.
+ * XXX: Wouldn't a random number make a lot more sense ??
+ */
+u_quad_t
+init_va_filerev(void)
+{
+ struct bintime bt;
+
+ getbinuptime(&bt);
+ return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
+}
+
+static int filt_vfsread(struct knote *kn, long hint);
+static int filt_vfswrite(struct knote *kn, long hint);
+static int filt_vfsvnode(struct knote *kn, long hint);
+static void filt_vfsdetach(struct knote *kn);
+static struct filterops vfsread_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_vfsdetach,
+ .f_event = filt_vfsread
+};
+static struct filterops vfswrite_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_vfsdetach,
+ .f_event = filt_vfswrite
+};
+static struct filterops vfsvnode_filtops = {
+ .f_isfd = 1,
+ .f_detach = filt_vfsdetach,
+ .f_event = filt_vfsvnode
+};
+
+static void
+vfs_knllock(void *arg)
+{
+ struct vnode *vp = arg;
+
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+}
+
+static void
+vfs_knlunlock(void *arg)
+{
+ struct vnode *vp = arg;
+
+ VOP_UNLOCK(vp, 0);
+}
+
+static void
+vfs_knl_assert_locked(void *arg)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vnode *vp = arg;
+
+ ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
+#endif
+}
+
+static void
+vfs_knl_assert_unlocked(void *arg)
+{
+#ifdef DEBUG_VFS_LOCKS
+ struct vnode *vp = arg;
+
+ ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
+#endif
+}
+
+int
+vfs_kqfilter(struct vop_kqfilter_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct knote *kn = ap->a_kn;
+ struct knlist *knl;
+
+ switch (kn->kn_filter) {
+ case EVFILT_READ:
+ kn->kn_fop = &vfsread_filtops;
+ break;
+ case EVFILT_WRITE:
+ kn->kn_fop = &vfswrite_filtops;
+ break;
+ case EVFILT_VNODE:
+ kn->kn_fop = &vfsvnode_filtops;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ kn->kn_hook = (caddr_t)vp;
+
+ v_addpollinfo(vp);
+ if (vp->v_pollinfo == NULL)
+ return (ENOMEM);
+ knl = &vp->v_pollinfo->vpi_selinfo.si_note;
+ knlist_add(knl, kn, 0);
+
+ return (0);
+}
+
+/*
+ * Detach knote from vnode
+ */
+static void
+filt_vfsdetach(struct knote *kn)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+ KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
+ knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
+}
+
+/*ARGSUSED*/
+static int
+filt_vfsread(struct knote *kn, long hint)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+ struct vattr va;
+ int res;
+
+ /*
+ * filesystem is gone, so set the EOF flag and schedule
+ * the knote for deletion.
+ */
+ if (hint == NOTE_REVOKE) {
+ VI_LOCK(vp);
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ VI_UNLOCK(vp);
+ return (1);
+ }
+
+ if (VOP_GETATTR(vp, &va, curthread->td_ucred))
+ return (0);
+
+ VI_LOCK(vp);
+ kn->kn_data = va.va_size - kn->kn_fp->f_offset;
+ res = (kn->kn_data != 0);
+ VI_UNLOCK(vp);
+ return (res);
+}
+
+/*ARGSUSED*/
+static int
+filt_vfswrite(struct knote *kn, long hint)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+
+ VI_LOCK(vp);
+
+ /*
+ * filesystem is gone, so set the EOF flag and schedule
+ * the knote for deletion.
+ */
+ if (hint == NOTE_REVOKE)
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+
+ kn->kn_data = 0;
+ VI_UNLOCK(vp);
+ return (1);
+}
+
+static int
+filt_vfsvnode(struct knote *kn, long hint)
+{
+ struct vnode *vp = (struct vnode *)kn->kn_hook;
+ int res;
+
+ VI_LOCK(vp);
+ if (kn->kn_sfflags & hint)
+ kn->kn_fflags |= hint;
+ if (hint == NOTE_REVOKE) {
+ kn->kn_flags |= EV_EOF;
+ VI_UNLOCK(vp);
+ return (1);
+ }
+ res = (kn->kn_fflags != 0);
+ VI_UNLOCK(vp);
+ return (res);
+}
+
+int
+vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
+{
+ int error;
+
+ if (dp->d_reclen > ap->a_uio->uio_resid)
+ return (ENAMETOOLONG);
+ error = uiomove(dp, dp->d_reclen, ap->a_uio);
+ if (error) {
+ if (ap->a_ncookies != NULL) {
+ if (ap->a_cookies != NULL)
+ free(ap->a_cookies, M_TEMP);
+ ap->a_cookies = NULL;
+ *ap->a_ncookies = 0;
+ }
+ return (error);
+ }
+ if (ap->a_ncookies == NULL)
+ return (0);
+
+ KASSERT(ap->a_cookies,
+ ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
+
+ *ap->a_cookies = realloc(*ap->a_cookies,
+ (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
+ (*ap->a_cookies)[*ap->a_ncookies] = off;
+ return (0);
+}
+
+/*
+ * Mark for update the access time of the file if the filesystem
+ * supports VOP_MARKATIME. This functionality is used by execve and
+ * mmap, so we want to avoid the I/O implied by directly setting
+ * va_atime for the sake of efficiency.
+ */
+void
+vfs_mark_atime(struct vnode *vp, struct ucred *cred)
+{
+ struct mount *mp;
+
+ mp = vp->v_mount;
+ ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
+ if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
+ (void)VOP_MARKATIME(vp);
+}
+
+/*
+ * The purpose of this routine is to remove granularity from accmode_t,
+ * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
+ * VADMIN and VAPPEND.
+ *
+ * If it returns 0, the caller is supposed to continue with the usual
+ * access checks using 'accmode' as modified by this routine. If it
+ * returns nonzero value, the caller is supposed to return that value
+ * as errno.
+ *
+ * Note that after this routine runs, accmode may be zero.
+ */
+int
+vfs_unixify_accmode(accmode_t *accmode)
+{
+ /*
+ * There is no way to specify explicit "deny" rule using
+ * file mode or POSIX.1e ACLs.
+ */
+ if (*accmode & VEXPLICIT_DENY) {
+ *accmode = 0;
+ return (0);
+ }
+
+ /*
+ * None of these can be translated into usual access bits.
+ * Also, the common case for NFSv4 ACLs is to not contain
+ * either of these bits. Caller should check for VWRITE
+ * on the containing directory instead.
+ */
+ if (*accmode & (VDELETE_CHILD | VDELETE))
+ return (EPERM);
+
+ if (*accmode & VADMIN_PERMS) {
+ *accmode &= ~VADMIN_PERMS;
+ *accmode |= VADMIN;
+ }
+
+ /*
+ * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
+ * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
+ */
+ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
+
+ return (0);
+}
+
+/*
+ * These are helper functions for filesystems to traverse all
+ * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
+ *
+ * This interface replaces MNT_VNODE_FOREACH.
+ */
+
+MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
+
+struct vnode *
+__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
+{
+ struct vnode *vp;
+
+ if (should_yield())
+ kern_yield(PRI_USER);
+ MNT_ILOCK(mp);
+ KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+ vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
+ while (vp != NULL && (vp->v_type == VMARKER ||
+ (vp->v_iflag & VI_DOOMED) != 0))
+ vp = TAILQ_NEXT(vp, v_nmntvnodes);
+
+ /* Check if we are done */
+ if (vp == NULL) {
+ __mnt_vnode_markerfree_all(mvp, mp);
+ /* MNT_IUNLOCK(mp); -- done in above function */
+ mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
+ return (NULL);
+ }
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+ TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+ VI_LOCK(vp);
+ MNT_IUNLOCK(mp);
+ return (vp);
+}
+
+struct vnode *
+__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
+{
+ struct vnode *vp;
+
+ *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
+ MNT_ILOCK(mp);
+ MNT_REF(mp);
+ (*mvp)->v_type = VMARKER;
+
+ vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
+ while (vp != NULL && (vp->v_type == VMARKER ||
+ (vp->v_iflag & VI_DOOMED) != 0))
+ vp = TAILQ_NEXT(vp, v_nmntvnodes);
+
+ /* Check if we are done */
+ if (vp == NULL) {
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ free(*mvp, M_VNODE_MARKER);
+ *mvp = NULL;
+ return (NULL);
+ }
+ (*mvp)->v_mount = mp;
+ TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
+ VI_LOCK(vp);
+ MNT_IUNLOCK(mp);
+ return (vp);
+}
+
+
+void
+__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
+{
+
+ if (*mvp == NULL) {
+ MNT_IUNLOCK(mp);
+ return;
+ }
+
+ mtx_assert(MNT_MTX(mp), MA_OWNED);
+
+ KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+ TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ free(*mvp, M_VNODE_MARKER);
+ *mvp = NULL;
+}
+
+/*
+ * These are helper functions for filesystems to traverse their
+ * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
+ */
+static void
+mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
+{
+
+ KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ free(*mvp, M_VNODE_MARKER);
+ *mvp = NULL;
+}
+
+static struct vnode *
+mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
+{
+ struct vnode *vp, *nvp;
+
+ mtx_assert(&vnode_free_list_mtx, MA_OWNED);
+ KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
+restart:
+ vp = TAILQ_NEXT(*mvp, v_actfreelist);
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
+ while (vp != NULL) {
+ if (vp->v_type == VMARKER) {
+ vp = TAILQ_NEXT(vp, v_actfreelist);
+ continue;
+ }
+ if (!VI_TRYLOCK(vp)) {
+ if (mp_ncpus == 1 || should_yield()) {
+ TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
+ mtx_unlock(&vnode_free_list_mtx);
+ pause("vnacti", 1);
+ mtx_lock(&vnode_free_list_mtx);
+ goto restart;
+ }
+ continue;
+ }
+ KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
+ KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
+ ("alien vnode on the active list %p %p", vp, mp));
+ if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
+ break;
+ nvp = TAILQ_NEXT(vp, v_actfreelist);
+ VI_UNLOCK(vp);
+ vp = nvp;
+ }
+
+ /* Check if we are done */
+ if (vp == NULL) {
+ mtx_unlock(&vnode_free_list_mtx);
+ mnt_vnode_markerfree_active(mvp, mp);
+ return (NULL);
+ }
+ TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
+ mtx_unlock(&vnode_free_list_mtx);
+ ASSERT_VI_LOCKED(vp, "active iter");
+ KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
+ return (vp);
+}
+
+struct vnode *
+__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
+{
+
+ if (should_yield())
+ kern_yield(PRI_USER);
+ mtx_lock(&vnode_free_list_mtx);
+ return (mnt_vnode_next_active(mvp, mp));
+}
+
+struct vnode *
+__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
+{
+ struct vnode *vp;
+
+ *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
+ MNT_ILOCK(mp);
+ MNT_REF(mp);
+ MNT_IUNLOCK(mp);
+ (*mvp)->v_type = VMARKER;
+ (*mvp)->v_mount = mp;
+
+ mtx_lock(&vnode_free_list_mtx);
+ vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
+ if (vp == NULL) {
+ mtx_unlock(&vnode_free_list_mtx);
+ mnt_vnode_markerfree_active(mvp, mp);
+ return (NULL);
+ }
+ TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
+ return (mnt_vnode_next_active(mvp, mp));
+}
+
+void
+__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
+{
+
+ if (*mvp == NULL)
+ return;
+
+ mtx_lock(&vnode_free_list_mtx);
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
+ mtx_unlock(&vnode_free_list_mtx);
+ mnt_vnode_markerfree_active(mvp, mp);
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
new file mode 100644
index 0000000..4b82df8
--- /dev/null
+++ b/sys/kern/vfs_syscalls.c
@@ -0,0 +1,4729 @@
+/*-
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_capsicum.h"
+#include "opt_compat.h"
+#include "opt_kdtrace.h"
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/capability.h>
+#include <sys/disk.h>
+#include <sys/sysent.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/limits.h>
+#include <sys/linker.h>
+#include <sys/rwlock.h>
+#include <sys/sdt.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/dirent.h>
+#include <sys/jail.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <machine/stdarg.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/uma.h>
+
+#include <ufs/ufs/quota.h>
+
+MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
+
+SDT_PROVIDER_DEFINE(vfs);
+SDT_PROBE_DEFINE2(vfs, , stat, mode, mode, "char *", "int");
+SDT_PROBE_DEFINE2(vfs, , stat, reg, reg, "char *", "int");
+
+static int chroot_refuse_vdir_fds(struct filedesc *fdp);
+static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
+static int kern_chflags(struct thread *td, const char *path,
+ enum uio_seg pathseg, u_long flags);
+static int kern_chflagsat(struct thread *td, int fd, const char *path,
+ enum uio_seg pathseg, u_long flags, int atflag);
+static int setfflags(struct thread *td, struct vnode *, u_long);
+static int setutimes(struct thread *td, struct vnode *,
+ const struct timespec *, int, int);
+static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
+ struct thread *td);
+
+/*
+ * The module initialization routine for POSIX asynchronous I/O will
+ * set this to the version of AIO that it implements. (Zero means
+ * that it is not implemented.) This value is used here by pathconf()
+ * and in kern_descrip.c by fpathconf().
+ */
+int async_io_version;
+
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+ int dummy;
+};
+#endif
+/* ARGSUSED */
+int
+sys_sync(td, uap)
+ struct thread *td;
+ struct sync_args *uap;
+{
+ struct mount *mp, *nmp;
+ int save;
+
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
+ vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
+ save = curthread_pflags_set(TDP_SYNCIO);
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_NOWAIT);
+ curthread_pflags_restore(save);
+ vn_finished_write(mp);
+ }
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp);
+ }
+ mtx_unlock(&mountlist_mtx);
+ return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+};
+#endif
+int
+sys_quotactl(td, uap)
+ struct thread *td;
+ register struct quotactl_args /* {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct nameidata nd;
+ int error;
+
+ AUDIT_ARG_CMD(uap->cmd);
+ AUDIT_ARG_UID(uap->uid);
+ if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
+ return (EPERM);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ mp = nd.ni_vp->v_mount;
+ vfs_ref(mp);
+ vput(nd.ni_vp);
+ error = vfs_busy(mp, 0);
+ vfs_rel(mp);
+ if (error != 0)
+ return (error);
+ error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
+
+ /*
+ * Since quota on operation typically needs to open quota
+ * file, the Q_QUOTAON handler needs to unbusy the mount point
+ * before calling into namei. Otherwise, unmount might be
+ * started between two vfs_busy() invocations (first is our,
+ * second is from mount point cross-walk code in lookup()),
+ * causing deadlock.
+ *
+ * Require that Q_QUOTAON handles the vfs_busy() reference on
+ * its own, always returning with ubusied mount point.
+ */
+ if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
+ vfs_unbusy(mp);
+ return (error);
+}
+
+/*
+ * Used by statfs conversion routines to scale the block size up if
+ * necessary so that all of the block counts are <= 'max_size'. Note
+ * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
+ * value of 'n'.
+ */
+void
+statfs_scale_blocks(struct statfs *sf, long max_size)
+{
+ uint64_t count;
+ int shift;
+
+ KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
+
+ /*
+ * Attempt to scale the block counts to give a more accurate
+ * overview to userland of the ratio of free space to used
+ * space. To do this, find the largest block count and compute
+ * a divisor that lets it fit into a signed integer <= max_size.
+ */
+ if (sf->f_bavail < 0)
+ count = -sf->f_bavail;
+ else
+ count = sf->f_bavail;
+ count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
+ if (count <= max_size)
+ return;
+
+ count >>= flsl(max_size);
+ shift = 0;
+ while (count > 0) {
+ shift++;
+ count >>=1;
+ }
+
+ sf->f_bsize <<= shift;
+ sf->f_blocks >>= shift;
+ sf->f_bfree >>= shift;
+ sf->f_bavail >>= shift;
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+ char *path;
+ struct statfs *buf;
+};
+#endif
+int
+sys_statfs(td, uap)
+ struct thread *td;
+ register struct statfs_args /* {
+ char *path;
+ struct statfs *buf;
+ } */ *uap;
+{
+ struct statfs sf;
+ int error;
+
+ error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
+ if (error == 0)
+ error = copyout(&sf, uap->buf, sizeof(sf));
+ return (error);
+}
+
+int
+kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
+ struct statfs *buf)
+{
+ struct mount *mp;
+ struct statfs *sp, sb;
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ vfs_ref(mp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_vp);
+ error = vfs_busy(mp, 0);
+ vfs_rel(mp);
+ if (error != 0)
+ return (error);
+#ifdef MAC
+ error = mac_mount_check_stat(td->td_ucred, mp);
+ if (error != 0)
+ goto out;
+#endif
+ /*
+ * Set these in case the underlying filesystem fails to do so.
+ */
+ sp = &mp->mnt_stat;
+ sp->f_version = STATFS_VERSION;
+ sp->f_namemax = NAME_MAX;
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = VFS_STATFS(mp, sp);
+ if (error != 0)
+ goto out;
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ prison_enforce_statfs(td->td_ucred, mp, &sb);
+ sp = &sb;
+ }
+ *buf = *sp;
+out:
+ vfs_unbusy(mp);
+ return (error);
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+ int fd;
+ struct statfs *buf;
+};
+#endif
+int
+sys_fstatfs(td, uap)
+ struct thread *td;
+ register struct fstatfs_args /* {
+ int fd;
+ struct statfs *buf;
+ } */ *uap;
+{
+ struct statfs sf;
+ int error;
+
+ error = kern_fstatfs(td, uap->fd, &sf);
+ if (error == 0)
+ error = copyout(&sf, uap->buf, sizeof(sf));
+ return (error);
+}
+
+int
+kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
+{
+ struct file *fp;
+ struct mount *mp;
+ struct statfs *sp, sb;
+ struct vnode *vp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ error = getvnode(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_FSTATFS), &fp);
+ if (error != 0)
+ return (error);
+ vp = fp->f_vnode;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+#ifdef AUDIT
+ AUDIT_ARG_VNODE1(vp);
+#endif
+ mp = vp->v_mount;
+ if (mp)
+ vfs_ref(mp);
+ VOP_UNLOCK(vp, 0);
+ fdrop(fp, td);
+ if (mp == NULL) {
+ error = EBADF;
+ goto out;
+ }
+ error = vfs_busy(mp, 0);
+ vfs_rel(mp);
+ if (error != 0)
+ return (error);
+#ifdef MAC
+ error = mac_mount_check_stat(td->td_ucred, mp);
+ if (error != 0)
+ goto out;
+#endif
+ /*
+ * Set these in case the underlying filesystem fails to do so.
+ */
+ sp = &mp->mnt_stat;
+ sp->f_version = STATFS_VERSION;
+ sp->f_namemax = NAME_MAX;
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = VFS_STATFS(mp, sp);
+ if (error != 0)
+ goto out;
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ prison_enforce_statfs(td->td_ucred, mp, &sb);
+ sp = &sb;
+ }
+ *buf = *sp;
+out:
+ if (mp)
+ vfs_unbusy(mp);
+ return (error);
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+ struct statfs *buf;
+ long bufsize;
+ int flags;
+};
+#endif
+int
+sys_getfsstat(td, uap)
+ struct thread *td;
+ register struct getfsstat_args /* {
+ struct statfs *buf;
+ long bufsize;
+ int flags;
+ } */ *uap;
+{
+
+ return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
+ uap->flags));
+}
+
+/*
+ * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
+ * The caller is responsible for freeing memory which will be allocated
+ * in '*buf'.
+ */
+int
+kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
+ enum uio_seg bufseg, int flags)
+{
+ struct mount *mp, *nmp;
+ struct statfs *sfsp, *sp, sb;
+ size_t count, maxcount;
+ int error;
+
+ maxcount = bufsize / sizeof(struct statfs);
+ if (bufsize == 0)
+ sfsp = NULL;
+ else if (bufseg == UIO_USERSPACE)
+ sfsp = *buf;
+ else /* if (bufseg == UIO_SYSSPACE) */ {
+ count = 0;
+ mtx_lock(&mountlist_mtx);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+ count++;
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (maxcount > count)
+ maxcount = count;
+ sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
+ M_WAITOK);
+ }
+ count = 0;
+ mtx_lock(&mountlist_mtx);
+ for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
+ if (prison_canseemount(td->td_ucred, mp) != 0) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+#ifdef MAC
+ if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+#endif
+ if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ continue;
+ }
+ if (sfsp && count < maxcount) {
+ sp = &mp->mnt_stat;
+ /*
+ * Set these in case the underlying filesystem
+ * fails to do so.
+ */
+ sp->f_version = STATFS_VERSION;
+ sp->f_namemax = NAME_MAX;
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ /*
+ * If MNT_NOWAIT or MNT_LAZY is specified, do not
+ * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
+ * overrides MNT_WAIT.
+ */
+ if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
+ (flags & MNT_WAIT)) &&
+ (error = VFS_STATFS(mp, sp))) {
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp);
+ continue;
+ }
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
+ bcopy(sp, &sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ prison_enforce_statfs(td->td_ucred, mp, &sb);
+ sp = &sb;
+ }
+ if (bufseg == UIO_SYSSPACE)
+ bcopy(sp, sfsp, sizeof(*sp));
+ else /* if (bufseg == UIO_USERSPACE) */ {
+ error = copyout(sp, sfsp, sizeof(*sp));
+ if (error != 0) {
+ vfs_unbusy(mp);
+ return (error);
+ }
+ }
+ sfsp++;
+ }
+ count++;
+ mtx_lock(&mountlist_mtx);
+ nmp = TAILQ_NEXT(mp, mnt_list);
+ vfs_unbusy(mp);
+ }
+ mtx_unlock(&mountlist_mtx);
+ if (sfsp && count > maxcount)
+ td->td_retval[0] = maxcount;
+ else
+ td->td_retval[0] = count;
+ return (0);
+}
+
+#ifdef COMPAT_FREEBSD4
+/*
+ * Get old format filesystem statistics.
+ */
+static void cvtstatfs(struct statfs *, struct ostatfs *);
+
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_statfs_args {
+ char *path;
+ struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_statfs(td, uap)
+ struct thread *td;
+ struct freebsd4_statfs_args /* {
+ char *path;
+ struct ostatfs *buf;
+ } */ *uap;
+{
+ struct ostatfs osb;
+ struct statfs sf;
+ int error;
+
+ error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
+ if (error != 0)
+ return (error);
+ cvtstatfs(&sf, &osb);
+ return (copyout(&osb, uap->buf, sizeof(osb)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_fstatfs_args {
+ int fd;
+ struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_fstatfs(td, uap)
+ struct thread *td;
+ struct freebsd4_fstatfs_args /* {
+ int fd;
+ struct ostatfs *buf;
+ } */ *uap;
+{
+ struct ostatfs osb;
+ struct statfs sf;
+ int error;
+
+ error = kern_fstatfs(td, uap->fd, &sf);
+ if (error != 0)
+ return (error);
+ cvtstatfs(&sf, &osb);
+ return (copyout(&osb, uap->buf, sizeof(osb)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_getfsstat_args {
+ struct ostatfs *buf;
+ long bufsize;
+ int flags;
+};
+#endif
+int
+freebsd4_getfsstat(td, uap)
+ struct thread *td;
+ register struct freebsd4_getfsstat_args /* {
+ struct ostatfs *buf;
+ long bufsize;
+ int flags;
+ } */ *uap;
+{
+ struct statfs *buf, *sp;
+ struct ostatfs osb;
+ size_t count, size;
+ int error;
+
+ count = uap->bufsize / sizeof(struct ostatfs);
+ size = count * sizeof(struct statfs);
+ error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
+ if (size > 0) {
+ count = td->td_retval[0];
+ sp = buf;
+ while (count > 0 && error == 0) {
+ cvtstatfs(sp, &osb);
+ error = copyout(&osb, uap->buf, sizeof(osb));
+ sp++;
+ uap->buf++;
+ count--;
+ }
+ free(buf, M_TEMP);
+ }
+ return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct freebsd4_fhstatfs_args {
+ struct fhandle *u_fhp;
+ struct ostatfs *buf;
+};
+#endif
+int
+freebsd4_fhstatfs(td, uap)
+ struct thread *td;
+ struct freebsd4_fhstatfs_args /* {
+ struct fhandle *u_fhp;
+ struct ostatfs *buf;
+ } */ *uap;
+{
+ struct ostatfs osb;
+ struct statfs sf;
+ fhandle_t fh;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+ if (error != 0)
+ return (error);
+ error = kern_fhstatfs(td, fh, &sf);
+ if (error != 0)
+ return (error);
+ cvtstatfs(&sf, &osb);
+ return (copyout(&osb, uap->buf, sizeof(osb)));
+}
+
+/*
+ * Convert a new format statfs structure to an old format statfs structure.
+ */
+static void
+cvtstatfs(nsp, osp)
+ struct statfs *nsp;
+ struct ostatfs *osp;
+{
+
+ statfs_scale_blocks(nsp, LONG_MAX);
+ bzero(osp, sizeof(*osp));
+ osp->f_bsize = nsp->f_bsize;
+ osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
+ osp->f_blocks = nsp->f_blocks;
+ osp->f_bfree = nsp->f_bfree;
+ osp->f_bavail = nsp->f_bavail;
+ osp->f_files = MIN(nsp->f_files, LONG_MAX);
+ osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
+ osp->f_owner = nsp->f_owner;
+ osp->f_type = nsp->f_type;
+ osp->f_flags = nsp->f_flags;
+ osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
+ osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
+ osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
+ osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
+ strlcpy(osp->f_fstypename, nsp->f_fstypename,
+ MIN(MFSNAMELEN, OMFSNAMELEN));
+ strlcpy(osp->f_mntonname, nsp->f_mntonname,
+ MIN(MNAMELEN, OMNAMELEN));
+ strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
+ MIN(MNAMELEN, OMNAMELEN));
+ osp->f_fsid = nsp->f_fsid;
+}
+#endif /* COMPAT_FREEBSD4 */
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+ int fd;
+};
+#endif
+int
+sys_fchdir(td, uap)
+ struct thread *td;
+ struct fchdir_args /* {
+ int fd;
+ } */ *uap;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ struct vnode *vp, *tdp, *vpold;
+ struct mount *mp;
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
+ &fp);
+ if (error != 0)
+ return (error);
+ vp = fp->f_vnode;
+ VREF(vp);
+ fdrop(fp, td);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ error = change_dir(vp, td);
+ while (!error && (mp = vp->v_mountedhere) != NULL) {
+ if (vfs_busy(mp, 0))
+ continue;
+ error = VFS_ROOT(mp, LK_SHARED, &tdp);
+ vfs_unbusy(mp);
+ if (error != 0)
+ break;
+ vput(vp);
+ vp = tdp;
+ }
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0);
+ FILEDESC_XLOCK(fdp);
+ vpold = fdp->fd_cdir;
+ fdp->fd_cdir = vp;
+ FILEDESC_XUNLOCK(fdp);
+ vrele(vpold);
+ return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+ char *path;
+};
+#endif
+int
+sys_chdir(td, uap)
+ struct thread *td;
+ struct chdir_args /* {
+ char *path;
+ } */ *uap;
+{
+
+ return (kern_chdir(td, uap->path, UIO_USERSPACE));
+}
+
+int
+kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ struct nameidata nd;
+ struct vnode *vp;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ if ((error = change_dir(nd.ni_vp, td)) != 0) {
+ vput(nd.ni_vp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+ }
+ VOP_UNLOCK(nd.ni_vp, 0);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ FILEDESC_XLOCK(fdp);
+ vp = fdp->fd_cdir;
+ fdp->fd_cdir = nd.ni_vp;
+ FILEDESC_XUNLOCK(fdp);
+ vrele(vp);
+ return (0);
+}
+
+/*
+ * Helper function for raised chroot(2) security function: Refuse if
+ * any filedescriptors are open directories.
+ */
+static int
+chroot_refuse_vdir_fds(fdp)
+ struct filedesc *fdp;
+{
+ struct vnode *vp;
+ struct file *fp;
+ int fd;
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
+ fp = fget_locked(fdp, fd);
+ if (fp == NULL)
+ continue;
+ if (fp->f_type == DTYPE_VNODE) {
+ vp = fp->f_vnode;
+ if (vp->v_type == VDIR)
+ return (EPERM);
+ }
+ }
+ return (0);
+}
+
+/*
+ * This sysctl determines if we will allow a process to chroot(2) if it
+ * has a directory open:
+ * 0: disallowed for all processes.
+ * 1: allowed for processes that were not already chroot(2)'ed.
+ * 2: allowed for all processes.
+ */
+
+static int chroot_allow_open_directories = 1;
+
+SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
+ &chroot_allow_open_directories, 0,
+ "Allow a process to chroot(2) if it has a directory open");
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+ char *path;
+};
+#endif
+int
+sys_chroot(td, uap)
+ struct thread *td;
+ struct chroot_args /* {
+ char *path;
+ } */ *uap;
+{
+ struct nameidata nd;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_CHROOT);
+ if (error != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error != 0)
+ goto error;
+ error = change_dir(nd.ni_vp, td);
+ if (error != 0)
+ goto e_vunlock;
+#ifdef MAC
+ error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
+ if (error != 0)
+ goto e_vunlock;
+#endif
+ VOP_UNLOCK(nd.ni_vp, 0);
+ error = change_root(nd.ni_vp, td);
+ vrele(nd.ni_vp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+e_vunlock:
+ vput(nd.ni_vp);
+error:
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+}
+
+/*
+ * Common routine for chroot and chdir. Callers must provide a locked vnode
+ * instance.
+ */
+int
+change_dir(vp, td)
+ struct vnode *vp;
+ struct thread *td;
+{
+#ifdef MAC
+ int error;
+#endif
+
+ ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+#ifdef MAC
+ error = mac_vnode_check_chdir(td->td_ucred, vp);
+ if (error != 0)
+ return (error);
+#endif
+ return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
+}
+
+/*
+ * Common routine for kern_chroot() and jail_attach(). The caller is
+ * responsible for invoking priv_check() and mac_vnode_check_chroot() to
+ * authorize this operation.
+ */
+int
+change_root(vp, td)
+ struct vnode *vp;
+ struct thread *td;
+{
+ struct filedesc *fdp;
+ struct vnode *oldvp;
+ int error;
+
+ fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK(fdp);
+ if (chroot_allow_open_directories == 0 ||
+ (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+ error = chroot_refuse_vdir_fds(fdp);
+ if (error != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ return (error);
+ }
+ }
+ oldvp = fdp->fd_rdir;
+ fdp->fd_rdir = vp;
+ VREF(fdp->fd_rdir);
+ if (!fdp->fd_jdir) {
+ fdp->fd_jdir = vp;
+ VREF(fdp->fd_jdir);
+ }
+ FILEDESC_XUNLOCK(fdp);
+ vrele(oldvp);
+ return (0);
+}
+
+static __inline void
+flags_to_rights(int flags, cap_rights_t *rightsp)
+{
+
+ if (flags & O_EXEC) {
+ cap_rights_set(rightsp, CAP_FEXECVE);
+ } else {
+ switch ((flags & O_ACCMODE)) {
+ case O_RDONLY:
+ cap_rights_set(rightsp, CAP_READ);
+ break;
+ case O_RDWR:
+ cap_rights_set(rightsp, CAP_READ);
+ /* FALLTHROUGH */
+ case O_WRONLY:
+ cap_rights_set(rightsp, CAP_WRITE);
+ if (!(flags & (O_APPEND | O_TRUNC)))
+ cap_rights_set(rightsp, CAP_SEEK);
+ break;
+ }
+ }
+
+ if (flags & O_CREAT)
+ cap_rights_set(rightsp, CAP_CREATE);
+
+ if (flags & O_TRUNC)
+ cap_rights_set(rightsp, CAP_FTRUNCATE);
+
+ if (flags & (O_SYNC | O_FSYNC))
+ cap_rights_set(rightsp, CAP_FSYNC);
+
+ if (flags & (O_EXLOCK | O_SHLOCK))
+ cap_rights_set(rightsp, CAP_FLOCK);
+}
+
+/*
+ * Check permissions, allocate an open file structure, and call the device
+ * open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+ char *path;
+ int flags;
+ int mode;
+};
+#endif
+int
+sys_open(td, uap)
+ struct thread *td;
+ register struct open_args /* {
+ char *path;
+ int flags;
+ int mode;
+ } */ *uap;
+{
+
+ return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct openat_args {
+ int fd;
+ char *path;
+ int flag;
+ int mode;
+};
+#endif
+int
+sys_openat(struct thread *td, struct openat_args *uap)
+{
+
+ return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
+ uap->mode));
+}
+
+int
+kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
+ int mode)
+{
+
+ return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
+}
+
+int
+kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int flags, int mode)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ struct file *fp;
+ struct vnode *vp;
+ struct nameidata nd;
+ cap_rights_t rights;
+ int cmode, error, indx;
+
+ indx = -1;
+
+ AUDIT_ARG_FFLAGS(flags);
+ AUDIT_ARG_MODE(mode);
+ /* XXX: audit dirfd */
+ cap_rights_init(&rights, CAP_LOOKUP);
+ flags_to_rights(flags, &rights);
+ /*
+ * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
+ * may be specified.
+ */
+ if (flags & O_EXEC) {
+ if (flags & O_ACCMODE)
+ return (EINVAL);
+ } else if ((flags & O_ACCMODE) == O_ACCMODE) {
+ return (EINVAL);
+ } else {
+ flags = FFLAGS(flags);
+ }
+
+ /*
+ * Allocate the file descriptor, but don't install a descriptor yet.
+ */
+ error = falloc_noinstall(td, &fp);
+ if (error != 0)
+ return (error);
+ /*
+ * An extra reference on `fp' has been held for us by
+ * falloc_noinstall().
+ */
+ /* Set the flags early so the finit in devfs can pick them up. */
+ fp->f_flag = flags & FMASK;
+ cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+ &rights, td);
+ td->td_dupfd = -1; /* XXX check for fdopen */
+ error = vn_open(&nd, &flags, cmode, fp);
+ if (error != 0) {
+ /*
+ * If the vn_open replaced the method vector, something
+ * wonderous happened deep below and we just pass it up
+ * pretending we know what we do.
+ */
+ if (error == ENXIO && fp->f_ops != &badfileops)
+ goto success;
+
+ /*
+ * Handle special fdopen() case. bleh.
+ *
+ * Don't do this for relative (capability) lookups; we don't
+ * understand exactly what would happen, and we don't think
+ * that it ever should.
+ */
+ if (nd.ni_strictrelative == 0 &&
+ (error == ENODEV || error == ENXIO) &&
+ td->td_dupfd >= 0) {
+ error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
+ &indx);
+ if (error == 0)
+ goto success;
+ }
+
+ goto bad;
+ }
+ td->td_dupfd = 0;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+
+ /*
+ * Store the vnode, for any f_type. Typically, the vnode use
+ * count is decremented by direct call to vn_closefile() for
+ * files that switched type in the cdevsw fdopen() method.
+ */
+ fp->f_vnode = vp;
+ /*
+ * If the file wasn't claimed by devfs bind it to the normal
+ * vnode operations here.
+ */
+ if (fp->f_ops == &badfileops) {
+ KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
+ fp->f_seqcount = 1;
+ finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
+ DTYPE_VNODE, vp, &vnops);
+ }
+
+ VOP_UNLOCK(vp, 0);
+ if (flags & O_TRUNC) {
+ error = fo_truncate(fp, 0, td->td_ucred, td);
+ if (error != 0)
+ goto bad;
+ }
+success:
+ /*
+ * If we haven't already installed the FD (for dupfdopen), do so now.
+ */
+ if (indx == -1) {
+ struct filecaps *fcaps;
+
+#ifdef CAPABILITIES
+ if (nd.ni_strictrelative == 1)
+ fcaps = &nd.ni_filecaps;
+ else
+#endif
+ fcaps = NULL;
+ error = finstall(td, fp, &indx, flags, fcaps);
+ /* On success finstall() consumes fcaps. */
+ if (error != 0) {
+ filecaps_free(&nd.ni_filecaps);
+ goto bad;
+ }
+ } else {
+ filecaps_free(&nd.ni_filecaps);
+ }
+
+ /*
+ * Release our private reference, leaving the one associated with
+ * the descriptor table intact.
+ */
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return (0);
+bad:
+ KASSERT(indx == -1, ("indx=%d, should be -1", indx));
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+ocreat(td, uap)
+ struct thread *td;
+ register struct ocreat_args /* {
+ char *path;
+ int mode;
+ } */ *uap;
+{
+
+ return (kern_open(td, uap->path, UIO_USERSPACE,
+ O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+ char *path;
+ int mode;
+ int dev;
+};
+#endif
+int
+sys_mknod(td, uap)
+ struct thread *td;
+ register struct mknod_args /* {
+ char *path;
+ int mode;
+ int dev;
+ } */ *uap;
+{
+
+ return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mknodat_args {
+ int fd;
+ char *path;
+ mode_t mode;
+ dev_t dev;
+};
+#endif
+int
+sys_mknodat(struct thread *td, struct mknodat_args *uap)
+{
+
+ return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
+ uap->dev));
+}
+
+int
+kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
+ int dev)
+{
+
+ return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
+}
+
+int
+kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int mode, int dev)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct vattr vattr;
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error, whiteout = 0;
+
+ AUDIT_ARG_MODE(mode);
+ AUDIT_ARG_DEV(dev);
+ switch (mode & S_IFMT) {
+ case S_IFCHR:
+ case S_IFBLK:
+ error = priv_check(td, PRIV_VFS_MKNOD_DEV);
+ break;
+ case S_IFMT:
+ error = priv_check(td, PRIV_VFS_MKNOD_BAD);
+ break;
+ case S_IFWHT:
+ error = priv_check(td, PRIV_VFS_MKNOD_WHT);
+ break;
+ case S_IFIFO:
+ if (dev == 0)
+ return (kern_mkfifoat(td, fd, path, pathseg, mode));
+ /* FALLTHROUGH */
+ default:
+ error = EINVAL;
+ break;
+ }
+ if (error != 0)
+ return (error);
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
+ pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EEXIST);
+ } else {
+ VATTR_NULL(&vattr);
+ vattr.va_mode = (mode & ALLPERMS) &
+ ~td->td_proc->p_fd->fd_cmask;
+ vattr.va_rdev = dev;
+ whiteout = 0;
+
+ switch (mode & S_IFMT) {
+ case S_IFMT: /* used by badsect to flag bad sectors */
+ vattr.va_type = VBAD;
+ break;
+ case S_IFCHR:
+ vattr.va_type = VCHR;
+ break;
+ case S_IFBLK:
+ vattr.va_type = VBLK;
+ break;
+ case S_IFWHT:
+ whiteout = 1;
+ break;
+ default:
+ panic("kern_mknod: invalid mode");
+ }
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+#ifdef MAC
+ if (error == 0 && !whiteout)
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
+ &nd.ni_cnd, &vattr);
+#endif
+ if (error == 0) {
+ if (whiteout)
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+ else {
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+ &nd.ni_cnd, &vattr);
+ if (error == 0)
+ vput(nd.ni_vp);
+ }
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+sys_mkfifo(td, uap)
+ struct thread *td;
+ register struct mkfifo_args /* {
+ char *path;
+ int mode;
+ } */ *uap;
+{
+
+ return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifoat_args {
+ int fd;
+ char *path;
+ mode_t mode;
+};
+#endif
+int
+sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
+{
+
+ return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
+ uap->mode));
+}
+
+int
+kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
+{
+
+ return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
+}
+
+int
+kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int mode)
+{
+ struct mount *mp;
+ struct vattr vattr;
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_MODE(mode);
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
+ pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ if (nd.ni_vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VFIFO;
+ vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+ &vattr);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ if (error == 0)
+ vput(nd.ni_vp);
+#ifdef MAC
+out:
+#endif
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+ char *path;
+ char *link;
+};
+#endif
+int
+sys_link(td, uap)
+ struct thread *td;
+ register struct link_args /* {
+ char *path;
+ char *link;
+ } */ *uap;
+{
+
+ return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct linkat_args {
+ int fd1;
+ char *path1;
+ int fd2;
+ char *path2;
+ int flag;
+};
+#endif
+int
+sys_linkat(struct thread *td, struct linkat_args *uap)
+{
+ int flag;
+
+ flag = uap->flag;
+ if (flag & ~AT_SYMLINK_FOLLOW)
+ return (EINVAL);
+
+ return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
+ UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
+}
+
+int hardlink_check_uid = 0;
+SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
+ &hardlink_check_uid, 0,
+ "Unprivileged processes cannot create hard links to files owned by other "
+ "users");
+static int hardlink_check_gid = 0;
+SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
+ &hardlink_check_gid, 0,
+ "Unprivileged processes cannot create hard links to files owned by other "
+ "groups");
+
+static int
+can_hardlink(struct vnode *vp, struct ucred *cred)
+{
+ struct vattr va;
+ int error;
+
+ if (!hardlink_check_uid && !hardlink_check_gid)
+ return (0);
+
+ error = VOP_GETATTR(vp, &va, cred);
+ if (error != 0)
+ return (error);
+
+ if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
+ error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+ if (error != 0)
+ return (error);
+ }
+
+ if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
+ error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+ if (error != 0)
+ return (error);
+ }
+
+ return (0);
+}
+
+int
+kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
+{
+
+ return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
+}
+
+int
+kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
+ enum uio_seg segflg, int follow)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error;
+
+ bwillwrite();
+ NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR) {
+ vrele(vp);
+ return (EPERM); /* POSIX */
+ }
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ vrele(vp);
+ return (error);
+ }
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2,
+ segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT), td);
+ if ((error = namei(&nd)) == 0) {
+ if (nd.ni_vp != NULL) {
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
+ == 0) {
+ error = can_hardlink(vp, td->td_ucred);
+ if (error == 0)
+#ifdef MAC
+ error = mac_vnode_check_link(td->td_ucred,
+ nd.ni_dvp, vp, &nd.ni_cnd);
+ if (error == 0)
+#endif
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ VOP_UNLOCK(vp, 0);
+ vput(nd.ni_dvp);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ }
+ vrele(vp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+ char *path;
+ char *link;
+};
+#endif
+int
+sys_symlink(td, uap)
+ struct thread *td;
+ register struct symlink_args /* {
+ char *path;
+ char *link;
+ } */ *uap;
+{
+
+ return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct symlinkat_args {
+ char *path;
+ int fd;
+ char *path2;
+};
+#endif
+int
+sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
+{
+
+ return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
+ UIO_USERSPACE));
+}
+
+int
+kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
+{
+
+ return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
+}
+
+int
+kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
+ enum uio_seg segflg)
+{
+ struct mount *mp;
+ struct vattr vattr;
+ char *syspath;
+ struct nameidata nd;
+ int error;
+ cap_rights_t rights;
+
+ if (segflg == UIO_SYSSPACE) {
+ syspath = path1;
+ } else {
+ syspath = uma_zalloc(namei_zone, M_WAITOK);
+ if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
+ goto out;
+ }
+ AUDIT_ARG_TEXT(syspath);
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
+ segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT), td);
+ if ((error = namei(&nd)) != 0)
+ goto out;
+ if (nd.ni_vp) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ goto out;
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ goto out;
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+ vattr.va_type = VLNK;
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+ &vattr);
+ if (error != 0)
+ goto out2;
+#endif
+ error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
+ if (error == 0)
+ vput(nd.ni_vp);
+#ifdef MAC
+out2:
+#endif
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+out:
+ if (segflg != UIO_SYSSPACE)
+ uma_zfree(namei_zone, syspath);
+ return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+int
+sys_undelete(td, uap)
+ struct thread *td;
+ register struct undelete_args /* {
+ char *path;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct nameidata nd;
+ int error;
+
+restart:
+ bwillwrite();
+ NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+
+ if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (nd.ni_vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+ char *path;
+};
+#endif
+int
+sys_unlink(td, uap)
+ struct thread *td;
+ struct unlink_args /* {
+ char *path;
+ } */ *uap;
+{
+
+ return (kern_unlink(td, uap->path, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct unlinkat_args {
+ int fd;
+ char *path;
+ int flag;
+};
+#endif
+int
+sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
+{
+ int flag = uap->flag;
+ int fd = uap->fd;
+ char *path = uap->path;
+
+ if (flag & ~AT_REMOVEDIR)
+ return (EINVAL);
+
+ if (flag & AT_REMOVEDIR)
+ return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
+ else
+ return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
+}
+
+int
+kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
+{
+
+ return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
+}
+
+int
+kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ ino_t oldinum)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct nameidata nd;
+ struct stat sb;
+ cap_rights_t rights;
+ int error;
+
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
+ if ((error = namei(&nd)) != 0)
+ return (error == EINVAL ? EPERM : error);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR && oldinum == 0) {
+ error = EPERM; /* POSIX */
+ } else if (oldinum != 0 &&
+ ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
+ sb.st_ino != oldinum) {
+ error = EIDRM; /* Identifier removed */
+ } else {
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ *
+ * XXX: can this only be a VDIR case?
+ */
+ if (vp->v_vflag & VV_ROOT)
+ error = EBUSY;
+ }
+ if (error == 0) {
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (vp == nd.ni_dvp)
+ vrele(vp);
+ else
+ vput(vp);
+ if ((error = vn_start_write(NULL, &mp,
+ V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+#ifdef MAC
+ error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
+ &nd.ni_cnd);
+ if (error != 0)
+ goto out;
+#endif
+ vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
+ error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+#ifdef MAC
+out:
+#endif
+ vn_finished_write(mp);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (vp == nd.ni_dvp)
+ vrele(vp);
+ else
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+};
+#endif
+int
+sys_lseek(td, uap)
+ struct thread *td;
+ register struct lseek_args /* {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+ } */ *uap;
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
+ if (error != 0)
+ return (error);
+ error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
+ fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
+ fdrop(fp, td);
+ return (error);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+ int fd;
+ long offset;
+ int whence;
+};
+#endif
+int
+olseek(td, uap)
+ struct thread *td;
+ register struct olseek_args /* {
+ int fd;
+ long offset;
+ int whence;
+ } */ *uap;
+{
+ struct lseek_args /* {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+ } */ nuap;
+
+ nuap.fd = uap->fd;
+ nuap.offset = uap->offset;
+ nuap.whence = uap->whence;
+ return (sys_lseek(td, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/* Version with the 'pad' argument */
+int
+freebsd6_lseek(td, uap)
+ struct thread *td;
+ register struct freebsd6_lseek_args *uap;
+{
+ struct lseek_args ouap;
+
+ ouap.fd = uap->fd;
+ ouap.offset = uap->offset;
+ ouap.whence = uap->whence;
+ return (sys_lseek(td, &ouap));
+}
+
+/*
+ * Check access permissions using passed credentials.
+ */
+static int
+vn_access(vp, user_flags, cred, td)
+ struct vnode *vp;
+ int user_flags;
+ struct ucred *cred;
+ struct thread *td;
+{
+ accmode_t accmode;
+ int error;
+
+ /* Flags == 0 means only check for existence. */
+ error = 0;
+ if (user_flags) {
+ accmode = 0;
+ if (user_flags & R_OK)
+ accmode |= VREAD;
+ if (user_flags & W_OK)
+ accmode |= VWRITE;
+ if (user_flags & X_OK)
+ accmode |= VEXEC;
+#ifdef MAC
+ error = mac_vnode_check_access(cred, vp, accmode);
+ if (error != 0)
+ return (error);
+#endif
+ if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+ error = VOP_ACCESS(vp, accmode, cred, td);
+ }
+ return (error);
+}
+
+/*
+ * Check access permissions using "real" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+ char *path;
+ int amode;
+};
+#endif
+int
+sys_access(td, uap)
+ struct thread *td;
+ register struct access_args /* {
+ char *path;
+ int amode;
+ } */ *uap;
+{
+
+ return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct faccessat_args {
+ int dirfd;
+ char *path;
+ int amode;
+ int flag;
+}
+#endif
+int
+sys_faccessat(struct thread *td, struct faccessat_args *uap)
+{
+
+ if (uap->flag & ~AT_EACCESS)
+ return (EINVAL);
+ return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
+ uap->amode));
+}
+
+int
+kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
+{
+
+ return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
+}
+
+int
+kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int flag, int amode)
+{
+ struct ucred *cred, *tmpcred;
+ struct vnode *vp;
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error;
+
+ /*
+ * Create and modify a temporary credential instead of one that
+ * is potentially shared.
+ */
+ if (!(flag & AT_EACCESS)) {
+ cred = td->td_ucred;
+ tmpcred = crdup(cred);
+ tmpcred->cr_uid = cred->cr_ruid;
+ tmpcred->cr_groups[0] = cred->cr_rgid;
+ td->td_ucred = tmpcred;
+ } else
+ cred = tmpcred = td->td_ucred;
+ AUDIT_ARG_VALUE(amode);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
+ AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
+ td);
+ if ((error = namei(&nd)) != 0)
+ goto out1;
+ vp = nd.ni_vp;
+
+ error = vn_access(vp, amode, tmpcred, td);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+out1:
+ if (!(flag & AT_EACCESS)) {
+ td->td_ucred = cred;
+ crfree(tmpcred);
+ }
+ return (error);
+}
+
+/*
+ * Check access permissions using "effective" credentials.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct eaccess_args {
+ char *path;
+ int amode;
+};
+#endif
+int
+sys_eaccess(td, uap)
+ struct thread *td;
+ register struct eaccess_args /* {
+ char *path;
+ int amode;
+ } */ *uap;
+{
+
+ return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
+}
+
+int
+kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
+{
+
+ return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
+}
+
+#if defined(COMPAT_43)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+int
+ostat(td, uap)
+ struct thread *td;
+ register struct ostat_args /* {
+ char *path;
+ struct ostat *ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+
+ error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
+ if (error != 0)
+ return (error);
+ cvtstat(&sb, &osb);
+ return (copyout(&osb, uap->ub, sizeof (osb)));
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+int
+olstat(td, uap)
+ struct thread *td;
+ register struct olstat_args /* {
+ char *path;
+ struct ostat *ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+
+ error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
+ if (error != 0)
+ return (error);
+ cvtstat(&sb, &osb);
+ return (copyout(&osb, uap->ub, sizeof (osb)));
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+ struct stat *st;
+ struct ostat *ost;
+{
+
+ ost->st_dev = st->st_dev;
+ ost->st_ino = st->st_ino;
+ ost->st_mode = st->st_mode;
+ ost->st_nlink = st->st_nlink;
+ ost->st_uid = st->st_uid;
+ ost->st_gid = st->st_gid;
+ ost->st_rdev = st->st_rdev;
+ if (st->st_size < (quad_t)1 << 32)
+ ost->st_size = st->st_size;
+ else
+ ost->st_size = -2;
+ ost->st_atim = st->st_atim;
+ ost->st_mtim = st->st_mtim;
+ ost->st_ctim = st->st_ctim;
+ ost->st_blksize = st->st_blksize;
+ ost->st_blocks = st->st_blocks;
+ ost->st_flags = st->st_flags;
+ ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+int
+sys_stat(td, uap)
+ struct thread *td;
+ register struct stat_args /* {
+ char *path;
+ struct stat *ub;
+ } */ *uap;
+{
+ struct stat sb;
+ int error;
+
+ error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
+ if (error == 0)
+ error = copyout(&sb, uap->ub, sizeof (sb));
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fstatat_args {
+ int fd;
+ char *path;
+ struct stat *buf;
+ int flag;
+}
+#endif
+int
+sys_fstatat(struct thread *td, struct fstatat_args *uap)
+{
+ struct stat sb;
+ int error;
+
+ error = kern_statat(td, uap->flag, uap->fd, uap->path,
+ UIO_USERSPACE, &sb);
+ if (error == 0)
+ error = copyout(&sb, uap->buf, sizeof (sb));
+ return (error);
+}
+
+int
+kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
+{
+
+ return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
+}
+
+int
+kern_statat(struct thread *td, int flag, int fd, char *path,
+ enum uio_seg pathseg, struct stat *sbp)
+{
+
+ return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
+}
+
+int
+kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
+ enum uio_seg pathseg, struct stat *sbp,
+ void (*hook)(struct vnode *vp, struct stat *sbp))
+{
+ struct nameidata nd;
+ struct stat sb;
+ cap_rights_t rights;
+ int error;
+
+ if (flag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
+ FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FSTAT), td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
+ if (error == 0) {
+ SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
+ if (S_ISREG(sb.st_mode))
+ SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
+ if (__predict_false(hook != NULL))
+ hook(nd.ni_vp, &sb);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_vp);
+ if (error != 0)
+ return (error);
+ *sbp = sb;
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT))
+ ktrstat(&sb);
+#endif
+ return (0);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+int
+sys_lstat(td, uap)
+ struct thread *td;
+ register struct lstat_args /* {
+ char *path;
+ struct stat *ub;
+ } */ *uap;
+{
+ struct stat sb;
+ int error;
+
+ error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
+ if (error == 0)
+ error = copyout(&sb, uap->ub, sizeof (sb));
+ return (error);
+}
+
+int
+kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
+{
+
+ return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
+ sbp));
+}
+
+/*
+ * Implementation of the NetBSD [l]stat() functions.
+ */
+void
+cvtnstat(sb, nsb)
+ struct stat *sb;
+ struct nstat *nsb;
+{
+
+ bzero(nsb, sizeof *nsb);
+ nsb->st_dev = sb->st_dev;
+ nsb->st_ino = sb->st_ino;
+ nsb->st_mode = sb->st_mode;
+ nsb->st_nlink = sb->st_nlink;
+ nsb->st_uid = sb->st_uid;
+ nsb->st_gid = sb->st_gid;
+ nsb->st_rdev = sb->st_rdev;
+ nsb->st_atim = sb->st_atim;
+ nsb->st_mtim = sb->st_mtim;
+ nsb->st_ctim = sb->st_ctim;
+ nsb->st_size = sb->st_size;
+ nsb->st_blocks = sb->st_blocks;
+ nsb->st_blksize = sb->st_blksize;
+ nsb->st_flags = sb->st_flags;
+ nsb->st_gen = sb->st_gen;
+ nsb->st_birthtim = sb->st_birthtim;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct nstat_args {
+ char *path;
+ struct nstat *ub;
+};
+#endif
+int
+sys_nstat(td, uap)
+ struct thread *td;
+ register struct nstat_args /* {
+ char *path;
+ struct nstat *ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct nstat nsb;
+ int error;
+
+ error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
+ if (error != 0)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ return (copyout(&nsb, uap->ub, sizeof (nsb)));
+}
+
+/*
+ * NetBSD lstat. Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+int
+sys_nlstat(td, uap)
+ struct thread *td;
+ register struct nlstat_args /* {
+ char *path;
+ struct nstat *ub;
+ } */ *uap;
+{
+ struct stat sb;
+ struct nstat nsb;
+ int error;
+
+ error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
+ if (error != 0)
+ return (error);
+ cvtnstat(&sb, &nsb);
+ return (copyout(&nsb, uap->ub, sizeof (nsb)));
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+ char *path;
+ int name;
+};
+#endif
+int
+sys_pathconf(td, uap)
+ struct thread *td;
+ register struct pathconf_args /* {
+ char *path;
+ int name;
+ } */ *uap;
+{
+
+ return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct lpathconf_args {
+ char *path;
+ int name;
+};
+#endif
+int
+sys_lpathconf(td, uap)
+ struct thread *td;
+ register struct lpathconf_args /* {
+ char *path;
+ int name;
+ } */ *uap;
+{
+
+ return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
+ NOFOLLOW));
+}
+
+int
+kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
+ u_long flags)
+{
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
+ pathseg, path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+
+ /* If asynchronous I/O is available, it works for all files. */
+ if (name == _PC_ASYNC_IO)
+ td->td_retval[0] = async_io_version;
+ else
+ error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
+ vput(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+ char *path;
+ char *buf;
+ size_t count;
+};
+#endif
+int
+sys_readlink(td, uap)
+ struct thread *td;
+ register struct readlink_args /* {
+ char *path;
+ char *buf;
+ size_t count;
+ } */ *uap;
+{
+
+ return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
+ UIO_USERSPACE, uap->count));
+}
+#ifndef _SYS_SYSPROTO_H_
+struct readlinkat_args {
+ int fd;
+ char *path;
+ char *buf;
+ size_t bufsize;
+};
+#endif
+int
+sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
+{
+
+ return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
+ uap->buf, UIO_USERSPACE, uap->bufsize));
+}
+
+int
+kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
+ enum uio_seg bufseg, size_t count)
+{
+
+ return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
+ count));
+}
+
+int
+kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ char *buf, enum uio_seg bufseg, size_t count)
+{
+ struct vnode *vp;
+ struct iovec aiov;
+ struct uio auio;
+ struct nameidata nd;
+ int error;
+
+ if (count > IOSIZE_MAX)
+ return (EINVAL);
+
+ NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+#ifdef MAC
+ error = mac_vnode_check_readlink(td->td_ucred, vp);
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+#endif
+ if (vp->v_type != VLNK)
+ error = EINVAL;
+ else {
+ aiov.iov_base = buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = bufseg;
+ auio.uio_td = td;
+ auio.uio_resid = count;
+ error = VOP_READLINK(vp, &auio, td->td_ucred);
+ }
+ vput(vp);
+ td->td_retval[0] = count - auio.uio_resid;
+ return (error);
+}
+
+/*
+ * Common implementation code for chflags() and fchflags().
+ */
+static int
+setfflags(td, vp, flags)
+ struct thread *td;
+ struct vnode *vp;
+ u_long flags;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+
+ /* We can't support the value matching VNOVAL. */
+ if (flags == VNOVAL)
+ return (EOPNOTSUPP);
+
+ /*
+ * Prevent non-root users from setting flags on devices. When
+ * a device is reused, users can retain ownership of the device
+ * if they are allowed to set flags and programs assume that
+ * chown can't fail when done as root.
+ */
+ if (vp->v_type == VCHR || vp->v_type == VBLK) {
+ error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
+ if (error != 0)
+ return (error);
+ }
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ VATTR_NULL(&vattr);
+ vattr.va_flags = flags;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef MAC
+ error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
+ if (error == 0)
+#endif
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+ const char *path;
+ u_long flags;
+};
+#endif
+int
+sys_chflags(td, uap)
+ struct thread *td;
+ register struct chflags_args /* {
+ const char *path;
+ u_long flags;
+ } */ *uap;
+{
+
+ return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct chflagsat_args {
+ int fd;
+ const char *path;
+ u_long flags;
+ int atflag;
+}
+#endif
+int
+sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
+{
+ int fd = uap->fd;
+ const char *path = uap->path;
+ u_long flags = uap->flags;
+ int atflag = uap->atflag;
+
+ if (atflag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
+}
+
+static int
+kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
+ u_long flags)
+{
+
+ return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
+}
+
+/*
+ * Same as chflags() but doesn't follow symlinks.
+ */
+int
+sys_lchflags(td, uap)
+ struct thread *td;
+ register struct lchflags_args /* {
+ const char *path;
+ u_long flags;
+ } */ *uap;
+{
+
+ return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->flags, AT_SYMLINK_NOFOLLOW));
+}
+
+static int
+kern_chflagsat(struct thread *td, int fd, const char *path,
+ enum uio_seg pathseg, u_long flags, int atflag)
+{
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error, follow;
+
+ AUDIT_ARG_FFLAGS(flags);
+ follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FCHFLAGS), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfflags(td, nd.ni_vp, flags);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+ int fd;
+ u_long flags;
+};
+#endif
+int
+sys_fchflags(td, uap)
+ struct thread *td;
+ register struct fchflags_args /* {
+ int fd;
+ u_long flags;
+ } */ *uap;
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_FFLAGS(uap->flags);
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
+ if (error != 0)
+ return (error);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(fp->f_vnode);
+ VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+ error = setfflags(td, fp->f_vnode, uap->flags);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation code for chmod(), lchmod() and fchmod().
+ */
+int
+setfmode(td, cred, vp, mode)
+ struct thread *td;
+ struct ucred *cred;
+ struct vnode *vp;
+ int mode;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VATTR_NULL(&vattr);
+ vattr.va_mode = mode & ALLPERMS;
+#ifdef MAC
+ error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
+ if (error == 0)
+#endif
+ error = VOP_SETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+sys_chmod(td, uap)
+ struct thread *td;
+ register struct chmod_args /* {
+ char *path;
+ int mode;
+ } */ *uap;
+{
+
+ return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fchmodat_args {
+ int dirfd;
+ char *path;
+ mode_t mode;
+ int flag;
+}
+#endif
+int
+sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
+{
+ int flag = uap->flag;
+ int fd = uap->fd;
+ char *path = uap->path;
+ mode_t mode = uap->mode;
+
+ if (flag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
+}
+
+int
+kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
+{
+
+ return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
+}
+
+/*
+ * Change mode of a file given path name (don't follow links.)
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchmod_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+sys_lchmod(td, uap)
+ struct thread *td;
+ register struct lchmod_args /* {
+ char *path;
+ int mode;
+ } */ *uap;
+{
+
+ return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->mode, AT_SYMLINK_NOFOLLOW));
+}
+
+int
+kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ mode_t mode, int flag)
+{
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error, follow;
+
+ AUDIT_ARG_MODE(mode);
+ follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FCHMOD), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+ int fd;
+ int mode;
+};
+#endif
+int
+sys_fchmod(struct thread *td, struct fchmod_args *uap)
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_MODE(uap->mode);
+
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
+ if (error != 0)
+ return (error);
+ error = fo_chmod(fp, uap->mode, td->td_ucred, td);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation for chown(), lchown(), and fchown()
+ */
+int
+setfown(td, cred, vp, uid, gid)
+ struct thread *td;
+ struct ucred *cred;
+ struct vnode *vp;
+ uid_t uid;
+ gid_t gid;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VATTR_NULL(&vattr);
+ vattr.va_uid = uid;
+ vattr.va_gid = gid;
+#ifdef MAC
+ error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
+ vattr.va_gid);
+ if (error == 0)
+#endif
+ error = VOP_SETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+int
+sys_chown(td, uap)
+ struct thread *td;
+ register struct chown_args /* {
+ char *path;
+ int uid;
+ int gid;
+ } */ *uap;
+{
+
+ return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct fchownat_args {
+ int fd;
+ const char * path;
+ uid_t uid;
+ gid_t gid;
+ int flag;
+};
+#endif
+int
+sys_fchownat(struct thread *td, struct fchownat_args *uap)
+{
+ int flag;
+
+ flag = uap->flag;
+ if (flag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
+ uap->gid, uap->flag));
+}
+
+int
+kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
+ int gid)
+{
+
+ return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
+}
+
+int
+kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ int uid, int gid, int flag)
+{
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error, follow;
+
+ AUDIT_ARG_OWNER(uid, gid);
+ follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FCHOWN), td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a path name, do not cross symlinks.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lchown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+int
+sys_lchown(td, uap)
+ struct thread *td;
+ register struct lchown_args /* {
+ char *path;
+ int uid;
+ int gid;
+ } */ *uap;
+{
+
+ return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
+}
+
+int
+kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
+ int gid)
+{
+
+ return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
+ AT_SYMLINK_NOFOLLOW));
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+ int fd;
+ int uid;
+ int gid;
+};
+#endif
+int
+sys_fchown(td, uap)
+ struct thread *td;
+ register struct fchown_args /* {
+ int fd;
+ int uid;
+ int gid;
+ } */ *uap;
+{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(uap->fd);
+ AUDIT_ARG_OWNER(uap->uid, uap->gid);
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
+ if (error != 0)
+ return (error);
+ error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+getutimes(usrtvp, tvpseg, tsp)
+ const struct timeval *usrtvp;
+ enum uio_seg tvpseg;
+ struct timespec *tsp;
+{
+ struct timeval tv[2];
+ const struct timeval *tvp;
+ int error;
+
+ if (usrtvp == NULL) {
+ vfs_timestamp(&tsp[0]);
+ tsp[1] = tsp[0];
+ } else {
+ if (tvpseg == UIO_SYSSPACE) {
+ tvp = usrtvp;
+ } else {
+ if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
+ return (error);
+ tvp = tv;
+ }
+
+ if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
+ tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
+ return (EINVAL);
+ TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
+ TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
+ }
+ return (0);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), and futimes().
+ */
+static int
+setutimes(td, vp, ts, numtimes, nullflag)
+ struct thread *td;
+ struct vnode *vp;
+ const struct timespec *ts;
+ int numtimes;
+ int nullflag;
+{
+ struct mount *mp;
+ struct vattr vattr;
+ int error, setbirthtime;
+
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ setbirthtime = 0;
+ if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
+ timespeccmp(&ts[1], &vattr.va_birthtime, < ))
+ setbirthtime = 1;
+ VATTR_NULL(&vattr);
+ vattr.va_atime = ts[0];
+ vattr.va_mtime = ts[1];
+ if (setbirthtime)
+ vattr.va_birthtime = ts[1];
+ if (numtimes > 2)
+ vattr.va_birthtime = ts[2];
+ if (nullflag)
+ vattr.va_vaflags |= VA_UTIMES_NULL;
+#ifdef MAC
+ error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
+ vattr.va_mtime);
+#endif
+ if (error == 0)
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+int
+sys_utimes(td, uap)
+ struct thread *td;
+ register struct utimes_args /* {
+ char *path;
+ struct timeval *tptr;
+ } */ *uap;
+{
+
+ return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
+ UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct futimesat_args {
+ int fd;
+ const char * path;
+ const struct timeval * times;
+};
+#endif
+int
+sys_futimesat(struct thread *td, struct futimesat_args *uap)
+{
+
+ return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
+ uap->times, UIO_USERSPACE));
+}
+
+int
+kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
+ struct timeval *tptr, enum uio_seg tptrseg)
+{
+
+ return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
+}
+
+int
+kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ struct timeval *tptr, enum uio_seg tptrseg)
+{
+ struct nameidata nd;
+ struct timespec ts[2];
+ cap_rights_t rights;
+ int error;
+
+ if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+ return (error);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FUTIMES), td);
+
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lutimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+int
+sys_lutimes(td, uap)
+ struct thread *td;
+ register struct lutimes_args /* {
+ char *path;
+ struct timeval *tptr;
+ } */ *uap;
+{
+
+ return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
+ UIO_USERSPACE));
+}
+
+int
+kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
+ struct timeval *tptr, enum uio_seg tptrseg)
+{
+ struct timespec ts[2];
+ struct nameidata nd;
+ int error;
+
+ if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct futimes_args {
+ int fd;
+ struct timeval *tptr;
+};
+#endif
+int
+sys_futimes(td, uap)
+ struct thread *td;
+ register struct futimes_args /* {
+ int fd;
+ struct timeval *tptr;
+ } */ *uap;
+{
+
+ return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
+}
+
+int
+kern_futimes(struct thread *td, int fd, struct timeval *tptr,
+ enum uio_seg tptrseg)
+{
+ struct timespec ts[2];
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_FD(fd);
+ error = getutimes(tptr, tptrseg, ts);
+ if (error != 0)
+ return (error);
+ error = getvnode(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_FUTIMES), &fp);
+ if (error != 0)
+ return (error);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(fp->f_vnode);
+ VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+ error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+ char *path;
+ int pad;
+ off_t length;
+};
+#endif
+int
+sys_truncate(td, uap)
+ struct thread *td;
+ register struct truncate_args /* {
+ char *path;
+ int pad;
+ off_t length;
+ } */ *uap;
+{
+
+ return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
+}
+
+int
+kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ void *rl_cookie;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error;
+
+ if (length < 0)
+ return(EINVAL);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+ if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+ vn_rangelock_unlock(vp, rl_cookie);
+ vrele(vp);
+ return (error);
+ }
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+#ifdef MAC
+ else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
+ }
+#endif
+ else if ((error = vn_writechk(vp)) == 0 &&
+ (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = length;
+ error = VOP_SETATTR(vp, &vattr, td->td_ucred);
+ }
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+ vn_rangelock_unlock(vp, rl_cookie);
+ vrele(vp);
+ return (error);
+}
+
+#if defined(COMPAT_43)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+ char *path;
+ long length;
+};
+#endif
+int
+otruncate(td, uap)
+ struct thread *td;
+ register struct otruncate_args /* {
+ char *path;
+ long length;
+ } */ *uap;
+{
+ struct truncate_args /* {
+ char *path;
+ int pad;
+ off_t length;
+ } */ nuap;
+
+ nuap.path = uap->path;
+ nuap.length = uap->length;
+ return (sys_truncate(td, &nuap));
+}
+#endif /* COMPAT_43 */
+
+/* Versions with the pad argument */
+int
+freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
+{
+ struct truncate_args ouap;
+
+ ouap.path = uap->path;
+ ouap.length = uap->length;
+ return (sys_truncate(td, &ouap));
+}
+
+int
+freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
+{
+ struct ftruncate_args ouap;
+
+ ouap.fd = uap->fd;
+ ouap.length = uap->length;
+ return (sys_ftruncate(td, &ouap));
+}
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+ int fd;
+};
+#endif
+int
+sys_fsync(td, uap)
+ struct thread *td;
+ struct fsync_args /* {
+ int fd;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct file *fp;
+ cap_rights_t rights;
+ int error, lock_flags;
+
+ AUDIT_ARG_FD(uap->fd);
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_FSYNC), &fp);
+ if (error != 0)
+ return (error);
+ vp = fp->f_vnode;
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ goto drop;
+ if (MNT_SHARED_WRITES(mp) ||
+ ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
+ lock_flags = LK_SHARED;
+ } else {
+ lock_flags = LK_EXCLUSIVE;
+ }
+ vn_lock(vp, lock_flags | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ if (vp->v_object != NULL) {
+ VM_OBJECT_WLOCK(vp->v_object);
+ vm_object_page_clean(vp->v_object, 0, 0, 0);
+ VM_OBJECT_WUNLOCK(vp->v_object);
+ }
+ error = VOP_FSYNC(vp, MNT_WAIT, td);
+
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+drop:
+ fdrop(fp, td);
+ return (error);
+}
+
+/*
+ * Rename files. Source and destination must either both be directories, or
+ * both not be directories. If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+ char *from;
+ char *to;
+};
+#endif
+int
+sys_rename(td, uap)
+ struct thread *td;
+ register struct rename_args /* {
+ char *from;
+ char *to;
+ } */ *uap;
+{
+
+ return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct renameat_args {
+ int oldfd;
+ char *old;
+ int newfd;
+ char *new;
+};
+#endif
+int
+sys_renameat(struct thread *td, struct renameat_args *uap)
+{
+
+ return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
+ UIO_USERSPACE));
+}
+
+int
+kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
+{
+
+ return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
+}
+
+int
+kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
+ enum uio_seg pathseg)
+{
+ struct mount *mp = NULL;
+ struct vnode *tvp, *fvp, *tdvp;
+ struct nameidata fromnd, tond;
+ cap_rights_t rights;
+ int error;
+
+ bwillwrite();
+#ifdef MAC
+ NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
+ AUDITVNODE1, pathseg, old, oldfd,
+ cap_rights_init(&rights, CAP_RENAMEAT), td);
+#else
+ NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
+ pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
+#endif
+
+ if ((error = namei(&fromnd)) != 0)
+ return (error);
+#ifdef MAC
+ error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
+ fromnd.ni_vp, &fromnd.ni_cnd);
+ VOP_UNLOCK(fromnd.ni_dvp, 0);
+ if (fromnd.ni_dvp != fromnd.ni_vp)
+ VOP_UNLOCK(fromnd.ni_vp, 0);
+#endif
+ fvp = fromnd.ni_vp;
+ if (error == 0)
+ error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
+ if (error != 0) {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
+ SAVESTART | AUDITVNODE2, pathseg, new, newfd,
+ cap_rights_init(&rights, CAP_LINKAT), td);
+ if (fromnd.ni_vp->v_type == VDIR)
+ tond.ni_cnd.cn_flags |= WILLBEDIR;
+ if ((error = namei(&tond)) != 0) {
+ /* Translate error code for rename("dir1", "dir2/."). */
+ if (error == EISDIR && fvp->v_type == VDIR)
+ error = EINVAL;
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ vn_finished_write(mp);
+ goto out1;
+ }
+ tdvp = tond.ni_dvp;
+ tvp = tond.ni_vp;
+ if (tvp != NULL) {
+ if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+#ifdef CAPABILITIES
+ if (newfd != AT_FDCWD) {
+ /*
+ * If the target already exists we require CAP_UNLINKAT
+ * from 'newfd'.
+ */
+ error = cap_check(&tond.ni_filecaps.fc_rights,
+ cap_rights_init(&rights, CAP_UNLINKAT));
+ if (error != 0)
+ goto out;
+ }
+#endif
+ }
+ if (fvp == tdvp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * If the source is the same as the destination (that is, if they
+ * are links to the same vnode), then there is nothing to do.
+ */
+ if (fvp == tvp)
+ error = -1;
+#ifdef MAC
+ else
+ error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
+ tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
+#endif
+out:
+ if (error == 0) {
+ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+ tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ } else {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ if (tvp != NULL)
+ vput(tvp);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ }
+ vrele(tond.ni_startdir);
+ vn_finished_write(mp);
+out1:
+ if (fromnd.ni_startdir)
+ vrele(fromnd.ni_startdir);
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+sys_mkdir(td, uap)
+ struct thread *td;
+ register struct mkdir_args /* {
+ char *path;
+ int mode;
+ } */ *uap;
+{
+
+ return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct mkdirat_args {
+ int fd;
+ char *path;
+ mode_t mode;
+};
+#endif
+int
+sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
+{
+
+ return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
+}
+
+int
+kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
+{
+
+ return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
+}
+
+int
+kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
+ int mode)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct vattr vattr;
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error;
+
+ AUDIT_ARG_MODE(mode);
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
+ segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT), td);
+ nd.ni_cnd.cn_flags |= WILLBEDIR;
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ /*
+ * XXX namei called with LOCKPARENT but not LOCKLEAF has
+ * the strange behaviour of leaving the vnode unlocked
+ * if the target is the same vnode as the parent.
+ */
+ if (vp == nd.ni_dvp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EEXIST);
+ }
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VDIR;
+ vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+#ifdef MAC
+ error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+ &vattr);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+#ifdef MAC
+out:
+#endif
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ if (error == 0)
+ vput(nd.ni_vp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+ char *path;
+};
+#endif
+int
+sys_rmdir(td, uap)
+ struct thread *td;
+ struct rmdir_args /* {
+ char *path;
+ } */ *uap;
+{
+
+ return (kern_rmdir(td, uap->path, UIO_USERSPACE));
+}
+
+int
+kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
+{
+
+ return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
+}
+
+int
+kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct nameidata nd;
+ cap_rights_t rights;
+ int error;
+
+restart:
+ bwillwrite();
+ NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ /*
+ * No rmdir "." please.
+ */
+ if (nd.ni_dvp == vp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ */
+ if (vp->v_vflag & VV_ROOT) {
+ error = EBUSY;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
+ &nd.ni_cnd);
+ if (error != 0)
+ goto out;
+#endif
+ if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+ vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
+ error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+ vn_finished_write(mp);
+out:
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(vp);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
+{
+ long loff;
+ int error;
+
+ error = kern_ogetdirentries(td, uap, &loff);
+ if (error == 0)
+ error = copyout(&loff, uap->basep, sizeof(long));
+ return (error);
+}
+
+int
+kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
+ long *ploff)
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio, kuio;
+ struct iovec aiov, kiov;
+ struct dirent *dp, *edp;
+ cap_rights_t rights;
+ caddr_t dirbuf;
+ int error, eofflag, readcnt;
+ long loff;
+ off_t foffset;
+
+ /* XXX arbitrary sanity limit on `count'. */
+ if (uap->count > 64 * 1024)
+ return (EINVAL);
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_READ), &fp);
+ if (error != 0)
+ return (error);
+ if ((fp->f_flag & FREAD) == 0) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ vp = fp->f_vnode;
+ foffset = foffset_lock(fp, 0);
+unionread:
+ if (vp->v_type != VDIR) {
+ foffset_unlock(fp, foffset, 0);
+ fdrop(fp, td);
+ return (EINVAL);
+ }
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_resid = uap->count;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ loff = auio.uio_offset = foffset;
+#ifdef MAC
+ error = mac_vnode_check_readdir(td->td_ucred, vp);
+ if (error != 0) {
+ VOP_UNLOCK(vp, 0);
+ foffset_unlock(fp, foffset, FOF_NOUPDATE);
+ fdrop(fp, td);
+ return (error);
+ }
+#endif
+# if (BYTE_ORDER != LITTLE_ENDIAN)
+ if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ foffset = auio.uio_offset;
+ } else
+# endif
+ {
+ kuio = auio;
+ kuio.uio_iov = &kiov;
+ kuio.uio_segflg = UIO_SYSSPACE;
+ kiov.iov_len = uap->count;
+ dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
+ kiov.iov_base = dirbuf;
+ error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ foffset = kuio.uio_offset;
+ if (error == 0) {
+ readcnt = uap->count - kuio.uio_resid;
+ edp = (struct dirent *)&dirbuf[readcnt];
+ for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+# if (BYTE_ORDER == LITTLE_ENDIAN)
+ /*
+ * The expected low byte of
+ * dp->d_namlen is our dp->d_type.
+ * The high MBZ byte of dp->d_namlen
+ * is our dp->d_namlen.
+ */
+ dp->d_type = dp->d_namlen;
+ dp->d_namlen = 0;
+# else
+ /*
+ * The dp->d_type is the high byte
+ * of the expected dp->d_namlen,
+ * so must be zero'ed.
+ */
+ dp->d_type = 0;
+# endif
+ if (dp->d_reclen > 0) {
+ dp = (struct dirent *)
+ ((char *)dp + dp->d_reclen);
+ } else {
+ error = EIO;
+ break;
+ }
+ }
+ if (dp >= edp)
+ error = uiomove(dirbuf, readcnt, &auio);
+ }
+ free(dirbuf, M_TEMP);
+ }
+ if (error != 0) {
+ VOP_UNLOCK(vp, 0);
+ foffset_unlock(fp, foffset, 0);
+ fdrop(fp, td);
+ return (error);
+ }
+ if (uap->count == auio.uio_resid &&
+ (vp->v_vflag & VV_ROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_vnode = vp;
+ fp->f_data = vp;
+ foffset = 0;
+ vput(tvp);
+ goto unionread;
+ }
+ VOP_UNLOCK(vp, 0);
+ foffset_unlock(fp, foffset, 0);
+ fdrop(fp, td);
+ td->td_retval[0] = uap->count - auio.uio_resid;
+ if (error == 0)
+ *ploff = loff;
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a filesystem independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+sys_getdirentries(td, uap)
+ struct thread *td;
+ register struct getdirentries_args /* {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+ } */ *uap;
+{
+ long base;
+ int error;
+
+ error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+ NULL, UIO_USERSPACE);
+ if (error != 0)
+ return (error);
+ if (uap->basep != NULL)
+ error = copyout(&base, uap->basep, sizeof(long));
+ return (error);
+}
+
+int
+kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
+ long *basep, ssize_t *residp, enum uio_seg bufseg)
+{
+ struct vnode *vp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ cap_rights_t rights;
+ long loff;
+ int error, eofflag;
+ off_t foffset;
+
+ AUDIT_ARG_FD(fd);
+ if (count > IOSIZE_MAX)
+ return (EINVAL);
+ auio.uio_resid = count;
+ error = getvnode(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_READ), &fp);
+ if (error != 0)
+ return (error);
+ if ((fp->f_flag & FREAD) == 0) {
+ fdrop(fp, td);
+ return (EBADF);
+ }
+ vp = fp->f_vnode;
+ foffset = foffset_lock(fp, 0);
+unionread:
+ if (vp->v_type != VDIR) {
+ error = EINVAL;
+ goto fail;
+ }
+ aiov.iov_base = buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = bufseg;
+ auio.uio_td = td;
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ loff = auio.uio_offset = foffset;
+#ifdef MAC
+ error = mac_vnode_check_readdir(td->td_ucred, vp);
+ if (error == 0)
+#endif
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
+ NULL);
+ foffset = auio.uio_offset;
+ if (error != 0) {
+ VOP_UNLOCK(vp, 0);
+ goto fail;
+ }
+ if (count == auio.uio_resid &&
+ (vp->v_vflag & VV_ROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_vnode = vp;
+ fp->f_data = vp;
+ foffset = 0;
+ vput(tvp);
+ goto unionread;
+ }
+ VOP_UNLOCK(vp, 0);
+ *basep = loff;
+ if (residp != NULL)
+ *residp = auio.uio_resid;
+ td->td_retval[0] = count - auio.uio_resid;
+fail:
+ foffset_unlock(fp, foffset, 0);
+ fdrop(fp, td);
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdents_args {
+ int fd;
+ char *buf;
+ size_t count;
+};
+#endif
+int
+sys_getdents(td, uap)
+ struct thread *td;
+ register struct getdents_args /* {
+ int fd;
+ char *buf;
+ u_int count;
+ } */ *uap;
+{
+ struct getdirentries_args ap;
+
+ ap.fd = uap->fd;
+ ap.buf = uap->buf;
+ ap.count = uap->count;
+ ap.basep = NULL;
+ return (sys_getdirentries(td, &ap));
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+ int newmask;
+};
+#endif
+int
+sys_umask(td, uap)
+ struct thread *td;
+ struct umask_args /* {
+ int newmask;
+ } */ *uap;
+{
+ register struct filedesc *fdp;
+
+ FILEDESC_XLOCK(td->td_proc->p_fd);
+ fdp = td->td_proc->p_fd;
+ td->td_retval[0] = fdp->fd_cmask;
+ fdp->fd_cmask = uap->newmask & ALLPERMS;
+ FILEDESC_XUNLOCK(td->td_proc->p_fd);
+ return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem away from
+ * vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+ char *path;
+};
+#endif
+int
+sys_revoke(td, uap)
+ struct thread *td;
+ register struct revoke_args /* {
+ char *path;
+ } */ *uap;
+{
+ struct vnode *vp;
+ struct vattr vattr;
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ vp = nd.ni_vp;
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if (vp->v_type != VCHR || vp->v_rdev == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_vnode_check_revoke(td->td_ucred, vp);
+ if (error != 0)
+ goto out;
+#endif
+ error = VOP_GETATTR(vp, &vattr, td->td_ucred);
+ if (error != 0)
+ goto out;
+ if (td->td_ucred->cr_uid != vattr.va_uid) {
+ error = priv_check(td, PRIV_VFS_ADMIN);
+ if (error != 0)
+ goto out;
+ }
+ if (vcount(vp) > 1)
+ VOP_REVOKE(vp, REVOKEALL);
+out:
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry and check that, if it
+ * is a capability, the correct rights are present. A reference on the file
+ * entry is held upon returning.
+ */
+int
+getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
+{
+ struct file *fp;
+ int error;
+
+ error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
+ if (error != 0)
+ return (error);
+
+ /*
+ * The file could be not of the vnode type, or it may be not
+ * yet fully initialized, in which case the f_vnode pointer
+ * may be set, but f_ops is still badfileops. E.g.,
+ * devfs_open() transiently create such situation to
+ * facilitate csw d_fdopen().
+ *
+ * Dupfdopen() handling in kern_openat() installs the
+ * half-baked file into the process descriptor table, allowing
+ * other thread to dereference it. Guard against the race by
+ * checking f_ops.
+ */
+ if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
+ fdrop(fp, curthread);
+ return (EINVAL);
+ }
+ *fpp = fp;
+ return (0);
+}
+
+
+/*
+ * Get an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lgetfh_args {
+ char *fname;
+ fhandle_t *fhp;
+};
+#endif
+int
+sys_lgetfh(td, uap)
+ struct thread *td;
+ register struct lgetfh_args *uap;
+{
+ struct nameidata nd;
+ fhandle_t fh;
+ register struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_GETFH);
+ if (error != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->fname, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ bzero(&fh, sizeof(fh));
+ fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+ error = VOP_VPTOFH(vp, &fh.fh_fid);
+ vput(vp);
+ if (error == 0)
+ error = copyout(&fh, uap->fhp, sizeof (fh));
+ return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getfh_args {
+ char *fname;
+ fhandle_t *fhp;
+};
+#endif
+int
+sys_getfh(td, uap)
+ struct thread *td;
+ register struct getfh_args *uap;
+{
+ struct nameidata nd;
+ fhandle_t fh;
+ register struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_GETFH);
+ if (error != 0)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->fname, td);
+ error = namei(&nd);
+ if (error != 0)
+ return (error);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vp = nd.ni_vp;
+ bzero(&fh, sizeof(fh));
+ fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
+ error = VOP_VPTOFH(vp, &fh.fh_fid);
+ vput(vp);
+ if (error == 0)
+ error = copyout(&fh, uap->fhp, sizeof (fh));
+ return (error);
+}
+
+/*
+ * syscall for the rpc.lockd to use to translate a NFS file handle into an
+ * open descriptor.
+ *
+ * warning: do not remove the priv_check() call or this becomes one giant
+ * security hole.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhopen_args {
+ const struct fhandle *u_fhp;
+ int flags;
+};
+#endif
+int
+sys_fhopen(td, uap)
+ struct thread *td;
+ struct fhopen_args /* {
+ const struct fhandle *u_fhp;
+ int flags;
+ } */ *uap;
+{
+ struct mount *mp;
+ struct vnode *vp;
+ struct fhandle fhp;
+ struct file *fp;
+ int fmode, error;
+ int indx;
+
+ error = priv_check(td, PRIV_VFS_FHOPEN);
+ if (error != 0)
+ return (error);
+ indx = -1;
+ fmode = FFLAGS(uap->flags);
+ /* why not allow a non-read/write open for our lockd? */
+ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
+ return (EINVAL);
+ error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
+ if (error != 0)
+ return(error);
+ /* find the mount point */
+ mp = vfs_busyfs(&fhp.fh_fsid);
+ if (mp == NULL)
+ return (ESTALE);
+ /* now give me my vnode, it gets returned to me locked */
+ error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
+ vfs_unbusy(mp);
+ if (error != 0)
+ return (error);
+
+ error = falloc_noinstall(td, &fp);
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
+ /*
+ * An extra reference on `fp' has been held for us by
+ * falloc_noinstall().
+ */
+
+#ifdef INVARIANTS
+ td->td_dupfd = -1;
+#endif
+ error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
+ if (error != 0) {
+ KASSERT(fp->f_ops == &badfileops,
+ ("VOP_OPEN in fhopen() set f_ops"));
+ KASSERT(td->td_dupfd < 0,
+ ("fhopen() encountered fdopen()"));
+
+ vput(vp);
+ goto bad;
+ }
+#ifdef INVARIANTS
+ td->td_dupfd = 0;
+#endif
+ fp->f_vnode = vp;
+ fp->f_seqcount = 1;
+ finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
+ &vnops);
+ VOP_UNLOCK(vp, 0);
+ if ((fmode & O_TRUNC) != 0) {
+ error = fo_truncate(fp, 0, td->td_ucred, td);
+ if (error != 0)
+ goto bad;
+ }
+
+ error = finstall(td, fp, &indx, fmode, NULL);
+bad:
+ fdrop(fp, td);
+ td->td_retval[0] = indx;
+ return (error);
+}
+
+/*
+ * Stat an (NFS) file handle.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstat_args {
+ struct fhandle *u_fhp;
+ struct stat *sb;
+};
+#endif
+int
+sys_fhstat(td, uap)
+ struct thread *td;
+ register struct fhstat_args /* {
+ struct fhandle *u_fhp;
+ struct stat *sb;
+ } */ *uap;
+{
+ struct stat sb;
+ struct fhandle fh;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fh));
+ if (error != 0)
+ return (error);
+ error = kern_fhstat(td, fh, &sb);
+ if (error == 0)
+ error = copyout(&sb, uap->sb, sizeof(sb));
+ return (error);
+}
+
+int
+kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
+{
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_FHSTAT);
+ if (error != 0)
+ return (error);
+ if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
+ vfs_unbusy(mp);
+ if (error != 0)
+ return (error);
+ error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Implement fstatfs() for (NFS) file handles.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fhstatfs_args {
+ struct fhandle *u_fhp;
+ struct statfs *buf;
+};
+#endif
+int
+sys_fhstatfs(td, uap)
+ struct thread *td;
+ struct fhstatfs_args /* {
+ struct fhandle *u_fhp;
+ struct statfs *buf;
+ } */ *uap;
+{
+ struct statfs sf;
+ fhandle_t fh;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
+ if (error != 0)
+ return (error);
+ error = kern_fhstatfs(td, fh, &sf);
+ if (error != 0)
+ return (error);
+ return (copyout(&sf, uap->buf, sizeof(sf)));
+}
+
+int
+kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
+{
+ struct statfs *sp;
+ struct mount *mp;
+ struct vnode *vp;
+ int error;
+
+ error = priv_check(td, PRIV_VFS_FHSTATFS);
+ if (error != 0)
+ return (error);
+ if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
+ return (ESTALE);
+ error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
+ if (error != 0) {
+ vfs_unbusy(mp);
+ return (error);
+ }
+ vput(vp);
+ error = prison_canseemount(td->td_ucred, mp);
+ if (error != 0)
+ goto out;
+#ifdef MAC
+ error = mac_mount_check_stat(td->td_ucred, mp);
+ if (error != 0)
+ goto out;
+#endif
+ /*
+ * Set these in case the underlying filesystem fails to do so.
+ */
+ sp = &mp->mnt_stat;
+ sp->f_version = STATFS_VERSION;
+ sp->f_namemax = NAME_MAX;
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = VFS_STATFS(mp, sp);
+ if (error == 0)
+ *buf = *sp;
+out:
+ vfs_unbusy(mp);
+ return (error);
+}
+
+int
+kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
+{
+ struct file *fp;
+ struct mount *mp;
+ struct vnode *vp;
+ cap_rights_t rights;
+ off_t olen, ooffset;
+ int error;
+
+ fp = NULL;
+ error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
+ if (error != 0)
+ goto out;
+
+ switch (fp->f_type) {
+ case DTYPE_VNODE:
+ break;
+ case DTYPE_PIPE:
+ case DTYPE_FIFO:
+ error = ESPIPE;
+ goto out;
+ default:
+ error = ENODEV;
+ goto out;
+ }
+ if ((fp->f_flag & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ error = ENODEV;
+ goto out;
+ }
+ if (offset < 0 || len <= 0) {
+ error = EINVAL;
+ goto out;
+ }
+ /* Check for wrap. */
+ if (offset > OFF_MAX - len) {
+ error = EFBIG;
+ goto out;
+ }
+
+ /* Allocating blocks may take a long time, so iterate. */
+ for (;;) {
+ olen = len;
+ ooffset = offset;
+
+ bwillwrite();
+ mp = NULL;
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
+ break;
+ error = vn_lock(vp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vn_finished_write(mp);
+ break;
+ }
+#ifdef MAC
+ error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
+ if (error == 0)
+#endif
+ error = VOP_ALLOCATE(vp, &offset, &len);
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+
+ if (olen + ooffset != offset + len) {
+ panic("offset + len changed from %jx/%jx to %jx/%jx",
+ ooffset, olen, offset, len);
+ }
+ if (error != 0 || len == 0)
+ break;
+ KASSERT(olen > len, ("Iteration did not make progress?"));
+ maybe_yield();
+ }
+ out:
+ if (fp != NULL)
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
+{
+
+ return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
+}
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint. Instead, we remember the last setting with
+ * the exception that we will allow POSIX_FADV_NORMAL to adjust the
+ * region of any current setting.
+ */
+int
+kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
+ int advice)
+{
+ struct fadvise_info *fa, *new;
+ struct file *fp;
+ struct vnode *vp;
+ cap_rights_t rights;
+ off_t end;
+ int error;
+
+ if (offset < 0 || len < 0 || offset > OFF_MAX - len)
+ return (EINVAL);
+ switch (advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_NOREUSE:
+ new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
+ break;
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ new = NULL;
+ break;
+ default:
+ return (EINVAL);
+ }
+ /* XXX: CAP_POSIX_FADVISE? */
+ error = fget(td, fd, cap_rights_init(&rights), &fp);
+ if (error != 0)
+ goto out;
+
+ switch (fp->f_type) {
+ case DTYPE_VNODE:
+ break;
+ case DTYPE_PIPE:
+ case DTYPE_FIFO:
+ error = ESPIPE;
+ goto out;
+ default:
+ error = ENODEV;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ error = ENODEV;
+ goto out;
+ }
+ if (len == 0)
+ end = OFF_MAX;
+ else
+ end = offset + len - 1;
+ switch (advice) {
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_RANDOM:
+ case POSIX_FADV_NOREUSE:
+ /*
+ * Try to merge any existing non-standard region with
+ * this new region if possible, otherwise create a new
+ * non-standard region for this request.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fa = fp->f_advice;
+ if (fa != NULL && fa->fa_advice == advice &&
+ ((fa->fa_start <= end && fa->fa_end >= offset) ||
+ (end != OFF_MAX && fa->fa_start == end + 1) ||
+ (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
+ if (offset < fa->fa_start)
+ fa->fa_start = offset;
+ if (end > fa->fa_end)
+ fa->fa_end = end;
+ } else {
+ new->fa_advice = advice;
+ new->fa_start = offset;
+ new->fa_end = end;
+ new->fa_prevstart = 0;
+ new->fa_prevend = 0;
+ fp->f_advice = new;
+ new = fa;
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case POSIX_FADV_NORMAL:
+ /*
+ * If a the "normal" region overlaps with an existing
+ * non-standard region, trim or remove the
+ * non-standard region.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fa = fp->f_advice;
+ if (fa != NULL) {
+ if (offset <= fa->fa_start && end >= fa->fa_end) {
+ new = fa;
+ fp->f_advice = NULL;
+ } else if (offset <= fa->fa_start &&
+ end >= fa->fa_start)
+ fa->fa_start = end + 1;
+ else if (offset <= fa->fa_end && end >= fa->fa_end)
+ fa->fa_end = offset - 1;
+ else if (offset >= fa->fa_start && end <= fa->fa_end) {
+ /*
+ * If the "normal" region is a middle
+ * portion of the existing
+ * non-standard region, just remove
+ * the whole thing rather than picking
+ * one side or the other to
+ * preserve.
+ */
+ new = fa;
+ fp->f_advice = NULL;
+ }
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case POSIX_FADV_WILLNEED:
+ case POSIX_FADV_DONTNEED:
+ error = VOP_ADVISE(vp, offset, end, advice);
+ break;
+ }
+out:
+ if (fp != NULL)
+ fdrop(fp, td);
+ free(new, M_FADVISE);
+ return (error);
+}
+
+int
+sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
+{
+
+ return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
+ uap->advice));
+}
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
new file mode 100644
index 0000000..c53030a
--- /dev/null
+++ b/sys/kern/vfs_vnops.c
@@ -0,0 +1,2083 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2013 The FreeBSD Foundation
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/kdb.h>
+#include <sys/stat.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/filio.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/ttycom.h>
+#include <sys/conf.h>
+#include <sys/syslog.h>
+#include <sys/unistd.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+
+static fo_rdwr_t vn_read;
+static fo_rdwr_t vn_write;
+static fo_rdwr_t vn_io_fault;
+static fo_truncate_t vn_truncate;
+static fo_ioctl_t vn_ioctl;
+static fo_poll_t vn_poll;
+static fo_kqfilter_t vn_kqfilter;
+static fo_stat_t vn_statfile;
+static fo_close_t vn_closefile;
+
+struct fileops vnops = {
+ .fo_read = vn_io_fault,
+ .fo_write = vn_io_fault,
+ .fo_truncate = vn_truncate,
+ .fo_ioctl = vn_ioctl,
+ .fo_poll = vn_poll,
+ .fo_kqfilter = vn_kqfilter,
+ .fo_stat = vn_statfile,
+ .fo_close = vn_closefile,
+ .fo_chmod = vn_chmod,
+ .fo_chown = vn_chown,
+ .fo_sendfile = vn_sendfile,
+ .fo_seek = vn_seek,
+ .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
+};
+
+int
+vn_open(ndp, flagp, cmode, fp)
+ struct nameidata *ndp;
+ int *flagp, cmode;
+ struct file *fp;
+{
+ struct thread *td = ndp->ni_cnd.cn_thread;
+
+ return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
+}
+
+/*
+ * Common code for vnode open operations via a name lookup.
+ * Lookup the vnode and invoke VOP_CREATE if needed.
+ * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
+ *
+ * Note that this does NOT free nameidata for the successful case,
+ * due to the NDINIT being done elsewhere.
+ */
+int
+vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
+ struct ucred *cred, struct file *fp)
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct thread *td = ndp->ni_cnd.cn_thread;
+ struct vattr vat;
+ struct vattr *vap = &vat;
+ int fmode, error;
+
+restart:
+ fmode = *flagp;
+ if (fmode & O_CREAT) {
+ ndp->ni_cnd.cn_nameiop = CREATE;
+ ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF;
+ if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
+ ndp->ni_cnd.cn_flags |= FOLLOW;
+ if (!(vn_open_flags & VN_OPEN_NOAUDIT))
+ ndp->ni_cnd.cn_flags |= AUDITVNODE1;
+ if (vn_open_flags & VN_OPEN_NOCAPCHECK)
+ ndp->ni_cnd.cn_flags |= NOCAPCHECK;
+ bwillwrite();
+ if ((error = namei(ndp)) != 0)
+ return (error);
+ if (ndp->ni_vp == NULL) {
+ VATTR_NULL(vap);
+ vap->va_type = VREG;
+ vap->va_mode = cmode;
+ if (fmode & O_EXCL)
+ vap->va_vaflags |= VA_EXCLUSIVE;
+ if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vput(ndp->ni_dvp);
+ if ((error = vn_start_write(NULL, &mp,
+ V_XSLEEP | PCATCH)) != 0)
+ return (error);
+ goto restart;
+ }
+#ifdef MAC
+ error = mac_vnode_check_create(cred, ndp->ni_dvp,
+ &ndp->ni_cnd, vap);
+ if (error == 0)
+#endif
+ error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
+ &ndp->ni_cnd, vap);
+ vput(ndp->ni_dvp);
+ vn_finished_write(mp);
+ if (error) {
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ return (error);
+ }
+ fmode &= ~O_TRUNC;
+ vp = ndp->ni_vp;
+ } else {
+ if (ndp->ni_dvp == ndp->ni_vp)
+ vrele(ndp->ni_dvp);
+ else
+ vput(ndp->ni_dvp);
+ ndp->ni_dvp = NULL;
+ vp = ndp->ni_vp;
+ if (fmode & O_EXCL) {
+ error = EEXIST;
+ goto bad;
+ }
+ fmode &= ~O_CREAT;
+ }
+ } else {
+ ndp->ni_cnd.cn_nameiop = LOOKUP;
+ ndp->ni_cnd.cn_flags = ISOPEN |
+ ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
+ if (!(fmode & FWRITE))
+ ndp->ni_cnd.cn_flags |= LOCKSHARED;
+ if (!(vn_open_flags & VN_OPEN_NOAUDIT))
+ ndp->ni_cnd.cn_flags |= AUDITVNODE1;
+ if (vn_open_flags & VN_OPEN_NOCAPCHECK)
+ ndp->ni_cnd.cn_flags |= NOCAPCHECK;
+ if ((error = namei(ndp)) != 0)
+ return (error);
+ vp = ndp->ni_vp;
+ }
+ error = vn_open_vnode(vp, fmode, cred, td, fp);
+ if (error)
+ goto bad;
+ *flagp = fmode;
+ return (0);
+bad:
+ NDFREE(ndp, NDF_ONLY_PNBUF);
+ vput(vp);
+ *flagp = fmode;
+ ndp->ni_vp = NULL;
+ return (error);
+}
+
+/*
+ * Common code for vnode open operations once a vnode is located.
+ * Check permissions, and call the VOP_OPEN routine.
+ */
+int
+vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
+ struct thread *td, struct file *fp)
+{
+ struct mount *mp;
+ accmode_t accmode;
+ struct flock lf;
+ int error, have_flock, lock_flags, type;
+
+ if (vp->v_type == VLNK)
+ return (EMLINK);
+ if (vp->v_type == VSOCK)
+ return (EOPNOTSUPP);
+ if (vp->v_type != VDIR && fmode & O_DIRECTORY)
+ return (ENOTDIR);
+ accmode = 0;
+ if (fmode & (FWRITE | O_TRUNC)) {
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+ accmode |= VWRITE;
+ }
+ if (fmode & FREAD)
+ accmode |= VREAD;
+ if (fmode & FEXEC)
+ accmode |= VEXEC;
+ if ((fmode & O_APPEND) && (fmode & FWRITE))
+ accmode |= VAPPEND;
+#ifdef MAC
+ error = mac_vnode_check_open(cred, vp, accmode);
+ if (error)
+ return (error);
+#endif
+ if ((fmode & O_CREAT) == 0) {
+ if (accmode & VWRITE) {
+ error = vn_writechk(vp);
+ if (error)
+ return (error);
+ }
+ if (accmode) {
+ error = VOP_ACCESS(vp, accmode, cred, td);
+ if (error)
+ return (error);
+ }
+ }
+ if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
+ return (error);
+
+ if (fmode & (O_EXLOCK | O_SHLOCK)) {
+ KASSERT(fp != NULL, ("open with flock requires fp"));
+ lock_flags = VOP_ISLOCKED(vp);
+ VOP_UNLOCK(vp, 0);
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (fmode & O_EXLOCK)
+ lf.l_type = F_WRLCK;
+ else
+ lf.l_type = F_RDLCK;
+ type = F_FLOCK;
+ if ((fmode & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
+ have_flock = (error == 0);
+ vn_lock(vp, lock_flags | LK_RETRY);
+ if (error == 0 && vp->v_iflag & VI_DOOMED)
+ error = ENOENT;
+ /*
+ * Another thread might have used this vnode as an
+ * executable while the vnode lock was dropped.
+ * Ensure the vnode is still able to be opened for
+ * writing after the lock has been obtained.
+ */
+ if (error == 0 && accmode & VWRITE)
+ error = vn_writechk(vp);
+ if (error) {
+ VOP_UNLOCK(vp, 0);
+ if (have_flock) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf,
+ F_FLOCK);
+ }
+ vn_start_write(vp, &mp, V_WAIT);
+ vn_lock(vp, lock_flags | LK_RETRY);
+ (void)VOP_CLOSE(vp, fmode, cred, td);
+ vn_finished_write(mp);
+ return (error);
+ }
+ fp->f_flag |= FHASLOCK;
+ }
+ if (fmode & FWRITE) {
+ VOP_ADD_WRITECOUNT(vp, 1);
+ CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
+ __func__, vp, vp->v_writecount);
+ }
+ ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
+ return (0);
+}
+
+/*
+ * Check for write permissions on the specified vnode.
+ * Prototype text segments cannot be written.
+ */
+int
+vn_writechk(vp)
+ register struct vnode *vp;
+{
+
+ ASSERT_VOP_LOCKED(vp, "vn_writechk");
+ /*
+ * If there's shared text associated with
+ * the vnode, try to free it up once. If
+ * we fail, we can't allow writing.
+ */
+ if (VOP_IS_TEXT(vp))
+ return (ETXTBSY);
+
+ return (0);
+}
+
+/*
+ * Vnode close call
+ */
+int
+vn_close(vp, flags, file_cred, td)
+ register struct vnode *vp;
+ int flags;
+ struct ucred *file_cred;
+ struct thread *td;
+{
+ struct mount *mp;
+ int error, lock_flags;
+
+ if (!(flags & FWRITE) && vp->v_mount != NULL &&
+ vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
+ lock_flags = LK_SHARED;
+ else
+ lock_flags = LK_EXCLUSIVE;
+
+ vn_start_write(vp, &mp, V_WAIT);
+ vn_lock(vp, lock_flags | LK_RETRY);
+ if (flags & FWRITE) {
+ VNASSERT(vp->v_writecount > 0, vp,
+ ("vn_close: negative writecount"));
+ VOP_ADD_WRITECOUNT(vp, -1);
+ CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
+ __func__, vp, vp->v_writecount);
+ }
+ error = VOP_CLOSE(vp, flags, file_cred, td);
+ vput(vp);
+ vn_finished_write(mp);
+ return (error);
+}
+
+/*
+ * Heuristic to detect sequential operation.
+ */
+static int
+sequential_heuristic(struct uio *uio, struct file *fp)
+{
+
+ if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
+ return (fp->f_seqcount << IO_SEQSHIFT);
+
+ /*
+ * Offset 0 is handled specially. open() sets f_seqcount to 1 so
+ * that the first I/O is normally considered to be slightly
+ * sequential. Seeking to offset 0 doesn't change sequentiality
+ * unless previous seeks have reduced f_seqcount to 0, in which
+ * case offset 0 is not special.
+ */
+ if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
+ uio->uio_offset == fp->f_nextoff) {
+ /*
+ * f_seqcount is in units of fixed-size blocks so that it
+ * depends mainly on the amount of sequential I/O and not
+ * much on the number of sequential I/O's. The fixed size
+ * of 16384 is hard-coded here since it is (not quite) just
+ * a magic size that works well here. This size is more
+ * closely related to the best I/O size for real disks than
+ * to any block size used by software.
+ */
+ fp->f_seqcount += howmany(uio->uio_resid, 16384);
+ if (fp->f_seqcount > IO_SEQMAX)
+ fp->f_seqcount = IO_SEQMAX;
+ return (fp->f_seqcount << IO_SEQSHIFT);
+ }
+
+ /* Not sequential. Quickly draw-down sequentiality. */
+ if (fp->f_seqcount > 1)
+ fp->f_seqcount = 1;
+ else
+ fp->f_seqcount = 0;
+ return (0);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it.
+ */
+int
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+ enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+ struct ucred *file_cred, ssize_t *aresid, struct thread *td)
+{
+ struct uio auio;
+ struct iovec aiov;
+ struct mount *mp;
+ struct ucred *cred;
+ void *rl_cookie;
+ int error, lock_flags;
+
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = base;
+ aiov.iov_len = len;
+ auio.uio_resid = len;
+ auio.uio_offset = offset;
+ auio.uio_segflg = segflg;
+ auio.uio_rw = rw;
+ auio.uio_td = td;
+ error = 0;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if (rw == UIO_READ) {
+ rl_cookie = vn_rangelock_rlock(vp, offset,
+ offset + len);
+ } else {
+ rl_cookie = vn_rangelock_wlock(vp, offset,
+ offset + len);
+ }
+ mp = NULL;
+ if (rw == UIO_WRITE) {
+ if (vp->v_type != VCHR &&
+ (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
+ != 0)
+ goto out;
+ if (MNT_SHARED_WRITES(mp) ||
+ ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
+ lock_flags = LK_SHARED;
+ else
+ lock_flags = LK_EXCLUSIVE;
+ } else
+ lock_flags = LK_SHARED;
+ vn_lock(vp, lock_flags | LK_RETRY);
+ } else
+ rl_cookie = NULL;
+
+ ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+#ifdef MAC
+ if ((ioflg & IO_NOMACCHECK) == 0) {
+ if (rw == UIO_READ)
+ error = mac_vnode_check_read(active_cred, file_cred,
+ vp);
+ else
+ error = mac_vnode_check_write(active_cred, file_cred,
+ vp);
+ }
+#endif
+ if (error == 0) {
+ if (file_cred != NULL)
+ cred = file_cred;
+ else
+ cred = active_cred;
+ if (rw == UIO_READ)
+ error = VOP_READ(vp, &auio, ioflg, cred);
+ else
+ error = VOP_WRITE(vp, &auio, ioflg, cred);
+ }
+ if (aresid)
+ *aresid = auio.uio_resid;
+ else
+ if (auio.uio_resid && error == 0)
+ error = EIO;
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ VOP_UNLOCK(vp, 0);
+ if (mp != NULL)
+ vn_finished_write(mp);
+ }
+ out:
+ if (rl_cookie != NULL)
+ vn_rangelock_unlock(vp, rl_cookie);
+ return (error);
+}
+
+/*
+ * Package up an I/O request on a vnode into a uio and do it. The I/O
+ * request is split up into smaller chunks and we try to avoid saturating
+ * the buffer cache while potentially holding a vnode locked, so we
+ * check bwillwrite() before calling vn_rdwr(). We also call kern_yield()
+ * to give other processes a chance to lock the vnode (either other processes
+ * core'ing the same binary, or unrelated processes scanning the directory).
+ */
+int
+vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
+ file_cred, aresid, td)
+ enum uio_rw rw;
+ struct vnode *vp;
+ void *base;
+ size_t len;
+ off_t offset;
+ enum uio_seg segflg;
+ int ioflg;
+ struct ucred *active_cred;
+ struct ucred *file_cred;
+ size_t *aresid;
+ struct thread *td;
+{
+ int error = 0;
+ ssize_t iaresid;
+
+ do {
+ int chunk;
+
+ /*
+ * Force `offset' to a multiple of MAXBSIZE except possibly
+ * for the first chunk, so that filesystems only need to
+ * write full blocks except possibly for the first and last
+ * chunks.
+ */
+ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
+
+ if (chunk > len)
+ chunk = len;
+ if (rw != UIO_READ && vp->v_type == VREG)
+ bwillwrite();
+ iaresid = 0;
+ error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
+ ioflg, active_cred, file_cred, &iaresid, td);
+ len -= chunk; /* aresid calc already includes length */
+ if (error)
+ break;
+ offset += chunk;
+ base = (char *)base + chunk;
+ kern_yield(PRI_USER);
+ } while (len);
+ if (aresid)
+ *aresid = len + iaresid;
+ return (error);
+}
+
+off_t
+foffset_lock(struct file *fp, int flags)
+{
+ struct mtx *mtxp;
+ off_t res;
+
+ KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+ /*
+ * Caller only wants the current f_offset value. Assume that
+ * the long and shorter integer types reads are atomic.
+ */
+ if ((flags & FOF_NOLOCK) != 0)
+ return (fp->f_offset);
+#endif
+
+ /*
+ * According to McKusick the vn lock was protecting f_offset here.
+ * It is now protected by the FOFFSET_LOCKED flag.
+ */
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if ((flags & FOF_NOLOCK) == 0) {
+ while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+ fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+ msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+ "vofflock", 0);
+ }
+ fp->f_vnread_flags |= FOFFSET_LOCKED;
+ }
+ res = fp->f_offset;
+ mtx_unlock(mtxp);
+ return (res);
+}
+
+void
+foffset_unlock(struct file *fp, off_t val, int flags)
+{
+ struct mtx *mtxp;
+
+ KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
+
+#if OFF_MAX <= LONG_MAX
+ if ((flags & FOF_NOLOCK) != 0) {
+ if ((flags & FOF_NOUPDATE) == 0)
+ fp->f_offset = val;
+ if ((flags & FOF_NEXTOFF) != 0)
+ fp->f_nextoff = val;
+ return;
+ }
+#endif
+
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if ((flags & FOF_NOUPDATE) == 0)
+ fp->f_offset = val;
+ if ((flags & FOF_NEXTOFF) != 0)
+ fp->f_nextoff = val;
+ if ((flags & FOF_NOLOCK) == 0) {
+ KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
+ ("Lost FOFFSET_LOCKED"));
+ if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+ wakeup(&fp->f_vnread_flags);
+ fp->f_vnread_flags = 0;
+ }
+ mtx_unlock(mtxp);
+}
+
+void
+foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+ if ((flags & FOF_OFFSET) == 0)
+ uio->uio_offset = foffset_lock(fp, flags);
+}
+
+void
+foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
+{
+
+ if ((flags & FOF_OFFSET) == 0)
+ foffset_unlock(fp, uio->uio_offset, flags);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+ struct mtx *mtxp;
+ int ret;
+
+ ret = POSIX_FADV_NORMAL;
+ if (fp->f_advice == NULL)
+ return (ret);
+
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if (uio->uio_offset >= fp->f_advice->fa_start &&
+ uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+ ret = fp->f_advice->fa_advice;
+ mtx_unlock(mtxp);
+ return (ret);
+}
+
+/*
+ * File table vnode read routine.
+ */
+static int
+vn_read(fp, uio, active_cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *active_cred;
+ int flags;
+ struct thread *td;
+{
+ struct vnode *vp;
+ struct mtx *mtxp;
+ int error, ioflag;
+ int advice;
+ off_t offset, start, end;
+
+ KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+ uio->uio_td, td));
+ KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
+ vp = fp->f_vnode;
+ ioflag = 0;
+ if (fp->f_flag & FNONBLOCK)
+ ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
+ advice = get_advice(fp, uio);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_NOREUSE:
+ ioflag |= sequential_heuristic(uio, fp);
+ break;
+ case POSIX_FADV_RANDOM:
+ /* Disable read-ahead for random I/O. */
+ break;
+ }
+ offset = uio->uio_offset;
+
+#ifdef MAC
+ error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
+ if (error == 0)
+#endif
+ error = VOP_READ(vp, uio, ioflag, fp->f_cred);
+ fp->f_nextoff = uio->uio_offset;
+ VOP_UNLOCK(vp, 0);
+ if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+ offset != uio->uio_offset) {
+ /*
+ * Use POSIX_FADV_DONTNEED to flush clean pages and
+ * buffers for the backing file after a
+ * POSIX_FADV_NOREUSE read(2). To optimize the common
+ * case of using POSIX_FADV_NOREUSE with sequential
+ * access, track the previous implicit DONTNEED
+ * request and grow this request to include the
+ * current read(2) in addition to the previous
+ * DONTNEED. With purely sequential access this will
+ * cause the DONTNEED requests to continously grow to
+ * cover all of the previously read regions of the
+ * file. This allows filesystem blocks that are
+ * accessed by multiple calls to read(2) to be flushed
+ * once the last read(2) finishes.
+ */
+ start = offset;
+ end = uio->uio_offset - 1;
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if (fp->f_advice != NULL &&
+ fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+ if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+ start = fp->f_advice->fa_prevstart;
+ else if (fp->f_advice->fa_prevstart != 0 &&
+ fp->f_advice->fa_prevstart == end + 1)
+ end = fp->f_advice->fa_prevend;
+ fp->f_advice->fa_prevstart = start;
+ fp->f_advice->fa_prevend = end;
+ }
+ mtx_unlock(mtxp);
+ error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+ }
+ return (error);
+}
+
+/*
+ * File table vnode write routine.
+ */
+static int
+vn_write(fp, uio, active_cred, flags, td)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *active_cred;
+ int flags;
+ struct thread *td;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct mtx *mtxp;
+ int error, ioflag, lock_flags;
+ int advice;
+ off_t offset, start, end;
+
+ KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
+ uio->uio_td, td));
+ KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
+ vp = fp->f_vnode;
+ if (vp->v_type == VREG)
+ bwillwrite();
+ ioflag = IO_UNIT;
+ if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
+ ioflag |= IO_APPEND;
+ if (fp->f_flag & FNONBLOCK)
+ ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
+ if ((fp->f_flag & O_FSYNC) ||
+ (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
+ ioflag |= IO_SYNC;
+ mp = NULL;
+ if (vp->v_type != VCHR &&
+ (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto unlock;
+
+ advice = get_advice(fp, uio);
+
+ if (MNT_SHARED_WRITES(mp) ||
+ (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
+ lock_flags = LK_SHARED;
+ } else {
+ lock_flags = LK_EXCLUSIVE;
+ }
+
+ vn_lock(vp, lock_flags | LK_RETRY);
+ switch (advice) {
+ case POSIX_FADV_NORMAL:
+ case POSIX_FADV_SEQUENTIAL:
+ case POSIX_FADV_NOREUSE:
+ ioflag |= sequential_heuristic(uio, fp);
+ break;
+ case POSIX_FADV_RANDOM:
+ /* XXX: Is this correct? */
+ break;
+ }
+ offset = uio->uio_offset;
+
+#ifdef MAC
+ error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
+ if (error == 0)
+#endif
+ error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
+ fp->f_nextoff = uio->uio_offset;
+ VOP_UNLOCK(vp, 0);
+ if (vp->v_type != VCHR)
+ vn_finished_write(mp);
+ if (error == 0 && advice == POSIX_FADV_NOREUSE &&
+ offset != uio->uio_offset) {
+ /*
+ * Use POSIX_FADV_DONTNEED to flush clean pages and
+ * buffers for the backing file after a
+ * POSIX_FADV_NOREUSE write(2). To optimize the
+ * common case of using POSIX_FADV_NOREUSE with
+ * sequential access, track the previous implicit
+ * DONTNEED request and grow this request to include
+ * the current write(2) in addition to the previous
+ * DONTNEED. With purely sequential access this will
+ * cause the DONTNEED requests to continously grow to
+ * cover all of the previously written regions of the
+ * file.
+ *
+ * Note that the blocks just written are almost
+ * certainly still dirty, so this only works when
+ * VOP_ADVISE() calls from subsequent writes push out
+ * the data written by this write(2) once the backing
+ * buffers are clean. However, as compared to forcing
+ * IO_DIRECT, this gives much saner behavior. Write
+ * clustering is still allowed, and clean pages are
+ * merely moved to the cache page queue rather than
+ * outright thrown away. This means a subsequent
+ * read(2) can still avoid hitting the disk if the
+ * pages have not been reclaimed.
+ *
+ * This does make POSIX_FADV_NOREUSE largely useless
+ * with non-sequential access. However, sequential
+ * access is the more common use case and the flag is
+ * merely advisory.
+ */
+ start = offset;
+ end = uio->uio_offset - 1;
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ if (fp->f_advice != NULL &&
+ fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
+ if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
+ start = fp->f_advice->fa_prevstart;
+ else if (fp->f_advice->fa_prevstart != 0 &&
+ fp->f_advice->fa_prevstart == end + 1)
+ end = fp->f_advice->fa_prevend;
+ fp->f_advice->fa_prevstart = start;
+ fp->f_advice->fa_prevend = end;
+ }
+ mtx_unlock(mtxp);
+ error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
+ }
+
+unlock:
+ return (error);
+}
+
+static const int io_hold_cnt = 16;
+static int vn_io_fault_enable = 1;
+SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
+ &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
+static u_long vn_io_faults_cnt;
+SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
+ &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
+
+/*
+ * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
+ * prevent the following deadlock:
+ *
+ * Assume that the thread A reads from the vnode vp1 into userspace
+ * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is
+ * currently not resident, then system ends up with the call chain
+ * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
+ * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
+ * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
+ * If, at the same time, thread B reads from vnode vp2 into buffer buf2
+ * backed by the pages of vnode vp1, and some page in buf2 is not
+ * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
+ *
+ * To prevent the lock order reversal and deadlock, vn_io_fault() does
+ * not allow page faults to happen during VOP_READ() or VOP_WRITE().
+ * Instead, it first tries to do the whole range i/o with pagefaults
+ * disabled. If all pages in the i/o buffer are resident and mapped,
+ * VOP will succeed (ignoring the genuine filesystem errors).
+ * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
+ * i/o in chunks, with all pages in the chunk prefaulted and held
+ * using vm_fault_quick_hold_pages().
+ *
+ * Filesystems using this deadlock avoidance scheme should use the
+ * array of the held pages from uio, saved in the curthread->td_ma,
+ * instead of doing uiomove(). A helper function
+ * vn_io_fault_uiomove() converts uiomove request into
+ * uiomove_fromphys() over td_ma array.
+ *
+ * Since vnode locks do not cover the whole i/o anymore, rangelocks
+ * make the current i/o request atomic with respect to other i/os and
+ * truncations.
+ */
+static int
+vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td)
+{
+ vm_page_t ma[io_hold_cnt + 2];
+ struct uio *uio_clone, short_uio;
+ struct iovec short_iovec[1];
+ fo_rdwr_t *doio;
+ struct vnode *vp;
+ void *rl_cookie;
+ struct mount *mp;
+ vm_page_t *prev_td_ma;
+ int cnt, error, save, saveheld, prev_td_ma_cnt;
+ vm_offset_t addr, end;
+ vm_prot_t prot;
+ size_t len, resid;
+ ssize_t adv;
+
+ if (uio->uio_rw == UIO_READ)
+ doio = vn_read;
+ else
+ doio = vn_write;
+ vp = fp->f_vnode;
+ foffset_lock_uio(fp, uio, flags);
+
+ if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
+ ((mp = vp->v_mount) != NULL &&
+ (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
+ !vn_io_fault_enable) {
+ error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+ goto out_last;
+ }
+
+ /*
+ * The UFS follows IO_UNIT directive and replays back both
+ * uio_offset and uio_resid if an error is encountered during the
+ * operation. But, since the iovec may be already advanced,
+ * uio is still in an inconsistent state.
+ *
+ * Cache a copy of the original uio, which is advanced to the redo
+ * point using UIO_NOCOPY below.
+ */
+ uio_clone = cloneuio(uio);
+ resid = uio->uio_resid;
+
+ short_uio.uio_segflg = UIO_USERSPACE;
+ short_uio.uio_rw = uio->uio_rw;
+ short_uio.uio_td = uio->uio_td;
+
+ if (uio->uio_rw == UIO_READ) {
+ prot = VM_PROT_WRITE;
+ rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
+ uio->uio_offset + uio->uio_resid);
+ } else {
+ prot = VM_PROT_READ;
+ if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0)
+ /* For appenders, punt and lock the whole range. */
+ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+ else
+ rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
+ uio->uio_offset + uio->uio_resid);
+ }
+
+ save = vm_fault_disable_pagefaults();
+ error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+ if (error != EFAULT)
+ goto out;
+
+ atomic_add_long(&vn_io_faults_cnt, 1);
+ uio_clone->uio_segflg = UIO_NOCOPY;
+ uiomove(NULL, resid - uio->uio_resid, uio_clone);
+ uio_clone->uio_segflg = uio->uio_segflg;
+
+ saveheld = curthread_pflags_set(TDP_UIOHELD);
+ prev_td_ma = td->td_ma;
+ prev_td_ma_cnt = td->td_ma_cnt;
+
+ while (uio_clone->uio_resid != 0) {
+ len = uio_clone->uio_iov->iov_len;
+ if (len == 0) {
+ KASSERT(uio_clone->uio_iovcnt >= 1,
+ ("iovcnt underflow"));
+ uio_clone->uio_iov++;
+ uio_clone->uio_iovcnt--;
+ continue;
+ }
+
+ addr = (vm_offset_t)uio_clone->uio_iov->iov_base;
+ end = round_page(addr + len);
+ cnt = howmany(end - trunc_page(addr), PAGE_SIZE);
+ /*
+ * A perfectly misaligned address and length could cause
+ * both the start and the end of the chunk to use partial
+ * page. +2 accounts for such a situation.
+ */
+ if (cnt > io_hold_cnt + 2) {
+ len = io_hold_cnt * PAGE_SIZE;
+ KASSERT(howmany(round_page(addr + len) -
+ trunc_page(addr), PAGE_SIZE) <= io_hold_cnt + 2,
+ ("cnt overflow"));
+ }
+ cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
+ addr, len, prot, ma, io_hold_cnt + 2);
+ if (cnt == -1) {
+ error = EFAULT;
+ break;
+ }
+ short_uio.uio_iov = &short_iovec[0];
+ short_iovec[0].iov_base = (void *)addr;
+ short_uio.uio_iovcnt = 1;
+ short_uio.uio_resid = short_iovec[0].iov_len = len;
+ short_uio.uio_offset = uio_clone->uio_offset;
+ td->td_ma = ma;
+ td->td_ma_cnt = cnt;
+
+ error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET,
+ td);
+ vm_page_unhold_pages(ma, cnt);
+ adv = len - short_uio.uio_resid;
+
+ uio_clone->uio_iov->iov_base =
+ (char *)uio_clone->uio_iov->iov_base + adv;
+ uio_clone->uio_iov->iov_len -= adv;
+ uio_clone->uio_resid -= adv;
+ uio_clone->uio_offset += adv;
+
+ uio->uio_resid -= adv;
+ uio->uio_offset += adv;
+
+ if (error != 0 || adv == 0)
+ break;
+ }
+ td->td_ma = prev_td_ma;
+ td->td_ma_cnt = prev_td_ma_cnt;
+ curthread_pflags_restore(saveheld);
+out:
+ vm_fault_enable_pagefaults(save);
+ vn_rangelock_unlock(vp, rl_cookie);
+ free(uio_clone, M_IOV);
+out_last:
+ foffset_unlock_uio(fp, uio, flags);
+ return (error);
+}
+
+/*
+ * Helper function to perform the requested uiomove operation using
+ * the held pages for io->uio_iov[0].iov_base buffer instead of
+ * copyin/copyout. Access to the pages with uiomove_fromphys()
+ * instead of iov_base prevents page faults that could occur due to
+ * pmap_collect() invalidating the mapping created by
+ * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
+ * object cleanup revoking the write access from page mappings.
+ *
+ * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
+ * instead of plain uiomove().
+ */
+int
+vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
+{
+ struct uio transp_uio;
+ struct iovec transp_iov[1];
+ struct thread *td;
+ size_t adv;
+ int error, pgadv;
+
+ td = curthread;
+ if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+ uio->uio_segflg != UIO_USERSPACE)
+ return (uiomove(data, xfersize, uio));
+
+ KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+ transp_iov[0].iov_base = data;
+ transp_uio.uio_iov = &transp_iov[0];
+ transp_uio.uio_iovcnt = 1;
+ if (xfersize > uio->uio_resid)
+ xfersize = uio->uio_resid;
+ transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
+ transp_uio.uio_offset = 0;
+ transp_uio.uio_segflg = UIO_SYSSPACE;
+ /*
+ * Since transp_iov points to data, and td_ma page array
+ * corresponds to original uio->uio_iov, we need to invert the
+ * direction of the i/o operation as passed to
+ * uiomove_fromphys().
+ */
+ switch (uio->uio_rw) {
+ case UIO_WRITE:
+ transp_uio.uio_rw = UIO_READ;
+ break;
+ case UIO_READ:
+ transp_uio.uio_rw = UIO_WRITE;
+ break;
+ }
+ transp_uio.uio_td = uio->uio_td;
+ error = uiomove_fromphys(td->td_ma,
+ ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
+ xfersize, &transp_uio);
+ adv = xfersize - transp_uio.uio_resid;
+ pgadv =
+ (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
+ (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
+ td->td_ma += pgadv;
+ KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+ pgadv));
+ td->td_ma_cnt -= pgadv;
+ uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
+ uio->uio_iov->iov_len -= adv;
+ uio->uio_resid -= adv;
+ uio->uio_offset += adv;
+ return (error);
+}
+
+int
+vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
+ struct uio *uio)
+{
+ struct thread *td;
+ vm_offset_t iov_base;
+ int cnt, pgadv;
+
+ td = curthread;
+ if ((td->td_pflags & TDP_UIOHELD) == 0 ||
+ uio->uio_segflg != UIO_USERSPACE)
+ return (uiomove_fromphys(ma, offset, xfersize, uio));
+
+ KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
+ cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
+ iov_base = (vm_offset_t)uio->uio_iov->iov_base;
+ switch (uio->uio_rw) {
+ case UIO_WRITE:
+ pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
+ offset, cnt);
+ break;
+ case UIO_READ:
+ pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
+ cnt);
+ break;
+ }
+ pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
+ td->td_ma += pgadv;
+ KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
+ pgadv));
+ td->td_ma_cnt -= pgadv;
+ uio->uio_iov->iov_base = (char *)(iov_base + cnt);
+ uio->uio_iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ return (0);
+}
+
+
+/*
+ * File table truncate routine.
+ */
+static int
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vattr vattr;
+ struct mount *mp;
+ struct vnode *vp;
+ void *rl_cookie;
+ int error;
+
+ vp = fp->f_vnode;
+
+ /*
+ * Lock the whole range for truncation. Otherwise split i/o
+ * might happen partly before and partly after the truncation.
+ */
+ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error)
+ goto out1;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ if (vp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+#ifdef MAC
+ error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
+ if (error)
+ goto out;
+#endif
+ error = vn_writechk(vp);
+ if (error == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = length;
+ error = VOP_SETATTR(vp, &vattr, fp->f_cred);
+ }
+out:
+ VOP_UNLOCK(vp, 0);
+ vn_finished_write(mp);
+out1:
+ vn_rangelock_unlock(vp, rl_cookie);
+ return (error);
+}
+
+/*
+ * File table vnode stat routine.
+ */
+static int
+vn_statfile(fp, sb, active_cred, td)
+ struct file *fp;
+ struct stat *sb;
+ struct ucred *active_cred;
+ struct thread *td;
+{
+ struct vnode *vp = fp->f_vnode;
+ int error;
+
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
+ VOP_UNLOCK(vp, 0);
+
+ return (error);
+}
+
+/*
+ * Stat a vnode; implementation for the stat syscall
+ */
+int
+vn_stat(vp, sb, active_cred, file_cred, td)
+ struct vnode *vp;
+ register struct stat *sb;
+ struct ucred *active_cred;
+ struct ucred *file_cred;
+ struct thread *td;
+{
+ struct vattr vattr;
+ register struct vattr *vap;
+ int error;
+ u_short mode;
+
+#ifdef MAC
+ error = mac_vnode_check_stat(active_cred, file_cred, vp);
+ if (error)
+ return (error);
+#endif
+
+ vap = &vattr;
+
+ /*
+ * Initialize defaults for new and unusual fields, so that file
+ * systems which don't support these fields don't need to know
+ * about them.
+ */
+ vap->va_birthtime.tv_sec = -1;
+ vap->va_birthtime.tv_nsec = 0;
+ vap->va_fsid = VNOVAL;
+ vap->va_rdev = NODEV;
+
+ error = VOP_GETATTR(vp, vap, active_cred);
+ if (error)
+ return (error);
+
+ /*
+ * Zero the spare stat fields
+ */
+ bzero(sb, sizeof *sb);
+
+ /*
+ * Copy from vattr table
+ */
+ if (vap->va_fsid != VNOVAL)
+ sb->st_dev = vap->va_fsid;
+ else
+ sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
+ sb->st_ino = vap->va_fileid;
+ mode = vap->va_mode;
+ switch (vap->va_type) {
+ case VREG:
+ mode |= S_IFREG;
+ break;
+ case VDIR:
+ mode |= S_IFDIR;
+ break;
+ case VBLK:
+ mode |= S_IFBLK;
+ break;
+ case VCHR:
+ mode |= S_IFCHR;
+ break;
+ case VLNK:
+ mode |= S_IFLNK;
+ break;
+ case VSOCK:
+ mode |= S_IFSOCK;
+ break;
+ case VFIFO:
+ mode |= S_IFIFO;
+ break;
+ default:
+ return (EBADF);
+ };
+ sb->st_mode = mode;
+ sb->st_nlink = vap->va_nlink;
+ sb->st_uid = vap->va_uid;
+ sb->st_gid = vap->va_gid;
+ sb->st_rdev = vap->va_rdev;
+ if (vap->va_size > OFF_MAX)
+ return (EOVERFLOW);
+ sb->st_size = vap->va_size;
+ sb->st_atim = vap->va_atime;
+ sb->st_mtim = vap->va_mtime;
+ sb->st_ctim = vap->va_ctime;
+ sb->st_birthtim = vap->va_birthtime;
+
+ /*
+ * According to www.opengroup.org, the meaning of st_blksize is
+ * "a filesystem-specific preferred I/O block size for this
+ * object. In some filesystem types, this may vary from file
+ * to file"
+ * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
+ */
+
+ sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
+
+ sb->st_flags = vap->va_flags;
+ if (priv_check(td, PRIV_VFS_GENERATION))
+ sb->st_gen = 0;
+ else
+ sb->st_gen = vap->va_gen;
+
+ sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+ return (0);
+}
+
+/*
+ * File table vnode ioctl routine.
+ */
+static int
+vn_ioctl(fp, com, data, active_cred, td)
+ struct file *fp;
+ u_long com;
+ void *data;
+ struct ucred *active_cred;
+ struct thread *td;
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ int error;
+
+ vp = fp->f_vnode;
+ switch (vp->v_type) {
+ case VDIR:
+ case VREG:
+ switch (com) {
+ case FIONREAD:
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, active_cred);
+ VOP_UNLOCK(vp, 0);
+ if (error == 0)
+ *(int *)data = vattr.va_size - fp->f_offset;
+ return (error);
+ case FIONBIO:
+ case FIOASYNC:
+ return (0);
+ default:
+ return (VOP_IOCTL(vp, com, data, fp->f_flag,
+ active_cred, td));
+ }
+ default:
+ return (ENOTTY);
+ }
+}
+
+/*
+ * File table vnode poll routine.
+ */
+static int
+vn_poll(fp, events, active_cred, td)
+ struct file *fp;
+ int events;
+ struct ucred *active_cred;
+ struct thread *td;
+{
+ struct vnode *vp;
+ int error;
+
+ vp = fp->f_vnode;
+#ifdef MAC
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
+ VOP_UNLOCK(vp, 0);
+ if (!error)
+#endif
+
+ error = VOP_POLL(vp, events, fp->f_cred, td);
+ return (error);
+}
+
+/*
+ * Acquire the requested lock and then check for validity. LK_RETRY
+ * permits vn_lock to return doomed vnodes.
+ */
+int
+_vn_lock(struct vnode *vp, int flags, char *file, int line)
+{
+ int error;
+
+ VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
+ ("vn_lock called with no locktype."));
+ do {
+#ifdef DEBUG_VFS_LOCKS
+ KASSERT(vp->v_holdcnt != 0,
+ ("vn_lock %p: zero hold count", vp));
+#endif
+ error = VOP_LOCK1(vp, flags, file, line);
+ flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
+ KASSERT((flags & LK_RETRY) == 0 || error == 0,
+ ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
+ flags, error));
+ /*
+ * Callers specify LK_RETRY if they wish to get dead vnodes.
+ * If RETRY is not set, we return ENOENT instead.
+ */
+ if (error == 0 && vp->v_iflag & VI_DOOMED &&
+ (flags & LK_RETRY) == 0) {
+ VOP_UNLOCK(vp, 0);
+ error = ENOENT;
+ break;
+ }
+ } while (flags & LK_RETRY && error != 0);
+ return (error);
+}
+
+/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(fp, td)
+ struct file *fp;
+ struct thread *td;
+{
+ struct vnode *vp;
+ struct flock lf;
+ int error;
+
+ vp = fp->f_vnode;
+ fp->f_ops = &badfileops;
+
+ if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK)
+ vref(vp);
+
+ error = vn_close(vp, fp->f_flag, fp->f_cred, td);
+
+ if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ lf.l_type = F_UNLCK;
+ (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
+ vrele(vp);
+ }
+ return (error);
+}
+
+/*
+ * Preparing to start a filesystem write operation. If the operation is
+ * permitted, then we bump the count of operations in progress and
+ * proceed. If a suspend request is in progress, we wait until the
+ * suspension is over, and then proceed.
+ */
+static int
+vn_start_write_locked(struct mount *mp, int flags)
+{
+ int error;
+
+ mtx_assert(MNT_MTX(mp), MA_OWNED);
+ error = 0;
+
+ /*
+ * Check on status of suspension.
+ */
+ if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
+ mp->mnt_susp_owner != curthread) {
+ while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+ if (flags & V_NOWAIT) {
+ error = EWOULDBLOCK;
+ goto unlock;
+ }
+ error = msleep(&mp->mnt_flag, MNT_MTX(mp),
+ (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
+ if (error)
+ goto unlock;
+ }
+ }
+ if (flags & V_XSLEEP)
+ goto unlock;
+ mp->mnt_writeopcount++;
+unlock:
+ if (error != 0 || (flags & V_XSLEEP) != 0)
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ return (error);
+}
+
+int
+vn_start_write(vp, mpp, flags)
+ struct vnode *vp;
+ struct mount **mpp;
+ int flags;
+{
+ struct mount *mp;
+ int error;
+
+ error = 0;
+ /*
+ * If a vnode is provided, get and return the mount point that
+ * to which it will write.
+ */
+ if (vp != NULL) {
+ if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+ *mpp = NULL;
+ if (error != EOPNOTSUPP)
+ return (error);
+ return (0);
+ }
+ }
+ if ((mp = *mpp) == NULL)
+ return (0);
+
+ /*
+ * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+ * a vfs_ref().
+ * As long as a vnode is not provided we need to acquire a
+ * refcount for the provided mountpoint too, in order to
+ * emulate a vfs_ref().
+ */
+ MNT_ILOCK(mp);
+ if (vp == NULL)
+ MNT_REF(mp);
+
+ return (vn_start_write_locked(mp, flags));
+}
+
+/*
+ * Secondary suspension. Used by operations such as vop_inactive
+ * routines that are needed by the higher level functions. These
+ * are allowed to proceed until all the higher level functions have
+ * completed (indicated by mnt_writeopcount dropping to zero). At that
+ * time, these operations are halted until the suspension is over.
+ */
+int
+vn_start_secondary_write(vp, mpp, flags)
+ struct vnode *vp;
+ struct mount **mpp;
+ int flags;
+{
+ struct mount *mp;
+ int error;
+
+ retry:
+ if (vp != NULL) {
+ if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
+ *mpp = NULL;
+ if (error != EOPNOTSUPP)
+ return (error);
+ return (0);
+ }
+ }
+ /*
+ * If we are not suspended or have not yet reached suspended
+ * mode, then let the operation proceed.
+ */
+ if ((mp = *mpp) == NULL)
+ return (0);
+
+ /*
+ * VOP_GETWRITEMOUNT() returns with the mp refcount held through
+ * a vfs_ref().
+ * As long as a vnode is not provided we need to acquire a
+ * refcount for the provided mountpoint too, in order to
+ * emulate a vfs_ref().
+ */
+ MNT_ILOCK(mp);
+ if (vp == NULL)
+ MNT_REF(mp);
+ if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
+ mp->mnt_secondary_writes++;
+ mp->mnt_secondary_accwrites++;
+ MNT_IUNLOCK(mp);
+ return (0);
+ }
+ if (flags & V_NOWAIT) {
+ MNT_REL(mp);
+ MNT_IUNLOCK(mp);
+ return (EWOULDBLOCK);
+ }
+ /*
+ * Wait for the suspension to finish.
+ */
+ error = msleep(&mp->mnt_flag, MNT_MTX(mp),
+ (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
+ vfs_rel(mp);
+ if (error == 0)
+ goto retry;
+ return (error);
+}
+
+/*
+ * Filesystem write operation has completed. If we are suspending and this
+ * operation is the last one, notify the suspender that the suspension is
+ * now in effect.
+ */
+void
+vn_finished_write(mp)
+ struct mount *mp;
+{
+ if (mp == NULL)
+ return;
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ mp->mnt_writeopcount--;
+ if (mp->mnt_writeopcount < 0)
+ panic("vn_finished_write: neg cnt");
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+ mp->mnt_writeopcount <= 0)
+ wakeup(&mp->mnt_writeopcount);
+ MNT_IUNLOCK(mp);
+}
+
+
+/*
+ * Filesystem secondary write operation has completed. If we are
+ * suspending and this operation is the last one, notify the suspender
+ * that the suspension is now in effect.
+ */
+void
+vn_finished_secondary_write(mp)
+ struct mount *mp;
+{
+ if (mp == NULL)
+ return;
+ MNT_ILOCK(mp);
+ MNT_REL(mp);
+ mp->mnt_secondary_writes--;
+ if (mp->mnt_secondary_writes < 0)
+ panic("vn_finished_secondary_write: neg cnt");
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
+ mp->mnt_secondary_writes <= 0)
+ wakeup(&mp->mnt_secondary_writes);
+ MNT_IUNLOCK(mp);
+}
+
+
+
+/*
+ * Request a filesystem to suspend write operations.
+ */
+int
+vfs_write_suspend(struct mount *mp, int flags)
+{
+ int error;
+
+ MNT_ILOCK(mp);
+ if (mp->mnt_susp_owner == curthread) {
+ MNT_IUNLOCK(mp);
+ return (EALREADY);
+ }
+ while (mp->mnt_kern_flag & MNTK_SUSPEND)
+ msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
+
+ /*
+ * Unmount holds a write reference on the mount point. If we
+ * own busy reference and drain for writers, we deadlock with
+ * the reference draining in the unmount path. Callers of
+ * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
+ * vfs_busy() reference is owned and caller is not in the
+ * unmount context.
+ */
+ if ((flags & VS_SKIP_UNMOUNT) != 0 &&
+ (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+ MNT_IUNLOCK(mp);
+ return (EBUSY);
+ }
+
+ mp->mnt_kern_flag |= MNTK_SUSPEND;
+ mp->mnt_susp_owner = curthread;
+ if (mp->mnt_writeopcount > 0)
+ (void) msleep(&mp->mnt_writeopcount,
+ MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
+ else
+ MNT_IUNLOCK(mp);
+ if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
+ vfs_write_resume(mp, 0);
+ return (error);
+}
+
+/*
+ * Request a filesystem to resume write operations.
+ */
+void
+vfs_write_resume(struct mount *mp, int flags)
+{
+
+ MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
+ KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
+ mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
+ MNTK_SUSPENDED);
+ mp->mnt_susp_owner = NULL;
+ wakeup(&mp->mnt_writeopcount);
+ wakeup(&mp->mnt_flag);
+ curthread->td_pflags &= ~TDP_IGNSUSP;
+ if ((flags & VR_START_WRITE) != 0) {
+ MNT_REF(mp);
+ mp->mnt_writeopcount++;
+ }
+ MNT_IUNLOCK(mp);
+ if ((flags & VR_NO_SUSPCLR) == 0)
+ VFS_SUSP_CLEAN(mp);
+ } else if ((flags & VR_START_WRITE) != 0) {
+ MNT_REF(mp);
+ vn_start_write_locked(mp, 0);
+ } else {
+ MNT_IUNLOCK(mp);
+ }
+}
+
+/*
+ * Implement kqueues for files by translating it to vnode operation.
+ */
+static int
+vn_kqfilter(struct file *fp, struct knote *kn)
+{
+
+ return (VOP_KQFILTER(fp->f_vnode, kn));
+}
+
+/*
+ * Simplified in-kernel wrapper calls for extended attribute access.
+ * Both calls pass in a NULL credential, authorizing as "kernel" access.
+ * Set IO_NODELOCKED in ioflg if the vnode is already locked.
+ */
+int
+vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, int *buflen, char *buf, struct thread *td)
+{
+ struct uio auio;
+ struct iovec iov;
+ int error;
+
+ iov.iov_len = *buflen;
+ iov.iov_base = buf;
+
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = 0;
+ auio.uio_resid = *buflen;
+
+ if ((ioflg & IO_NODELOCKED) == 0)
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+
+ ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+ /* authorize attribute retrieval as kernel */
+ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
+ td);
+
+ if ((ioflg & IO_NODELOCKED) == 0)
+ VOP_UNLOCK(vp, 0);
+
+ if (error == 0) {
+ *buflen = *buflen - auio.uio_resid;
+ }
+
+ return (error);
+}
+
+/*
+ * XXX failure mode if partially written?
+ */
+int
+vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, int buflen, char *buf, struct thread *td)
+{
+ struct uio auio;
+ struct iovec iov;
+ struct mount *mp;
+ int error;
+
+ iov.iov_len = buflen;
+ iov.iov_base = buf;
+
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = 0;
+ auio.uio_resid = buflen;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ }
+
+ ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+ /* authorize attribute setting as kernel */
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ vn_finished_write(mp);
+ VOP_UNLOCK(vp, 0);
+ }
+
+ return (error);
+}
+
+int
+vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
+ const char *attrname, struct thread *td)
+{
+ struct mount *mp;
+ int error;
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
+ return (error);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ }
+
+ ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
+
+ /* authorize attribute removal as kernel */
+ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
+ if (error == EOPNOTSUPP)
+ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+ NULL, td);
+
+ if ((ioflg & IO_NODELOCKED) == 0) {
+ vn_finished_write(mp);
+ VOP_UNLOCK(vp, 0);
+ }
+
+ return (error);
+}
+
+int
+vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
+{
+ struct mount *mp;
+ int ltype, error;
+
+ mp = vp->v_mount;
+ ltype = VOP_ISLOCKED(vp);
+ KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
+ ("vn_vget_ino: vp not locked"));
+ error = vfs_busy(mp, MBF_NOWAIT);
+ if (error != 0) {
+ vfs_ref(mp);
+ VOP_UNLOCK(vp, 0);
+ error = vfs_busy(mp, 0);
+ vn_lock(vp, ltype | LK_RETRY);
+ vfs_rel(mp);
+ if (error != 0)
+ return (ENOENT);
+ if (vp->v_iflag & VI_DOOMED) {
+ vfs_unbusy(mp);
+ return (ENOENT);
+ }
+ }
+ VOP_UNLOCK(vp, 0);
+ error = VFS_VGET(mp, ino, lkflags, rvp);
+ vfs_unbusy(mp);
+ vn_lock(vp, ltype | LK_RETRY);
+ if (vp->v_iflag & VI_DOOMED) {
+ if (error == 0)
+ vput(*rvp);
+ error = ENOENT;
+ }
+ return (error);
+}
+
+int
+vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
+ const struct thread *td)
+{
+
+ if (vp->v_type != VREG || td == NULL)
+ return (0);
+ PROC_LOCK(td->td_proc);
+ if ((uoff_t)uio->uio_offset + uio->uio_resid >
+ lim_cur(td->td_proc, RLIMIT_FSIZE)) {
+ kern_psignal(td->td_proc, SIGXFSZ);
+ PROC_UNLOCK(td->td_proc);
+ return (EFBIG);
+ }
+ PROC_UNLOCK(td->td_proc);
+ return (0);
+}
+
+int
+vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vnode *vp;
+
+ vp = fp->f_vnode;
+#ifdef AUDIT
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ VOP_UNLOCK(vp, 0);
+#endif
+ return (setfmode(td, active_cred, vp, mode));
+}
+
+int
+vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
+ struct thread *td)
+{
+ struct vnode *vp;
+
+ vp = fp->f_vnode;
+#ifdef AUDIT
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(vp);
+ VOP_UNLOCK(vp, 0);
+#endif
+ return (setfown(td, active_cred, vp, uid, gid));
+}
+
+void
+vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
+{
+ vm_object_t object;
+
+ if ((object = vp->v_object) == NULL)
+ return;
+ VM_OBJECT_WLOCK(object);
+ vm_object_page_remove(object, start, end, 0);
+ VM_OBJECT_WUNLOCK(object);
+}
+
+int
+vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
+{
+ struct vattr va;
+ daddr_t bn, bnp;
+ uint64_t bsize;
+ off_t noff;
+ int error;
+
+ KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
+ ("Wrong command %lu", cmd));
+
+ if (vn_lock(vp, LK_SHARED) != 0)
+ return (EBADF);
+ if (vp->v_type != VREG) {
+ error = ENOTTY;
+ goto unlock;
+ }
+ error = VOP_GETATTR(vp, &va, cred);
+ if (error != 0)
+ goto unlock;
+ noff = *off;
+ if (noff >= va.va_size) {
+ error = ENXIO;
+ goto unlock;
+ }
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+ for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
+ error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
+ if (error == EOPNOTSUPP) {
+ error = ENOTTY;
+ goto unlock;
+ }
+ if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
+ (bnp != -1 && cmd == FIOSEEKDATA)) {
+ noff = bn * bsize;
+ if (noff < *off)
+ noff = *off;
+ goto unlock;
+ }
+ }
+ if (noff > va.va_size)
+ noff = va.va_size;
+ /* noff == va.va_size. There is an implicit hole at the end of file. */
+ if (cmd == FIOSEEKDATA)
+ error = ENXIO;
+unlock:
+ VOP_UNLOCK(vp, 0);
+ if (error == 0)
+ *off = noff;
+ return (error);
+}
+
+int
+vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
+{
+ struct ucred *cred;
+ struct vnode *vp;
+ struct vattr vattr;
+ off_t foffset, size;
+ int error, noneg;
+
+ cred = td->td_ucred;
+ vp = fp->f_vnode;
+ foffset = foffset_lock(fp, 0);
+ noneg = (vp->v_type != VCHR);
+ error = 0;
+ switch (whence) {
+ case L_INCR:
+ if (noneg &&
+ (foffset < 0 ||
+ (offset > 0 && foffset > OFF_MAX - offset))) {
+ error = EOVERFLOW;
+ break;
+ }
+ offset += foffset;
+ break;
+ case L_XTND:
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, cred);
+ VOP_UNLOCK(vp, 0);
+ if (error)
+ break;
+
+ /*
+ * If the file references a disk device, then fetch
+ * the media size and use that to determine the ending
+ * offset.
+ */
+ if (vattr.va_size == 0 && vp->v_type == VCHR &&
+ fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
+ vattr.va_size = size;
+ if (noneg &&
+ (vattr.va_size > OFF_MAX ||
+ (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
+ error = EOVERFLOW;
+ break;
+ }
+ offset += vattr.va_size;
+ break;
+ case L_SET:
+ break;
+ case SEEK_DATA:
+ error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
+ break;
+ case SEEK_HOLE:
+ error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
+ break;
+ default:
+ error = EINVAL;
+ }
+ if (error == 0 && noneg && offset < 0)
+ error = EINVAL;
+ if (error != 0)
+ goto drop;
+ VFS_KNOTE_UNLOCKED(vp, 0);
+ *(off_t *)(td->td_retval) = offset;
+drop:
+ foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
+ return (error);
+}
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
new file mode 100644
index 0000000..eabfb43
--- /dev/null
+++ b/sys/kern/vnode_if.src
@@ -0,0 +1,716 @@
+#-
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 4. Neither the name of the University nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# @(#)vnode_if.src 8.12 (Berkeley) 5/14/95
+# $FreeBSD$
+#
+
+#
+# Above each of the vop descriptors in lines starting with %%
+# is a specification of the locking protocol used by each vop call.
+# The first column is the name of the variable, the remaining three
+# columns are in, out and error respectively. The "in" column defines
+# the lock state on input, the "out" column defines the state on succesful
+# return, and the "error" column defines the locking state on error exit.
+#
+# The locking value can take the following values:
+# L: locked; not converted to type of lock.
+# A: any lock type.
+# S: locked with shared lock.
+# E: locked with exclusive lock for this process.
+# O: locked with exclusive lock for other process.
+# U: unlocked.
+# -: not applicable. vnode does not yet (or no longer) exists.
+# =: the same on input and output, may be either L or U.
+# X: locked if not nil.
+#
+# The paramater named "vpp" is assumed to be always used with double
+# indirection (**vpp) and that name is hard-coded in vnode_if.awk !
+#
+# Lines starting with %! specify a pre or post-condition function
+# to call before/after the vop call.
+#
+# If other such parameters are introduced, they have to be added to
+# the AWK script at the head of the definition of "add_debug_code()".
+#
+
+vop_islocked {
+ IN struct vnode *vp;
+};
+
+%% lookup dvp L L L
+%% lookup vpp - L -
+
+# XXX - the lookup locking protocol defies simple description and depends
+# on the flags and operation fields in the (cnp) structure. Note
+# especially that *vpp may equal dvp and both may be locked.
+
+vop_lookup {
+ IN struct vnode *dvp;
+ INOUT struct vnode **vpp;
+ IN struct componentname *cnp;
+};
+
+%% cachedlookup dvp L L L
+%% cachedlookup vpp - L -
+
+# This must be an exact copy of lookup. See kern/vfs_cache.c for details.
+
+vop_cachedlookup {
+ IN struct vnode *dvp;
+ INOUT struct vnode **vpp;
+ IN struct componentname *cnp;
+};
+
+%% create dvp E E E
+%% create vpp - L -
+%! create post vop_create_post
+
+vop_create {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+
+%% whiteout dvp E E E
+
+vop_whiteout {
+ IN struct vnode *dvp;
+ IN struct componentname *cnp;
+ IN int flags;
+};
+
+
+%% mknod dvp E E E
+%% mknod vpp - L -
+%! mknod post vop_mknod_post
+
+vop_mknod {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+
+%% open vp L L L
+
+vop_open {
+ IN struct vnode *vp;
+ IN int mode;
+ IN struct ucred *cred;
+ IN struct thread *td;
+ IN struct file *fp;
+};
+
+
+%% close vp L L L
+
+vop_close {
+ IN struct vnode *vp;
+ IN int fflag;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% access vp L L L
+
+vop_access {
+ IN struct vnode *vp;
+ IN accmode_t accmode;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% accessx vp L L L
+
+vop_accessx {
+ IN struct vnode *vp;
+ IN accmode_t accmode;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% getattr vp L L L
+
+vop_getattr {
+ IN struct vnode *vp;
+ OUT struct vattr *vap;
+ IN struct ucred *cred;
+};
+
+
+%% setattr vp E E E
+%! setattr post vop_setattr_post
+
+vop_setattr {
+ IN struct vnode *vp;
+ IN struct vattr *vap;
+ IN struct ucred *cred;
+};
+
+%% markatime vp L L L
+
+vop_markatime {
+ IN struct vnode *vp;
+};
+
+%% read vp L L L
+
+vop_read {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN int ioflag;
+ IN struct ucred *cred;
+};
+
+
+%% write vp L L L
+%! write pre VOP_WRITE_PRE
+%! write post VOP_WRITE_POST
+
+vop_write {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN int ioflag;
+ IN struct ucred *cred;
+};
+
+
+%% ioctl vp U U U
+
+vop_ioctl {
+ IN struct vnode *vp;
+ IN u_long command;
+ IN void *data;
+ IN int fflag;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% poll vp U U U
+
+vop_poll {
+ IN struct vnode *vp;
+ IN int events;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% kqfilter vp U U U
+
+vop_kqfilter {
+ IN struct vnode *vp;
+ IN struct knote *kn;
+};
+
+
+%% revoke vp L L L
+
+vop_revoke {
+ IN struct vnode *vp;
+ IN int flags;
+};
+
+
+%% fsync vp L L L
+
+vop_fsync {
+ IN struct vnode *vp;
+ IN int waitfor;
+ IN struct thread *td;
+};
+
+
+%% remove dvp E E E
+%% remove vp E E E
+%! remove post vop_remove_post
+
+vop_remove {
+ IN struct vnode *dvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+
+%% link tdvp E E E
+%% link vp E E E
+%! link post vop_link_post
+
+vop_link {
+ IN struct vnode *tdvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+
+%! rename pre vop_rename_pre
+%! rename post vop_rename_post
+
+vop_rename {
+ IN WILLRELE struct vnode *fdvp;
+ IN WILLRELE struct vnode *fvp;
+ IN struct componentname *fcnp;
+ IN WILLRELE struct vnode *tdvp;
+ IN WILLRELE struct vnode *tvp;
+ IN struct componentname *tcnp;
+};
+
+
+%% mkdir dvp E E E
+%% mkdir vpp - E -
+%! mkdir post vop_mkdir_post
+
+vop_mkdir {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+};
+
+
+%% rmdir dvp E E E
+%% rmdir vp E E E
+%! rmdir post vop_rmdir_post
+
+vop_rmdir {
+ IN struct vnode *dvp;
+ IN struct vnode *vp;
+ IN struct componentname *cnp;
+};
+
+
+%% symlink dvp E E E
+%% symlink vpp - E -
+%! symlink post vop_symlink_post
+
+vop_symlink {
+ IN struct vnode *dvp;
+ OUT struct vnode **vpp;
+ IN struct componentname *cnp;
+ IN struct vattr *vap;
+ IN char *target;
+};
+
+
+%% readdir vp L L L
+
+vop_readdir {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN struct ucred *cred;
+ INOUT int *eofflag;
+ OUT int *ncookies;
+ INOUT u_long **cookies;
+};
+
+
+%% readlink vp L L L
+
+vop_readlink {
+ IN struct vnode *vp;
+ INOUT struct uio *uio;
+ IN struct ucred *cred;
+};
+
+
+%% inactive vp E E E
+
+vop_inactive {
+ IN struct vnode *vp;
+ IN struct thread *td;
+};
+
+
+%% reclaim vp E E E
+
+vop_reclaim {
+ IN struct vnode *vp;
+ IN struct thread *td;
+};
+
+
+%! lock1 pre vop_lock_pre
+%! lock1 post vop_lock_post
+
+vop_lock1 {
+ IN struct vnode *vp;
+ IN int flags;
+ IN char *file;
+ IN int line;
+};
+
+
+%! unlock pre vop_unlock_pre
+%! unlock post vop_unlock_post
+
+vop_unlock {
+ IN struct vnode *vp;
+ IN int flags;
+};
+
+
+%% bmap vp L L L
+
+vop_bmap {
+ IN struct vnode *vp;
+ IN daddr_t bn;
+ OUT struct bufobj **bop;
+ IN daddr_t *bnp;
+ OUT int *runp;
+ OUT int *runb;
+};
+
+
+%% strategy vp L L L
+%! strategy pre vop_strategy_pre
+
+vop_strategy {
+ IN struct vnode *vp;
+ IN struct buf *bp;
+};
+
+
+%% getwritemount vp = = =
+
+vop_getwritemount {
+ IN struct vnode *vp;
+ OUT struct mount **mpp;
+};
+
+
+%% print vp - - -
+
+vop_print {
+ IN struct vnode *vp;
+};
+
+
+%% pathconf vp L L L
+
+vop_pathconf {
+ IN struct vnode *vp;
+ IN int name;
+ OUT register_t *retval;
+};
+
+
+%% advlock vp U U U
+
+vop_advlock {
+ IN struct vnode *vp;
+ IN void *id;
+ IN int op;
+ IN struct flock *fl;
+ IN int flags;
+};
+
+
+%% advlockasync vp U U U
+
+vop_advlockasync {
+ IN struct vnode *vp;
+ IN void *id;
+ IN int op;
+ IN struct flock *fl;
+ IN int flags;
+ IN struct task *task;
+ INOUT void **cookiep;
+};
+
+
+%% advlockpurge vp E E E
+
+vop_advlockpurge {
+ IN struct vnode *vp;
+};
+
+
+%% reallocblks vp E E E
+
+vop_reallocblks {
+ IN struct vnode *vp;
+ IN struct cluster_save *buflist;
+};
+
+
+%% getpages vp L L L
+
+vop_getpages {
+ IN struct vnode *vp;
+ IN vm_page_t *m;
+ IN int count;
+ IN int reqpage;
+ IN vm_ooffset_t offset;
+};
+
+
+%% putpages vp E E E
+
+vop_putpages {
+ IN struct vnode *vp;
+ IN vm_page_t *m;
+ IN int count;
+ IN int sync;
+ IN int *rtvals;
+ IN vm_ooffset_t offset;
+};
+
+
+%% getacl vp L L L
+
+vop_getacl {
+ IN struct vnode *vp;
+ IN acl_type_t type;
+ OUT struct acl *aclp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% setacl vp E E E
+
+vop_setacl {
+ IN struct vnode *vp;
+ IN acl_type_t type;
+ IN struct acl *aclp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% aclcheck vp = = =
+
+vop_aclcheck {
+ IN struct vnode *vp;
+ IN acl_type_t type;
+ IN struct acl *aclp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% closeextattr vp L L L
+
+vop_closeextattr {
+ IN struct vnode *vp;
+ IN int commit;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% getextattr vp L L L
+
+vop_getextattr {
+ IN struct vnode *vp;
+ IN int attrnamespace;
+ IN const char *name;
+ INOUT struct uio *uio;
+ OUT size_t *size;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% listextattr vp L L L
+
+vop_listextattr {
+ IN struct vnode *vp;
+ IN int attrnamespace;
+ INOUT struct uio *uio;
+ OUT size_t *size;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% openextattr vp L L L
+
+vop_openextattr {
+ IN struct vnode *vp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% deleteextattr vp E E E
+%! deleteextattr post vop_deleteextattr_post
+
+vop_deleteextattr {
+ IN struct vnode *vp;
+ IN int attrnamespace;
+ IN const char *name;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% setextattr vp E E E
+%! setextattr post vop_setextattr_post
+
+vop_setextattr {
+ IN struct vnode *vp;
+ IN int attrnamespace;
+ IN const char *name;
+ INOUT struct uio *uio;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% setlabel vp E E E
+
+vop_setlabel {
+ IN struct vnode *vp;
+ IN struct label *label;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
+%% vptofh vp = = =
+
+vop_vptofh {
+ IN struct vnode *vp;
+ IN struct fid *fhp;
+};
+
+
+%% vptocnp vp L L L
+%% vptocnp vpp - U -
+
+vop_vptocnp {
+ IN struct vnode *vp;
+ OUT struct vnode **vpp;
+ IN struct ucred *cred;
+ INOUT char *buf;
+ INOUT int *buflen;
+};
+
+
+%% allocate vp E E E
+
+vop_allocate {
+ IN struct vnode *vp;
+ INOUT off_t *offset;
+ INOUT off_t *len;
+};
+
+%% advise vp U U U
+
+vop_advise {
+ IN struct vnode *vp;
+ IN off_t start;
+ IN off_t end;
+ IN int advice;
+};
+
+%% unp_bind vp E E E
+
+vop_unp_bind {
+ IN struct vnode *vp;
+ IN struct socket *socket;
+};
+
+%% unp_connect vp L L L
+
+vop_unp_connect {
+ IN struct vnode *vp;
+ OUT struct socket **socket;
+};
+
+%% unp_detach vp = = =
+
+vop_unp_detach {
+ IN struct vnode *vp;
+};
+
+%% is_text vp L L L
+
+vop_is_text {
+ IN struct vnode *vp;
+};
+
+%% set_text vp E E E
+
+vop_set_text {
+ IN struct vnode *vp;
+};
+
+%% vop_unset_text vp E E E
+
+vop_unset_text {
+ IN struct vnode *vp;
+};
+
+%% get_writecount vp L L L
+
+vop_get_writecount {
+ IN struct vnode *vp;
+ OUT int *writecount;
+};
+
+%% add_writecount vp E E E
+
+vop_add_writecount {
+ IN struct vnode *vp;
+ IN int inc;
+};
+
+# The VOPs below are spares at the end of the table to allow new VOPs to be
+# added in stable branches without breaking the KBI. New VOPs in HEAD should
+# be added above these spares. When merging a new VOP to a stable branch,
+# the new VOP should replace one of the spares.
+
+vop_spare1 {
+ IN struct vnode *vp;
+};
+
+vop_spare2 {
+ IN struct vnode *vp;
+};
+
+vop_spare3 {
+ IN struct vnode *vp;
+};
+
+vop_spare4 {
+ IN struct vnode *vp;
+};
+
+vop_spare5 {
+ IN struct vnode *vp;
+};
OpenPOWER on IntegriCloud