summaryrefslogtreecommitdiffstats
path: root/sys/kern/sys_generic.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern/sys_generic.c')
-rw-r--r--sys/kern/sys_generic.c1210
1 files changed, 1210 insertions, 0 deletions
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
new file mode 100644
index 0000000..1bdd913
--- /dev/null
+++ b/sys/kern/sys_generic.c
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
+ * $FreeBSD$
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/filedesc.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/signalvar.h>
+#include <sys/socketvar.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/poll.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/condvar.h>
+#ifdef __alpha__
+#include <sys/disklabel.h>
+#endif
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/limits.h>
+
+static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
+static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
+MALLOC_DEFINE(M_IOV, "iov", "large iov's");
+
+static int pollscan(struct thread *, struct pollfd *, u_int);
+static int selscan(struct thread *, fd_mask **, fd_mask **, int);
+static int dofileread(struct thread *, struct file *, int, void *,
+ size_t, off_t, int);
+static int dofilewrite(struct thread *, struct file *, int,
+ const void *, size_t, off_t, int);
+
+/*
+ * Read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+ int fd;
+ void *buf;
+ size_t nbyte;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+read(td, uap)
+ struct thread *td;
+ struct read_args *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = fget_read(td, uap->fd, &fp)) == 0) {
+ error = dofileread(td, fp, uap->fd, uap->buf,
+ uap->nbyte, (off_t)-1, 0);
+ fdrop(fp, td);
+ }
+ return(error);
+}
+
+/*
+ * Pread system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pread_args {
+ int fd;
+ void *buf;
+ size_t nbyte;
+ int pad;
+ off_t offset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+pread(td, uap)
+ struct thread *td;
+ struct pread_args *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = fget_read(td, uap->fd, &fp)) != 0)
+ return (error);
+ if (fp->f_type != DTYPE_VNODE) {
+ error = ESPIPE;
+ } else {
+ error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte,
+ uap->offset, FOF_OFFSET);
+ }
+ fdrop(fp, td);
+ return(error);
+}
+
+/*
+ * Code common for read and pread
+ */
+int
+dofileread(td, fp, fd, buf, nbyte, offset, flags)
+ struct thread *td;
+ struct file *fp;
+ int fd, flags;
+ void *buf;
+ size_t nbyte;
+ off_t offset;
+{
+ struct uio auio;
+ struct iovec aiov;
+ long cnt, error = 0;
+#ifdef KTRACE
+ struct iovec ktriov;
+ struct uio ktruio;
+ int didktr = 0;
+#endif
+
+ aiov.iov_base = (caddr_t)buf;
+ aiov.iov_len = nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ if (nbyte > INT_MAX)
+ return (EINVAL);
+ auio.uio_resid = nbyte;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec
+ */
+ if (KTRPOINT(td, KTR_GENIO)) {
+ ktriov = aiov;
+ ktruio = auio;
+ didktr = 1;
+ }
+#endif
+ cnt = nbyte;
+
+ if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (didktr && error == 0) {
+ ktruio.uio_iov = &ktriov;
+ ktruio.uio_resid = cnt;
+ ktrgenio(fd, UIO_READ, &ktruio, error);
+ }
+#endif
+ td->td_retval[0] = cnt;
+ return (error);
+}
+
+/*
+ * Scatter read system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+readv(td, uap)
+ struct thread *td;
+ struct readv_args *uap;
+{
+ struct file *fp;
+ struct uio auio;
+ struct iovec *iov;
+ struct iovec *needfree;
+ struct iovec aiov[UIO_SMALLIOV];
+ long i, cnt;
+ int error;
+ u_int iovlen;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+ struct uio ktruio;
+#endif
+
+ if ((error = fget_read(td, uap->fd, &fp)) != 0)
+ return (error);
+ needfree = NULL;
+ /* note: can't use iovlen until iovcnt is validated */
+ iovlen = uap->iovcnt * sizeof (struct iovec);
+ if (uap->iovcnt > UIO_SMALLIOV) {
+ if (uap->iovcnt > UIO_MAXIOV) {
+ error = EINVAL;
+ goto done;
+ }
+ MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+ needfree = iov;
+ } else
+ iov = aiov;
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovcnt;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = -1;
+ if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+ goto done;
+ auio.uio_resid = 0;
+ for (i = 0; i < uap->iovcnt; i++) {
+ if (iov->iov_len > INT_MAX - auio.uio_resid) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid += iov->iov_len;
+ iov++;
+ }
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec
+ */
+ if (KTRPOINT(td, KTR_GENIO)) {
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+ ktruio = auio;
+ }
+#endif
+ cnt = auio.uio_resid;
+ if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0) {
+ ktruio.uio_iov = ktriov;
+ ktruio.uio_resid = cnt;
+ ktrgenio(uap->fd, UIO_READ, &ktruio, error);
+ }
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+ td->td_retval[0] = cnt;
+done:
+ fdrop(fp, td);
+ if (needfree)
+ FREE(needfree, M_IOV);
+ return (error);
+}
+
+/*
+ * Write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+ int fd;
+ const void *buf;
+ size_t nbyte;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+write(td, uap)
+ struct thread *td;
+ struct write_args *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = fget_write(td, uap->fd, &fp)) == 0) {
+ error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
+ (off_t)-1, 0);
+ fdrop(fp, td);
+ } else {
+ error = EBADF; /* XXX this can't be right */
+ }
+ return(error);
+}
+
+/*
+ * Pwrite system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pwrite_args {
+ int fd;
+ const void *buf;
+ size_t nbyte;
+ int pad;
+ off_t offset;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+pwrite(td, uap)
+ struct thread *td;
+ struct pwrite_args *uap;
+{
+ struct file *fp;
+ int error;
+
+ if ((error = fget_write(td, uap->fd, &fp)) == 0) {
+ if (fp->f_type == DTYPE_VNODE) {
+ error = dofilewrite(td, fp, uap->fd, uap->buf,
+ uap->nbyte, uap->offset, FOF_OFFSET);
+ } else {
+ error = ESPIPE;
+ }
+ fdrop(fp, td);
+ } else {
+ error = EBADF; /* this can't be right */
+ }
+ return(error);
+}
+
+static int
+dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
+ struct thread *td;
+ struct file *fp;
+ int fd, flags;
+ const void *buf;
+ size_t nbyte;
+ off_t offset;
+{
+ struct uio auio;
+ struct iovec aiov;
+ long cnt, error = 0;
+#ifdef KTRACE
+ struct iovec ktriov;
+ struct uio ktruio;
+ int didktr = 0;
+#endif
+
+ aiov.iov_base = (void *)(uintptr_t)buf;
+ aiov.iov_len = nbyte;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = offset;
+ if (nbyte > INT_MAX)
+ return (EINVAL);
+ auio.uio_resid = nbyte;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec and uio
+ */
+ if (KTRPOINT(td, KTR_GENIO)) {
+ ktriov = aiov;
+ ktruio = auio;
+ didktr = 1;
+ }
+#endif
+ cnt = nbyte;
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ /* Socket layer is responsible for issuing SIGPIPE. */
+ if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
+ PROC_LOCK(td->td_proc);
+ psignal(td->td_proc, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (didktr && error == 0) {
+ ktruio.uio_iov = &ktriov;
+ ktruio.uio_resid = cnt;
+ ktrgenio(fd, UIO_WRITE, &ktruio, error);
+ }
+#endif
+ td->td_retval[0] = cnt;
+ return (error);
+}
+
+/*
+ * Gather write system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+writev(td, uap)
+ struct thread *td;
+ register struct writev_args *uap;
+{
+ struct file *fp;
+ struct uio auio;
+ register struct iovec *iov;
+ struct iovec *needfree;
+ struct iovec aiov[UIO_SMALLIOV];
+ long i, cnt, error = 0;
+ u_int iovlen;
+#ifdef KTRACE
+ struct iovec *ktriov = NULL;
+ struct uio ktruio;
+#endif
+
+ mtx_lock(&Giant);
+ if ((error = fget_write(td, uap->fd, &fp)) != 0) {
+ error = EBADF;
+ goto done2;
+ }
+ /* note: can't use iovlen until iovcnt is validated */
+ iovlen = uap->iovcnt * sizeof (struct iovec);
+ if (uap->iovcnt > UIO_SMALLIOV) {
+ if (uap->iovcnt > UIO_MAXIOV) {
+ needfree = NULL;
+ error = EINVAL;
+ goto done;
+ }
+ MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
+ needfree = iov;
+ } else {
+ iov = aiov;
+ needfree = NULL;
+ }
+ auio.uio_iov = iov;
+ auio.uio_iovcnt = uap->iovcnt;
+ auio.uio_rw = UIO_WRITE;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_td = td;
+ auio.uio_offset = -1;
+ if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
+ goto done;
+ auio.uio_resid = 0;
+ for (i = 0; i < uap->iovcnt; i++) {
+ if (iov->iov_len > INT_MAX - auio.uio_resid) {
+ error = EINVAL;
+ goto done;
+ }
+ auio.uio_resid += iov->iov_len;
+ iov++;
+ }
+#ifdef KTRACE
+ /*
+ * if tracing, save a copy of iovec and uio
+ */
+ if (KTRPOINT(td, KTR_GENIO)) {
+ MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
+ bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
+ ktruio = auio;
+ }
+#endif
+ cnt = auio.uio_resid;
+ if (fp->f_type == DTYPE_VNODE)
+ bwillwrite();
+ if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
+ if (auio.uio_resid != cnt && (error == ERESTART ||
+ error == EINTR || error == EWOULDBLOCK))
+ error = 0;
+ if (error == EPIPE) {
+ PROC_LOCK(td->td_proc);
+ psignal(td->td_proc, SIGPIPE);
+ PROC_UNLOCK(td->td_proc);
+ }
+ }
+ cnt -= auio.uio_resid;
+#ifdef KTRACE
+ if (ktriov != NULL) {
+ if (error == 0) {
+ ktruio.uio_iov = ktriov;
+ ktruio.uio_resid = cnt;
+ ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
+ }
+ FREE(ktriov, M_TEMP);
+ }
+#endif
+ td->td_retval[0] = cnt;
+done:
+ fdrop(fp, td);
+ if (needfree)
+ FREE(needfree, M_IOV);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * Ioctl system call
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+ int fd;
+ u_long com;
+ caddr_t data;
+};
+#endif
+/*
+ * MPSAFE
+ */
+/* ARGSUSED */
+int
+ioctl(td, uap)
+ struct thread *td;
+ register struct ioctl_args *uap;
+{
+ struct file *fp;
+ register struct filedesc *fdp;
+ register u_long com;
+ int error = 0;
+ register u_int size;
+ caddr_t data, memp;
+ int tmp;
+#define STK_PARAMS 128
+ union {
+ char stkbuf[STK_PARAMS];
+ long align;
+ } ubuf;
+
+ if ((error = fget(td, uap->fd, &fp)) != 0)
+ return (error);
+ mtx_lock(&Giant);
+ if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (EBADF);
+ }
+ fdp = td->td_proc->p_fd;
+ switch (com = uap->com) {
+ case FIONCLEX:
+ FILEDESC_LOCK(fdp);
+ fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (0);
+ case FIOCLEX:
+ FILEDESC_LOCK(fdp);
+ fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (0);
+ }
+
+ /*
+ * Interpret high order word to find amount of data to be
+ * copied to/from the user's address space.
+ */
+ size = IOCPARM_LEN(com);
+ if (size > IOCPARM_MAX) {
+ fdrop(fp, td);
+ mtx_unlock(&Giant);
+ return (ENOTTY);
+ }
+
+ memp = NULL;
+ if (size > sizeof (ubuf.stkbuf)) {
+ memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+ data = memp;
+ } else {
+ data = ubuf.stkbuf;
+ }
+ if (com&IOC_IN) {
+ if (size) {
+ error = copyin(uap->data, data, (u_int)size);
+ if (error) {
+ if (memp)
+ free(memp, M_IOCTLOPS);
+ fdrop(fp, td);
+ goto done;
+ }
+ } else {
+ *(caddr_t *)data = uap->data;
+ }
+ } else if ((com&IOC_OUT) && size) {
+ /*
+ * Zero the buffer so the user always
+ * gets back something deterministic.
+ */
+ bzero(data, size);
+ } else if (com&IOC_VOID) {
+ *(caddr_t *)data = uap->data;
+ }
+
+ switch (com) {
+
+ case FIONBIO:
+ FILE_LOCK(fp);
+ if ((tmp = *(int *)data))
+ fp->f_flag |= FNONBLOCK;
+ else
+ fp->f_flag &= ~FNONBLOCK;
+ FILE_UNLOCK(fp);
+ error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
+ break;
+
+ case FIOASYNC:
+ FILE_LOCK(fp);
+ if ((tmp = *(int *)data))
+ fp->f_flag |= FASYNC;
+ else
+ fp->f_flag &= ~FASYNC;
+ FILE_UNLOCK(fp);
+ error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
+ break;
+
+ default:
+ error = fo_ioctl(fp, com, data, td);
+ /*
+ * Copy any data to user, size was
+ * already set and checked above.
+ */
+ if (error == 0 && (com&IOC_OUT) && size)
+ error = copyout(data, uap->data, (u_int)size);
+ break;
+ }
+ if (memp)
+ free(memp, M_IOCTLOPS);
+ fdrop(fp, td);
+done:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+/*
+ * sellock and selwait are initialized in selectinit() via SYSINIT.
+ */
+struct mtx sellock;
+struct cv selwait;
+u_int nselcoll; /* Select collisions since boot */
+SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
+
+/*
+ * Select system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+ int nd;
+ fd_set *in, *ou, *ex;
+ struct timeval *tv;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+select(td, uap)
+ register struct thread *td;
+ register struct select_args *uap;
+{
+ struct filedesc *fdp;
+ /*
+ * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+ * infds with the new FD_SETSIZE of 1024, and more than enough for
+ * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+ * of 256.
+ */
+ fd_mask s_selbits[howmany(2048, NFDBITS)];
+ fd_mask *ibits[3], *obits[3], *selbits, *sbp;
+ struct timeval atv, rtv, ttv;
+ int error, timo;
+ u_int ncoll, nbufbytes, ncpbytes, nfdbits;
+
+ if (uap->nd < 0)
+ return (EINVAL);
+ fdp = td->td_proc->p_fd;
+ mtx_lock(&Giant);
+ FILEDESC_LOCK(fdp);
+
+ if (uap->nd > td->td_proc->p_fd->fd_nfiles)
+ uap->nd = td->td_proc->p_fd->fd_nfiles; /* forgiving; slightly wrong */
+ FILEDESC_UNLOCK(fdp);
+
+ /*
+ * Allocate just enough bits for the non-null fd_sets. Use the
+ * preallocated auto buffer if possible.
+ */
+ nfdbits = roundup(uap->nd, NFDBITS);
+ ncpbytes = nfdbits / NBBY;
+ nbufbytes = 0;
+ if (uap->in != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (uap->ou != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (uap->ex != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (nbufbytes <= sizeof s_selbits)
+ selbits = &s_selbits[0];
+ else
+ selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
+
+ /*
+ * Assign pointers into the bit buffers and fetch the input bits.
+ * Put the output buffers together so that they can be bzeroed
+ * together.
+ */
+ sbp = selbits;
+#define getbits(name, x) \
+ do { \
+ if (uap->name == NULL) \
+ ibits[x] = NULL; \
+ else { \
+ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
+ obits[x] = sbp; \
+ sbp += ncpbytes / sizeof *sbp; \
+ error = copyin(uap->name, ibits[x], ncpbytes); \
+ if (error != 0) \
+ goto done_nosellock; \
+ } \
+ } while (0)
+ getbits(in, 0);
+ getbits(ou, 1);
+ getbits(ex, 2);
+#undef getbits
+ if (nbufbytes != 0)
+ bzero(selbits, nbufbytes / 2);
+
+ if (uap->tv) {
+ error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
+ sizeof (atv));
+ if (error)
+ goto done_nosellock;
+ if (itimerfix(&atv)) {
+ error = EINVAL;
+ goto done_nosellock;
+ }
+ getmicrouptime(&rtv);
+ timevaladd(&atv, &rtv);
+ } else {
+ atv.tv_sec = 0;
+ atv.tv_usec = 0;
+ }
+ timo = 0;
+ mtx_lock(&sellock);
+retry:
+ ncoll = nselcoll;
+ mtx_lock_spin(&sched_lock);
+ td->td_flags |= TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+
+ /* XXX Is there a better place for this? */
+ TAILQ_INIT(&td->td_selq);
+ error = selscan(td, ibits, obits, uap->nd);
+ mtx_lock(&sellock);
+ if (error || td->td_retval[0])
+ goto done;
+ if (atv.tv_sec || atv.tv_usec) {
+ getmicrouptime(&rtv);
+ if (timevalcmp(&rtv, &atv, >=))
+ goto done;
+ ttv = atv;
+ timevalsub(&ttv, &rtv);
+ timo = ttv.tv_sec > 24 * 60 * 60 ?
+ 24 * 60 * 60 * hz : tvtohz(&ttv);
+ }
+
+ /*
+ * An event of interest may occur while we do not hold
+ * sellock, so check TDF_SELECT and the number of
+ * collisions and rescan the file descriptors if
+ * necessary.
+ */
+ mtx_lock_spin(&sched_lock);
+ if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
+ mtx_unlock_spin(&sched_lock);
+ goto retry;
+ }
+ mtx_unlock_spin(&sched_lock);
+
+ if (timo > 0)
+ error = cv_timedwait_sig(&selwait, &sellock, timo);
+ else
+ error = cv_wait_sig(&selwait, &sellock);
+
+ if (error == 0)
+ goto retry;
+
+done:
+ clear_selinfo_list(td);
+ mtx_lock_spin(&sched_lock);
+ td->td_flags &= ~TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+
+done_nosellock:
+ /* select is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+#define putbits(name, x) \
+ if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
+ error = error2;
+ if (error == 0) {
+ int error2;
+
+ putbits(in, 0);
+ putbits(ou, 1);
+ putbits(ex, 2);
+#undef putbits
+ }
+ if (selbits != &s_selbits[0])
+ free(selbits, M_SELECT);
+
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+selscan(td, ibits, obits, nfd)
+ struct thread *td;
+ fd_mask **ibits, **obits;
+ int nfd;
+{
+ int msk, i, fd;
+ fd_mask bits;
+ struct file *fp;
+ int n = 0;
+ /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
+ static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
+ struct filedesc *fdp = td->td_proc->p_fd;
+
+ FILEDESC_LOCK(fdp);
+ for (msk = 0; msk < 3; msk++) {
+ if (ibits[msk] == NULL)
+ continue;
+ for (i = 0; i < nfd; i += NFDBITS) {
+ bits = ibits[msk][i/NFDBITS];
+ /* ffs(int mask) not portable, fd_mask is long */
+ for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
+ if (!(bits & 1))
+ continue;
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
+ FILEDESC_UNLOCK(fdp);
+ return (EBADF);
+ }
+ if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
+ obits[msk][(fd)/NFDBITS] |=
+ ((fd_mask)1 << ((fd) % NFDBITS));
+ n++;
+ }
+ }
+ }
+ }
+ FILEDESC_UNLOCK(fdp);
+ td->td_retval[0] = n;
+ return (0);
+}
+
+/*
+ * Poll system call.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct poll_args {
+ struct pollfd *fds;
+ u_int nfds;
+ int timeout;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+poll(td, uap)
+ struct thread *td;
+ struct poll_args *uap;
+{
+ caddr_t bits;
+ char smallbits[32 * sizeof(struct pollfd)];
+ struct timeval atv, rtv, ttv;
+ int error = 0, timo;
+ u_int ncoll, nfds;
+ size_t ni;
+
+ nfds = SCARG(uap, nfds);
+
+ mtx_lock(&Giant);
+ /*
+ * This is kinda bogus. We have fd limits, but that is not
+ * really related to the size of the pollfd array. Make sure
+ * we let the process use at least FD_SETSIZE entries and at
+ * least enough for the current limits. We want to be reasonably
+ * safe, but not overly restrictive.
+ */
+ if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
+ (nfds > FD_SETSIZE)) {
+ error = EINVAL;
+ goto done2;
+ }
+ ni = nfds * sizeof(struct pollfd);
+ if (ni > sizeof(smallbits))
+ bits = malloc(ni, M_TEMP, M_WAITOK);
+ else
+ bits = smallbits;
+ error = copyin(SCARG(uap, fds), bits, ni);
+ if (error)
+ goto done_nosellock;
+ if (SCARG(uap, timeout) != INFTIM) {
+ atv.tv_sec = SCARG(uap, timeout) / 1000;
+ atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
+ if (itimerfix(&atv)) {
+ error = EINVAL;
+ goto done_nosellock;
+ }
+ getmicrouptime(&rtv);
+ timevaladd(&atv, &rtv);
+ } else {
+ atv.tv_sec = 0;
+ atv.tv_usec = 0;
+ }
+ timo = 0;
+ mtx_lock(&sellock);
+retry:
+ ncoll = nselcoll;
+ mtx_lock_spin(&sched_lock);
+ td->td_flags |= TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+
+ /* XXX Is there a better place for this? */
+ TAILQ_INIT(&td->td_selq);
+ error = pollscan(td, (struct pollfd *)bits, nfds);
+ mtx_lock(&sellock);
+ if (error || td->td_retval[0])
+ goto done;
+ if (atv.tv_sec || atv.tv_usec) {
+ getmicrouptime(&rtv);
+ if (timevalcmp(&rtv, &atv, >=))
+ goto done;
+ ttv = atv;
+ timevalsub(&ttv, &rtv);
+ timo = ttv.tv_sec > 24 * 60 * 60 ?
+ 24 * 60 * 60 * hz : tvtohz(&ttv);
+ }
+ /*
+ * An event of interest may occur while we do not hold
+ * sellock, so check TDF_SELECT and the number of collisions
+ * and rescan the file descriptors if necessary.
+ */
+ mtx_lock_spin(&sched_lock);
+ if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
+ mtx_unlock_spin(&sched_lock);
+ goto retry;
+ }
+ mtx_unlock_spin(&sched_lock);
+
+ if (timo > 0)
+ error = cv_timedwait_sig(&selwait, &sellock, timo);
+ else
+ error = cv_wait_sig(&selwait, &sellock);
+
+ if (error == 0)
+ goto retry;
+
+done:
+ clear_selinfo_list(td);
+ mtx_lock_spin(&sched_lock);
+ td->td_flags &= ~TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+
+done_nosellock:
+ /* poll is not restarted after signals... */
+ if (error == ERESTART)
+ error = EINTR;
+ if (error == EWOULDBLOCK)
+ error = 0;
+ if (error == 0) {
+ error = copyout(bits, SCARG(uap, fds), ni);
+ if (error)
+ goto out;
+ }
+out:
+ if (ni > sizeof(smallbits))
+ free(bits, M_TEMP);
+done2:
+ mtx_unlock(&Giant);
+ return (error);
+}
+
+static int
+pollscan(td, fds, nfd)
+ struct thread *td;
+ struct pollfd *fds;
+ u_int nfd;
+{
+ register struct filedesc *fdp = td->td_proc->p_fd;
+ int i;
+ struct file *fp;
+ int n = 0;
+
+ FILEDESC_LOCK(fdp);
+ for (i = 0; i < nfd; i++, fds++) {
+ if (fds->fd >= fdp->fd_nfiles) {
+ fds->revents = POLLNVAL;
+ n++;
+ } else if (fds->fd < 0) {
+ fds->revents = 0;
+ } else {
+ fp = fdp->fd_ofiles[fds->fd];
+ if (fp == NULL) {
+ fds->revents = POLLNVAL;
+ n++;
+ } else {
+ /*
+ * Note: backend also returns POLLHUP and
+ * POLLERR if appropriate.
+ */
+ fds->revents = fo_poll(fp, fds->events,
+ fp->f_cred, td);
+ if (fds->revents != 0)
+ n++;
+ }
+ }
+ }
+ FILEDESC_UNLOCK(fdp);
+ td->td_retval[0] = n;
+ return (0);
+}
+
+/*
+ * OpenBSD poll system call.
+ * XXX this isn't quite a true representation.. OpenBSD uses select ops.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct openbsd_poll_args {
+ struct pollfd *fds;
+ u_int nfds;
+ int timeout;
+};
+#endif
+/*
+ * MPSAFE
+ */
+int
+openbsd_poll(td, uap)
+ register struct thread *td;
+ register struct openbsd_poll_args *uap;
+{
+ return (poll(td, (struct poll_args *)uap));
+}
+
+/*
+ * Remove the references to the thread from all of the objects
+ * we were polling.
+ *
+ * This code assumes that the underlying owner of the selinfo
+ * structure will hold sellock before it changes it, and that
+ * it will unlink itself from our list if it goes away.
+ */
+void
+clear_selinfo_list(td)
+ struct thread *td;
+{
+ struct selinfo *si;
+
+ mtx_assert(&sellock, MA_OWNED);
+ TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
+ si->si_thread = NULL;
+ TAILQ_INIT(&td->td_selq);
+}
+
+/*ARGSUSED*/
+int
+seltrue(dev, events, td)
+ dev_t dev;
+ int events;
+ struct thread *td;
+{
+
+ return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
+}
+
+/*
+ * Record a select request.
+ */
+void
+selrecord(selector, sip)
+ struct thread *selector;
+ struct selinfo *sip;
+{
+
+ mtx_lock(&sellock);
+ /*
+ * If the thread is NULL then take ownership of selinfo
+ * however if the thread is not NULL and the thread points to
+ * someone else, then we have a collision, otherwise leave it alone
+ * as we've owned it in a previous selrecord on this selinfo.
+ */
+ if (sip->si_thread == NULL) {
+ sip->si_thread = selector;
+ TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
+ } else if (sip->si_thread != selector) {
+ sip->si_flags |= SI_COLL;
+ }
+
+ mtx_unlock(&sellock);
+}
+
+/*
+ * Do a wakeup when a selectable event occurs.
+ */
+void
+selwakeup(sip)
+ struct selinfo *sip;
+{
+ struct thread *td;
+
+ mtx_lock(&sellock);
+ td = sip->si_thread;
+ if ((sip->si_flags & SI_COLL) != 0) {
+ nselcoll++;
+ sip->si_flags &= ~SI_COLL;
+ cv_broadcast(&selwait);
+ }
+ if (td == NULL) {
+ mtx_unlock(&sellock);
+ return;
+ }
+ TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
+ sip->si_thread = NULL;
+ mtx_lock_spin(&sched_lock);
+ if (td->td_wchan == (caddr_t)&selwait) {
+ if (td->td_proc->p_stat == SSLEEP)
+ setrunnable(td);
+ else
+ cv_waitq_remove(td);
+ } else
+ td->td_flags &= ~TDF_SELECT;
+ mtx_unlock_spin(&sched_lock);
+ mtx_unlock(&sellock);
+}
+
+static void selectinit(void *);
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
+
+/* ARGSUSED*/
+static void
+selectinit(dummy)
+ void *dummy;
+{
+ cv_init(&selwait, "select");
+ mtx_init(&sellock, "sellck", NULL, MTX_DEF);
+}
OpenPOWER on IntegriCloud