summaryrefslogtreecommitdiffstats
path: root/sys/compat/linux/linux_epoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/compat/linux/linux_epoll.c')
-rw-r--r--sys/compat/linux/linux_epoll.c554
1 files changed, 554 insertions, 0 deletions
diff --git a/sys/compat/linux/linux_epoll.c b/sys/compat/linux/linux_epoll.c
new file mode 100644
index 0000000..b9e1f2b
--- /dev/null
+++ b/sys/compat/linux/linux_epoll.c
@@ -0,0 +1,554 @@
+/*-
+ * Copyright (c) 2007 Roman Divacky
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/limits.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/capability.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/errno.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/syscallsubr.h>
+#include <sys/timespec.h>
+#include <compat/linux/linux_epoll.h>
+#include <compat/linux/linux_util.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#ifdef COMPAT_LINUX32
+#include <machine/../linux32/linux.h>
+#include <machine/../linux32/linux32_proto.h>
+#else
+#include <machine/../linux/linux.h>
+#include <machine/../linux/linux_proto.h>
+#endif
+
+#define ktrepoll_events(evt, count) \
+ ktrstruct("linux_epoll_event", (evt), count * sizeof(*evt))
+
+/*
+ * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
+ * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
+ * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
+ * data verbatuim. Therefore on 32 bit architectures we allocate 64-bit memory
+ * block to pass user supplied data for every file descriptor.
+ */
+typedef uint64_t epoll_udata_t;
+#if defined(__i386__)
+#define EPOLL_WIDE_USER_DATA 1
+#else
+#define EPOLL_WIDE_USER_DATA 0
+#endif
+
+#if EPOLL_WIDE_USER_DATA
+
+/*
+ * Approach similar to epoll_user_data could also be used to
+ * keep track of event bits per file descriptor for all architectures.
+ * However, it isn't obvious that such tracking would be beneficial
+ * in practice.
+ */
+
+struct epoll_user_data {
+ unsigned sz;
+ epoll_udata_t data[1];
+};
+static MALLOC_DEFINE(M_LINUX_EPOLL, "epoll", "memory for epoll system");
+#define EPOLL_USER_DATA_SIZE(ndata) \
+ (sizeof(struct epoll_user_data)+((ndata)-1)*sizeof(epoll_udata_t))
+#define EPOLL_USER_DATA_MARGIN 16
+
+static void epoll_init_user_data(struct thread *td, struct file *epfp);
+static void epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data);
+static epoll_udata_t epoll_get_user_data(struct thread *td, struct file *epfp, int fd);
+static fo_close_t epoll_close;
+
+/* overload kqueue fileops */
+static struct fileops epollops = {
+ .fo_read = kqueue_read,
+ .fo_write = kqueue_write,
+ .fo_truncate = kqueue_truncate,
+ .fo_ioctl = kqueue_ioctl,
+ .fo_poll = kqueue_poll,
+ .fo_kqfilter = kqueue_kqfilter,
+ .fo_stat = kqueue_stat,
+ .fo_close = epoll_close,
+ .fo_chmod = invfo_chmod,
+ .fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
+};
+#endif
+
+static struct file* epoll_fget(struct thread *td, int epfd);
+
+struct epoll_copyin_args {
+ struct kevent *changelist;
+};
+
+struct epoll_copyout_args {
+ struct linux_epoll_event *leventlist;
+ int count;
+ int error;
+#if KTRACE || EPOLL_WIDE_USER_DATA
+ struct thread *td;
+#endif
+#if EPOLL_WIDE_USER_DATA
+ struct file *epfp;
+#endif
+};
+
+
+/* Create a new epoll file descriptor. */
+
+static int
+linux_epoll_create_common(struct thread *td)
+{
+ struct file *fp;
+ int error;
+
+ error = kern_kqueue_locked(td, &fp);
+#if EPOLL_WIDE_USER_DATA
+ if (error == 0) {
+ epoll_init_user_data(td, fp);
+ fdrop(fp, td);
+ }
+#endif
+ return (error);
+}
+
+int
+linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
+{
+ if (args->size <= 0)
+ return (EINVAL);
+ /* args->size is unused. Linux just tests it
+ * and then forgets it as well. */
+
+ return (linux_epoll_create_common(td));
+}
+
+int
+linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
+{
+ int error;
+
+ error = linux_epoll_create_common(td);
+
+ if (!error) {
+ if (args->flags & LINUX_EPOLL_CLOEXEC)
+ td->td_proc->p_fd->fd_ofiles[td->td_retval[0]].fde_flags |= UF_EXCLOSE;
+ if (args->flags & LINUX_EPOLL_NONBLOCK)
+ linux_msg(td, "epoll_create1 doesn't yet support EPOLL_NONBLOCK flag\n");
+ }
+
+ return (error);
+}
+
+/* Structure converting function from epoll to kevent. */
+static int
+linux_epoll_to_kevent(struct thread *td,
+#if EPOLL_WIDE_USER_DATA
+ struct file *epfp,
+#endif
+ int fd, struct linux_epoll_event *l_event, int kev_flags, struct kevent *kevent, int *nkevents)
+{
+ /* flags related to how event is registered */
+ if (l_event->events & LINUX_EPOLLONESHOT)
+ kev_flags |= EV_ONESHOT;
+ if (l_event->events & LINUX_EPOLLET) {
+ kev_flags |= EV_CLEAR;
+ }
+
+ /* flags related to what event is registered */
+ if (l_event->events & LINUX_EPOLLIN ||
+ l_event->events & LINUX_EPOLLRDNORM ||
+ l_event->events & LINUX_EPOLLPRI ||
+ l_event->events & LINUX_EPOLLRDHUP) {
+ EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0,
+ (void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data));
+ ++*nkevents;
+ }
+ if (l_event->events & LINUX_EPOLLOUT ||
+ l_event->events & LINUX_EPOLLWRNORM) {
+ EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0,
+ (void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data));
+ ++*nkevents;
+ }
+ if (l_event->events & LINUX_EPOLLRDBAND ||
+ l_event->events & LINUX_EPOLLWRBAND ||
+ l_event->events & LINUX_EPOLLHUP ||
+ l_event->events & LINUX_EPOLLMSG ||
+ l_event->events & LINUX_EPOLLWAKEUP ||
+ l_event->events & LINUX_EPOLLERR) {
+ linux_msg(td, "epoll_ctl doesn't yet support some event flags supplied: 0x%x\n",
+ l_event->events);
+ return (EINVAL);
+ }
+
+#if EPOLL_WIDE_USER_DATA
+ epoll_set_user_data(td, epfp, fd, l_event->data);
+#endif
+ return (0);
+}
+
+/*
+ * Structure converting function from kevent to epoll. In a case
+ * this is called on error in registration we store the error in
+ * event->data and pick it up later in linux_epoll_ctl().
+ */
+static void
+linux_kevent_to_epoll(
+#if EPOLL_WIDE_USER_DATA
+ struct thread *td, struct file *epfp,
+#endif
+ struct kevent *kevent, struct linux_epoll_event *l_event)
+{
+ if ((kevent->flags & EV_ERROR) == 0)
+ switch (kevent->filter) {
+ case EVFILT_READ:
+ l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI;
+ break;
+ case EVFILT_WRITE:
+ l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM;
+ break;
+ }
+#if EPOLL_WIDE_USER_DATA
+ l_event->data = epoll_get_user_data(td, epfp, kevent->ident);
+#else
+ l_event->data = (epoll_udata_t)kevent->udata;
+#endif
+}
+
+/*
+ * Copyout callback used by kevent. This converts kevent
+ * events to epoll events and copies them back to the
+ * userspace. This is also called on error on registering
+ * of the filter.
+ */
+static int
+epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
+{
+ struct epoll_copyout_args *args;
+ struct linux_epoll_event *eep;
+ int error, i;
+
+ args = (struct epoll_copyout_args*) arg;
+ eep = malloc(sizeof(*eep) * count, M_TEMP, M_WAITOK | M_ZERO);
+
+ for (i = 0; i < count; i++)
+ linux_kevent_to_epoll(
+#if EPOLL_WIDE_USER_DATA
+ args->td, args->epfp,
+#endif
+ &kevp[i], &eep[i]);
+
+ error = copyout(eep, args->leventlist, count * sizeof(*eep));
+ if (!error) {
+ args->leventlist += count;
+ args->count += count;
+ } else if (!args->error)
+ args->error = error;
+
+#ifdef KTRACE
+ if (KTRPOINT(args->td, KTR_STRUCT))
+ ktrepoll_events(eep, count);
+#endif
+
+ free(eep, M_TEMP);
+ return (error);
+}
+
+/*
+ * Copyin callback used by kevent. This copies already
+ * converted filters from kernel memory to the kevent
+ * internal kernel memory. Hence the memcpy instead of
+ * copyin.
+ */
+static int
+epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
+{
+ struct epoll_copyin_args *args;
+
+ args = (struct epoll_copyin_args*) arg;
+
+ memcpy(kevp, args->changelist, count * sizeof(*kevp));
+ args->changelist += count;
+
+ return (0);
+}
+
+static int
+ignore_enoent(int error) {
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+}
+
+static int
+delete_event(struct thread *td, struct file *epfp, int fd, int filter)
+{
+ struct epoll_copyin_args ciargs;
+ struct kevent kev;
+ struct kevent_copyops k_ops = { &ciargs,
+ NULL,
+ epoll_kev_copyin};
+ ciargs.changelist = &kev;
+
+ EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
+ return (kern_kevent_locked(td, epfp, 1, 0, &k_ops, NULL));
+}
+
+static int
+delete_all_events(struct thread *td, struct file *epfp, int fd)
+{
+ /* here we ignore ENONT, because we don't keep track of events here */
+ int error1, error2;
+
+ error1 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_READ));
+ error2 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_WRITE));
+
+ /* report any errors we got */
+ if (error1)
+ return (error1);
+ if (error2)
+ return (error2);
+ return (0);
+}
+
+/*
+ * Load epoll filter, convert it to kevent filter
+ * and load it into kevent subsystem.
+ */
+int
+linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
+{
+ struct file *epfp;
+ struct epoll_copyin_args ciargs;
+ struct kevent kev[2];
+ struct kevent_copyops k_ops = { &ciargs,
+ NULL,
+ epoll_kev_copyin};
+ struct linux_epoll_event le;
+ int kev_flags;
+ int nchanges = 0;
+ int error;
+
+ if (args->epfd == args->fd)
+ return (EINVAL);
+
+ if (args->op != LINUX_EPOLL_CTL_DEL) {
+ error = copyin(args->event, &le, sizeof(le));
+ if (error)
+ return (error);
+ }
+#ifdef DEBUG
+ if (ldebug(epoll_ctl))
+ printf(ARGS(epoll_ctl,"%i, %i, %i, %u"), args->epfd, args->op,
+ args->fd, le.events);
+#endif
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT) && args->op != LINUX_EPOLL_CTL_DEL)
+ ktrepoll_events(&le, 1);
+#endif
+ epfp = epoll_fget(td, args->epfd);
+
+ ciargs.changelist = kev;
+
+ switch (args->op) {
+ case LINUX_EPOLL_CTL_MOD:
+ /* we don't memorize which events were set for this FD
+ on this level, so just delete all we could have set:
+ EVFILT_READ and EVFILT_WRITE, ignoring any errors
+ */
+ error = delete_all_events(td, epfp, args->fd);
+ if (error)
+ goto leave;
+ /* FALLTHROUGH */
+ case LINUX_EPOLL_CTL_ADD:
+ kev_flags = EV_ADD | EV_ENABLE;
+ break;
+ case LINUX_EPOLL_CTL_DEL:
+ /* CTL_DEL means unregister this fd with this epoll */
+ error = delete_all_events(td, epfp, args->fd);
+ goto leave;
+ default:
+ error = EINVAL;
+ goto leave;
+ }
+
+ error = linux_epoll_to_kevent(td,
+#if EPOLL_WIDE_USER_DATA
+ epfp,
+#endif
+ args->fd, &le, kev_flags, kev, &nchanges);
+ if (error)
+ goto leave;
+
+ error = kern_kevent_locked(td, epfp, nchanges, 0, &k_ops, NULL);
+leave:
+ fdrop(epfp, td);
+ return (error);
+}
+
+/*
+ * Wait for a filter to be triggered on the epoll file descriptor. */
+int
+linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
+{
+ struct file *epfp;
+ struct timespec ts, *tsp;
+ struct epoll_copyout_args coargs;
+ struct kevent_copyops k_ops = { &coargs,
+ epoll_kev_copyout,
+ NULL};
+ int error;
+
+ if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS)
+ return (EINVAL);
+
+ epfp = epoll_fget(td, args->epfd);
+
+ coargs.leventlist = args->events;
+ coargs.count = 0;
+ coargs.error = 0;
+#if defined(KTRACE) || EPOLL_WIDE_USER_DATA
+ coargs.td = td;
+#endif
+#if EPOLL_WIDE_USER_DATA
+ coargs.epfp = epfp;
+#endif
+
+ if (args->timeout != -1) {
+ if (args->timeout < 0) {
+ error = EINVAL;
+ goto leave;
+ }
+ /* Convert from milliseconds to timespec. */
+ ts.tv_sec = args->timeout / 1000;
+ ts.tv_nsec = (args->timeout % 1000) * 1000000;
+ tsp = &ts;
+ } else {
+ tsp = NULL;
+ }
+
+ error = kern_kevent_locked(td, epfp, 0, args->maxevents, &k_ops, tsp);
+ if (!error && coargs.error)
+ error = coargs.error;
+
+ /*
+ * kern_keven might return ENOMEM which is not expected from epoll_wait.
+ * Maybe we should translate that but I don't think it matters at all.
+ */
+
+ if (!error)
+ td->td_retval[0] = coargs.count;
+leave:
+ fdrop(epfp, td);
+ return (error);
+}
+
+#if EPOLL_WIDE_USER_DATA
+/*
+ * we store user_data vector in an unused for kqueue descriptor
+ * field fvn_epollpriv in struct file.
+ */
+#define EPOLL_USER_DATA_GET(epfp) \
+ ((struct epoll_user_data*)(epfp)->f_vnun.fvn_epollpriv)
+#define EPOLL_USER_DATA_SET(epfp, udv) \
+ (epfp)->f_vnun.fvn_epollpriv = (udv)
+
+static void
+epoll_init_user_data(struct thread *td, struct file *epfp)
+{
+ struct epoll_user_data *udv;
+
+ /* override file ops to have our close operation */
+ atomic_store_rel_ptr((volatile uintptr_t *)&epfp->f_ops, (uintptr_t)&epollops);
+
+ /* allocate epoll_user_data initially for up to 16 file descriptor values */
+ udv = malloc(EPOLL_USER_DATA_SIZE(EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK);
+ udv->sz = EPOLL_USER_DATA_MARGIN;
+ EPOLL_USER_DATA_SET(epfp, udv);
+}
+
+static void
+epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data)
+{
+ struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp);
+
+ if (fd >= udv->sz) {
+ udv = realloc(udv, EPOLL_USER_DATA_SIZE(fd + EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK);
+ udv->sz = fd + EPOLL_USER_DATA_MARGIN;
+ EPOLL_USER_DATA_SET(epfp, udv);
+ }
+ udv->data[fd] = user_data;
+}
+
+static epoll_udata_t
+epoll_get_user_data(struct thread *td, struct file *epfp, int fd)
+{
+ struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp);
+ if (fd >= udv->sz)
+ panic("epoll: user data vector is too small");
+
+ return (udv->data[fd]);
+}
+
+/*ARGSUSED*/
+static int
+epoll_close(struct file *epfp, struct thread *td)
+{
+ /* free user data vector */
+ free(EPOLL_USER_DATA_GET(epfp), M_LINUX_EPOLL);
+ /* over to kqueue parent */
+ return (kqueue_close(epfp, td));
+}
+#endif
+
+static struct file*
+epoll_fget(struct thread *td, int epfd)
+{
+ struct file *fp;
+ cap_rights_t rights;
+
+ if (fget(td, epfd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp) != 0)
+ panic("epoll: no file object found for kqueue descriptor");
+
+ return (fp);
+}
+
OpenPOWER on IntegriCloud