summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrdivacky <rdivacky@FreeBSD.org>2013-09-18 17:56:04 +0000
committerrdivacky <rdivacky@FreeBSD.org>2013-09-18 17:56:04 +0000
commitd57db3eeadba8647b04947a065402b4fa3a37b23 (patch)
tree8a109929a3a40eade814da689824fd2945348482
parent2320759748604480072b729069f5e3012914398b (diff)
downloadFreeBSD-src-d57db3eeadba8647b04947a065402b4fa3a37b23.zip
FreeBSD-src-d57db3eeadba8647b04947a065402b4fa3a37b23.tar.gz
Implement epoll support in Linuxulator. This is a tiny wrapper around kqueue
to implement epoll subset of functionality. The kqueue user data are 32bit on i386 which is not enough for epoll user data so this patch overrides kqueue fileops to maintain enough space in struct file. Initial patch developed by me in 2007 and then extended and finished by Yuri Victorovich. Approved by: re (delphij) Sponsored by: Google Summer of Code Submitted by: Yuri Victorovich <yuri at rawbw dot com> Tested by: Yuri Victorovich <yuri at rawbw dot com>
-rw-r--r--sys/amd64/linux32/linux32_dummy.c4
-rw-r--r--sys/amd64/linux32/syscalls.master10
-rw-r--r--sys/compat/linux/linux_epoll.c554
-rw-r--r--sys/compat/linux/linux_epoll.h68
-rw-r--r--sys/conf/files.amd641
-rw-r--r--sys/conf/files.i3861
-rw-r--r--sys/conf/files.pc981
-rw-r--r--sys/i386/linux/linux_dummy.c4
-rw-r--r--sys/i386/linux/syscalls.master10
-rw-r--r--sys/kern/kern_event.c123
-rw-r--r--sys/modules/linux/Makefile2
-rw-r--r--sys/sys/event.h18
-rw-r--r--sys/sys/file.h2
-rw-r--r--sys/sys/syscallsubr.h7
14 files changed, 736 insertions, 69 deletions
diff --git a/sys/amd64/linux32/linux32_dummy.c b/sys/amd64/linux32/linux32_dummy.c
index 95bf3ec..1ae64bb 100644
--- a/sys/amd64/linux32/linux32_dummy.c
+++ b/sys/amd64/linux32/linux32_dummy.c
@@ -70,9 +70,6 @@ DUMMY(pivot_root);
DUMMY(mincore);
DUMMY(ptrace);
DUMMY(lookup_dcookie);
-DUMMY(epoll_create);
-DUMMY(epoll_ctl);
-DUMMY(epoll_wait);
DUMMY(remap_file_pages);
DUMMY(timer_create);
DUMMY(timer_settime);
@@ -129,7 +126,6 @@ DUMMY(timerfd_gettime);
/* linux 2.6.27: */
DUMMY(signalfd4);
DUMMY(eventfd2);
-DUMMY(epoll_create1);
DUMMY(dup3);
DUMMY(inotify_init1);
/* linux 2.6.30: */
diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master
index c3a10af..b9a0829 100644
--- a/sys/amd64/linux32/syscalls.master
+++ b/sys/amd64/linux32/syscalls.master
@@ -430,9 +430,11 @@
251 AUE_NULL UNIMPL
252 AUE_EXIT STD { int linux_exit_group(int error_code); }
253 AUE_NULL STD { int linux_lookup_dcookie(void); }
-254 AUE_NULL STD { int linux_epoll_create(void); }
-255 AUE_NULL STD { int linux_epoll_ctl(void); }
-256 AUE_NULL STD { int linux_epoll_wait(void); }
+254 AUE_NULL STD { int linux_epoll_create(l_int size); }
+255 AUE_NULL STD { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
+ struct linux_epoll_event *event); }
+256 AUE_NULL STD { int linux_epoll_wait(l_int epfd, struct linux_epoll_event *events, \
+ l_int maxevents, l_int timeout); }
257 AUE_NULL STD { int linux_remap_file_pages(void); }
258 AUE_NULL STD { int linux_set_tid_address(int *tidptr); }
259 AUE_NULL STD { int linux_timer_create(void); }
@@ -534,7 +536,7 @@
; linux 2.6.27:
327 AUE_NULL STD { int linux_signalfd4(void); }
328 AUE_NULL STD { int linux_eventfd2(void); }
-329 AUE_NULL STD { int linux_epoll_create1(void); }
+329 AUE_NULL STD { int linux_epoll_create1(l_int flags); }
330 AUE_NULL STD { int linux_dup3(void); }
331 AUE_NULL STD { int linux_pipe2(l_int *pipefds, l_int flags); }
332 AUE_NULL STD { int linux_inotify_init1(void); }
diff --git a/sys/compat/linux/linux_epoll.c b/sys/compat/linux/linux_epoll.c
new file mode 100644
index 0000000..b9e1f2b
--- /dev/null
+++ b/sys/compat/linux/linux_epoll.c
@@ -0,0 +1,554 @@
+/*-
+ * Copyright (c) 2007 Roman Divacky
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_ktrace.h"
+
+#include <sys/limits.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/capability.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/errno.h>
+#include <sys/event.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/syscallsubr.h>
+#include <sys/timespec.h>
+#include <compat/linux/linux_epoll.h>
+#include <compat/linux/linux_util.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#ifdef COMPAT_LINUX32
+#include <machine/../linux32/linux.h>
+#include <machine/../linux32/linux32_proto.h>
+#else
+#include <machine/../linux/linux.h>
+#include <machine/../linux/linux_proto.h>
+#endif
+
+#define ktrepoll_events(evt, count) \
+ ktrstruct("linux_epoll_event", (evt), count * sizeof(*evt))
+
+/*
+ * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
+ * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
+ * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
+ * data verbatuim. Therefore on 32 bit architectures we allocate 64-bit memory
+ * block to pass user supplied data for every file descriptor.
+ */
+typedef uint64_t epoll_udata_t;
+#if defined(__i386__)
+#define EPOLL_WIDE_USER_DATA 1
+#else
+#define EPOLL_WIDE_USER_DATA 0
+#endif
+
+#if EPOLL_WIDE_USER_DATA
+
+/*
+ * Approach similar to epoll_user_data could also be used to
+ * keep track of event bits per file descriptor for all architectures.
+ * However, it isn't obvious that such tracking would be beneficial
+ * in practice.
+ */
+
+struct epoll_user_data {
+ unsigned sz;
+ epoll_udata_t data[1];
+};
+static MALLOC_DEFINE(M_LINUX_EPOLL, "epoll", "memory for epoll system");
+#define EPOLL_USER_DATA_SIZE(ndata) \
+ (sizeof(struct epoll_user_data)+((ndata)-1)*sizeof(epoll_udata_t))
+#define EPOLL_USER_DATA_MARGIN 16
+
+static void epoll_init_user_data(struct thread *td, struct file *epfp);
+static void epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data);
+static epoll_udata_t epoll_get_user_data(struct thread *td, struct file *epfp, int fd);
+static fo_close_t epoll_close;
+
+/* overload kqueue fileops */
+static struct fileops epollops = {
+ .fo_read = kqueue_read,
+ .fo_write = kqueue_write,
+ .fo_truncate = kqueue_truncate,
+ .fo_ioctl = kqueue_ioctl,
+ .fo_poll = kqueue_poll,
+ .fo_kqfilter = kqueue_kqfilter,
+ .fo_stat = kqueue_stat,
+ .fo_close = epoll_close,
+ .fo_chmod = invfo_chmod,
+ .fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
+};
+#endif
+
+static struct file* epoll_fget(struct thread *td, int epfd);
+
+struct epoll_copyin_args {
+ struct kevent *changelist;
+};
+
+struct epoll_copyout_args {
+ struct linux_epoll_event *leventlist;
+ int count;
+ int error;
+#if KTRACE || EPOLL_WIDE_USER_DATA
+ struct thread *td;
+#endif
+#if EPOLL_WIDE_USER_DATA
+ struct file *epfp;
+#endif
+};
+
+
+/* Create a new epoll file descriptor. */
+
+static int
+linux_epoll_create_common(struct thread *td)
+{
+ struct file *fp;
+ int error;
+
+ error = kern_kqueue_locked(td, &fp);
+#if EPOLL_WIDE_USER_DATA
+ if (error == 0) {
+ epoll_init_user_data(td, fp);
+ fdrop(fp, td);
+ }
+#endif
+ return (error);
+}
+
+int
+linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
+{
+ if (args->size <= 0)
+ return (EINVAL);
+ /* args->size is unused. Linux just tests it
+ * and then forgets it as well. */
+
+ return (linux_epoll_create_common(td));
+}
+
+int
+linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
+{
+ int error;
+
+ error = linux_epoll_create_common(td);
+
+ if (!error) {
+ if (args->flags & LINUX_EPOLL_CLOEXEC)
+ td->td_proc->p_fd->fd_ofiles[td->td_retval[0]].fde_flags |= UF_EXCLOSE;
+ if (args->flags & LINUX_EPOLL_NONBLOCK)
+ linux_msg(td, "epoll_create1 doesn't yet support EPOLL_NONBLOCK flag\n");
+ }
+
+ return (error);
+}
+
+/* Structure converting function from epoll to kevent. */
+static int
+linux_epoll_to_kevent(struct thread *td,
+#if EPOLL_WIDE_USER_DATA
+ struct file *epfp,
+#endif
+ int fd, struct linux_epoll_event *l_event, int kev_flags, struct kevent *kevent, int *nkevents)
+{
+ /* flags related to how event is registered */
+ if (l_event->events & LINUX_EPOLLONESHOT)
+ kev_flags |= EV_ONESHOT;
+ if (l_event->events & LINUX_EPOLLET) {
+ kev_flags |= EV_CLEAR;
+ }
+
+ /* flags related to what event is registered */
+ if (l_event->events & LINUX_EPOLLIN ||
+ l_event->events & LINUX_EPOLLRDNORM ||
+ l_event->events & LINUX_EPOLLPRI ||
+ l_event->events & LINUX_EPOLLRDHUP) {
+ EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0,
+ (void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data));
+ ++*nkevents;
+ }
+ if (l_event->events & LINUX_EPOLLOUT ||
+ l_event->events & LINUX_EPOLLWRNORM) {
+ EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0,
+ (void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data));
+ ++*nkevents;
+ }
+ if (l_event->events & LINUX_EPOLLRDBAND ||
+ l_event->events & LINUX_EPOLLWRBAND ||
+ l_event->events & LINUX_EPOLLHUP ||
+ l_event->events & LINUX_EPOLLMSG ||
+ l_event->events & LINUX_EPOLLWAKEUP ||
+ l_event->events & LINUX_EPOLLERR) {
+ linux_msg(td, "epoll_ctl doesn't yet support some event flags supplied: 0x%x\n",
+ l_event->events);
+ return (EINVAL);
+ }
+
+#if EPOLL_WIDE_USER_DATA
+ epoll_set_user_data(td, epfp, fd, l_event->data);
+#endif
+ return (0);
+}
+
+/*
+ * Structure converting function from kevent to epoll. In a case
+ * this is called on error in registration we store the error in
+ * event->data and pick it up later in linux_epoll_ctl().
+ */
+static void
+linux_kevent_to_epoll(
+#if EPOLL_WIDE_USER_DATA
+ struct thread *td, struct file *epfp,
+#endif
+ struct kevent *kevent, struct linux_epoll_event *l_event)
+{
+ if ((kevent->flags & EV_ERROR) == 0)
+ switch (kevent->filter) {
+ case EVFILT_READ:
+ l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI;
+ break;
+ case EVFILT_WRITE:
+ l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM;
+ break;
+ }
+#if EPOLL_WIDE_USER_DATA
+ l_event->data = epoll_get_user_data(td, epfp, kevent->ident);
+#else
+ l_event->data = (epoll_udata_t)kevent->udata;
+#endif
+}
+
+/*
+ * Copyout callback used by kevent. This converts kevent
+ * events to epoll events and copies them back to the
+ * userspace. This is also called on error on registering
+ * of the filter.
+ */
+static int
+epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
+{
+ struct epoll_copyout_args *args;
+ struct linux_epoll_event *eep;
+ int error, i;
+
+ args = (struct epoll_copyout_args*) arg;
+ eep = malloc(sizeof(*eep) * count, M_TEMP, M_WAITOK | M_ZERO);
+
+ for (i = 0; i < count; i++)
+ linux_kevent_to_epoll(
+#if EPOLL_WIDE_USER_DATA
+ args->td, args->epfp,
+#endif
+ &kevp[i], &eep[i]);
+
+ error = copyout(eep, args->leventlist, count * sizeof(*eep));
+ if (!error) {
+ args->leventlist += count;
+ args->count += count;
+ } else if (!args->error)
+ args->error = error;
+
+#ifdef KTRACE
+ if (KTRPOINT(args->td, KTR_STRUCT))
+ ktrepoll_events(eep, count);
+#endif
+
+ free(eep, M_TEMP);
+ return (error);
+}
+
+/*
+ * Copyin callback used by kevent. This copies already
+ * converted filters from kernel memory to the kevent
+ * internal kernel memory. Hence the memcpy instead of
+ * copyin.
+ */
+static int
+epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
+{
+ struct epoll_copyin_args *args;
+
+ args = (struct epoll_copyin_args*) arg;
+
+ memcpy(kevp, args->changelist, count * sizeof(*kevp));
+ args->changelist += count;
+
+ return (0);
+}
+
+static int
+ignore_enoent(int error) {
+ if (error == ENOENT)
+ error = 0;
+ return (error);
+}
+
+static int
+delete_event(struct thread *td, struct file *epfp, int fd, int filter)
+{
+ struct epoll_copyin_args ciargs;
+ struct kevent kev;
+ struct kevent_copyops k_ops = { &ciargs,
+ NULL,
+ epoll_kev_copyin};
+ ciargs.changelist = &kev;
+
+ EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
+ return (kern_kevent_locked(td, epfp, 1, 0, &k_ops, NULL));
+}
+
+static int
+delete_all_events(struct thread *td, struct file *epfp, int fd)
+{
+ /* here we ignore ENONT, because we don't keep track of events here */
+ int error1, error2;
+
+ error1 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_READ));
+ error2 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_WRITE));
+
+ /* report any errors we got */
+ if (error1)
+ return (error1);
+ if (error2)
+ return (error2);
+ return (0);
+}
+
+/*
+ * Load epoll filter, convert it to kevent filter
+ * and load it into kevent subsystem.
+ */
+int
+linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
+{
+ struct file *epfp;
+ struct epoll_copyin_args ciargs;
+ struct kevent kev[2];
+ struct kevent_copyops k_ops = { &ciargs,
+ NULL,
+ epoll_kev_copyin};
+ struct linux_epoll_event le;
+ int kev_flags;
+ int nchanges = 0;
+ int error;
+
+ if (args->epfd == args->fd)
+ return (EINVAL);
+
+ if (args->op != LINUX_EPOLL_CTL_DEL) {
+ error = copyin(args->event, &le, sizeof(le));
+ if (error)
+ return (error);
+ }
+#ifdef DEBUG
+ if (ldebug(epoll_ctl))
+ printf(ARGS(epoll_ctl,"%i, %i, %i, %u"), args->epfd, args->op,
+ args->fd, le.events);
+#endif
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_STRUCT) && args->op != LINUX_EPOLL_CTL_DEL)
+ ktrepoll_events(&le, 1);
+#endif
+ epfp = epoll_fget(td, args->epfd);
+
+ ciargs.changelist = kev;
+
+ switch (args->op) {
+ case LINUX_EPOLL_CTL_MOD:
+ /* we don't memorize which events were set for this FD
+ on this level, so just delete all we could have set:
+ EVFILT_READ and EVFILT_WRITE, ignoring any errors
+ */
+ error = delete_all_events(td, epfp, args->fd);
+ if (error)
+ goto leave;
+ /* FALLTHROUGH */
+ case LINUX_EPOLL_CTL_ADD:
+ kev_flags = EV_ADD | EV_ENABLE;
+ break;
+ case LINUX_EPOLL_CTL_DEL:
+ /* CTL_DEL means unregister this fd with this epoll */
+ error = delete_all_events(td, epfp, args->fd);
+ goto leave;
+ default:
+ error = EINVAL;
+ goto leave;
+ }
+
+ error = linux_epoll_to_kevent(td,
+#if EPOLL_WIDE_USER_DATA
+ epfp,
+#endif
+ args->fd, &le, kev_flags, kev, &nchanges);
+ if (error)
+ goto leave;
+
+ error = kern_kevent_locked(td, epfp, nchanges, 0, &k_ops, NULL);
+leave:
+ fdrop(epfp, td);
+ return (error);
+}
+
+/*
+ * Wait for a filter to be triggered on the epoll file descriptor. */
+int
+linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
+{
+ struct file *epfp;
+ struct timespec ts, *tsp;
+ struct epoll_copyout_args coargs;
+ struct kevent_copyops k_ops = { &coargs,
+ epoll_kev_copyout,
+ NULL};
+ int error;
+
+ if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS)
+ return (EINVAL);
+
+ epfp = epoll_fget(td, args->epfd);
+
+ coargs.leventlist = args->events;
+ coargs.count = 0;
+ coargs.error = 0;
+#if defined(KTRACE) || EPOLL_WIDE_USER_DATA
+ coargs.td = td;
+#endif
+#if EPOLL_WIDE_USER_DATA
+ coargs.epfp = epfp;
+#endif
+
+ if (args->timeout != -1) {
+ if (args->timeout < 0) {
+ error = EINVAL;
+ goto leave;
+ }
+ /* Convert from milliseconds to timespec. */
+ ts.tv_sec = args->timeout / 1000;
+ ts.tv_nsec = (args->timeout % 1000) * 1000000;
+ tsp = &ts;
+ } else {
+ tsp = NULL;
+ }
+
+ error = kern_kevent_locked(td, epfp, 0, args->maxevents, &k_ops, tsp);
+ if (!error && coargs.error)
+ error = coargs.error;
+
+ /*
+ * kern_keven might return ENOMEM which is not expected from epoll_wait.
+ * Maybe we should translate that but I don't think it matters at all.
+ */
+
+ if (!error)
+ td->td_retval[0] = coargs.count;
+leave:
+ fdrop(epfp, td);
+ return (error);
+}
+
+#if EPOLL_WIDE_USER_DATA
+/*
+ * we store user_data vector in an unused for kqueue descriptor
+ * field fvn_epollpriv in struct file.
+ */
+#define EPOLL_USER_DATA_GET(epfp) \
+ ((struct epoll_user_data*)(epfp)->f_vnun.fvn_epollpriv)
+#define EPOLL_USER_DATA_SET(epfp, udv) \
+ (epfp)->f_vnun.fvn_epollpriv = (udv)
+
+static void
+epoll_init_user_data(struct thread *td, struct file *epfp)
+{
+ struct epoll_user_data *udv;
+
+ /* override file ops to have our close operation */
+ atomic_store_rel_ptr((volatile uintptr_t *)&epfp->f_ops, (uintptr_t)&epollops);
+
+ /* allocate epoll_user_data initially for up to 16 file descriptor values */
+ udv = malloc(EPOLL_USER_DATA_SIZE(EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK);
+ udv->sz = EPOLL_USER_DATA_MARGIN;
+ EPOLL_USER_DATA_SET(epfp, udv);
+}
+
+static void
+epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data)
+{
+ struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp);
+
+ if (fd >= udv->sz) {
+ udv = realloc(udv, EPOLL_USER_DATA_SIZE(fd + EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK);
+ udv->sz = fd + EPOLL_USER_DATA_MARGIN;
+ EPOLL_USER_DATA_SET(epfp, udv);
+ }
+ udv->data[fd] = user_data;
+}
+
+static epoll_udata_t
+epoll_get_user_data(struct thread *td, struct file *epfp, int fd)
+{
+ struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp);
+ if (fd >= udv->sz)
+ panic("epoll: user data vector is too small");
+
+ return (udv->data[fd]);
+}
+
+/*ARGSUSED*/
+static int
+epoll_close(struct file *epfp, struct thread *td)
+{
+ /* free user data vector */
+ free(EPOLL_USER_DATA_GET(epfp), M_LINUX_EPOLL);
+ /* over to kqueue parent */
+ return (kqueue_close(epfp, td));
+}
+#endif
+
+static struct file*
+epoll_fget(struct thread *td, int epfd)
+{
+ struct file *fp;
+ cap_rights_t rights;
+
+ if (fget(td, epfd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp) != 0)
+ panic("epoll: no file object found for kqueue descriptor");
+
+ return (fp);
+}
+
diff --git a/sys/compat/linux/linux_epoll.h b/sys/compat/linux/linux_epoll.h
new file mode 100644
index 0000000..aea4185
--- /dev/null
+++ b/sys/compat/linux/linux_epoll.h
@@ -0,0 +1,68 @@
+/*-
+ * Copyright (c) 2007 Roman Divacky
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _LINUX_EPOLL_H_
+#define _LINUX_EPOLL_H_
+
+#ifdef __amd64__
+#define EPOLL_PACKED __packed
+#else
+#define EPOLL_PACKED
+#endif
+
+struct linux_epoll_event {
+ uint32_t events;
+ uint64_t data;
+} EPOLL_PACKED;
+
+#define LINUX_EPOLLIN 0x001
+#define LINUX_EPOLLPRI 0x002
+#define LINUX_EPOLLOUT 0x004
+#define LINUX_EPOLLRDNORM 0x040
+#define LINUX_EPOLLRDBAND 0x080
+#define LINUX_EPOLLWRNORM 0x100
+#define LINUX_EPOLLWRBAND 0x200
+#define LINUX_EPOLLMSG 0x400
+#define LINUX_EPOLLERR 0x008
+#define LINUX_EPOLLHUP 0x010
+#define LINUX_EPOLLRDHUP 0x2000
+#define LINUX_EPOLLWAKEUP 1u<<29
+#define LINUX_EPOLLONESHOT 1u<<30
+#define LINUX_EPOLLET 1u<<31
+
+#define LINUX_EPOLL_CTL_ADD 1
+#define LINUX_EPOLL_CTL_DEL 2
+#define LINUX_EPOLL_CTL_MOD 3
+
+#define LINUX_EPOLL_CLOEXEC 02000000
+#define LINUX_EPOLL_NONBLOCK 00004000
+
+#define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct linux_epoll_event))
+
+#endif /* !_LINUX_EPOLL_H_ */
+
diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index c1647d3..babfcab 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -467,6 +467,7 @@ amd64/linux32/linux32_support.s optional compat_linux32 \
dependency "linux32_assym.h"
amd64/linux32/linux32_sysent.c optional compat_linux32
amd64/linux32/linux32_sysvec.c optional compat_linux32
+compat/linux/linux_epoll.c optional compat_linux32
compat/linux/linux_emul.c optional compat_linux32
compat/linux/linux_file.c optional compat_linux32
compat/linux/linux_fork.c optional compat_linux32
diff --git a/sys/conf/files.i386 b/sys/conf/files.i386
index 24dac5f..17791a6 100644
--- a/sys/conf/files.i386
+++ b/sys/conf/files.i386
@@ -80,6 +80,7 @@ hptrr_lib.o optional hptrr \
cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}"
compat/linprocfs/linprocfs.c optional linprocfs
compat/linsysfs/linsysfs.c optional linsysfs
+compat/linux/linux_epoll.c optional compat_linux
compat/linux/linux_emul.c optional compat_linux
compat/linux/linux_file.c optional compat_linux
compat/linux/linux_fork.c optional compat_linux
diff --git a/sys/conf/files.pc98 b/sys/conf/files.pc98
index a8e60b6..ee91501 100644
--- a/sys/conf/files.pc98
+++ b/sys/conf/files.pc98
@@ -41,6 +41,7 @@ ukbdmap.h optional ukbd_dflt_keymap \
cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}"
compat/linprocfs/linprocfs.c optional linprocfs
compat/linsysfs/linsysfs.c optional linsysfs
+compat/linux/linux_epoll.c optional compat_linux
compat/linux/linux_emul.c optional compat_linux
compat/linux/linux_file.c optional compat_linux
compat/linux/linux_fork.c optional compat_linux
diff --git a/sys/i386/linux/linux_dummy.c b/sys/i386/linux/linux_dummy.c
index ab77790..f8526e1 100644
--- a/sys/i386/linux/linux_dummy.c
+++ b/sys/i386/linux/linux_dummy.c
@@ -72,9 +72,6 @@ DUMMY(setfsgid);
DUMMY(pivot_root);
DUMMY(mincore);
DUMMY(lookup_dcookie);
-DUMMY(epoll_create);
-DUMMY(epoll_ctl);
-DUMMY(epoll_wait);
DUMMY(remap_file_pages);
DUMMY(fstatfs64);
DUMMY(mbind);
@@ -120,7 +117,6 @@ DUMMY(timerfd_gettime);
/* linux 2.6.27: */
DUMMY(signalfd4);
DUMMY(eventfd2);
-DUMMY(epoll_create1);
DUMMY(dup3);
DUMMY(inotify_init1);
/* linux 2.6.30: */
diff --git a/sys/i386/linux/syscalls.master b/sys/i386/linux/syscalls.master
index bb17166..1f260bd 100644
--- a/sys/i386/linux/syscalls.master
+++ b/sys/i386/linux/syscalls.master
@@ -432,9 +432,11 @@
251 AUE_NULL UNIMPL
252 AUE_EXIT STD { int linux_exit_group(int error_code); }
253 AUE_NULL STD { int linux_lookup_dcookie(void); }
-254 AUE_NULL STD { int linux_epoll_create(void); }
-255 AUE_NULL STD { int linux_epoll_ctl(void); }
-256 AUE_NULL STD { int linux_epoll_wait(void); }
+254 AUE_NULL STD { int linux_epoll_create(l_int size); }
+255 AUE_NULL STD { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \
+ struct linux_epoll_event *event); }
+256 AUE_NULL STD { int linux_epoll_wait(l_int epfd, struct linux_epoll_event *events, \
+ l_int maxevents, l_int timeout); }
257 AUE_NULL STD { int linux_remap_file_pages(void); }
258 AUE_NULL STD { int linux_set_tid_address(int *tidptr); }
259 AUE_NULL STD { int linux_timer_create(clockid_t clock_id, \
@@ -544,7 +546,7 @@
; linux 2.6.27:
327 AUE_NULL STD { int linux_signalfd4(void); }
328 AUE_NULL STD { int linux_eventfd2(void); }
-329 AUE_NULL STD { int linux_epoll_create1(void); }
+329 AUE_NULL STD { int linux_epoll_create1(l_int flags); }
330 AUE_NULL STD { int linux_dup3(void); }
331 AUE_NULL STD { int linux_pipe2(l_int *pipefds, l_int flags); }
332 AUE_NULL STD { int linux_inotify_init1(void); }
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 85ea78c..f4b6c19 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -107,16 +107,7 @@ static void kqueue_wakeup(struct kqueue *kq);
static struct filterops *kqueue_fo_find(int filt);
static void kqueue_fo_release(int filt);
-static fo_rdwr_t kqueue_read;
-static fo_rdwr_t kqueue_write;
-static fo_truncate_t kqueue_truncate;
-static fo_ioctl_t kqueue_ioctl;
-static fo_poll_t kqueue_poll;
-static fo_kqfilter_t kqueue_kqfilter;
-static fo_stat_t kqueue_stat;
-static fo_close_t kqueue_close;
-
-static struct fileops kqueueops = {
+struct fileops kqueueops = {
.fo_read = kqueue_read,
.fo_write = kqueue_write,
.fo_truncate = kqueue_truncate,
@@ -303,7 +294,7 @@ filt_fileattach(struct knote *kn)
}
/*ARGSUSED*/
-static int
+int
kqueue_kqfilter(struct file *fp, struct knote *kn)
{
struct kqueue *kq = kn->kn_fp->f_data;
@@ -688,34 +679,7 @@ filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
int
sys_kqueue(struct thread *td, struct kqueue_args *uap)
{
- struct filedesc *fdp;
- struct kqueue *kq;
- struct file *fp;
- int fd, error;
-
- fdp = td->td_proc->p_fd;
- error = falloc(td, &fp, &fd, 0);
- if (error)
- goto done2;
-
- /* An extra reference on `fp' has been held for us by falloc(). */
- kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
- mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
- TAILQ_INIT(&kq->kq_head);
- kq->kq_fdp = fdp;
- knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
- TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
-
- FILEDESC_XLOCK(fdp);
- TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
- FILEDESC_XUNLOCK(fdp);
-
- finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
- fdrop(fp, td);
-
- td->td_retval[0] = fd;
-done2:
- return (error);
+ return (kern_kqueue(td));
}
#ifndef _SYS_SYSPROTO_H_
@@ -817,19 +781,75 @@ kevent_copyin(void *arg, struct kevent *kevp, int count)
}
int
+kern_kqueue(struct thread *td)
+{
+ struct file *fp;
+ int error;
+
+ error = kern_kqueue_locked(td, &fp);
+
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+kern_kqueue_locked(struct thread *td, struct file **fpp)
+{
+ struct filedesc *fdp;
+ struct kqueue *kq;
+ struct file *fp;
+ int fd, error;
+
+ fdp = td->td_proc->p_fd;
+ error = falloc(td, &fp, &fd, 0);
+ if (error)
+ return (error);
+
+ /* An extra reference on `fp' has been held for us by falloc(). */
+ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
+ mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
+ TAILQ_INIT(&kq->kq_head);
+ kq->kq_fdp = fdp;
+ knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
+ TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+
+ FILEDESC_XLOCK(fdp);
+ TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
+ FILEDESC_XUNLOCK(fdp);
+
+ finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
+
+ td->td_retval[0] = fd;
+ *fpp = fp;
+ return (0);
+}
+
+int
kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
struct kevent_copyops *k_ops, const struct timespec *timeout)
{
+ struct file *fp;
+ cap_rights_t rights;
+ int error;
+
+ if ((error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp)) != 0)
+ return (error);
+
+ error = kern_kevent_locked(td, fp, nchanges, nevents, k_ops, timeout);
+
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+kern_kevent_locked(struct thread *td, struct file *fp, int nchanges, int nevents,
+ struct kevent_copyops *k_ops, const struct timespec *timeout)
+{
struct kevent keva[KQ_NEVENTS];
struct kevent *kevp, *changes;
struct kqueue *kq;
- struct file *fp;
- cap_rights_t rights;
int i, n, nerrors, error;
- error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp);
- if (error != 0)
- return (error);
if ((error = kqueue_acquire(fp, &kq)) != 0)
goto done_norel;
@@ -872,7 +892,6 @@ kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
done:
kqueue_release(kq, 0);
done_norel:
- fdrop(fp, td);
return (error);
}
@@ -1526,7 +1545,7 @@ done_nl:
* This could be expanded to call kqueue_scan, if desired.
*/
/*ARGSUSED*/
-static int
+int
kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
@@ -1534,7 +1553,7 @@ kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
}
/*ARGSUSED*/
-static int
+int
kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
int flags, struct thread *td)
{
@@ -1542,7 +1561,7 @@ kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
}
/*ARGSUSED*/
-static int
+int
kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
struct thread *td)
{
@@ -1551,7 +1570,7 @@ kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
}
/*ARGSUSED*/
-static int
+int
kqueue_ioctl(struct file *fp, u_long cmd, void *data,
struct ucred *active_cred, struct thread *td)
{
@@ -1599,7 +1618,7 @@ kqueue_ioctl(struct file *fp, u_long cmd, void *data,
}
/*ARGSUSED*/
-static int
+int
kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
struct thread *td)
{
@@ -1626,7 +1645,7 @@ kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
}
/*ARGSUSED*/
-static int
+int
kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
struct thread *td)
{
@@ -1644,7 +1663,7 @@ kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
}
/*ARGSUSED*/
-static int
+int
kqueue_close(struct file *fp, struct thread *td)
{
struct kqueue *kq = fp->f_data;
diff --git a/sys/modules/linux/Makefile b/sys/modules/linux/Makefile
index 7ed6e98..ce46aa8 100644
--- a/sys/modules/linux/Makefile
+++ b/sys/modules/linux/Makefile
@@ -9,7 +9,7 @@ CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32
KMOD= linux
SRCS= linux_fork.c linux${SFX}_dummy.c linux_emul.c linux_file.c \
- linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \
+ linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c linux_epoll.c \
linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \
linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \
linux${SFX}_sysvec.c linux_uid16.c linux_util.c linux_time.c \
diff --git a/sys/sys/event.h b/sys/sys/event.h
index 03bd7b9..60bced7 100644
--- a/sys/sys/event.h
+++ b/sys/sys/event.h
@@ -236,6 +236,9 @@ struct proc;
struct knlist;
struct mtx;
struct rwlock;
+struct uio;
+struct stat;
+struct ucred;
extern void knote(struct knlist *list, long hint, int lockflags);
extern void knote_fork(struct knlist *list, int pid);
@@ -261,6 +264,21 @@ extern int kqfd_register(int fd, struct kevent *kev, struct thread *p,
extern int kqueue_add_filteropts(int filt, struct filterops *filtops);
extern int kqueue_del_filteropts(int filt);
+int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td);
+int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+ int flags, struct thread *td);
+int kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td);
+int kqueue_ioctl(struct file *fp, u_long cmd, void *data,
+ struct ucred *active_cred, struct thread *td);
+int kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
+ struct thread *td);
+int kqueue_kqfilter(struct file *fp, struct knote *kn);
+int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+ struct thread *td);
+int kqueue_close(struct file *fp, struct thread *td);
+
#else /* !_KERNEL */
#include <sys/cdefs.h>
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 7b373f0..b4c1ad4 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -169,6 +169,8 @@ struct file {
union {
struct cdev_privdata *fvn_cdevpriv;
/* (d) Private data for the cdev. */
+ void *fvn_epollpriv;
+ /* (d) Private data for the epoll. */
struct fadvise_info *fvn_advice;
} f_vnun;
/*
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index 17f2b97..92dd8be 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -121,8 +121,13 @@ int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data);
int kern_jail(struct thread *td, struct jail *j);
int kern_jail_get(struct thread *td, struct uio *options, int flags);
int kern_jail_set(struct thread *td, struct uio *options, int flags);
+int kern_kqueue(struct thread *td);
+int kern_kqueue_locked(struct thread *td, struct file **fpp);
int kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
struct kevent_copyops *k_ops, const struct timespec *timeout);
+int kern_kevent_locked(struct thread *td, struct file *fp, int nchanges,
+ int nevents,
+ struct kevent_copyops *k_ops, const struct timespec *timeout);
int kern_kldload(struct thread *td, const char *file, int *fileid);
int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat);
int kern_kldunload(struct thread *td, int fileid, int flags);
@@ -248,6 +253,8 @@ int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
struct timeval *tptr, enum uio_seg tptrseg);
int kern_utimesat(struct thread *td, int fd, char *path,
enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg);
+int kern_utimensat(struct thread *td, int fd, char *path,
+ enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg);
int kern_wait(struct thread *td, pid_t pid, int *status, int options,
struct rusage *rup);
int kern_wait6(struct thread *td, enum idtype idtype, id_t id, int *status,
OpenPOWER on IntegriCloud