summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjmg <jmg@FreeBSD.org>2004-08-15 06:24:42 +0000
committerjmg <jmg@FreeBSD.org>2004-08-15 06:24:42 +0000
commitbc1805c6e871c178d0b6516c3baa774ffd77224a (patch)
tree1172b68a7c9e7fa73090ae134eb98825bdab8ad6
parent57da12d01618c3ef2590eab07e71f69d16ead13a (diff)
downloadFreeBSD-src-bc1805c6e871c178d0b6516c3baa774ffd77224a.zip
FreeBSD-src-bc1805c6e871c178d0b6516c3baa774ffd77224a.tar.gz
Add locking to the kqueue subsystem. This also makes the kqueue subsystem
a more complete subsystem, and removes the knowlege of how things are implemented from the drivers. Include locking around filter ops, so a module like aio will know when not to be unloaded if there are outstanding knotes using it's filter ops. Currently, it uses the MTX_DUPOK even though it is not always safe to aquire duplicate locks. Witness currently doesn't support the ability to discover if a dup lock is ok (in some cases). Reviewed by: green, rwatson (both earlier versions)
-rw-r--r--sys/cam/scsi/scsi_target.c26
-rw-r--r--sys/dev/mii/mii.c2
-rw-r--r--sys/fs/fifofs/fifo_vnops.c10
-rw-r--r--sys/gnu/ext2fs/ext2_vnops.c13
-rw-r--r--sys/gnu/fs/ext2fs/ext2_vnops.c13
-rw-r--r--sys/kern/init_main.c1
-rw-r--r--sys/kern/kern_conf.c8
-rw-r--r--sys/kern/kern_descrip.c27
-rw-r--r--sys/kern/kern_event.c1284
-rw-r--r--sys/kern/kern_exec.c2
-rw-r--r--sys/kern/kern_exit.c13
-rw-r--r--sys/kern/kern_fork.c3
-rw-r--r--sys/kern/kern_sig.c10
-rw-r--r--sys/kern/sys_pipe.c17
-rw-r--r--sys/kern/tty.c20
-rw-r--r--sys/kern/tty_cons.c4
-rw-r--r--sys/kern/uipc_sockbuf.c4
-rw-r--r--sys/kern/uipc_socket.c16
-rw-r--r--sys/kern/uipc_socket2.c4
-rw-r--r--sys/kern/uipc_syscalls.c2
-rw-r--r--sys/kern/vfs_aio.c25
-rw-r--r--sys/kern/vfs_subr.c21
-rw-r--r--sys/kern/vfs_vnops.c7
-rw-r--r--sys/net/bpf.c12
-rw-r--r--sys/net/if.c42
-rw-r--r--sys/net/if_var.h2
-rw-r--r--sys/net/if_vlan.c2
-rw-r--r--sys/sys/event.h58
-rw-r--r--sys/sys/eventvar.h23
-rw-r--r--sys/sys/filedesc.h6
-rw-r--r--sys/sys/proc.h2
-rw-r--r--sys/sys/selinfo.h2
-rw-r--r--sys/sys/vnode.h6
-rw-r--r--sys/ufs/ffs/ffs_vnops.c2
-rw-r--r--sys/ufs/ufs/ufs_acl.c2
-rw-r--r--sys/ufs/ufs/ufs_vnops.c43
36 files changed, 1205 insertions, 529 deletions
diff --git a/sys/cam/scsi/scsi_target.c b/sys/cam/scsi/scsi_target.c
index 198a70d..a3e3381 100644
--- a/sys/cam/scsi/scsi_target.c
+++ b/sys/cam/scsi/scsi_target.c
@@ -196,6 +196,7 @@ targopen(struct cdev *dev, int flags, int fmt, struct thread *td)
TAILQ_INIT(&softc->work_queue);
TAILQ_INIT(&softc->abort_queue);
TAILQ_INIT(&softc->user_ccb_queue);
+ knlist_init(&softc->read_select.si_note, &softc->mtx);
return (0);
}
@@ -336,9 +337,7 @@ targkqfilter(struct cdev *dev, struct knote *kn)
softc = (struct targ_softc *)dev->si_drv1;
kn->kn_hook = (caddr_t)softc;
kn->kn_fop = &targread_filtops;
- TARG_LOCK(softc);
- SLIST_INSERT_HEAD(&softc->read_select.si_note, kn, kn_selnext);
- TARG_UNLOCK(softc);
+ knlist_add(&softc->read_select.si_note, kn, 0);
return (0);
}
@@ -348,9 +347,7 @@ targreadfiltdetach(struct knote *kn)
struct targ_softc *softc;
softc = (struct targ_softc *)kn->kn_hook;
- TARG_LOCK(softc);
- SLIST_REMOVE(&softc->read_select.si_note, kn, knote, kn_selnext);
- TARG_UNLOCK(softc);
+ knlist_remove(&softc->read_select.si_note, kn, 0);
}
/* Notify the user's kqueue when the user queue or abort queue gets a CCB */
@@ -361,10 +358,8 @@ targreadfilt(struct knote *kn, long hint)
int retval;
softc = (struct targ_softc *)kn->kn_hook;
- TARG_LOCK(softc);
retval = !TAILQ_EMPTY(&softc->user_ccb_queue) ||
!TAILQ_EMPTY(&softc->abort_queue);
- TARG_UNLOCK(softc);
return (retval);
}
@@ -1096,19 +1091,8 @@ abort_all_pending(struct targ_softc *softc)
/* If we aborted anything from the work queue, wakeup user. */
if (!TAILQ_EMPTY(&softc->user_ccb_queue)
- || !TAILQ_EMPTY(&softc->abort_queue)) {
- /*
- * XXX KNOTE calls back into targreadfilt, causing a
- * lock recursion. So unlock around calls to it although
- * this may open up a race allowing a user to submit
- * another CCB after we have aborted all pending ones
- * A better approach is to mark the softc as dying
- * under lock and check for this in targstart().
- */
- TARG_UNLOCK(softc);
+ || !TAILQ_EMPTY(&softc->abort_queue))
notify_user(softc);
- TARG_LOCK(softc);
- }
}
/* Notify the user that data is ready */
@@ -1120,7 +1104,7 @@ notify_user(struct targ_softc *softc)
* blocking read().
*/
selwakeuppri(&softc->read_select, PRIBIO);
- KNOTE(&softc->read_select.si_note, 0);
+ KNOTE_LOCKED(&softc->read_select.si_note, 0);
wakeup(&softc->user_ccb_queue);
}
diff --git a/sys/dev/mii/mii.c b/sys/dev/mii/mii.c
index 111f3e7..e3d782b 100644
--- a/sys/dev/mii/mii.c
+++ b/sys/dev/mii/mii.c
@@ -265,7 +265,7 @@ miibus_linkchg(dev)
if (ifp->if_link_state != link_state) {
ifp->if_link_state = link_state;
rt_ifmsg(ifp);
- KNOTE(&ifp->if_klist, link);
+ KNOTE_UNLOCKED(&ifp->if_klist, link);
if (ifp->if_nvlans != 0)
(*vlan_link_state_p)(ifp, link);
}
diff --git a/sys/fs/fifofs/fifo_vnops.c b/sys/fs/fifofs/fifo_vnops.c
index ec7bda9..f21f2b6 100644
--- a/sys/fs/fifofs/fifo_vnops.c
+++ b/sys/fs/fifofs/fifo_vnops.c
@@ -432,7 +432,7 @@ fifo_kqfilter(ap)
ap->a_kn->kn_hook = (caddr_t)so;
SOCKBUF_LOCK(sb);
- SLIST_INSERT_HEAD(&sb->sb_sel.si_note, ap->a_kn, kn_selnext);
+ knlist_add(&sb->sb_sel.si_note, ap->a_kn, 1);
sb->sb_flags |= SB_KNOTE;
SOCKBUF_UNLOCK(sb);
@@ -445,8 +445,8 @@ filt_fifordetach(struct knote *kn)
struct socket *so = (struct socket *)kn->kn_hook;
SOCKBUF_LOCK(&so->so_rcv);
- SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
- if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
+ knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
+ if (knlist_empty(&so->so_rcv.sb_sel.si_note))
so->so_rcv.sb_flags &= ~SB_KNOTE;
SOCKBUF_UNLOCK(&so->so_rcv);
}
@@ -479,8 +479,8 @@ filt_fifowdetach(struct knote *kn)
struct socket *so = (struct socket *)kn->kn_hook;
SOCKBUF_LOCK(&so->so_snd);
- SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
- if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
+ knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
+ if (knlist_empty(&so->so_snd.sb_sel.si_note))
so->so_snd.sb_flags &= ~SB_KNOTE;
SOCKBUF_UNLOCK(&so->so_snd);
}
diff --git a/sys/gnu/ext2fs/ext2_vnops.c b/sys/gnu/ext2fs/ext2_vnops.c
index 3964632..cf04778 100644
--- a/sys/gnu/ext2fs/ext2_vnops.c
+++ b/sys/gnu/ext2fs/ext2_vnops.c
@@ -570,7 +570,7 @@ ext2_setattr(ap)
return (EROFS);
error = ext2_chmod(vp, (int)vap->va_mode, cred, td);
}
- VN_KNOTE(vp, NOTE_ATTRIB);
+ VN_KNOTE_UNLOCKED(vp, NOTE_ATTRIB);
return (error);
}
@@ -1894,9 +1894,9 @@ ext2_kqfilter(ap)
if (vp->v_pollinfo == NULL)
v_addpollinfo(vp);
- mtx_lock(&vp->v_pollinfo->vpi_lock);
- SLIST_INSERT_HEAD(&vp->v_pollinfo->vpi_selinfo.si_note, kn, kn_selnext);
- mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ if (vp->v_pollinfo == NULL)
+ return ENOMEM;
+ knlist_add(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
return (0);
}
@@ -1907,10 +1907,7 @@ filt_ext2detach(struct knote *kn)
struct vnode *vp = (struct vnode *)kn->kn_hook;
KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo"));
- mtx_lock(&vp->v_pollinfo->vpi_lock);
- SLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note,
- kn, knote, kn_selnext);
- mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
}
/*ARGSUSED*/
diff --git a/sys/gnu/fs/ext2fs/ext2_vnops.c b/sys/gnu/fs/ext2fs/ext2_vnops.c
index 3964632..cf04778 100644
--- a/sys/gnu/fs/ext2fs/ext2_vnops.c
+++ b/sys/gnu/fs/ext2fs/ext2_vnops.c
@@ -570,7 +570,7 @@ ext2_setattr(ap)
return (EROFS);
error = ext2_chmod(vp, (int)vap->va_mode, cred, td);
}
- VN_KNOTE(vp, NOTE_ATTRIB);
+ VN_KNOTE_UNLOCKED(vp, NOTE_ATTRIB);
return (error);
}
@@ -1894,9 +1894,9 @@ ext2_kqfilter(ap)
if (vp->v_pollinfo == NULL)
v_addpollinfo(vp);
- mtx_lock(&vp->v_pollinfo->vpi_lock);
- SLIST_INSERT_HEAD(&vp->v_pollinfo->vpi_selinfo.si_note, kn, kn_selnext);
- mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ if (vp->v_pollinfo == NULL)
+ return ENOMEM;
+ knlist_add(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
return (0);
}
@@ -1907,10 +1907,7 @@ filt_ext2detach(struct knote *kn)
struct vnode *vp = (struct vnode *)kn->kn_hook;
KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo"));
- mtx_lock(&vp->v_pollinfo->vpi_lock);
- SLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note,
- kn, knote, kn_selnext);
- mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
}
/*ARGSUSED*/
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 34b38c2..e6363e1 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -381,6 +381,7 @@ proc0_init(void *dummy __unused)
p->p_flag = P_SYSTEM;
p->p_sflag = PS_INMEM;
p->p_state = PRS_NORMAL;
+ knlist_init(&p->p_klist, &p->p_mtx);
p->p_nice = NZERO;
td->td_state = TDS_RUNNING;
kg->kg_pri_class = PRI_TIMESHARE;
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
index e16adee..60004fa 100644
--- a/sys/kern/kern_conf.c
+++ b/sys/kern/kern_conf.c
@@ -198,13 +198,7 @@ static struct cdevsw dead_cdevsw = {
#define no_write (d_write_t *)enodev
#define no_ioctl (d_ioctl_t *)enodev
#define no_mmap (d_mmap_t *)enodev
-
-static int
-no_kqfilter(struct cdev *dev __unused, struct knote *kn __unused)
-{
-
- return (1);
-}
+#define no_kqfilter (d_kqfilter_t *)enodev
static void
no_strategy(struct bio *bp)
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index ea0dee1..5104999 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -985,12 +985,12 @@ close(td, uap)
/*
* we now hold the fp reference that used to be owned by the descriptor
* array.
+ * We have to unlock the FILEDESC *AFTER* knote_fdclose to prevent a
+ * race of the fd getting opened, a knote added, and deleteing a knote
+ * for the new fd.
*/
- if (fd < fdp->fd_knlistsize) {
- FILEDESC_UNLOCK(fdp);
- knote_fdclose(td, fd);
- } else
- FILEDESC_UNLOCK(fdp);
+ knote_fdclose(td, fd);
+ FILEDESC_UNLOCK(fdp);
error = closef(fp, td);
mtx_unlock(&Giant);
@@ -1424,7 +1424,6 @@ fdinit(fdp)
newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
newfdp->fd_fd.fd_nfiles = NDFILE;
- newfdp->fd_fd.fd_knlistsize = -1;
newfdp->fd_fd.fd_map = newfdp->fd_dmap;
return (&newfdp->fd_fd);
}
@@ -1624,10 +1623,6 @@ fdfree(td)
vrele(fdp->fd_rdir);
if (fdp->fd_jdir)
vrele(fdp->fd_jdir);
- if (fdp->fd_knlist)
- FREE(fdp->fd_knlist, M_KQUEUE);
- if (fdp->fd_knhash)
- FREE(fdp->fd_knhash, M_KQUEUE);
mtx_destroy(&fdp->fd_mtx);
FREE(fdp, M_FILEDESC);
}
@@ -1681,11 +1676,7 @@ setugidsafety(td)
if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
struct file *fp;
- if (i < fdp->fd_knlistsize) {
- FILEDESC_UNLOCK(fdp);
- knote_fdclose(td, i);
- FILEDESC_LOCK(fdp);
- }
+ knote_fdclose(td, i);
/*
* NULL-out descriptor prior to close to avoid
* a race while close blocks.
@@ -1728,11 +1719,7 @@ fdcloseexec(td)
(fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
struct file *fp;
- if (i < fdp->fd_knlistsize) {
- FILEDESC_UNLOCK(fdp);
- knote_fdclose(td, i);
- FILEDESC_LOCK(fdp);
- }
+ knote_fdclose(td, i);
/*
* NULL-out descriptor prior to close to avoid
* a race while close blocks.
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 771b3cd..d7949a7 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -1,5 +1,6 @@
/*-
* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
+ * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -39,6 +40,7 @@ __FBSDID("$FreeBSD$");
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/fcntl.h>
+#include <sys/kthread.h>
#include <sys/selinfo.h>
#include <sys/queue.h>
#include <sys/event.h>
@@ -52,16 +54,42 @@ __FBSDID("$FreeBSD$");
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
+#include <sys/taskqueue.h>
#include <sys/uio.h>
#include <vm/uma.h>
MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
-
-static int kqueue_scan(struct file *fp, int maxevents,
+/*
+ * This lock is used if multiple kq locks are required. This possibly
+ * should be made into a per proc lock.
+ */
+static struct mtx kq_global;
+MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
+#define KQ_GLOBAL_LOCK(lck, haslck) do { \
+ if (!haslck) \
+ mtx_lock(lck); \
+ haslck = 1; \
+} while (0)
+#define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
+ if (haslck) \
+ mtx_unlock(lck); \
+ haslck = 0; \
+} while (0)
+
+TASKQUEUE_DEFINE_THREAD(kqueue);
+
+static int kqueue_aquire(struct file *fp, struct kqueue **kqp);
+static void kqueue_release(struct kqueue *kq, int locked);
+static int kqueue_expand(struct kqueue *kq, struct filterops *fops,
+ uintptr_t ident, int waitok);
+static void kqueue_task(void *arg, int pending);
+static int kqueue_scan(struct kqueue *kq, int maxevents,
struct kevent *ulistp, const struct timespec *timeout,
- struct thread *td);
+ struct kevent *keva, struct thread *td);
static void kqueue_wakeup(struct kqueue *kq);
+static struct filterops *kqueue_fo_find(int filt);
+static void kqueue_fo_release(int filt);
static fo_rdwr_t kqueue_read;
static fo_rdwr_t kqueue_write;
@@ -81,12 +109,12 @@ static struct fileops kqueueops = {
.fo_close = kqueue_close,
};
-static void knote_attach(struct knote *kn, struct filedesc *fdp);
+static int knote_attach(struct knote *kn, struct kqueue *kq);
static void knote_drop(struct knote *kn, struct thread *td);
static void knote_enqueue(struct knote *kn);
static void knote_dequeue(struct knote *kn);
static void knote_init(void);
-static struct knote *knote_alloc(void);
+static struct knote *knote_alloc(int waitok);
static void knote_free(struct knote *kn);
static void filt_kqdetach(struct knote *kn);
@@ -104,6 +132,7 @@ static struct filterops file_filtops =
{ 1, filt_fileattach, NULL, NULL };
static struct filterops kqread_filtops =
{ 1, NULL, filt_kqdetach, filt_kqueue };
+/* XXX - move to kern_proc.c? */
static struct filterops proc_filtops =
{ 0, filt_procattach, filt_procdetach, filt_proc };
static struct filterops timer_filtops =
@@ -115,11 +144,48 @@ static int kq_calloutmax = (4 * 1024);
SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
&kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
-#define KNOTE_ACTIVATE(kn) do { \
- kn->kn_status |= KN_ACTIVE; \
- if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
- knote_enqueue(kn); \
+/* XXX - ensure not KN_INFLUX?? */
+#define KNOTE_ACTIVATE(kn, islock) do { \
+ if ((islock)) \
+ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
+ else \
+ KQ_LOCK((kn)->kn_kq); \
+ (kn)->kn_status |= KN_ACTIVE; \
+ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
+ knote_enqueue((kn)); \
+ if (!(islock)) \
+ KQ_UNLOCK((kn)->kn_kq); \
} while(0)
+#define KQ_LOCK(kq) do { \
+ mtx_lock(&(kq)->kq_lock); \
+} while (0)
+#define KQ_FLUX_WAKEUP(kq) do { \
+ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
+ (kq)->kq_state &= ~KQ_FLUXWAIT; \
+ wakeup((kq)); \
+ } \
+} while (0)
+#define KQ_UNLOCK_FLUX(kq) do { \
+ KQ_FLUX_WAKEUP(kq); \
+ mtx_unlock(&(kq)->kq_lock); \
+} while (0)
+#define KQ_UNLOCK(kq) do { \
+ mtx_unlock(&(kq)->kq_lock); \
+} while (0)
+#define KQ_OWNED(kq) do { \
+ mtx_assert(&(kq)->kq_lock, MA_OWNED); \
+} while (0)
+#define KQ_NOTOWNED(kq) do { \
+ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
+} while (0)
+#define KN_LIST_LOCK(kn) do { \
+ if (kn->kn_knlist != NULL) \
+ mtx_lock(kn->kn_knlist->kl_lock); \
+} while (0)
+#define KN_LIST_UNLOCK(kn) do { \
+ if (kn->kn_knlist != NULL) \
+ mtx_unlock(kn->kn_knlist->kl_lock); \
+} while (0)
#define KN_HASHSIZE 64 /* XXX should be tunable */
#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
@@ -134,24 +200,35 @@ filt_nullattach(struct knote *kn)
struct filterops null_filtops =
{ 0, filt_nullattach, NULL, NULL };
+/* XXX - make SYSINIT to add these, and move into respective modules. */
extern struct filterops sig_filtops;
extern struct filterops fs_filtops;
/*
* Table for for all system-defined filters.
*/
-static struct filterops *sysfilt_ops[] = {
- &file_filtops, /* EVFILT_READ */
- &file_filtops, /* EVFILT_WRITE */
- &null_filtops, /* EVFILT_AIO */
- &file_filtops, /* EVFILT_VNODE */
- &proc_filtops, /* EVFILT_PROC */
- &sig_filtops, /* EVFILT_SIGNAL */
- &timer_filtops, /* EVFILT_TIMER */
- &file_filtops, /* EVFILT_NETDEV */
- &fs_filtops, /* EVFILT_FS */
+static struct mtx filterops_lock;
+MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
+ MTX_DEF);
+static struct {
+ struct filterops *for_fop;
+ int for_refcnt;
+} sysfilt_ops[EVFILT_SYSCOUNT] = {
+ { &file_filtops }, /* EVFILT_READ */
+ { &file_filtops }, /* EVFILT_WRITE */
+ { &null_filtops }, /* EVFILT_AIO */
+ { &file_filtops }, /* EVFILT_VNODE */
+ { &proc_filtops }, /* EVFILT_PROC */
+ { &sig_filtops }, /* EVFILT_SIGNAL */
+ { &timer_filtops }, /* EVFILT_TIMER */
+ { &file_filtops }, /* EVFILT_NETDEV */
+ { &fs_filtops }, /* EVFILT_FS */
};
+/*
+ * Simple redirection for all cdevsw style objects to call their fo_kqfilter
+ * method.
+ */
static int
filt_fileattach(struct knote *kn)
{
@@ -166,10 +243,12 @@ kqueue_kqfilter(struct file *fp, struct knote *kn)
struct kqueue *kq = kn->kn_fp->f_data;
if (kn->kn_filter != EVFILT_READ)
- return (1);
+ return (EINVAL);
+ kn->kn_status |= KN_KQUEUE;
kn->kn_fop = &kqread_filtops;
- SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
+ knlist_add(&kq->kq_sel.si_note, kn, 0);
+
return (0);
}
@@ -178,7 +257,7 @@ filt_kqdetach(struct knote *kn)
{
struct kqueue *kq = kn->kn_fp->f_data;
- SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
+ knlist_remove(&kq->kq_sel.si_note, kn, 0);
}
/*ARGSUSED*/
@@ -191,6 +270,7 @@ filt_kqueue(struct knote *kn, long hint)
return (kn->kn_data > 0);
}
+/* XXX - move to kern_proc.c? */
static int
filt_procattach(struct knote *kn)
{
@@ -203,13 +283,14 @@ filt_procattach(struct knote *kn)
if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
p = zpfind(kn->kn_id);
immediate = 1;
+ } else if (p != NULL && (p->p_flag & P_WEXIT)) {
+ immediate = 1;
}
+
if (p == NULL)
return (ESRCH);
- if ((error = p_cansee(curthread, p))) {
- PROC_UNLOCK(p);
+ if ((error = p_cansee(curthread, p)))
return (error);
- }
kn->kn_ptr.p_proc = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
@@ -224,7 +305,7 @@ filt_procattach(struct knote *kn)
}
if (immediate == 0)
- SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
+ knlist_add(&p->p_klist, kn, 1);
/*
* Immediately activate any exit notes if the target process is a
@@ -232,7 +313,7 @@ filt_procattach(struct knote *kn)
* process, e.g. a child, dies before the kevent is registered.
*/
if (immediate && filt_proc(kn, NOTE_EXIT))
- KNOTE_ACTIVATE(kn);
+ KNOTE_ACTIVATE(kn, 0);
PROC_UNLOCK(p);
@@ -247,22 +328,25 @@ filt_procattach(struct knote *kn)
* this routine is called, so a check is needed to avoid actually performing
* a detach, because the original process does not exist any more.
*/
+/* XXX - move to kern_proc.c? */
static void
filt_procdetach(struct knote *kn)
{
- struct proc *p = kn->kn_ptr.p_proc;
+ struct proc *p;
if (kn->kn_status & KN_DETACHED)
return;
- PROC_LOCK(p);
- SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
- PROC_UNLOCK(p);
+ p = kn->kn_ptr.p_proc;
+ knlist_remove(&p->p_klist, kn, 0);
+ kn->kn_ptr.p_proc = NULL;
}
+/* XXX - move to kern_proc.c? */
static int
filt_proc(struct knote *kn, long hint)
{
+ struct proc *p = kn->kn_ptr.p_proc;
u_int event;
/*
@@ -280,8 +364,10 @@ filt_proc(struct knote *kn, long hint)
* process is gone, so flag the event as finished.
*/
if (event == NOTE_EXIT) {
- kn->kn_status |= KN_DETACHED;
+ if (!(kn->kn_status & KN_DETACHED))
+ knlist_remove_inevent(&p->p_klist, kn);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ kn->kn_ptr.p_proc = NULL;
return (1);
}
@@ -303,7 +389,7 @@ filt_proc(struct knote *kn, long hint)
kev.fflags = kn->kn_sfflags;
kev.data = kn->kn_id; /* parent */
kev.udata = kn->kn_kevent.udata; /* preserve udata */
- error = kqueue_register(kn->kn_kq, &kev, NULL);
+ error = kqueue_register(kn->kn_kq, &kev, NULL, 0);
if (error)
kn->kn_fflags |= NOTE_TRACKERR;
}
@@ -311,54 +397,64 @@ filt_proc(struct knote *kn, long hint)
return (kn->kn_fflags != 0);
}
+static int
+timertoticks(intptr_t data)
+{
+ struct timeval tv;
+ int tticks;
+
+ tv.tv_sec = data / 1000;
+ tv.tv_usec = (data % 1000) * 1000;
+ tticks = tvtohz(&tv);
+
+ return tticks;
+}
+
+/* XXX - move to kern_timeout.c? */
static void
filt_timerexpire(void *knx)
{
struct knote *kn = knx;
struct callout *calloutp;
- struct timeval tv;
- int tticks;
kn->kn_data++;
- KNOTE_ACTIVATE(kn);
+ KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
- if ((kn->kn_flags & EV_ONESHOT) == 0) {
- tv.tv_sec = kn->kn_sdata / 1000;
- tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
- tticks = tvtohz(&tv);
+ if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
calloutp = (struct callout *)kn->kn_hook;
- callout_reset(calloutp, tticks, filt_timerexpire, kn);
+ callout_reset(calloutp, timertoticks(kn->kn_sdata),
+ filt_timerexpire, kn);
}
}
/*
* data contains amount of time to sleep, in milliseconds
*/
+/* XXX - move to kern_timeout.c? */
static int
filt_timerattach(struct knote *kn)
{
struct callout *calloutp;
- struct timeval tv;
- int tticks;
- if (kq_ncallouts >= kq_calloutmax)
- return (ENOMEM);
- kq_ncallouts++;
+ atomic_add_int(&kq_ncallouts, 1);
- tv.tv_sec = kn->kn_sdata / 1000;
- tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
- tticks = tvtohz(&tv);
+ if (kq_ncallouts >= kq_calloutmax) {
+ atomic_add_int(&kq_ncallouts, -1);
+ return (ENOMEM);
+ }
kn->kn_flags |= EV_CLEAR; /* automatically set */
MALLOC(calloutp, struct callout *, sizeof(*calloutp),
M_KQUEUE, M_WAITOK);
- callout_init(calloutp, 0);
+ callout_init(calloutp, 1);
kn->kn_hook = calloutp;
- callout_reset(calloutp, tticks, filt_timerexpire, kn);
+ callout_reset(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire,
+ kn);
return (0);
}
+/* XXX - move to kern_timeout.c? */
static void
filt_timerdetach(struct knote *kn)
{
@@ -367,9 +463,10 @@ filt_timerdetach(struct knote *kn)
calloutp = (struct callout *)kn->kn_hook;
callout_drain(calloutp);
FREE(calloutp, M_KQUEUE);
- kq_ncallouts--;
+ atomic_add_int(&kq_ncallouts, -1);
}
+/* XXX - move to kern_timeout.c? */
static int
filt_timer(struct knote *kn, long hint)
{
@@ -388,14 +485,23 @@ kqueue(struct thread *td, struct kqueue_args *uap)
struct file *fp;
int fd, error;
- mtx_lock(&Giant);
fdp = td->td_proc->p_fd;
error = falloc(td, &fp, &fd);
if (error)
goto done2;
+
/* An extra reference on `nfp' has been held for us by falloc(). */
- kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
+ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
+ mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
TAILQ_INIT(&kq->kq_head);
+ kq->kq_fdp = fdp;
+ knlist_init(&kq->kq_sel.si_note, &kq->kq_lock);
+ TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+
+ FILEDESC_LOCK(fdp);
+ SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
+ FILEDESC_UNLOCK(fdp);
+
FILE_LOCK(fp);
fp->f_flag = FREAD | FWRITE;
fp->f_type = DTYPE_KQUEUE;
@@ -403,14 +509,9 @@ kqueue(struct thread *td, struct kqueue_args *uap)
fp->f_data = kq;
FILE_UNLOCK(fp);
fdrop(fp, td);
- FILEDESC_LOCK(fdp);
+
td->td_retval[0] = fd;
- if (fdp->fd_knlistsize < 0)
- fdp->fd_knlistsize = 0; /* this process has a kq */
- FILEDESC_UNLOCK(fdp);
- kq->kq_fdp = fdp;
done2:
- mtx_unlock(&Giant);
return (error);
}
@@ -430,6 +531,7 @@ struct kevent_args {
int
kevent(struct thread *td, struct kevent_args *uap)
{
+ struct kevent keva[KQ_NEVENTS];
struct kevent *kevp;
struct kqueue *kq;
struct file *fp;
@@ -438,31 +540,28 @@ kevent(struct thread *td, struct kevent_args *uap)
if ((error = fget(td, uap->fd, &fp)) != 0)
return (error);
- if (fp->f_type != DTYPE_KQUEUE) {
- fdrop(fp, td);
- return (EBADF);
- }
+ if ((error = kqueue_aquire(fp, &kq)) != 0)
+ goto done_norel;
+
if (uap->timeout != NULL) {
error = copyin(uap->timeout, &ts, sizeof(ts));
if (error)
- goto done_nogiant;
+ goto done;
uap->timeout = &ts;
}
- mtx_lock(&Giant);
- kq = fp->f_data;
nerrors = 0;
while (uap->nchanges > 0) {
n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
- error = copyin(uap->changelist, kq->kq_kev,
- n * sizeof(struct kevent));
+ error = copyin(uap->changelist, keva,
+ n * sizeof *keva);
if (error)
goto done;
for (i = 0; i < n; i++) {
- kevp = &kq->kq_kev[i];
+ kevp = &keva[i];
kevp->flags &= ~EV_SYSFLAGS;
- error = kqueue_register(kq, kevp, td);
+ error = kqueue_register(kq, kevp, td, 1);
if (error) {
if (uap->nevents != 0) {
kevp->flags = EV_ERROR;
@@ -482,15 +581,16 @@ kevent(struct thread *td, struct kevent_args *uap)
uap->changelist += n;
}
if (nerrors) {
- td->td_retval[0] = nerrors;
+ td->td_retval[0] = nerrors;
error = 0;
goto done;
}
- error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td);
+ error = kqueue_scan(kq, uap->nevents, uap->eventlist, uap->timeout,
+ keva, td);
done:
- mtx_unlock(&Giant);
-done_nogiant:
+ kqueue_release(kq, 0);
+done_norel:
if (fp != NULL)
fdrop(fp, td);
return (error);
@@ -499,88 +599,192 @@ done_nogiant:
int
kqueue_add_filteropts(int filt, struct filterops *filtops)
{
+ int error;
+
+ if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
+ printf(
+"trying to add a filterop that is out of range: %d is beyond %d\n",
+ ~filt, EVFILT_SYSCOUNT);
+ return EINVAL;
+ }
+ mtx_lock(&filterops_lock);
+ if (sysfilt_ops[~filt].for_fop != &null_filtops &&
+ sysfilt_ops[~filt].for_fop != NULL)
+ error = EEXIST;
+ else {
+ sysfilt_ops[~filt].for_fop = filtops;
+ sysfilt_ops[~filt].for_refcnt = 0;
+ }
+ mtx_unlock(&filterops_lock);
- if (filt > 0)
- panic("filt(%d) > 0", filt);
- if (filt + EVFILT_SYSCOUNT < 0)
- panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
- filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
- if (sysfilt_ops[~filt] != &null_filtops)
- panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
- sysfilt_ops[~filt] = filtops;
return (0);
}
int
kqueue_del_filteropts(int filt)
{
+ int error;
- if (filt > 0)
- panic("filt(%d) > 0", filt);
- if (filt + EVFILT_SYSCOUNT < 0)
- panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
- filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
- if (sysfilt_ops[~filt] == &null_filtops)
- panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
- sysfilt_ops[~filt] = &null_filtops;
- return (0);
+ error = 0;
+ if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+ return EINVAL;
+
+ mtx_lock(&filterops_lock);
+ if (sysfilt_ops[~filt].for_fop == &null_filtops ||
+ sysfilt_ops[~filt].for_fop == NULL)
+ error = EINVAL;
+ else if (sysfilt_ops[~filt].for_refcnt != 0)
+ error = EBUSY;
+ else {
+ sysfilt_ops[~filt].for_fop = &null_filtops;
+ sysfilt_ops[~filt].for_refcnt = 0;
+ }
+ mtx_unlock(&filterops_lock);
+
+ return error;
+}
+
+static struct filterops *
+kqueue_fo_find(int filt)
+{
+
+ if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+ return NULL;
+
+ mtx_lock(&filterops_lock);
+ sysfilt_ops[~filt].for_refcnt++;
+ if (sysfilt_ops[~filt].for_fop == NULL)
+ sysfilt_ops[~filt].for_fop = &null_filtops;
+ mtx_unlock(&filterops_lock);
+
+ return sysfilt_ops[~filt].for_fop;
+}
+
+static void
+kqueue_fo_release(int filt)
+{
+
+ if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
+ return;
+
+ mtx_lock(&filterops_lock);
+ KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
+ ("filter object refcount not valid on release"));
+ sysfilt_ops[~filt].for_refcnt--;
+ mtx_unlock(&filterops_lock);
}
+/*
+ * A ref to kq (obtained via kqueue_aquire) should be held. waitok will
+ * influence if memory allocation should wait. Make sure it is 0 if you
+ * hold any mutexes.
+ */
int
-kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
+kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
{
- struct filedesc *fdp = kq->kq_fdp;
+ struct filedesc *fdp;
struct filterops *fops;
- struct file *fp = NULL;
- struct knote *kn = NULL;
- int s, error = 0;
-
- if (kev->filter < 0) {
- if (kev->filter + EVFILT_SYSCOUNT < 0)
- return (EINVAL);
- fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
- } else {
- /*
- * XXX
- * filter attach routine is responsible for insuring that
- * the identifier can be attached to it.
- */
- printf("unknown filter: %d\n", kev->filter);
- return (EINVAL);
- }
+ struct file *fp;
+ struct knote *kn, *tkn;
+ int error, filt, event;
+ int haskqglobal;
+ int fd;
- FILEDESC_LOCK(fdp);
+ fdp = NULL;
+ fp = NULL;
+ kn = NULL;
+ error = 0;
+ haskqglobal = 0;
+
+ filt = kev->filter;
+ fops = kqueue_fo_find(filt);
+ if (fops == NULL)
+ return EINVAL;
+
+ tkn = knote_alloc(waitok); /* prevent waiting with locks */
+
+findkn:
if (fops->f_isfd) {
+ KASSERT(td != NULL, ("td is NULL"));
+ fdp = td->td_proc->p_fd;
+ FILEDESC_LOCK(fdp);
/* validate descriptor */
- if ((u_int)kev->ident >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[kev->ident]) == NULL) {
+ fd = kev->ident;
+ if (fd < 0 || fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL) {
FILEDESC_UNLOCK(fdp);
- return (EBADF);
+ error = EBADF;
+ goto done;
}
fhold(fp);
- if (kev->ident < fdp->fd_knlistsize) {
- SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
- if (kq == kn->kn_kq &&
- kev->filter == kn->kn_filter)
+ if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
+ kev->ident, 0) != 0) {
+ /* unlock and try again */
+ FILEDESC_UNLOCK(fdp);
+ fdrop(fp, td);
+ fp = NULL;
+ error = kqueue_expand(kq, fops, kev->ident, waitok);
+ if (error)
+ goto done;
+ goto findkn;
+ }
+
+ if (fp->f_type == DTYPE_KQUEUE) {
+ /*
+ * if we add some inteligence about what we are doing,
+ * we should be able to support events on ourselves.
+ * We need to know when we are doing this to prevent
+ * getting both the knlist lock and the kq lock since
+ * they are the same thing.
+ */
+ if (fp->f_data == kq) {
+ FILEDESC_UNLOCK(fdp);
+ error = EINVAL;
+ goto done_noglobal;
+ }
+
+ KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+ }
+
+ KQ_LOCK(kq);
+ if (kev->ident < kq->kq_knlistsize) {
+ SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
+ if (kev->filter == kn->kn_filter)
break;
}
+ FILEDESC_UNLOCK(fdp);
} else {
- if (fdp->fd_knhashmask != 0) {
+ if ((kev->flags & EV_ADD) == EV_ADD)
+ kqueue_expand(kq, fops, kev->ident, waitok);
+
+ KQ_LOCK(kq);
+ if (kq->kq_knhashmask != 0) {
struct klist *list;
- list = &fdp->fd_knhash[
- KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
+ list = &kq->kq_knhash[
+ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
SLIST_FOREACH(kn, list, kn_link)
if (kev->ident == kn->kn_id &&
- kq == kn->kn_kq &&
kev->filter == kn->kn_filter)
break;
}
}
- FILEDESC_UNLOCK(fdp);
+
+ /* knote is in the process of changing, wait for it to stablize. */
+ if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ if (fp != NULL) {
+ fdrop(fp, td);
+ fp = NULL;
+ }
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+ kq->kq_state |= KQ_FLUXWAIT;
+ msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
+ goto findkn;
+ }
if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
+ KQ_UNLOCK(kq);
error = ENOENT;
goto done;
}
@@ -589,9 +793,9 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
* kn now contains the matching knote, or NULL if no match
*/
if (kev->flags & EV_ADD) {
-
if (kn == NULL) {
- kn = knote_alloc();
+ kn = tkn;
+ tkn = NULL;
if (kn == NULL) {
error = ENOMEM;
goto done;
@@ -599,11 +803,11 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
kn->kn_fp = fp;
kn->kn_kq = kq;
kn->kn_fop = fops;
-
/*
- * apply reference count to knote structure, and
+ * apply reference counts to knote structure, and
* do not release it at the end of this routine.
*/
+ fops = NULL;
fp = NULL;
kn->kn_sfflags = kev->fflags;
@@ -611,29 +815,50 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
kev->fflags = 0;
kev->data = 0;
kn->kn_kevent = *kev;
+ kn->kn_status = KN_INFLUX|KN_DETACHED;
- knote_attach(kn, fdp);
- if ((error = fops->f_attach(kn)) != 0) {
+ error = knote_attach(kn, kq);
+ KQ_UNLOCK(kq);
+ if (error != 0) {
+ tkn = kn;
+ goto done;
+ }
+
+ if ((error = kn->kn_fop->f_attach(kn)) != 0) {
knote_drop(kn, td);
goto done;
}
+ KN_LIST_LOCK(kn);
} else {
/*
* The user may change some filter values after the
* initial EV_ADD, but doing so will not reset any
* filter which has already been triggered.
*/
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ KN_LIST_LOCK(kn);
kn->kn_sfflags = kev->fflags;
kn->kn_sdata = kev->data;
kn->kn_kevent.udata = kev->udata;
}
- s = splhigh();
- if (kn->kn_fop->f_event(kn, 0))
- KNOTE_ACTIVATE(kn);
- splx(s);
-
+ /*
+ * We can get here with kn->kn_knlist == NULL.
+ * This can happen when the initial attach event decides that
+ * the event is "completed" already. i.e. filt_procattach
+ * is called on a zombie process. It will call filt_proc
+ * which will remove it from the list, and NULL kn_knlist.
+ */
+ event = kn->kn_fop->f_event(kn, 0);
+ KN_LIST_UNLOCK(kn);
+ KQ_LOCK(kq);
+ if (event)
+ KNOTE_ACTIVATE(kn, 1);
+ kn->kn_status &= ~KN_INFLUX;
} else if (kev->flags & EV_DELETE) {
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
kn->kn_fop->f_detach(kn);
knote_drop(kn, td);
goto done;
@@ -641,48 +866,208 @@ kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
if ((kev->flags & EV_DISABLE) &&
((kn->kn_status & KN_DISABLED) == 0)) {
- s = splhigh();
kn->kn_status |= KN_DISABLED;
- splx(s);
}
if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
- s = splhigh();
kn->kn_status &= ~KN_DISABLED;
if ((kn->kn_status & KN_ACTIVE) &&
((kn->kn_status & KN_QUEUED) == 0))
knote_enqueue(kn);
- splx(s);
}
+ KQ_UNLOCK_FLUX(kq);
done:
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+done_noglobal:
if (fp != NULL)
fdrop(fp, td);
+ if (tkn != NULL)
+ knote_free(tkn);
+ if (fops != NULL)
+ kqueue_fo_release(filt);
return (error);
}
static int
-kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
- const struct timespec *tsp, struct thread *td)
+kqueue_aquire(struct file *fp, struct kqueue **kqp)
+{
+ int error;
+ struct kqueue *kq;
+
+ error = 0;
+
+ FILE_LOCK(fp);
+ do {
+ kq = fp->f_data;
+ if (fp->f_type != DTYPE_KQUEUE || kq == NULL) {
+ error = EBADF;
+ break;
+ }
+ *kqp = kq;
+ KQ_LOCK(kq);
+ if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
+ KQ_UNLOCK(kq);
+ error = EBADF;
+ break;
+ }
+ kq->kq_refcnt++;
+ KQ_UNLOCK(kq);
+ } while (0);
+ FILE_UNLOCK(fp);
+
+ return error;
+}
+
+static void
+kqueue_release(struct kqueue *kq, int locked)
+{
+ if (locked)
+ KQ_OWNED(kq);
+ else
+ KQ_LOCK(kq);
+ kq->kq_refcnt--;
+ if (kq->kq_refcnt == 1)
+ wakeup(&kq->kq_refcnt);
+ if (!locked)
+ KQ_UNLOCK(kq);
+}
+
+static void
+kqueue_schedtask(struct kqueue *kq)
+{
+
+ KQ_OWNED(kq);
+ KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
+ ("scheduling kqueue task while draining"));
+
+ if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
+ taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
+ kq->kq_state |= KQ_TASKSCHED;
+ }
+}
+
+/*
+ * Expand the kq to make sure we have storage for fops/ident pair.
+ *
+ * Return 0 on success (or no work necessary), return errno on failure.
+ *
+ * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
+ * If kqueue_register is called from a non-fd context, there usually/should
+ * be no locks held.
+ */
+static int
+kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
+ int waitok)
+{
+ struct klist *list, *tmp_knhash;
+ u_long tmp_knhashmask;
+ int size;
+ int fd;
+ int mflag = waitok ? M_WAITOK : M_NOWAIT;
+
+ KQ_NOTOWNED(kq);
+
+ if (fops->f_isfd) {
+ fd = ident;
+ if (kq->kq_knlistsize <= fd) {
+ size = kq->kq_knlistsize;
+ while (size <= fd)
+ size += KQEXTENT;
+ MALLOC(list, struct klist *,
+ size * sizeof list, M_KQUEUE, mflag);
+ if (list == NULL)
+ return ENOMEM;
+ KQ_LOCK(kq);
+ if (kq->kq_knlistsize > fd) {
+ FREE(list, M_KQUEUE);
+ list = NULL;
+ } else {
+ if (kq->kq_knlist != NULL) {
+ bcopy(kq->kq_knlist, list,
+ kq->kq_knlistsize * sizeof list);
+ FREE(kq->kq_knlist, M_KQUEUE);
+ kq->kq_knlist = NULL;
+ }
+ bzero((caddr_t)list +
+ kq->kq_knlistsize * sizeof list,
+ (size - kq->kq_knlistsize) * sizeof list);
+ kq->kq_knlistsize = size;
+ kq->kq_knlist = list;
+ }
+ KQ_UNLOCK(kq);
+ }
+ } else {
+ if (kq->kq_knhashmask == 0) {
+ tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
+ &tmp_knhashmask);
+ if (tmp_knhash == NULL)
+ return ENOMEM;
+ KQ_LOCK(kq);
+ if (kq->kq_knhashmask == 0) {
+ kq->kq_knhash = tmp_knhash;
+ kq->kq_knhashmask = tmp_knhashmask;
+ } else {
+ free(tmp_knhash, M_KQUEUE);
+ }
+ KQ_UNLOCK(kq);
+ }
+ }
+
+ KQ_NOTOWNED(kq);
+ return 0;
+}
+
+static void
+kqueue_task(void *arg, int pending)
{
struct kqueue *kq;
+ int haskqglobal;
+
+ haskqglobal = 0;
+ kq = arg;
+
+ KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+ KQ_LOCK(kq);
+
+ KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
+
+ kq->kq_state &= ~KQ_TASKSCHED;
+ if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
+ wakeup(&kq->kq_state);
+ }
+ KQ_UNLOCK(kq);
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+}
+
+/*
+ * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
+ * We treat KN_MARKER knotes as if they are INFLUX.
+ */
+static int
+kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
+ const struct timespec *tsp, struct kevent *keva, struct thread *td)
+{
struct kevent *kevp;
struct timeval atv, rtv, ttv;
struct knote *kn, marker;
- int s, count, timeout, nkev = 0, error = 0;
+ int count, timeout, nkev, error;
+ int haskqglobal;
- FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
-
- kq = fp->f_data;
count = maxevents;
- if (count == 0)
- goto done;
+ nkev = 0;
+ error = 0;
+ haskqglobal = 0;
+ marker.kn_status = KN_MARKER;
+
+ if (maxevents == 0)
+ goto done_nl;
if (tsp != NULL) {
TIMESPEC_TO_TIMEVAL(&atv, tsp);
if (itimerfix(&atv)) {
error = EINVAL;
- goto done;
+ goto done_nl;
}
if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
timeout = -1;
@@ -696,6 +1081,7 @@ kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
atv.tv_usec = 0;
timeout = 0;
}
+ KQ_LOCK(kq);
goto start;
retry:
@@ -710,16 +1096,15 @@ retry:
}
start:
- kevp = kq->kq_kev;
- s = splhigh();
+ kevp = keva;
if (kq->kq_count == 0) {
if (timeout < 0) {
error = EWOULDBLOCK;
} else {
kq->kq_state |= KQ_SLEEP;
- error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
+ error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
+ "kqread", timeout);
}
- splx(s);
if (error == 0)
goto retry;
/* don't restart after signals... */
@@ -732,63 +1117,99 @@ start:
TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe);
while (count) {
+ KQ_OWNED(kq);
kn = TAILQ_FIRST(&kq->kq_head);
- TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
- if (kn == &marker) {
- splx(s);
- if (count == maxevents)
- goto retry;
- goto done;
+
+ if ((kn->kn_status == KN_MARKER && kn != &marker) ||
+ (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ kq->kq_state |= KQ_FLUXWAIT;
+ error = msleep(kq, &kq->kq_lock, PSOCK,
+ "kqflxwt", 0);
+ continue;
}
- if (kn->kn_status & KN_DISABLED) {
+
+ TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+ if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
kn->kn_status &= ~KN_QUEUED;
kq->kq_count--;
continue;
}
- if ((kn->kn_flags & EV_ONESHOT) == 0 &&
- kn->kn_fop->f_event(kn, 0) == 0) {
- kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
- kq->kq_count--;
- continue;
+ if (kn == &marker) {
+ KQ_FLUX_WAKEUP(kq);
+ if (count == maxevents)
+ goto retry;
+ goto done;
}
- *kevp = kn->kn_kevent;
- kevp++;
- nkev++;
- if (kn->kn_flags & EV_ONESHOT) {
+ KASSERT((kn->kn_status & KN_INFLUX) == 0,
+ ("KN_INFLUX set when not suppose to be"));
+
+ if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
kn->kn_status &= ~KN_QUEUED;
+ kn->kn_status |= KN_INFLUX;
kq->kq_count--;
- splx(s);
+ KQ_UNLOCK(kq);
+ /*
+ * We don't need to lock the list since we've marked
+ * it _INFLUX.
+ */
+ *kevp = kn->kn_kevent;
kn->kn_fop->f_detach(kn);
knote_drop(kn, td);
- s = splhigh();
- } else if (kn->kn_flags & EV_CLEAR) {
- kn->kn_data = 0;
- kn->kn_fflags = 0;
- kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
- kq->kq_count--;
+ KQ_LOCK(kq);
+ kn = NULL;
} else {
- TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
+ KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
+ KN_LIST_LOCK(kn);
+ if (kn->kn_fop->f_event(kn, 0) == 0) {
+ KN_LIST_UNLOCK(kn);
+ KQ_LOCK(kq);
+ kn->kn_status &=
+ ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
+ kq->kq_count--;
+ continue;
+ }
+ *kevp = kn->kn_kevent;
+ KQ_LOCK(kq);
+ if (kn->kn_flags & EV_CLEAR) {
+ kn->kn_data = 0;
+ kn->kn_fflags = 0;
+ kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
+ kq->kq_count--;
+ } else
+ TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
+ KN_LIST_UNLOCK(kn);
+ kn->kn_status &= ~(KN_INFLUX);
}
+
+ /* we are returning a copy to the user */
+ kevp++;
+ nkev++;
count--;
+
if (nkev == KQ_NEVENTS) {
- splx(s);
- error = copyout(&kq->kq_kev, ulistp,
- sizeof(struct kevent) * nkev);
+ KQ_UNLOCK_FLUX(kq);
+ error = copyout(keva, ulistp, sizeof *keva * nkev);
ulistp += nkev;
nkev = 0;
- kevp = kq->kq_kev;
- s = splhigh();
+ kevp = keva;
+ KQ_LOCK(kq);
if (error)
break;
}
}
TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe);
- splx(s);
done:
+ KQ_OWNED(kq);
+ KQ_UNLOCK_FLUX(kq);
+ KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+done_nl:
+ KQ_NOTOWNED(kq);
if (nkev != 0)
- error = copyout(&kq->kq_kev, ulistp,
- sizeof(struct kevent) * nkev);
- td->td_retval[0] = maxevents - count;
+ error = copyout(keva, ulistp, sizeof *keva * nkev);
+ td->td_retval[0] = maxevents - count;
return (error);
}
@@ -867,18 +1288,22 @@ kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
{
struct kqueue *kq;
int revents = 0;
- int s = splnet();
+ int error;
- kq = fp->f_data;
- if (events & (POLLIN | POLLRDNORM)) {
- if (kq->kq_count) {
- revents |= events & (POLLIN | POLLRDNORM);
+ if ((error = kqueue_aquire(fp, &kq)))
+ return POLLERR;
+
+ KQ_LOCK(kq);
+ if (events & (POLLIN | POLLRDNORM)) {
+ if (kq->kq_count) {
+ revents |= events & (POLLIN | POLLRDNORM);
} else {
- selrecord(td, &kq->kq_sel);
+ selrecord(td, &kq->kq_sel);
kq->kq_state |= KQ_SEL;
}
}
- splx(s);
+ kqueue_release(kq, 1);
+ KQ_UNLOCK(kq);
return (revents);
}
@@ -887,15 +1312,8 @@ static int
kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
struct thread *td)
{
- struct kqueue *kq;
- /* Unlocked read. */
- kq = fp->f_data;
- bzero((void *)st, sizeof(*st));
- st->st_size = kq->kq_count;
- st->st_blksize = sizeof(struct kevent);
- st->st_mode = S_IFIFO;
- return (0);
+ return (ENXIO);
}
/*ARGSUSED*/
@@ -903,222 +1321,431 @@ static int
kqueue_close(struct file *fp, struct thread *td)
{
struct kqueue *kq = fp->f_data;
- struct filedesc *fdp = kq->kq_fdp;
- struct knote **knp, *kn, *kn0;
+ struct filedesc *fdp;
+ struct knote *kn;
int i;
+ int error;
- mtx_lock(&Giant);
+ GIANT_REQUIRED;
- FILEDESC_LOCK(fdp);
- for (i = 0; i < fdp->fd_knlistsize; i++) {
- knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
- kn = *knp;
- while (kn != NULL) {
- kn0 = SLIST_NEXT(kn, kn_link);
- if (kq == kn->kn_kq) {
- kn->kn_fop->f_detach(kn);
- *knp = kn0;
- FILE_LOCK(kn->kn_fp);
- FILEDESC_UNLOCK(fdp);
- fdrop_locked(kn->kn_fp, td);
- knote_free(kn);
- FILEDESC_LOCK(fdp);
- } else {
- knp = &SLIST_NEXT(kn, kn_link);
- }
- kn = kn0;
+ if ((error = kqueue_aquire(fp, &kq)))
+ return error;
+
+ KQ_LOCK(kq);
+
+ KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
+ ("kqueue already closing"));
+ kq->kq_state |= KQ_CLOSING;
+ if (kq->kq_refcnt > 1)
+ msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
+
+ KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
+ fdp = kq->kq_fdp;
+
+ KASSERT(knlist_empty(&kq->kq_sel.si_note),
+ ("kqueue's knlist not empty"));
+
+ for (i = 0; i < kq->kq_knlistsize; i++) {
+ while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
+ KASSERT((kn->kn_status & KN_INFLUX) == 0,
+ ("KN_INFLUX set when not suppose to be"));
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ KQ_LOCK(kq);
}
}
- if (fdp->fd_knhashmask != 0) {
- for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
- knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
- kn = *knp;
- while (kn != NULL) {
- kn0 = SLIST_NEXT(kn, kn_link);
- if (kq == kn->kn_kq) {
- kn->kn_fop->f_detach(kn);
- *knp = kn0;
- /* XXX non-fd release of kn->kn_ptr */
- FILEDESC_UNLOCK(fdp);
- knote_free(kn);
- FILEDESC_LOCK(fdp);
- } else {
- knp = &SLIST_NEXT(kn, kn_link);
- }
- kn = kn0;
+ if (kq->kq_knhashmask != 0) {
+ for (i = 0; i <= kq->kq_knhashmask; i++) {
+ while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
+ KASSERT((kn->kn_status & KN_INFLUX) == 0,
+ ("KN_INFLUX set when not suppose to be"));
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ KQ_LOCK(kq);
}
}
}
- FILEDESC_UNLOCK(fdp);
- if (kq->kq_state & KQ_SEL) {
+
+ if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
+ kq->kq_state |= KQ_TASKDRAIN;
+ msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
+ }
+
+ if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
kq->kq_state &= ~KQ_SEL;
selwakeuppri(&kq->kq_sel, PSOCK);
}
+
+ KQ_UNLOCK(kq);
+
+ FILEDESC_LOCK(fdp);
+ SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
+ FILEDESC_UNLOCK(fdp);
+
+ knlist_destroy(&kq->kq_sel.si_note);
+ mtx_destroy(&kq->kq_lock);
+ kq->kq_fdp = NULL;
+
+ if (kq->kq_knhash != NULL)
+ free(kq->kq_knhash, M_KQUEUE);
+ if (kq->kq_knlist != NULL)
+ free(kq->kq_knlist, M_KQUEUE);
+
funsetown(&kq->kq_sigio);
free(kq, M_KQUEUE);
fp->f_data = NULL;
- mtx_unlock(&Giant);
return (0);
}
static void
kqueue_wakeup(struct kqueue *kq)
{
+ KQ_OWNED(kq);
- if (kq->kq_state & KQ_SLEEP) {
+ if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
kq->kq_state &= ~KQ_SLEEP;
wakeup(kq);
}
- if (kq->kq_state & KQ_SEL) {
+ if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
kq->kq_state &= ~KQ_SEL;
selwakeuppri(&kq->kq_sel, PSOCK);
}
- if (kq->kq_state & KQ_ASYNC) {
+ if (!knlist_empty(&kq->kq_sel.si_note))
+ kqueue_schedtask(kq);
+ if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
pgsigio(&kq->kq_sigio, SIGIO, 0);
}
- KNOTE(&kq->kq_sel.si_note, 0);
}
/*
- * walk down a list of knotes, activating them if their event has triggered.
+ * Walk down a list of knotes, activating them if their event has triggered.
+ *
+ * There is a possibility to optimize in the case of one kq watching another.
+ * Instead of scheduling a task to wake it up, you could pass enough state
+ * down the chain to make up the parent kqueue. Make this code functional
+ * first.
*/
void
-knote(struct klist *list, long hint)
+knote(struct knlist *list, long hint, int islocked)
{
+ struct kqueue *kq;
struct knote *kn;
- SLIST_FOREACH(kn, list, kn_selnext)
- if (kn->kn_fop->f_event(kn, hint))
- KNOTE_ACTIVATE(kn);
+ if (list == NULL)
+ return;
+
+ mtx_assert(list->kl_lock, islocked ? MA_OWNED : MA_NOTOWNED);
+ if (!islocked)
+ mtx_lock(list->kl_lock);
+ /*
+ * If we unlock the list lock (and set KN_INFLUX), we can eliminate
+ * the kqueue scheduling, but this will introduce four
+ * lock/unlock's for each knote to test. If we do, continue to use
+ * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
+ * only safe if you want to remove the current item, which we are
+ * not doing.
+ */
+ SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+ kq = kn->kn_kq;
+ if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
+ KQ_LOCK(kq);
+ if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
+ kn->kn_status |= KN_HASKQLOCK;
+ if (kn->kn_fop->f_event(kn, hint))
+ KNOTE_ACTIVATE(kn, 1);
+ kn->kn_status &= ~KN_HASKQLOCK;
+ }
+ KQ_UNLOCK(kq);
+ }
+ kq = NULL;
+ }
+ if (!islocked)
+ mtx_unlock(list->kl_lock);
+}
+
+/*
+ * add a knote to a knlist
+ */
+void
+knlist_add(struct knlist *knl, struct knote *kn, int islocked)
+{
+ mtx_assert(knl->kl_lock, islocked ? MA_OWNED : MA_NOTOWNED);
+ KQ_NOTOWNED(kn->kn_kq);
+ KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
+ (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
+ if (!islocked)
+ mtx_lock(knl->kl_lock);
+ SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
+ if (!islocked)
+ mtx_unlock(knl->kl_lock);
+ KQ_LOCK(kn->kn_kq);
+ kn->kn_knlist = knl;
+ kn->kn_status &= ~KN_DETACHED;
+ KQ_UNLOCK(kn->kn_kq);
+}
+
+static void
+knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
+{
+ KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
+ mtx_assert(knl->kl_lock, knlislocked ? MA_OWNED : MA_NOTOWNED);
+ mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
+ if (!kqislocked)
+ KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
+ ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
+ if (!knlislocked)
+ mtx_lock(knl->kl_lock);
+ SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
+ kn->kn_knlist = NULL;
+ if (!knlislocked)
+ mtx_unlock(knl->kl_lock);
+ if (!kqislocked)
+ KQ_LOCK(kn->kn_kq);
+ kn->kn_status |= KN_DETACHED;
+ if (!kqislocked)
+ KQ_UNLOCK(kn->kn_kq);
}
/*
* remove all knotes from a specified klist
*/
void
-knote_remove(struct thread *td, struct klist *list)
+knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
+{
+
+ knlist_remove_kq(knl, kn, islocked, 0);
+}
+
+/*
+ * remove knote from a specified klist while in f_event handler.
+ */
+void
+knlist_remove_inevent(struct knlist *knl, struct knote *kn)
+{
+
+ knlist_remove_kq(knl, kn, 1,
+ (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
+}
+
+int
+knlist_empty(struct knlist *knl)
+{
+
+ mtx_assert(knl->kl_lock, MA_OWNED);
+ return SLIST_EMPTY(&knl->kl_list);
+}
+
+static struct mtx knlist_lock;
+MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
+ MTX_DEF);
+
+void
+knlist_init(struct knlist *knl, struct mtx *mtx)
+{
+
+ if (mtx == NULL)
+ knl->kl_lock = &knlist_lock;
+ else
+ knl->kl_lock = mtx;
+
+ SLIST_INIT(&knl->kl_list);
+}
+
+void
+knlist_destroy(struct knlist *knl)
+{
+
+#ifdef INVARIANTS
+ /*
+ * if we run across this error, we need to find the offending
+ * driver and have it call knlist_clear.
+ */
+ if (!SLIST_EMPTY(&knl->kl_list))
+ printf("WARNING: destroying knlist w/ knotes on it!\n");
+#endif
+
+ knl->kl_lock = NULL;
+ SLIST_INIT(&knl->kl_list);
+}
+
+/*
+ * Even if we are locked, we may need to drop the lock to allow any influx
+ * knotes time to "settle".
+ */
+void
+knlist_clear(struct knlist *knl, int islocked)
{
struct knote *kn;
+ struct kqueue *kq;
- while ((kn = SLIST_FIRST(list)) != NULL) {
- kn->kn_fop->f_detach(kn);
- knote_drop(kn, td);
+ if (islocked)
+ mtx_assert(knl->kl_lock, MA_OWNED);
+ else {
+ mtx_assert(knl->kl_lock, MA_NOTOWNED);
+again: /* need to reaquire lock since we have dropped it */
+ mtx_lock(knl->kl_lock);
+ }
+
+ SLIST_FOREACH(kn, &knl->kl_list, kn_selnext) {
+ kq = kn->kn_kq;
+ KQ_LOCK(kq);
+ if ((kn->kn_status & KN_INFLUX) &&
+ (kn->kn_status & KN_DETACHED) != KN_DETACHED) {
+ KQ_UNLOCK(kq);
+ continue;
+ }
+ /* Make sure cleared knotes disappear soon */
+ kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ knlist_remove_kq(knl, kn, 1, 1);
+ KQ_UNLOCK(kq);
+ kq = NULL;
+ }
+
+ if (!SLIST_EMPTY(&knl->kl_list)) {
+ /* there are still KN_INFLUX remaining */
+ kn = SLIST_FIRST(&knl->kl_list);
+ kq = kn->kn_kq;
+ KQ_LOCK(kq);
+ KASSERT(kn->kn_status & KN_INFLUX,
+ ("knote removed w/o list lock"));
+ mtx_unlock(knl->kl_lock);
+ kq->kq_state |= KQ_FLUXWAIT;
+ msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
+ kq = NULL;
+ goto again;
+ }
+
+ SLIST_INIT(&knl->kl_list);
+
+ if (islocked)
+ mtx_assert(knl->kl_lock, MA_OWNED);
+ else {
+ mtx_unlock(knl->kl_lock);
+ mtx_assert(knl->kl_lock, MA_NOTOWNED);
}
}
/*
* remove all knotes referencing a specified fd
+ * must be called with FILEDESC lock. This prevents a race where a new fd
+ * comes along and occupies the entry and we attach a knote to the fd.
*/
void
knote_fdclose(struct thread *td, int fd)
{
struct filedesc *fdp = td->td_proc->p_fd;
- struct klist *list;
+ struct kqueue *kq;
+ struct knote *kn;
+ int influx;
- FILEDESC_LOCK(fdp);
- list = &fdp->fd_knlist[fd];
- FILEDESC_UNLOCK(fdp);
- knote_remove(td, list);
+ FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+ /*
+ * We shouldn't have to worry about new kevents appearing on fd
+ * since filedesc is locked.
+ */
+ SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
+ KQ_LOCK(kq);
+
+again:
+ influx = 0;
+ while (kq->kq_knlistsize > fd &&
+ (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
+ if (kn->kn_status & KN_INFLUX) {
+ /* someone else might be waiting on our knote */
+ if (influx)
+ wakeup(kq);
+ kq->kq_state |= KQ_FLUXWAIT;
+ msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
+ goto again;
+ }
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ influx = 1;
+ KQ_LOCK(kq);
+ }
+ KQ_UNLOCK_FLUX(kq);
+ }
}
-static void
-knote_attach(struct knote *kn, struct filedesc *fdp)
+static int
+knote_attach(struct knote *kn, struct kqueue *kq)
{
- struct klist *list, *tmp_knhash;
- u_long tmp_knhashmask;
- int size;
+ struct klist *list;
- FILEDESC_LOCK(fdp);
+ KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
+ KQ_OWNED(kq);
- if (! kn->kn_fop->f_isfd) {
- if (fdp->fd_knhashmask == 0) {
- FILEDESC_UNLOCK(fdp);
- tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
- &tmp_knhashmask);
- FILEDESC_LOCK(fdp);
- if (fdp->fd_knhashmask == 0) {
- fdp->fd_knhash = tmp_knhash;
- fdp->fd_knhashmask = tmp_knhashmask;
- } else {
- free(tmp_knhash, M_KQUEUE);
- }
- }
- list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
- goto done;
+ if (kn->kn_fop->f_isfd) {
+ if (kn->kn_id >= kq->kq_knlistsize)
+ return ENOMEM;
+ list = &kq->kq_knlist[kn->kn_id];
+ } else {
+ if (kq->kq_knhash == NULL)
+ return ENOMEM;
+ list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
}
- if (fdp->fd_knlistsize <= kn->kn_id) {
- size = fdp->fd_knlistsize;
- while (size <= kn->kn_id)
- size += KQEXTENT;
- FILEDESC_UNLOCK(fdp);
- MALLOC(list, struct klist *,
- size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
- FILEDESC_LOCK(fdp);
- if (fdp->fd_knlistsize > kn->kn_id) {
- FREE(list, M_KQUEUE);
- goto bigenough;
- }
- if (fdp->fd_knlist != NULL) {
- bcopy(fdp->fd_knlist, list,
- fdp->fd_knlistsize * sizeof(struct klist *));
- FREE(fdp->fd_knlist, M_KQUEUE);
- }
- bzero((caddr_t)list +
- fdp->fd_knlistsize * sizeof(struct klist *),
- (size - fdp->fd_knlistsize) * sizeof(struct klist *));
- fdp->fd_knlistsize = size;
- fdp->fd_knlist = list;
- }
-bigenough:
- list = &fdp->fd_knlist[kn->kn_id];
-done:
- FILEDESC_UNLOCK(fdp);
SLIST_INSERT_HEAD(list, kn, kn_link);
- kn->kn_status = 0;
+
+ return 0;
}
/*
- * should be called at spl == 0, since we don't want to hold spl
- * while calling fdrop and free.
+ * knote must already have been detatched using the f_detach method.
+ * no lock need to be held, it is assumed that the KN_INFLUX flag is set
+ * to prevent other removal.
*/
static void
knote_drop(struct knote *kn, struct thread *td)
{
- struct filedesc *fdp = td->td_proc->p_fd;
+ struct kqueue *kq;
struct klist *list;
- FILEDESC_LOCK(fdp);
+ kq = kn->kn_kq;
+
+ KQ_NOTOWNED(kq);
+ KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
+ ("knote_drop called without KN_INFLUX set in kn_status"));
+
+ KQ_LOCK(kq);
if (kn->kn_fop->f_isfd)
- list = &fdp->fd_knlist[kn->kn_id];
+ list = &kq->kq_knlist[kn->kn_id];
else
- list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
- if (kn->kn_fop->f_isfd)
- FILE_LOCK(kn->kn_fp);
- FILEDESC_UNLOCK(fdp);
+ list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
SLIST_REMOVE(list, kn, knote, kn_link);
if (kn->kn_status & KN_QUEUED)
knote_dequeue(kn);
- if (kn->kn_fop->f_isfd)
- fdrop_locked(kn->kn_fp, td);
+ KQ_UNLOCK_FLUX(kq);
+
+ if (kn->kn_fop->f_isfd) {
+ fdrop(kn->kn_fp, td);
+ kn->kn_fp = NULL;
+ }
+ kqueue_fo_release(kn->kn_kevent.filter);
+ kn->kn_fop = NULL;
knote_free(kn);
}
-
static void
knote_enqueue(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
- int s = splhigh();
+ KQ_OWNED(kn->kn_kq);
KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
kn->kn_status |= KN_QUEUED;
kq->kq_count++;
- splx(s);
kqueue_wakeup(kq);
}
@@ -1126,33 +1753,34 @@ static void
knote_dequeue(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
- int s = splhigh();
+ KQ_OWNED(kn->kn_kq);
KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~KN_QUEUED;
kq->kq_count--;
- splx(s);
}
static void
knote_init(void)
{
+
knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
-
}
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
static struct knote *
-knote_alloc(void)
+knote_alloc(int waitok)
{
- return ((struct knote *)uma_zalloc(knote_zone, M_WAITOK));
+ return ((struct knote *)uma_zalloc(knote_zone,
+ (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
}
static void
knote_free(struct knote *kn)
{
- uma_zfree(knote_zone, kn);
+ if (kn != NULL)
+ uma_zfree(knote_zone, kn);
}
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 7357468..34c7571 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -623,7 +623,7 @@ interpret:
* Notify others that we exec'd, and clear the P_INEXEC flag
* as we're now a bona fide freshly-execed process.
*/
- KNOTE(&p->p_klist, NOTE_EXEC);
+ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
p->p_flag &= ~P_INEXEC;
/*
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index f53aa47..96e9cf2 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -438,18 +438,17 @@ retry:
mtx_unlock_spin(&sched_lock);
ruadd(p->p_ru, &p->p_stats->p_cru);
+ mtx_unlock(&Giant);
/*
* Notify interested parties of our demise.
*/
- KNOTE(&p->p_klist, NOTE_EXIT);
- mtx_unlock(&Giant);
+ KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
/*
* Just delete all entries in the p_klist. At this point we won't
* report any more events, and there are nasty race conditions that
* can beat us if we don't.
*/
- while (SLIST_FIRST(&p->p_klist))
- SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext);
+ knlist_clear(&p->p_klist, 1);
/*
* Notify parent that we're gone. If parent has the PS_NOCLDWAIT
@@ -533,6 +532,12 @@ retry:
sched_exit(p->p_pptr, td);
/*
+ * hopefully no one will try to deliver a signal to the process this
+ * late in the game.
+ */
+ knlist_destroy(&p->p_klist);
+
+ /*
* Make sure the scheduler takes this thread out of its tables etc.
* This will also release this thread's reference to the ucred.
* Other thread parts to release include pcb bits and such.
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 78c6d8b..90f13c4 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -287,6 +287,7 @@ fork1(td, flags, pages, procp)
#ifdef MAC
mac_init_proc(newproc);
#endif
+ knlist_init(&newproc->p_klist, &newproc->p_mtx);
/* We have to lock the process tree while we look for a pid. */
sx_slock(&proctree_lock);
@@ -722,7 +723,7 @@ again:
/*
* Tell any interested parties about the new process.
*/
- KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
+ KNOTE_LOCKED(&p1->p_klist, NOTE_FORK | p2->p_pid);
PROC_UNLOCK(p1);
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index e6e8f67..4cd09cc 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -1674,7 +1674,7 @@ do_tdsignal(struct thread *td, int sig, sigtarget_t target)
ps = p->p_sigacts;
PROC_LOCK_ASSERT(p, MA_OWNED);
- KNOTE(&p->p_klist, NOTE_SIGNAL | sig);
+ KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
prop = sigprop(sig);
@@ -2720,9 +2720,7 @@ filt_sigattach(struct knote *kn)
kn->kn_ptr.p_proc = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
- PROC_LOCK(p);
- SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
- PROC_UNLOCK(p);
+ knlist_add(&p->p_klist, kn, 0);
return (0);
}
@@ -2732,9 +2730,7 @@ filt_sigdetach(struct knote *kn)
{
struct proc *p = kn->kn_ptr.p_proc;
- PROC_LOCK(p);
- SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
- PROC_UNLOCK(p);
+ knlist_remove(&p->p_klist, kn, 0);
}
/*
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index 28376ac..c37a140 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -504,7 +504,7 @@ pipeselwakeup(cpipe)
}
if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
- KNOTE(&cpipe->pipe_sel.si_note, 0);
+ KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
}
/*
@@ -524,6 +524,7 @@ pipe_create(pipe)
error = pipespace_new(pipe, SMALL_PIPE_SIZE);
else
error = pipespace_new(pipe, PIPE_SIZE);
+ knlist_init(&pipe->pipe_sel.si_note, PIPE_MTX(pipe));
return (error);
}
@@ -1424,7 +1425,7 @@ pipeclose(cpipe)
ppipe->pipe_state |= PIPE_EOF;
wakeup(ppipe);
- KNOTE(&ppipe->pipe_sel.si_note, 0);
+ KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
}
/*
@@ -1438,6 +1439,8 @@ pipeclose(cpipe)
PIPE_LOCK(cpipe);
cpipe->pipe_present = 0;
pipeunlock(cpipe);
+ knlist_clear(&cpipe->pipe_sel.si_note, 1);
+ knlist_destroy(&cpipe->pipe_sel.si_note);
/*
* If both endpoints are now closed, release the memory for the
@@ -1476,10 +1479,10 @@ pipe_kqfilter(struct file *fp, struct knote *kn)
break;
default:
PIPE_UNLOCK(cpipe);
- return (1);
+ return (EINVAL);
}
- SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
+ knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
PIPE_UNLOCK(cpipe);
return (0);
}
@@ -1497,7 +1500,7 @@ filt_pipedetach(struct knote *kn)
}
cpipe = cpipe->pipe_peer;
}
- SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
+ knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
PIPE_UNLOCK(cpipe);
}
@@ -1507,6 +1510,7 @@ filt_piperead(struct knote *kn, long hint)
{
struct pipe *rpipe = kn->kn_fp->f_data;
struct pipe *wpipe = rpipe->pipe_peer;
+ int ret;
PIPE_LOCK(rpipe);
kn->kn_data = rpipe->pipe_buffer.cnt;
@@ -1519,8 +1523,9 @@ filt_piperead(struct knote *kn, long hint)
PIPE_UNLOCK(rpipe);
return (1);
}
+ ret = kn->kn_data > 0;
PIPE_UNLOCK(rpipe);
- return (kn->kn_data > 0);
+ return ret;
}
/*ARGSUSED*/
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index f0172df..fa3fe3f 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -281,6 +281,8 @@ tty_close(struct tty *tp)
tp->t_pgrp = NULL;
tp->t_session = NULL;
tp->t_state = 0;
+ knlist_clear(&tp->t_rsel.si_note, 0);
+ knlist_clear(&tp->t_wsel.si_note, 0);
ttyrel(tp);
splx(s);
return (0);
@@ -1259,7 +1261,7 @@ int
ttykqfilter(struct cdev *dev, struct knote *kn)
{
struct tty *tp;
- struct klist *klist;
+ struct knlist *klist;
int s;
KASSERT(devsw(dev)->d_flags & D_TTY,
@@ -1277,13 +1279,13 @@ ttykqfilter(struct cdev *dev, struct knote *kn)
kn->kn_fop = &ttywrite_filtops;
break;
default:
- return (1);
+ return (EINVAL);
}
kn->kn_hook = (caddr_t)dev;
s = spltty();
- SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+ knlist_add(klist, kn, 0);
splx(s);
return (0);
@@ -1295,7 +1297,7 @@ filt_ttyrdetach(struct knote *kn)
struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
int s = spltty();
- SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext);
+ knlist_remove(&tp->t_rsel.si_note, kn, 0);
splx(s);
}
@@ -1318,7 +1320,7 @@ filt_ttywdetach(struct knote *kn)
struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
int s = spltty();
- SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext);
+ knlist_remove(&tp->t_wsel.si_note, kn, 0);
splx(s);
}
@@ -2365,7 +2367,7 @@ ttwakeup(struct tty *tp)
if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
wakeup(TSA_HUP_OR_INPUT(tp));
- KNOTE(&tp->t_rsel.si_note, 0);
+ KNOTE_UNLOCKED(&tp->t_rsel.si_note, 0);
}
/*
@@ -2389,7 +2391,7 @@ ttwwakeup(struct tty *tp)
CLR(tp->t_state, TS_SO_OLOWAT);
wakeup(TSA_OLOWAT(tp));
}
- KNOTE(&tp->t_wsel.si_note, 0);
+ KNOTE_UNLOCKED(&tp->t_wsel.si_note, 0);
}
/*
@@ -2754,6 +2756,8 @@ ttyrel(struct tty *tp)
TAILQ_REMOVE(&tty_list, tp, t_list);
mtx_unlock(&tp->t_mtx);
mtx_unlock(&tty_list_mutex);
+ knlist_destroy(&tp->t_rsel.si_note);
+ knlist_destroy(&tp->t_wsel.si_note);
mtx_destroy(&tp->t_mtx);
free(tp, M_TTYS);
return (i);
@@ -2789,6 +2793,8 @@ ttymalloc(struct tty *tp)
mtx_lock(&tty_list_mutex);
TAILQ_INSERT_TAIL(&tty_list, tp, t_list);
mtx_unlock(&tty_list_mutex);
+ knlist_init(&tp->t_rsel.si_note, &tp->t_mtx);
+ knlist_init(&tp->t_wsel.si_note, &tp->t_mtx);
return (tp);
}
diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c
index 54a287c..dd8a115 100644
--- a/sys/kern/tty_cons.c
+++ b/sys/kern/tty_cons.c
@@ -528,11 +528,11 @@ cnkqfilter(struct cdev *dev, struct knote *kn)
cnd = STAILQ_FIRST(&cn_devlist);
if (cn_mute || CND_INVALID(cnd, curthread))
- return (1);
+ return (EINVAL);
dev = cnd->cnd_vp->v_rdev;
if (dev != NULL)
return ((*devsw(dev)->d_kqfilter)(dev, kn));
- return (1);
+ return (ENXIO);
}
/*
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
index efcea0c..02b68d8 100644
--- a/sys/kern/uipc_sockbuf.c
+++ b/sys/kern/uipc_sockbuf.c
@@ -242,6 +242,8 @@ sonewconn(head, connstatus)
mac_create_socket_from_socket(head, so);
SOCK_UNLOCK(head);
#endif
+ knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
+ knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
(*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
sodealloc(so);
@@ -403,7 +405,7 @@ sowakeup(so, sb)
sb->sb_flags &= ~SB_WAIT;
wakeup(&sb->sb_cc);
}
- KNOTE(&sb->sb_sel.si_note, 0);
+ KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
SOCKBUF_UNLOCK(sb);
if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
pgsigio(&so->so_sigio, SIGIO, 0);
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 6fc4d61..9b6c423 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -221,6 +221,8 @@ socreate(dom, aso, type, proto, cred, td)
mac_create_socket(cred, so);
#endif
SOCK_LOCK(so);
+ knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
+ knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
soref(so);
SOCK_UNLOCK(so);
error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
@@ -378,6 +380,8 @@ sofree(so)
sbrelease_locked(&so->so_snd, so);
SOCKBUF_UNLOCK(&so->so_snd);
sorflush(so);
+ knlist_destroy(&so->so_rcv.sb_sel.si_note);
+ knlist_destroy(&so->so_snd.sb_sel.si_note);
sodealloc(so);
}
@@ -2141,11 +2145,11 @@ soo_kqfilter(struct file *fp, struct knote *kn)
sb = &so->so_snd;
break;
default:
- return (1);
+ return (EINVAL);
}
SOCKBUF_LOCK(sb);
- SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
+ knlist_add(&sb->sb_sel.si_note, kn, 1);
sb->sb_flags |= SB_KNOTE;
SOCKBUF_UNLOCK(sb);
return (0);
@@ -2157,8 +2161,8 @@ filt_sordetach(struct knote *kn)
struct socket *so = kn->kn_fp->f_data;
SOCKBUF_LOCK(&so->so_rcv);
- SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
- if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
+ knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1);
+ if (knlist_empty(&so->so_rcv.sb_sel.si_note))
so->so_rcv.sb_flags &= ~SB_KNOTE;
SOCKBUF_UNLOCK(&so->so_rcv);
}
@@ -2200,8 +2204,8 @@ filt_sowdetach(struct knote *kn)
struct socket *so = kn->kn_fp->f_data;
SOCKBUF_LOCK(&so->so_snd);
- SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
- if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
+ knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1);
+ if (knlist_empty(&so->so_snd.sb_sel.si_note))
so->so_snd.sb_flags &= ~SB_KNOTE;
SOCKBUF_UNLOCK(&so->so_snd);
}
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
index efcea0c..02b68d8 100644
--- a/sys/kern/uipc_socket2.c
+++ b/sys/kern/uipc_socket2.c
@@ -242,6 +242,8 @@ sonewconn(head, connstatus)
mac_create_socket_from_socket(head, so);
SOCK_UNLOCK(head);
#endif
+ knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
+ knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
(*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
sodealloc(so);
@@ -403,7 +405,7 @@ sowakeup(so, sb)
sb->sb_flags &= ~SB_WAIT;
wakeup(&sb->sb_cc);
}
- KNOTE(&sb->sb_sel.si_note, 0);
+ KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
SOCKBUF_UNLOCK(sb);
if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
pgsigio(&so->so_sigio, SIGIO, 0);
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 238dee4..dc27e56 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -334,7 +334,7 @@ accept1(td, uap, compat)
td->td_retval[0] = fd;
/* connection has been removed from the listen queue */
- KNOTE(&head->so_rcv.sb_sel.si_note, 0);
+ KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
pgid = fgetown(&head->so_sigio);
if (pgid != 0)
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index e33a5fd..3513653 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -182,7 +182,7 @@ struct aiocblist {
struct file *fd_file; /* Pointer to file structure */
struct aio_liojob *lio; /* Optional lio job */
struct aiocb *uuaiocb; /* Pointer in userspace of aiocb */
- struct klist klist; /* list of knotes */
+ struct knlist klist; /* list of knotes */
struct aiocb uaiocb; /* Kernel I/O control block */
};
@@ -368,6 +368,7 @@ aio_onceonly(void)
static int
aio_unload(void)
{
+ int error;
/*
* XXX: no unloads by default, it's too dangerous.
@@ -377,11 +378,14 @@ aio_unload(void)
if (!unloadable)
return (EOPNOTSUPP);
+ error = kqueue_del_filteropts(EVFILT_AIO);
+ if (error)
+ return error;
+
async_io_version = 0;
aio_swake = NULL;
EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
- kqueue_del_filteropts(EVFILT_AIO);
p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
@@ -482,7 +486,7 @@ aio_free_entry(struct aiocblist *aiocbe)
* OWNING thread? (or maybe the running thread?)
* There is a semantic problem here...
*/
- knote_remove(FIRST_THREAD_IN_PROC(p), &aiocbe->klist); /* XXXKSE */
+ knlist_clear(&aiocbe->klist, 0); /* XXXKSE */
if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
&& ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
@@ -933,7 +937,7 @@ aio_daemon(void *uproc)
TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
splx(s);
- KNOTE(&aiocbe->klist, 0);
+ KNOTE_UNLOCKED(&aiocbe->klist, 0);
if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
wakeup(aiocbe);
@@ -1171,7 +1175,7 @@ aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
}
splx(s);
if (notify)
- KNOTE(&aiocbe->klist, 0);
+ KNOTE_UNLOCKED(&aiocbe->klist, 0);
return (0);
doerror:
@@ -1296,7 +1300,8 @@ _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int typ
aiocbe->inputcharge = 0;
aiocbe->outputcharge = 0;
callout_handle_init(&aiocbe->timeouthandle);
- SLIST_INIT(&aiocbe->klist);
+ /* XXX - need a lock */
+ knlist_init(&aiocbe->klist, NULL);
suword(&job->_aiocb_private.status, -1);
suword(&job->_aiocb_private.error, 0);
@@ -1415,7 +1420,7 @@ _aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int typ
kev.filter = EVFILT_AIO;
kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
kev.data = (intptr_t)aiocbe;
- error = kqueue_register(kq, &kev, td);
+ error = kqueue_register(kq, &kev, td, 1);
aqueue_fail:
if (error) {
fdrop(fp, td);
@@ -2187,7 +2192,7 @@ aio_physwakeup(struct buf *bp)
TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
- KNOTE(&aiocbe->klist, 0);
+ KNOTE_UNLOCKED(&aiocbe->klist, 0);
/* Do the wakeup. */
if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
ki->kaio_flags &= ~KAIO_WAKEUP;
@@ -2289,7 +2294,7 @@ filt_aioattach(struct knote *kn)
return (EPERM);
kn->kn_flags &= ~EV_FLAG1;
- SLIST_INSERT_HEAD(&aiocbe->klist, kn, kn_selnext);
+ knlist_add(&aiocbe->klist, kn, 0);
return (0);
}
@@ -2300,7 +2305,7 @@ filt_aiodetach(struct knote *kn)
{
struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
- SLIST_REMOVE(&aiocbe->klist, kn, knote, kn_selnext);
+ knlist_remove(&aiocbe->klist, kn, 0);
}
/* kqueue filter function */
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index aa234be..2cd24ec 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -849,6 +849,7 @@ getnewvnode(tag, mp, vops, vpp)
vp->v_cachedid = -1;
VI_UNLOCK(vp);
if (pollinfo != NULL) {
+ knlist_destroy(&pollinfo->vpi_selinfo.si_note);
mtx_destroy(&pollinfo->vpi_lock);
uma_zfree(vnodepoll_zone, pollinfo);
}
@@ -3256,6 +3257,8 @@ v_addpollinfo(struct vnode *vp)
}
vp->v_pollinfo = vi;
mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
+ knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note,
+ &vp->v_pollinfo->vpi_lock);
}
/*
@@ -3341,7 +3344,7 @@ vn_pollgone(vp)
{
mtx_lock(&vp->v_pollinfo->vpi_lock);
- VN_KNOTE(vp, NOTE_REVOKE);
+ VN_KNOTE_LOCKED(vp, NOTE_REVOKE);
if (vp->v_pollinfo->vpi_events) {
vp->v_pollinfo->vpi_events = 0;
selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
@@ -3981,13 +3984,21 @@ vop_unlock_post(void *ap, int rc)
}
#endif /* DEBUG_VFS_LOCKS */
-static struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
+static struct knlist fs_knlist;
+
+static void
+vfs_event_init(void *arg)
+{
+ knlist_init(&fs_knlist, NULL);
+}
+/* XXX - correct order? */
+SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
void
vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
{
- KNOTE(&fs_klist, event);
+ KNOTE_UNLOCKED(&fs_knlist, event);
}
static int filt_fsattach(struct knote *kn);
@@ -4002,7 +4013,7 @@ filt_fsattach(struct knote *kn)
{
kn->kn_flags |= EV_CLEAR;
- SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext);
+ knlist_add(&fs_knlist, kn, 0);
return (0);
}
@@ -4010,7 +4021,7 @@ static void
filt_fsdetach(struct knote *kn)
{
- SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext);
+ knlist_remove(&fs_knlist, kn, 0);
}
static int
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index c61c413..d8204b6 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1108,10 +1108,13 @@ vfs_write_resume(mp)
static int
vn_kqfilter(struct file *fp, struct knote *kn)
{
+ int error;
- GIANT_REQUIRED;
+ mtx_lock(&Giant);
+ error = VOP_KQFILTER(fp->f_vnode, kn);
+ mtx_unlock(&Giant);
- return (VOP_KQFILTER(fp->f_vnode, kn));
+ return error;
}
/*
diff --git a/sys/net/bpf.c b/sys/net/bpf.c
index 6f6c9fe..f48fcda 100644
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@@ -353,6 +353,7 @@ bpfopen(dev, flags, fmt, td)
#endif
mtx_init(&d->bd_mtx, devtoname(dev), "bpf cdev lock", MTX_DEF);
callout_init(&d->bd_callout, debug_mpsafenet ? CALLOUT_MPSAFE : 0);
+ knlist_init(&d->bd_sel.si_note, &d->bd_mtx);
return (0);
}
@@ -384,6 +385,7 @@ bpfclose(dev, flags, fmt, td)
#ifdef MAC
mac_destroy_bpfdesc(d);
#endif /* MAC */
+ knlist_destroy(&d->bd_sel.si_note);
bpf_freed(d);
dev->si_drv1 = 0;
free(d, M_BPF);
@@ -525,7 +527,7 @@ bpf_wakeup(d)
pgsigio(&d->bd_sigio, d->bd_sig, 0);
selwakeuppri(&d->bd_sel, PRINET);
- KNOTE(&d->bd_sel.si_note, 0);
+ KNOTE_LOCKED(&d->bd_sel.si_note, 0);
}
static void
@@ -1089,9 +1091,7 @@ bpfkqfilter(dev, kn)
kn->kn_fop = &bpfread_filtops;
kn->kn_hook = d;
- BPFD_LOCK(d);
- SLIST_INSERT_HEAD(&d->bd_sel.si_note, kn, kn_selnext);
- BPFD_UNLOCK(d);
+ knlist_add(&d->bd_sel.si_note, kn, 0);
return (0);
}
@@ -1102,9 +1102,7 @@ filt_bpfdetach(kn)
{
struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
- BPFD_LOCK(d);
- SLIST_REMOVE(&d->bd_sel.si_note, kn, knote, kn_selnext);
- BPFD_UNLOCK(d);
+ knlist_remove(&d->bd_sel.si_note, kn, 0);
}
static int
diff --git a/sys/net/if.c b/sys/net/if.c
index 9f560d3..b2f365c 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -109,7 +109,7 @@ struct ifnethead ifnet; /* depend on static init XXX */
struct mtx ifnet_lock;
static int if_indexlim = 8;
-static struct klist ifklist;
+static struct knlist ifklist;
static void filt_netdetach(struct knote *kn);
static int filt_netdev(struct knote *kn, long hint);
@@ -185,10 +185,18 @@ netioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td
static int
netkqfilter(struct cdev *dev, struct knote *kn)
{
- struct klist *klist;
+ struct knlist *klist;
struct ifnet *ifp;
int idx;
+ switch (kn->kn_filter) {
+ case EVFILT_NETDEV:
+ kn->kn_fop = &netdev_filtops;
+ break;
+ default:
+ return (1);
+ }
+
idx = minor(dev);
if (idx == 0) {
klist = &ifklist;
@@ -199,18 +207,9 @@ netkqfilter(struct cdev *dev, struct knote *kn)
klist = &ifp->if_klist;
}
- switch (kn->kn_filter) {
- case EVFILT_NETDEV:
- kn->kn_fop = &netdev_filtops;
- break;
- default:
- return (1);
- }
-
kn->kn_hook = (caddr_t)klist;
- /* XXX locking? */
- SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+ knlist_add(klist, kn, 0);
return (0);
}
@@ -218,27 +217,30 @@ netkqfilter(struct cdev *dev, struct knote *kn)
static void
filt_netdetach(struct knote *kn)
{
- struct klist *klist = (struct klist *)kn->kn_hook;
+ struct knlist *klist = (struct knlist *)kn->kn_hook;
if (kn->kn_status & KN_DETACHED)
return;
- SLIST_REMOVE(klist, kn, knote, kn_selnext);
+
+ knlist_remove(klist, kn, 0);
}
static int
filt_netdev(struct knote *kn, long hint)
{
+ struct knlist *klist = (struct knlist *)kn->kn_hook;
/*
* Currently NOTE_EXIT is abused to indicate device detach.
*/
if (hint == NOTE_EXIT) {
kn->kn_data = NOTE_LINKINV;
- kn->kn_status |= KN_DETACHED;
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+ knlist_remove_inevent(klist, kn);
return (1);
}
- kn->kn_data = hint; /* current status */
+ if (hint != 0)
+ kn->kn_data = hint; /* current status */
if (kn->kn_sfflags & hint)
kn->kn_fflags |= hint;
return (kn->kn_fflags != 0);
@@ -257,7 +259,7 @@ if_init(void *dummy __unused)
IFNET_LOCK_INIT();
TAILQ_INIT(&ifnet);
- SLIST_INIT(&ifklist);
+ knlist_init(&ifklist, NULL);
if_grow(); /* create initial table */
ifdev_byindex(0) = make_dev(&net_cdevsw, 0,
UID_ROOT, GID_WHEEL, 0600, "network");
@@ -383,7 +385,7 @@ if_attach(struct ifnet *ifp)
TAILQ_INIT(&ifp->if_addrhead);
TAILQ_INIT(&ifp->if_prefixhead);
TAILQ_INIT(&ifp->if_multiaddrs);
- SLIST_INIT(&ifp->if_klist);
+ knlist_init(&ifp->if_klist, NULL);
getmicrotime(&ifp->if_lastchange);
#ifdef MAC
@@ -620,7 +622,9 @@ if_detach(struct ifnet *ifp)
#ifdef MAC
mac_destroy_ifnet(ifp);
#endif /* MAC */
- KNOTE(&ifp->if_klist, NOTE_EXIT);
+ KNOTE_UNLOCKED(&ifp->if_klist, NOTE_EXIT);
+ knlist_clear(&ifp->if_klist, 0);
+ knlist_destroy(&ifp->if_klist);
IFNET_WLOCK();
found = 0;
TAILQ_FOREACH(iter, &ifnet, if_link)
diff --git a/sys/net/if_var.h b/sys/net/if_var.h
index c5164b1..29837fb 100644
--- a/sys/net/if_var.h
+++ b/sys/net/if_var.h
@@ -146,7 +146,7 @@ struct ifnet {
* However, access to the AF_LINK address through this
* field is deprecated. Use ifaddr_byindex() instead.
*/
- struct klist if_klist; /* events attached to this if */
+ struct knlist if_klist; /* events attached to this if */
int if_pcount; /* number of promiscuous listeners */
void *if_carp; /* carp (tbd) interface pointer */
struct bpf_if *if_bpf; /* packet filter structure */
diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c
index d18f056..7a8429d 100644
--- a/sys/net/if_vlan.c
+++ b/sys/net/if_vlan.c
@@ -821,7 +821,7 @@ vlan_link_state(struct ifnet *ifp, int link)
if (ifv->ifv_p == ifp) {
ifv->ifv_if.if_link_state = ifv->ifv_p->if_link_state;
rt_ifmsg(&(ifv->ifv_if));
- KNOTE(&ifp->if_klist, link);
+ KNOTE_UNLOCKED(&ifp->if_klist, link);
}
}
VLAN_UNLOCK();
diff --git a/sys/sys/event.h b/sys/sys/event.h
index 4c7c0fe..1a7956c 100644
--- a/sys/sys/event.h
+++ b/sys/sys/event.h
@@ -118,9 +118,16 @@ struct kevent {
* This is currently visible to userland to work around broken
* programs which pull in <sys/proc.h>.
*/
-#include <sys/queue.h>
+#include <sys/queue.h>
+#include <sys/_lock.h>
+#include <sys/_mutex.h>
struct knote;
SLIST_HEAD(klist, knote);
+struct knlist {
+ struct mtx *kl_lock; /* lock to protect kll_list */
+ struct klist kl_list;
+};
+
#ifdef _KERNEL
@@ -128,8 +135,14 @@ SLIST_HEAD(klist, knote);
MALLOC_DECLARE(M_KQUEUE);
#endif
-#define KNOTE(list, hint) \
- do { if ((list) != NULL) knote(list, hint); } while (0)
+struct kqueue;
+SLIST_HEAD(kqlist, kqueue);
+
+#define KNOTE(list, hist, lock) knote(list, hist, lock)
+#define KNOTE_LOCKED(list, hint) knote(list, hint, 1)
+#define KNOTE_UNLOCKED(list, hint) knote(list, hint, 0)
+#define KNOTE_STATUS_BEGIN(kn) knote_status(kn, 1)
+#define KNOTE_STATUS_END(kn) knote_status(kn, 0)
/*
* Flag indicating hint is a signal. Used by EVFILT_SIGNAL, and also
@@ -144,13 +157,28 @@ struct filterops {
int (*f_event)(struct knote *kn, long hint);
};
+/*
+ * Setting the KN_INFLUX flag enables you to unlock the kq that this knote
+ * is on, and modify kn_status as if you had the KQ lock.
+ *
+ * kn_sfflags, kn_sdata, and kn_kevent are protected by the knlist lock.
+ */
struct knote {
- SLIST_ENTRY(knote) kn_link; /* for fd */
+ SLIST_ENTRY(knote) kn_link; /* for kq */
SLIST_ENTRY(knote) kn_selnext; /* for struct selinfo */
+ struct knlist *kn_knlist; /* f_attach populated */
TAILQ_ENTRY(knote) kn_tqe;
struct kqueue *kn_kq; /* which queue we are on */
struct kevent kn_kevent;
- int kn_status;
+ int kn_status; /* protected by kq lock */
+#define KN_ACTIVE 0x01 /* event has been triggered */
+#define KN_QUEUED 0x02 /* event is on queue */
+#define KN_DISABLED 0x04 /* event is disabled */
+#define KN_DETACHED 0x08 /* knote is detached */
+#define KN_INFLUX 0x10 /* knote is in flux */
+#define KN_MARKER 0x20 /* ignore this knote */
+#define KN_KQUEUE 0x40 /* this knote belongs to a kq */
+#define KN_HASKQLOCK 0x80 /* for _inevent */
int kn_sfflags; /* saved filter flags */
intptr_t kn_sdata; /* saved data field */
union {
@@ -159,10 +187,6 @@ struct knote {
} kn_ptr;
struct filterops *kn_fop;
void *kn_hook;
-#define KN_ACTIVE 0x01 /* event has been triggered */
-#define KN_QUEUED 0x02 /* event is on queue */
-#define KN_DISABLED 0x04 /* event is disabled */
-#define KN_DETACHED 0x08 /* knote is detached */
#define kn_id kn_kevent.ident
#define kn_filter kn_kevent.filter
@@ -174,12 +198,20 @@ struct knote {
struct thread;
struct proc;
-
-extern void knote(struct klist *list, long hint);
-extern void knote_remove(struct thread *p, struct klist *list);
+struct knlist;
+
+extern void knote(struct knlist *list, long hint, int islocked);
+extern void knote_status(struct knote *kn, int begin);
+extern void knlist_add(struct knlist *knl, struct knote *kn, int islocked);
+extern void knlist_remove(struct knlist *knl, struct knote *kn, int islocked);
+extern void knlist_remove_inevent(struct knlist *knl, struct knote *kn);
+extern int knlist_empty(struct knlist *knl);
+extern void knlist_init(struct knlist *knl, struct mtx *mtx);
+extern void knlist_destroy(struct knlist *knl);
+extern void knlist_clear(struct knlist *knl, int islocked);
extern void knote_fdclose(struct thread *p, int fd);
extern int kqueue_register(struct kqueue *kq,
- struct kevent *kev, struct thread *p);
+ struct kevent *kev, struct thread *p, int waitok);
extern int kqueue_add_filteropts(int filt, struct filterops *filtops);
extern int kqueue_del_filteropts(int filt);
diff --git a/sys/sys/eventvar.h b/sys/sys/eventvar.h
index ef9087b..bdb3adf 100644
--- a/sys/sys/eventvar.h
+++ b/sys/sys/eventvar.h
@@ -29,11 +29,20 @@
#ifndef _SYS_EVENTVAR_H_
#define _SYS_EVENTVAR_H_
+#ifndef _KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+#include <sys/_task.h>
+
#define KQ_NEVENTS 8 /* minimize copy{in,out} calls */
#define KQEXTENT 256 /* linear growth by this amount */
struct kqueue {
- TAILQ_HEAD(kqlist, knote) kq_head; /* list of pending event */
+ struct mtx kq_lock;
+ int kq_refcnt;
+ SLIST_ENTRY(kqueue) kq_list;
+ TAILQ_HEAD(, knote) kq_head; /* list of pending event */
int kq_count; /* number of pending events */
struct selinfo kq_sel;
struct sigio *kq_sigio;
@@ -41,8 +50,16 @@ struct kqueue {
int kq_state;
#define KQ_SEL 0x01
#define KQ_SLEEP 0x02
-#define KQ_ASYNC 0x04
- struct kevent kq_kev[KQ_NEVENTS];
+#define KQ_FLUXWAIT 0x04 /* waiting for a in flux kn */
+#define KQ_ASYNC 0x08
+#define KQ_CLOSING 0x10
+#define KQ_TASKSCHED 0x20 /* task scheduled */
+#define KQ_TASKDRAIN 0x40 /* waiting for task to drain */
+ int kq_knlistsize; /* size of knlist */
+ struct klist *kq_knlist; /* list of knotes */
+ u_long kq_knhashmask; /* size of knhash */
+ struct klist *kq_knhash; /* hash table for knotes */
+ struct task kq_task;
};
#endif /* !_SYS_EVENTVAR_H_ */
diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h
index b813c32..7b7c6b9 100644
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@@ -34,6 +34,7 @@
#define _SYS_FILEDESC_H_
#include <sys/queue.h>
+#include <sys/event.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
@@ -71,11 +72,8 @@ struct filedesc {
u_short fd_cmask; /* mask for file creation */
u_short fd_refcnt; /* reference count */
- int fd_knlistsize; /* size of knlist */
- struct klist *fd_knlist; /* list of attached knotes */
- u_long fd_knhashmask; /* size of knhash */
- struct klist *fd_knhash; /* hash table for attached knotes */
struct mtx fd_mtx; /* protects members of this struct */
+ struct kqlist fd_kqlist; /* list of kqueues on this filedesc */
int fd_holdleaderscount; /* block fdfree() for shared close() */
int fd_holdleaderswakeup; /* fdfree() needs wakeup */
};
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 1b1f1f4..f5bd3c6 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -608,7 +608,6 @@ struct proc {
struct vnode *p_textvp; /* (b) Vnode of executable. */
sigset_t p_siglist; /* (c) Sigs not delivered to a td. */
char p_lock; /* (c) Proclock (prevent swap) count. */
- struct klist p_klist; /* (c) Knotes attached to this proc. */
struct sigiolst p_sigiolst; /* (c) List of sigio sources. */
int p_sigparent; /* (c) Signal to parent on exit. */
int p_sig; /* (n) For core dump/debugger XXX. */
@@ -638,6 +637,7 @@ struct proc {
#define p_endcopy p_xstat
u_short p_xstat; /* (c) Exit status; also stop sig. */
+ struct knlist p_klist; /* (c) Knotes attached to this proc. */
int p_numthreads; /* (j) Number of threads. */
int p_numksegrps; /* (c) number of ksegrps */
struct mdproc p_md; /* Any machine-dependent fields. */
diff --git a/sys/sys/selinfo.h b/sys/sys/selinfo.h
index eb9d1ef..946da8c 100644
--- a/sys/sys/selinfo.h
+++ b/sys/sys/selinfo.h
@@ -42,7 +42,7 @@
struct selinfo {
TAILQ_ENTRY(selinfo) si_thrlist; /* list hung off of thread */
struct thread *si_thread; /* thread waiting */
- struct klist si_note; /* kernel note list */
+ struct knlist si_note; /* kernel note list */
short si_flags; /* see below */
};
#define SI_COLL 0x0001 /* collision occurred */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index df5ae00..166cd53 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -201,11 +201,13 @@ struct xvnode {
vn_pollevent((vp), (events)); \
} while (0)
-#define VN_KNOTE(vp, b) \
+#define VN_KNOTE(vp, b, a) \
do { \
if ((vp)->v_pollinfo != NULL) \
- KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b)); \
+ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), (a)); \
} while (0)
+#define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, 1)
+#define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0)
/*
* Vnode flags.
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index fa0b9d4..bddd2e4 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -725,7 +725,7 @@ ffs_write(ap)
DIP_SET(ip, i_mode, ip->i_mode);
}
if (resid > uio->uio_resid)
- VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
+ VN_KNOTE_UNLOCKED(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
if (error) {
if (ioflag & IO_UNIT) {
(void)UFS_TRUNCATE(vp, osize,
diff --git a/sys/ufs/ufs/ufs_acl.c b/sys/ufs/ufs/ufs_acl.c
index ece8976..e3005f4 100644
--- a/sys/ufs/ufs/ufs_acl.c
+++ b/sys/ufs/ufs/ufs_acl.c
@@ -397,7 +397,7 @@ ufs_setacl(ap)
ip->i_flag |= IN_CHANGE;
}
- VN_KNOTE(ap->a_vp, NOTE_ATTRIB);
+ VN_KNOTE_UNLOCKED(ap->a_vp, NOTE_ATTRIB);
return (0);
}
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index c76cc88..84e3d3c 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -198,7 +198,7 @@ ufs_create(ap)
ap->a_dvp, ap->a_vpp, ap->a_cnp);
if (error)
return (error);
- VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ VN_KNOTE_UNLOCKED(ap->a_dvp, NOTE_WRITE);
return (0);
}
@@ -225,7 +225,7 @@ ufs_mknod(ap)
ap->a_dvp, vpp, ap->a_cnp);
if (error)
return (error);
- VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ VN_KNOTE_UNLOCKED(ap->a_dvp, NOTE_WRITE);
ip = VTOI(*vpp);
ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
if (vap->va_rdev != VNOVAL) {
@@ -615,7 +615,7 @@ ufs_setattr(ap)
return (EPERM);
error = ufs_chmod(vp, (int)vap->va_mode, cred, td);
}
- VN_KNOTE(vp, NOTE_ATTRIB);
+ VN_KNOTE_UNLOCKED(vp, NOTE_ATTRIB);
return (error);
}
@@ -799,8 +799,8 @@ ufs_remove(ap)
error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
if (ip->i_nlink <= 0)
vp->v_vflag |= VV_NOSYNC;
- VN_KNOTE(vp, NOTE_DELETE);
- VN_KNOTE(dvp, NOTE_WRITE);
+ VN_KNOTE_UNLOCKED(vp, NOTE_DELETE);
+ VN_KNOTE_UNLOCKED(dvp, NOTE_WRITE);
out:
return (error);
}
@@ -861,8 +861,8 @@ ufs_link(ap)
softdep_change_linkcnt(ip);
}
out:
- VN_KNOTE(vp, NOTE_LINK);
- VN_KNOTE(tdvp, NOTE_WRITE);
+ VN_KNOTE_UNLOCKED(vp, NOTE_LINK);
+ VN_KNOTE_UNLOCKED(tdvp, NOTE_WRITE);
return (error);
}
@@ -1037,7 +1037,7 @@ abortit:
oldparent = dp->i_number;
doingdirectory = 1;
}
- VN_KNOTE(fdvp, NOTE_WRITE); /* XXX right place? */
+ VN_KNOTE_UNLOCKED(fdvp, NOTE_WRITE); /* XXX right place? */
vrele(fdvp);
/*
@@ -1146,7 +1146,7 @@ abortit:
}
goto bad;
}
- VN_KNOTE(tdvp, NOTE_WRITE);
+ VN_KNOTE_UNLOCKED(tdvp, NOTE_WRITE);
vput(tdvp);
} else {
if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
@@ -1230,9 +1230,9 @@ abortit:
tcnp->cn_cred, tcnp->cn_thread)) != 0)
goto bad;
}
- VN_KNOTE(tdvp, NOTE_WRITE);
+ VN_KNOTE_UNLOCKED(tdvp, NOTE_WRITE);
vput(tdvp);
- VN_KNOTE(tvp, NOTE_DELETE);
+ VN_KNOTE_UNLOCKED(tvp, NOTE_DELETE);
vput(tvp);
xp = NULL;
}
@@ -1302,7 +1302,7 @@ abortit:
error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0);
xp->i_flag &= ~IN_RENAME;
}
- VN_KNOTE(fvp, NOTE_RENAME);
+ VN_KNOTE_UNLOCKED(fvp, NOTE_RENAME);
if (dp)
vput(fdvp);
if (xp)
@@ -1620,7 +1620,7 @@ ufs_mkdir(ap)
bad:
if (error == 0) {
- VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+ VN_KNOTE_UNLOCKED(dvp, NOTE_WRITE | NOTE_LINK);
*ap->a_vpp = tvp;
} else {
dp->i_effnlink--;
@@ -1713,7 +1713,7 @@ ufs_rmdir(ap)
}
goto out;
}
- VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+ VN_KNOTE_UNLOCKED(dvp, NOTE_WRITE | NOTE_LINK);
cache_purge(dvp);
/*
* Truncate inode. The only stuff left in the directory is "." and
@@ -1742,7 +1742,7 @@ ufs_rmdir(ap)
ufsdirhash_free(ip);
#endif
out:
- VN_KNOTE(vp, NOTE_DELETE);
+ VN_KNOTE_UNLOCKED(vp, NOTE_DELETE);
return (error);
}
@@ -1767,7 +1767,7 @@ ufs_symlink(ap)
vpp, ap->a_cnp);
if (error)
return (error);
- VN_KNOTE(ap->a_dvp, NOTE_WRITE);
+ VN_KNOTE_UNLOCKED(ap->a_dvp, NOTE_WRITE);
vp = *vpp;
len = strlen(ap->a_target);
if (len < vp->v_mount->mnt_maxsymlinklen) {
@@ -2620,9 +2620,9 @@ ufs_kqfilter(ap)
if (vp->v_pollinfo == NULL)
v_addpollinfo(vp);
- mtx_lock(&vp->v_pollinfo->vpi_lock);
- SLIST_INSERT_HEAD(&vp->v_pollinfo->vpi_selinfo.si_note, kn, kn_selnext);
- mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ if (vp->v_pollinfo == NULL)
+ return ENOMEM;
+ knlist_add(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
return (0);
}
@@ -2633,10 +2633,7 @@ filt_ufsdetach(struct knote *kn)
struct vnode *vp = (struct vnode *)kn->kn_hook;
KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo"));
- mtx_lock(&vp->v_pollinfo->vpi_lock);
- SLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note,
- kn, knote, kn_selnext);
- mtx_unlock(&vp->v_pollinfo->vpi_lock);
+ knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
}
/*ARGSUSED*/
OpenPOWER on IntegriCloud