summaryrefslogtreecommitdiffstats
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c144
1 files changed, 130 insertions, 14 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4d9d3a4..74598f6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -33,8 +33,8 @@
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
+#include <linux/device.h>
#include <asm/uaccess.h>
-#include <asm/system.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
@@ -88,7 +88,7 @@
*/
/* Epoll private bits inside the event mask */
-#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
+#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
@@ -155,6 +155,9 @@ struct epitem {
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
+ /* wakeup_source used when EPOLLWAKEUP is set */
+ struct wakeup_source *ws;
+
/* The structure that describe the interested events and the source fd */
struct epoll_event event;
};
@@ -195,6 +198,9 @@ struct eventpoll {
*/
struct epitem *ovflist;
+ /* wakeup_source used when ep_scan_ready_list is running */
+ struct wakeup_source *ws;
+
/* The user that created the eventpoll descriptor */
struct user_struct *user;
@@ -427,6 +433,31 @@ out_unlock:
return error;
}
+/*
+ * As described in commit 0ccf831cb lockdep: annotate epoll
+ * the use of wait queues used by epoll is done in a very controlled
+ * manner. Wake ups can nest inside each other, but are never done
+ * with the same locking. For example:
+ *
+ * dfd = socket(...);
+ * efd1 = epoll_create();
+ * efd2 = epoll_create();
+ * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
+ * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
+ *
+ * When a packet arrives to the device underneath "dfd", the net code will
+ * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
+ * callback wakeup entry on that queue, and the wake_up() performed by the
+ * "dfd" net code will end up in ep_poll_callback(). At this point epoll
+ * (efd1) notices that it may have some event ready, so it needs to wake up
+ * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
+ * that ends up in another wake_up(), after having checked about the
+ * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
+ * avoid stack blasting.
+ *
+ * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
+ * this special case of epoll.
+ */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
unsigned long events, int subclass)
@@ -564,8 +595,10 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
- if (!ep_is_linked(&epi->rdllink))
+ if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
+ }
}
/*
* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
@@ -578,6 +611,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* Quickly re-inject items left on "txlist".
*/
list_splice(&txlist, &ep->rdllist);
+ __pm_relax(ep->ws);
if (!list_empty(&ep->rdllist)) {
/*
@@ -632,6 +666,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
+ wakeup_source_unregister(epi->ws);
+
/* At this point it is safe to free the eventpoll item */
kmem_cache_free(epi_cache, epi);
@@ -682,6 +718,7 @@ static void ep_free(struct eventpoll *ep)
mutex_unlock(&epmutex);
mutex_destroy(&ep->mtx);
free_uid(ep->user);
+ wakeup_source_unregister(ep->ws);
kfree(ep);
}
@@ -699,9 +736,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct epitem *epi, *tmp;
+ poll_table pt;
+ init_poll_funcptr(&pt, NULL);
list_for_each_entry_safe(epi, tmp, head, rdllink) {
- if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+ pt._key = epi->event.events;
+ if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
epi->event.events)
return POLLIN | POLLRDNORM;
else {
@@ -710,6 +750,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
* callback, but it's not actually ready, as far as
* caller requested events goes. We can remove it here.
*/
+ __pm_relax(epi->ws);
list_del_init(&epi->rdllink);
}
}
@@ -900,13 +941,23 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
if (epi->next == EP_UNACTIVE_PTR) {
epi->next = ep->ovflist;
ep->ovflist = epi;
+ if (epi->ws) {
+ /*
+ * Activate ep->ws since epi->ws may get
+ * deactivated at any time.
+ */
+ __pm_stay_awake(ep->ws);
+ }
+
}
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(&epi->rdllink))
+ if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
+ }
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
@@ -1049,13 +1100,11 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
*/
static int reverse_path_check(void)
{
- int length = 0;
int error = 0;
struct file *current_file;
/* let's call this for all tfiles */
list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
- length++;
path_count_init();
error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
reverse_path_check_proc, current_file,
@@ -1066,6 +1115,30 @@ static int reverse_path_check(void)
return error;
}
+static int ep_create_wakeup_source(struct epitem *epi)
+{
+ const char *name;
+
+ if (!epi->ep->ws) {
+ epi->ep->ws = wakeup_source_register("eventpoll");
+ if (!epi->ep->ws)
+ return -ENOMEM;
+ }
+
+ name = epi->ffd.file->f_path.dentry->d_name.name;
+ epi->ws = wakeup_source_register(name);
+ if (!epi->ws)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void ep_destroy_wakeup_source(struct epitem *epi)
+{
+ wakeup_source_unregister(epi->ws);
+ epi->ws = NULL;
+}
+
/*
* Must be called with "mtx" held.
*/
@@ -1093,10 +1166,18 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
+ if (epi->event.events & EPOLLWAKEUP) {
+ error = ep_create_wakeup_source(epi);
+ if (error)
+ goto error_create_wakeup_source;
+ } else {
+ epi->ws = NULL;
+ }
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
+ epq.pt._key = event->events;
/*
* Attach the item to the poll hooks and get current event bits.
@@ -1138,6 +1219,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
@@ -1178,6 +1260,9 @@ error_unregister:
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
+ wakeup_source_unregister(epi->ws);
+
+error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
return error;
@@ -1191,6 +1276,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
{
int pwake = 0;
unsigned int revents;
+ poll_table pt;
+
+ init_poll_funcptr(&pt, NULL);
/*
* Set the new event interest mask before calling f_op->poll();
@@ -1198,13 +1286,20 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
* f_op->poll() call and the new event set registering.
*/
epi->event.events = event->events;
+ pt._key = event->events;
epi->event.data = event->data; /* protected by mtx */
+ if (epi->event.events & EPOLLWAKEUP) {
+ if (!epi->ws)
+ ep_create_wakeup_source(epi);
+ } else if (epi->ws) {
+ ep_destroy_wakeup_source(epi);
+ }
/*
* Get current event bits. We can safely use the file* here because
* its usage count has been increased by the caller of this function.
*/
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
+ revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
/*
* If the item is "hot" and it is not registered inside the ready
@@ -1214,6 +1309,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
spin_lock_irq(&ep->lock);
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
@@ -1239,6 +1335,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
unsigned int revents;
struct epitem *epi;
struct epoll_event __user *uevent;
+ poll_table pt;
+
+ init_poll_funcptr(&pt, NULL);
/*
* We can loop without lock because we are passed a task private list.
@@ -1249,9 +1348,22 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
!list_empty(head) && eventcnt < esed->maxevents;) {
epi = list_first_entry(head, struct epitem, rdllink);
+ /*
+ * Activate ep->ws before deactivating epi->ws to prevent
+ * triggering auto-suspend here (in case we reactive epi->ws
+ * below).
+ *
+ * This could be rearranged to delay the deactivation of epi->ws
+ * instead, but then epi->ws would temporarily be out of sync
+ * with ep_is_linked().
+ */
+ if (epi->ws && epi->ws->active)
+ __pm_stay_awake(ep->ws);
+ __pm_relax(epi->ws);
list_del_init(&epi->rdllink);
- revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+ pt._key = epi->event.events;
+ revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
epi->event.events;
/*
@@ -1264,6 +1376,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
+ __pm_stay_awake(epi->ws);
return eventcnt ? eventcnt : -EFAULT;
}
eventcnt++;
@@ -1283,6 +1396,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
* poll callback will queue them in ep->ovflist.
*/
list_add_tail(&epi->rdllink, &ep->rdllist);
+ __pm_stay_awake(epi->ws);
}
}
}
@@ -1595,6 +1709,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
if (!tfile->f_op || !tfile->f_op->poll)
goto error_tgt_fput;
+ /* Check if EPOLLWAKEUP is allowed */
+ if ((epds.events & EPOLLWAKEUP) && !capable(CAP_EPOLLWAKEUP))
+ epds.events &= ~EPOLLWAKEUP;
+
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
@@ -1629,8 +1747,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
if (op == EPOLL_CTL_ADD) {
if (is_file_epoll(tfile)) {
error = -ELOOP;
- if (ep_loop_check(ep, tfile) != 0)
+ if (ep_loop_check(ep, tfile) != 0) {
+ clear_tfile_check_list();
goto error_tgt_fput;
+ }
} else
list_add(&tfile->f_tfile_llink, &tfile_check_list);
}
@@ -1733,8 +1853,6 @@ error_return:
return error;
}
-#ifdef HAVE_SET_RESTORE_SIGMASK
-
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_pwait(2).
@@ -1779,8 +1897,6 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
return error;
}
-#endif /* HAVE_SET_RESTORE_SIGMASK */
-
static int __init eventpoll_init(void)
{
struct sysinfo si;
OpenPOWER on IntegriCloud