summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorrwatson <rwatson@FreeBSD.org>2005-11-10 16:06:04 +0000
committerrwatson <rwatson@FreeBSD.org>2005-11-10 16:06:04 +0000
commit9487c057e255ed900f5033620724c359ec48e7eb (patch)
treefe013f526c32e87068c47073cd432328202156b3 /sys
parent720f36e2ac18c653c51b7a715ad11caae2ad4cb3 (diff)
downloadFreeBSD-src-9487c057e255ed900f5033620724c359ec48e7eb.zip
FreeBSD-src-9487c057e255ed900f5033620724c359ec48e7eb.tar.gz
Correct a number of serious and closely related bugs in the UNIX domain
socket file descriptor garbage collection code, which is intended to detect and clear cycles of orphaned file descriptors that are "in-flight" in a socket when that socket is closed before they are received. The algorithm present was both run at poor times (resulting in recursion and reentrance), and also buggy in the presence of parallelism. In order to fix these problems, make the following changes: - When there are in-flight sockets and a UNIX domain socket is destroyed, asynchronously schedule the garbage collector, rather than running it synchronously in the current context. This avoids lock order issues when the garbage collection code reenters the UNIX domain socket code, avoiding lock order reversals, deadlocks, etc. Run the code asynchronously in a task queue. - In the garbage collector, when skipping file descriptors that have entered a closing state (i.e., have f_count == 0), re-test the FDEFER flag, and decrement unp_defer. As file descriptors can now transition to a closed state, while the garbage collector is running, it is no longer the case that unp_defer will remain an accurate count of deferred sockets in the mark portion of the GC algorithm. Otherwise, the garbage collector will loop waiting waiting for unp_defer to reach zero, which it will never do as it is skipping file descriptors that were marked in an earlier pass, but now closed. - Acquire the UNIX domain socket subsystem lock in unp_discard() when modifying the unp_rights counter, or a read/write race is risked with other threads also manipulating the counter. While here: - Remove #if 0'd code regarding acquiring the socket buffer sleep lock in the garbage collector, this is not required as we are able to use the socket buffer receive lock to protect scanning the receive buffer for in-flight file descriptors on the socket buffer. - Annotate that the description of the garbage collector implementation is increasingly inaccurate and needs to be updated. - Add counters of the number of deferred garbage collections and recycled file descriptors. This will be removed and is here temporarily for debugging purposes. With these changes in place, the unp_passfd regression test now appears to be passed consistently on UP and SMP systems for extended runs, whereas before it hung quickly or panicked, depending on which bug was triggered. Reported by: Philip Kizer <pckizer at nostrum dot com> MFC after: 2 weeks
Diffstat (limited to 'sys')
-rw-r--r--sys/kern/uipc_usrreq.c95
1 files changed, 45 insertions, 50 deletions
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 439cd7d..9b63f53 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sx.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
+#include <sys/taskqueue.h>
#include <sys/un.h>
#include <sys/unpcb.h>
#include <sys/vnode.h>
@@ -112,6 +113,14 @@ static struct mtx unp_mtx;
#define UNP_LOCK_ASSERT() mtx_assert(&unp_mtx, MA_OWNED)
#define UNP_UNLOCK_ASSERT() mtx_assert(&unp_mtx, MA_NOTOWNED)
+/*
+ * Garbage collection of cyclic file descriptor/socket references occurs
+ * asynchronously in a taskqueue context in order to avoid recursion and
+ * reentrance in the UNIX domain socket, file descriptor, and socket layer
+ * code. See unp_gc() for a full description.
+ */
+static struct task unp_gc_task;
+
static int unp_attach(struct socket *);
static void unp_detach(struct unpcb *);
static int unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
@@ -120,7 +129,7 @@ static int unp_connect2(struct socket *so, struct socket *so2, int);
static void unp_disconnect(struct unpcb *);
static void unp_shutdown(struct unpcb *);
static void unp_drop(struct unpcb *, int);
-static void unp_gc(void);
+static void unp_gc(__unused void *, int);
static void unp_scan(struct mbuf *, void (*)(struct file *));
static void unp_mark(struct file *);
static void unp_discard(struct file *);
@@ -773,6 +782,7 @@ static void
unp_detach(struct unpcb *unp)
{
struct vnode *vp;
+ int local_unp_rights;
UNP_LOCK_ASSERT();
@@ -795,19 +805,8 @@ unp_detach(struct unpcb *unp)
}
soisdisconnected(unp->unp_socket);
unp->unp_socket->so_pcb = NULL;
- if (unp_rights) {
- /*
- * Normally the receive buffer is flushed later,
- * in sofree, but if our receive buffer holds references
- * to descriptors that are now garbage, we will dispose
- * of those descriptor references after the garbage collector
- * gets them (resulting in a "panic: closef: count < 0").
- */
- sorflush(unp->unp_socket);
- unp_gc(); /* Will unlock UNP. */
- } else
- UNP_UNLOCK();
- UNP_UNLOCK_ASSERT();
+ local_unp_rights = unp_rights;
+ UNP_UNLOCK();
if (unp->unp_addr != NULL)
FREE(unp->unp_addr, M_SONAME);
uma_zfree(unp_zone, unp);
@@ -816,6 +815,8 @@ unp_detach(struct unpcb *unp)
vrele(vp);
mtx_unlock(&Giant);
}
+ if (local_unp_rights)
+ taskqueue_enqueue(taskqueue_thread, &unp_gc_task);
}
static int
@@ -1395,7 +1396,7 @@ unp_init(void)
uma_zone_set_max(unp_zone, nmbclusters);
LIST_INIT(&unp_dhead);
LIST_INIT(&unp_shead);
-
+ TASK_INIT(&unp_gc_task, 0, unp_gc, NULL);
UNP_LOCK_INIT();
}
@@ -1581,14 +1582,20 @@ unp_addsockcred(struct thread *td, struct mbuf *control)
}
/*
- * unp_defer is thread-local during garbage collection, and does not require
- * explicit synchronization. unp_gcing prevents other threads from entering
- * garbage collection, and perhaps should be an sx lock instead.
+ * unp_defer indicates whether additional work has been defered for a future
+ * pass through unp_gc(). It is thread local and does not require explicit
+ * synchronization.
*/
-static int unp_defer, unp_gcing;
+static int unp_defer;
+
+static int unp_taskcount;
+SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, "");
+
+static int unp_recycled;
+SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, "");
static void
-unp_gc(void)
+unp_gc(__unused void *arg, int pending)
{
struct file *fp, *nextfp;
struct socket *so;
@@ -1597,15 +1604,8 @@ unp_gc(void)
int nfiles_snap;
int nfiles_slack = 20;
- UNP_LOCK_ASSERT();
-
- if (unp_gcing) {
- UNP_UNLOCK();
- return;
- }
- unp_gcing = 1;
+ unp_taskcount++;
unp_defer = 0;
- UNP_UNLOCK();
/*
* before going through all this, set all FDs to
* be NOT defered and NOT externally accessible
@@ -1618,9 +1618,16 @@ unp_gc(void)
LIST_FOREACH(fp, &filehead, f_list) {
FILE_LOCK(fp);
/*
- * If the file is not open, skip it
+ * If the file is not open, skip it -- could be a
+ * file in the process of being opened, or in the
+ * process of being closed. If the file is
+ * "closing", it may have been marked for deferred
+ * consideration. Clear the flag now if so.
*/
if (fp->f_count == 0) {
+ if (fp->f_gcflag & FDEFER)
+ unp_defer--;
+ fp->f_gcflag &= ~(FMARK|FDEFER);
FILE_UNLOCK(fp);
continue;
}
@@ -1670,22 +1677,6 @@ unp_gc(void)
if (so->so_proto->pr_domain != &localdomain ||
(so->so_proto->pr_flags&PR_RIGHTS) == 0)
continue;
-#ifdef notdef
- if (so->so_rcv.sb_flags & SB_LOCK) {
- /*
- * This is problematical; it's not clear
- * we need to wait for the sockbuf to be
- * unlocked (on a uniprocessor, at least),
- * and it's also not clear what to do
- * if sbwait returns an error due to receipt
- * of a signal. If sbwait does return
- * an error, we'll go into an infinite
- * loop. Delete all of this for now.
- */
- (void) sbwait(&so->so_rcv);
- goto restart;
- }
-#endif
/*
* So, Ok, it's one of our sockets and it IS externally
* accessible (or was defered). Now we look
@@ -1700,6 +1691,9 @@ unp_gc(void)
} while (unp_defer);
sx_sunlock(&filelist_lock);
/*
+ * XXXRW: The following comments need updating for a post-SMPng and
+ * deferred unp_gc() world, but are still generally accurate.
+ *
* We grab an extra reference to each of the file table entries
* that are not otherwise accessible and then free the rights
* that are stored in messages on them.
@@ -1711,7 +1705,7 @@ unp_gc(void)
* times -- consider the case of sockets A and B that contain
* references to each other. On a last close of some other socket,
* we trigger a gc since the number of outstanding rights (unp_rights)
- * is non-zero. If during the sweep phase the gc code un_discards,
+ * is non-zero. If during the sweep phase the gc code unp_discards,
* we end up doing a (full) closef on the descriptor. A closef on A
* results in the following chain. Closef calls soo_close, which
* calls soclose. Soclose calls first (through the switch
@@ -1788,12 +1782,11 @@ again:
FILE_UNLOCK(tfp);
}
}
- for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
+ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
closef(*fpp, (struct thread *) NULL);
+ unp_recycled++;
+ }
free(extra_ref, M_TEMP);
- unp_gcing = 0;
-
- UNP_UNLOCK_ASSERT();
}
void
@@ -1884,9 +1877,11 @@ unp_mark(struct file *fp)
static void
unp_discard(struct file *fp)
{
+ UNP_LOCK();
FILE_LOCK(fp);
fp->f_msgcount--;
unp_rights--;
FILE_UNLOCK(fp);
+ UNP_UNLOCK();
(void) closef(fp, (struct thread *)NULL);
}
OpenPOWER on IntegriCloud