summaryrefslogtreecommitdiffstats
path: root/sys/netinet/in_pcb.c
diff options
context:
space:
mode:
authorrwatson <rwatson@FreeBSD.org>2011-05-23 19:32:02 +0000
committerrwatson <rwatson@FreeBSD.org>2011-05-23 19:32:02 +0000
commit95a805600bb382ba9dc73eed13f32cd8a5de3ae7 (patch)
treea8de4b018620235851a890bf9c394c1b5af8eb8a /sys/netinet/in_pcb.c
parentfbe30c6e5ce0c364505ae499b72e7e18115f4e27 (diff)
downloadFreeBSD-src-95a805600bb382ba9dc73eed13f32cd8a5de3ae7.zip
FreeBSD-src-95a805600bb382ba9dc73eed13f32cd8a5de3ae7.tar.gz
Continue to refine inpcb reference counting and locking, in preparation for
reworking of inpcbinfo locking: (1) Convert inpcb reference counting from manually manipulated integers to the refcount(9) KPI. This allows the refcount to be managed atomically with an inpcb read lock rather than write lock, or even with no inpcb lock at all. As a result, in_pcbref() also no longer requires an inpcb lock, so can be performed solely using the lock used to look up an inpcb. (2) Shift more inpcb freeing activity from the in_pcbrele() context (via in_pcbfree_internal) to the explicit in_pcbfree() context. This means that the inpcb refcount is increasingly used only to maintain memory stability, not actually defer the clean up of inpcb protocol parts. This is desirable as many of those protocol parts required the pcbinfo lock, which we'd like not to acquire in in_pcbrele() contexts. Document this in comments better. (3) Introduce new read-locked and write-locked in_pcbrele() variations, in_pcbrele_rlocked() and in_pcbrele_wlocked(), which allow the inpcb to be properly unlocked as needed. in_pcbrele() is a wrapper around the latter, and should probably go away at some point. This makes it easier to use this weak reference model when holding only a read lock, as will happen in the future. This may well be safe to MFC, but some more KBI analysis is required. Reviewed by: bz MFC after: 3 weeks Sponsored by: Juniper Networks, Inc.
Diffstat (limited to 'sys/netinet/in_pcb.c')
-rw-r--r--sys/netinet/in_pcb.c154
1 files changed, 91 insertions, 63 deletions
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 515bd41..9bde3c8 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -2,8 +2,12 @@
* Copyright (c) 1982, 1986, 1991, 1993, 1995
* The Regents of the University of California.
* Copyright (c) 2007-2009 Robert N. M. Watson
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -50,6 +54,7 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/priv.h>
#include <sys/proc.h>
+#include <sys/refcount.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
@@ -287,7 +292,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
#endif
INP_WLOCK(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
- inp->inp_refcount = 1; /* Reference from the inpcbinfo */
+ refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
#if defined(IPSEC) || defined(MAC)
out:
if (error != 0) {
@@ -1028,56 +1033,18 @@ in_pcbdetach(struct inpcb *inp)
}
/*
- * in_pcbfree_internal() frees an inpcb that has been detached from its
- * socket, and whose reference count has reached 0. It will also remove the
- * inpcb from any global lists it might remain on.
- */
-static void
-in_pcbfree_internal(struct inpcb *inp)
-{
- struct inpcbinfo *ipi = inp->inp_pcbinfo;
-
- KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
-
- INP_INFO_WLOCK_ASSERT(ipi);
- INP_WLOCK_ASSERT(inp);
-
-#ifdef IPSEC
- if (inp->inp_sp != NULL)
- ipsec_delete_pcbpolicy(inp);
-#endif /* IPSEC */
- inp->inp_gencnt = ++ipi->ipi_gencnt;
- in_pcbremlists(inp);
-#ifdef INET6
- if (inp->inp_vflag & INP_IPV6PROTO) {
- ip6_freepcbopts(inp->in6p_outputopts);
- if (inp->in6p_moptions != NULL)
- ip6_freemoptions(inp->in6p_moptions);
- }
-#endif
- if (inp->inp_options)
- (void)m_free(inp->inp_options);
-#ifdef INET
- if (inp->inp_moptions != NULL)
- inp_freemoptions(inp->inp_moptions);
-#endif
- inp->inp_vflag = 0;
- crfree(inp->inp_cred);
-
-#ifdef MAC
- mac_inpcb_destroy(inp);
-#endif
- INP_WUNLOCK(inp);
- uma_zfree(ipi->ipi_zone, inp);
-}
-
-/*
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released. This
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
* but where the inpcb lock is already held.
*
+ * in_pcbref() should be used only to provide brief memory stability, and
+ * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
+ * garbage collect the inpcb if it has been in_pcbfree()'d from another
+ * context. Until in_pcbrele() has returned that the inpcb is still valid,
+ * lock and rele are the *only* safe operations that may be performed on the
+ * inpcb.
+ *
* While the inpcb will not be freed, releasing the inpcb lock means that the
* connection's state may change, so the caller should be careful to
* revalidate any cached state on reacquiring the lock. Drop the reference
@@ -1091,7 +1058,7 @@ in_pcbref(struct inpcb *inp)
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
- inp->inp_refcount++;
+ refcount_acquire(&inp->inp_refcount);
}
/*
@@ -1099,47 +1066,108 @@ in_pcbref(struct inpcb *inp)
* in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
* return a flag indicating whether or not the inpcb remains valid. If it is
* valid, we return with the inpcb lock held.
+ *
+ * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
+ * reference on an inpcb. Historically more work was done here (actually, in
+ * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
+ * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely
+ * about memory stability (and continued use of the write lock).
*/
int
-in_pcbrele(struct inpcb *inp)
+in_pcbrele_rlocked(struct inpcb *inp)
{
-#ifdef INVARIANTS
- struct inpcbinfo *ipi = inp->inp_pcbinfo;
-#endif
+ struct inpcbinfo *pcbinfo;
+
+ KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
+
+ INP_RLOCK_ASSERT(inp);
+
+ if (refcount_release(&inp->inp_refcount) == 0)
+ return (0);
+
+ KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
+
+ INP_RUNLOCK(inp);
+ pcbinfo = inp->inp_pcbinfo;
+ uma_zfree(pcbinfo->ipi_zone, inp);
+ return (1);
+}
+
+int
+in_pcbrele_wlocked(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo;
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
- INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
- inp->inp_refcount--;
- if (inp->inp_refcount > 0)
+ if (refcount_release(&inp->inp_refcount) == 0)
return (0);
- in_pcbfree_internal(inp);
+
+ KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
+
+ INP_WUNLOCK(inp);
+ pcbinfo = inp->inp_pcbinfo;
+ uma_zfree(pcbinfo->ipi_zone, inp);
return (1);
}
/*
+ * Temporary wrapper.
+ */
+int
+in_pcbrele(struct inpcb *inp)
+{
+
+ return (in_pcbrele_wlocked(inp));
+}
+
+/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
- * released using in_pcbrele(), but the inpcb is still unlocked.
+ * released using in_pcbrele(), but the inpcb is still unlocked. Almost all
+ * work, including removal from global lists, is done in this context, where
+ * the pcbinfo lock is held.
*/
void
in_pcbfree(struct inpcb *inp)
{
-#ifdef INVARIANTS
- struct inpcbinfo *ipi = inp->inp_pcbinfo;
-#endif
+ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
- KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
- __func__));
+ KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
- INP_INFO_WLOCK_ASSERT(ipi);
+ INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
- if (!in_pcbrele(inp))
+ /* XXXRW: Do as much as possible here. */
+#ifdef IPSEC
+ if (inp->inp_sp != NULL)
+ ipsec_delete_pcbpolicy(inp);
+#endif /* IPSEC */
+ inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
+ in_pcbremlists(inp);
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ ip6_freepcbopts(inp->in6p_outputopts);
+ if (inp->in6p_moptions != NULL)
+ ip6_freemoptions(inp->in6p_moptions);
+ }
+#endif
+ if (inp->inp_options)
+ (void)m_free(inp->inp_options);
+#ifdef INET
+ if (inp->inp_moptions != NULL)
+ inp_freemoptions(inp->inp_moptions);
+#endif
+ inp->inp_vflag = 0;
+ crfree(inp->inp_cred);
+#ifdef MAC
+ mac_inpcb_destroy(inp);
+#endif
+ if (!in_pcbrele_wlocked(inp))
INP_WUNLOCK(inp);
}
OpenPOWER on IntegriCloud