summaryrefslogtreecommitdiffstats
path: root/contrib/bind9/lib/isc/unix
diff options
context:
space:
mode:
authordougb <dougb@FreeBSD.org>2008-12-23 22:47:56 +0000
committerdougb <dougb@FreeBSD.org>2008-12-23 22:47:56 +0000
commitc673a416c3c80055f220808cf9464c00fa1c5042 (patch)
tree6caf68c956b10fe118ac0bb9f368df80b7d0818e /contrib/bind9/lib/isc/unix
parentcabae62b0ba1d31f524c393e294d3a5e08d543fc (diff)
parentfa25a858e20428b15ec892d020272b1f70eaa725 (diff)
downloadFreeBSD-src-c673a416c3c80055f220808cf9464c00fa1c5042.zip
FreeBSD-src-c673a416c3c80055f220808cf9464c00fa1c5042.tar.gz
Merge from vendor/bind9/dist as of the 9.4.3 import
Diffstat (limited to 'contrib/bind9/lib/isc/unix')
-rw-r--r--contrib/bind9/lib/isc/unix/app.c37
-rw-r--r--contrib/bind9/lib/isc/unix/include/isc/net.h27
-rw-r--r--contrib/bind9/lib/isc/unix/net.c161
-rw-r--r--contrib/bind9/lib/isc/unix/resource.c61
-rw-r--r--contrib/bind9/lib/isc/unix/socket.c1674
-rw-r--r--contrib/bind9/lib/isc/unix/socket_p.h15
-rw-r--r--contrib/bind9/lib/isc/unix/time.c10
7 files changed, 1562 insertions, 423 deletions
diff --git a/contrib/bind9/lib/isc/unix/app.c b/contrib/bind9/lib/isc/unix/app.c
index b71d766..c119362 100644
--- a/contrib/bind9/lib/isc/unix/app.c
+++ b/contrib/bind9/lib/isc/unix/app.c
@@ -1,8 +1,8 @@
/*
- * Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC")
+ * Copyright (C) 2004, 2005, 2008 Internet Systems Consortium, Inc. ("ISC")
* Copyright (C) 1999-2003 Internet Software Consortium.
*
- * Permission to use, copy, modify, and distribute this software for any
+ * Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: app.c,v 1.50.18.2.50.1 2008/07/29 04:47:31 each Exp $ */
+/* $Id: app.c,v 1.50.18.8 2008/10/15 03:41:17 marka Exp $ */
/*! \file */
@@ -30,6 +30,9 @@
#include <unistd.h>
#include <signal.h>
#include <sys/time.h>
+#ifdef HAVE_EPOLL
+#include <sys/epoll.h>
+#endif
#include <isc/app.h>
#include <isc/boolean.h>
@@ -59,11 +62,11 @@ static isc_boolean_t running = ISC_FALSE;
/*!
* We assume that 'want_shutdown' can be read and written atomically.
*/
-static isc_boolean_t want_shutdown = ISC_FALSE;
+static volatile isc_boolean_t want_shutdown = ISC_FALSE;
/*
* We assume that 'want_reload' can be read and written atomically.
*/
-static isc_boolean_t want_reload = ISC_FALSE;
+static volatile isc_boolean_t want_reload = ISC_FALSE;
static isc_boolean_t blocked = ISC_FALSE;
#ifdef ISC_PLATFORM_USETHREADS
@@ -87,13 +90,13 @@ static pthread_t main_thread;
#ifndef HAVE_SIGWAIT
static void
exit_action(int arg) {
- UNUSED(arg);
+ UNUSED(arg);
want_shutdown = ISC_TRUE;
}
static void
reload_action(int arg) {
- UNUSED(arg);
+ UNUSED(arg);
want_reload = ISC_TRUE;
}
#endif
@@ -297,14 +300,13 @@ isc_app_onrun(isc_mem_t *mctx, isc_task_t *task, isc_taskaction_t action,
* Event loop for nonthreaded programs.
*/
static isc_result_t
-evloop() {
+evloop(void) {
isc_result_t result;
while (!want_shutdown) {
int n;
isc_time_t when, now;
struct timeval tv, *tvp;
- fd_set *readfds, *writefds;
- int maxfd;
+ isc_socketwait_t *swait;
isc_boolean_t readytasks;
isc_boolean_t call_timer_dispatch = ISC_FALSE;
@@ -331,15 +333,15 @@ evloop() {
}
}
- isc__socketmgr_getfdsets(&readfds, &writefds, &maxfd);
- n = select(maxfd, readfds, writefds, NULL, tvp);
+ swait = NULL;
+ n = isc__socketmgr_waitevents(tvp, &swait);
if (n == 0 || call_timer_dispatch) {
/*
* We call isc__timermgr_dispatch() only when
* necessary, in order to reduce overhead. If the
* select() call indicates a timeout, we need the
- * dispatch. Even if not, if we set the 0-timeout
+ * dispatch. Even if not, if we set the 0-timeout
* for the select() call, we need to check the timer
* events. In the 'readytasks' case, there may be no
* timeout event actually, but there is no other way
@@ -352,8 +354,7 @@ evloop() {
isc__timermgr_dispatch();
}
if (n > 0)
- (void)isc__socketmgr_dispatch(readfds, writefds,
- maxfd);
+ (void)isc__socketmgr_dispatch(swait);
(void)isc__taskmgr_dispatch();
if (want_reload) {
@@ -423,7 +424,7 @@ isc__nothread_signal_hack(isc_condition_t *cp) {
signalled = ISC_TRUE;
return (ISC_R_SUCCESS);
}
-
+
#endif /* ISC_PLATFORM_USETHREADS */
isc_result_t
@@ -434,10 +435,10 @@ isc_app_run(void) {
#ifdef ISC_PLATFORM_USETHREADS
sigset_t sset;
char strbuf[ISC_STRERRORSIZE];
-#endif /* ISC_PLATFORM_USETHREADS */
#ifdef HAVE_SIGWAIT
int sig;
#endif
+#endif /* ISC_PLATFORM_USETHREADS */
#ifdef HAVE_LINUXTHREADS
REQUIRE(main_thread == pthread_self());
@@ -676,7 +677,7 @@ isc_app_unblock(void) {
REQUIRE(blockedthread == pthread_self());
RUNTIME_CHECK(sigemptyset(&sset) == 0 &&
- sigaddset(&sset, SIGINT) == 0 &&
+ sigaddset(&sset, SIGINT) == 0 &&
sigaddset(&sset, SIGTERM) == 0);
RUNTIME_CHECK(pthread_sigmask(SIG_BLOCK, &sset, NULL) == 0);
#endif /* ISC_PLATFORM_USETHREADS */
diff --git a/contrib/bind9/lib/isc/unix/include/isc/net.h b/contrib/bind9/lib/isc/unix/include/isc/net.h
index bdd8c14..948e7b1 100644
--- a/contrib/bind9/lib/isc/unix/include/isc/net.h
+++ b/contrib/bind9/lib/isc/unix/include/isc/net.h
@@ -1,8 +1,8 @@
/*
- * Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC")
+ * Copyright (C) 2004, 2005, 2008 Internet Systems Consortium, Inc. ("ISC")
* Copyright (C) 1999-2003 Internet Software Consortium.
*
- * Permission to use, copy, modify, and distribute this software for any
+ * Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: net.h,v 1.39.18.4 2005/04/27 05:02:37 sra Exp $ */
+/* $Id: net.h,v 1.39.18.6 2008/06/24 23:45:55 tbox Exp $ */
#ifndef ISC_NET_H
#define ISC_NET_H 1
@@ -104,7 +104,7 @@
/*%
* Required for some pre RFC2133 implementations.
* IN6ADDR_ANY_INIT and IN6ADDR_LOOPBACK_INIT were added in
- * draft-ietf-ipngwg-bsd-api-04.txt or draft-ietf-ipngwg-bsd-api-05.txt.
+ * draft-ietf-ipngwg-bsd-api-04.txt or draft-ietf-ipngwg-bsd-api-05.txt.
* If 's6_addr' is defined then assume that there is a union and three
* levels otherwise assume two levels required.
*/
@@ -202,7 +202,7 @@ extern const struct in6_addr isc_net_in6addrloop;
#ifdef ISC_PLATFORM_FIXIN6ISADDR
#undef IN6_IS_ADDR_GEOGRAPHIC
-/*!
+/*!
* \brief
* Fix UnixWare 7.1.1's broken IN6_IS_ADDR_* definitions.
*/
@@ -324,6 +324,23 @@ isc_net_probeunix(void);
* Returns whether UNIX domain sockets are supported.
*/
+isc_result_t
+isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high);
+/*%<
+ * Returns system's default range of ephemeral UDP ports, if defined.
+ * If the range is not available or unknown, ISC_NET_PORTRANGELOW and
+ * ISC_NET_PORTRANGEHIGH will be returned.
+ *
+ * Requires:
+ *
+ *\li 'low' and 'high' must be non NULL.
+ *
+ * Returns:
+ *
+ *\li *low and *high will be the ports specifying the low and high ends of
+ * the range.
+ */
+
#ifdef ISC_PLATFORM_NEEDNTOP
const char *
isc_net_ntop(int af, const void *src, char *dst, size_t size);
diff --git a/contrib/bind9/lib/isc/unix/net.c b/contrib/bind9/lib/isc/unix/net.c
index 1de6b32..600ac92 100644
--- a/contrib/bind9/lib/isc/unix/net.c
+++ b/contrib/bind9/lib/isc/unix/net.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2004, 2005, 2007 Internet Systems Consortium, Inc. ("ISC")
+ * Copyright (C) 2004, 2005, 2007, 2008 Internet Systems Consortium, Inc. ("ISC")
* Copyright (C) 1999-2003 Internet Software Consortium.
*
* Permission to use, copy, modify, and/or distribute this software for any
@@ -15,10 +15,19 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: net.c,v 1.29.18.6 2007/09/13 23:46:26 tbox Exp $ */
+/* $Id: net.c,v 1.29.18.9 2008/07/04 05:52:05 each Exp $ */
#include <config.h>
+#include <sys/types.h>
+
+#if defined(HAVE_SYS_SYSCTL_H)
+#if defined(HAVE_SYS_PARAM_H)
+#include <sys/param.h>
+#endif
+#include <sys/sysctl.h>
+#endif
+
#include <errno.h>
#include <unistd.h>
@@ -30,6 +39,59 @@
#include <isc/string.h>
#include <isc/util.h>
+/*%
+ * Definitions about UDP port range specification. This is a total mess of
+ * portability variants: some use sysctl (but the sysctl names vary), some use
+ * system-specific interfaces, some have the same interface for IPv4 and IPv6,
+ * some separate them, etc...
+ */
+
+/*%
+ * The last resort defaults: use all non well known port space
+ */
+#ifndef ISC_NET_PORTRANGELOW
+#define ISC_NET_PORTRANGELOW 1024
+#endif /* ISC_NET_PORTRANGELOW */
+#ifndef ISC_NET_PORTRANGEHIGH
+#define ISC_NET_PORTRANGEHIGH 65535
+#endif /* ISC_NET_PORTRANGEHIGH */
+
+#ifdef HAVE_SYSCTLBYNAME
+
+/*%
+ * sysctl variants
+ */
+#if defined(__FreeBSD__) || defined(__APPLE__) || defined(__DragonFly__)
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portrange.hifirst"
+#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.portrange.hilast"
+#define SYSCTL_V6PORTRANGE_LOW "net.inet.ip.portrange.hifirst"
+#define SYSCTL_V6PORTRANGE_HIGH "net.inet.ip.portrange.hilast"
+#endif
+
+#ifdef __NetBSD__
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.anonportmin"
+#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.anonportmax"
+#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.anonportmin"
+#define SYSCTL_V6PORTRANGE_HIGH "net.inet6.ip6.anonportmax"
+#endif
+
+#else /* !HAVE_SYSCTLBYNAME */
+
+#ifdef __OpenBSD__
+#define USE_SYSCTL_PORTRANGE
+#define SYSCTL_V4PORTRANGE_LOW { CTL_NET, PF_INET, IPPROTO_IP, \
+ IPCTL_IPPORT_HIFIRSTAUTO }
+#define SYSCTL_V4PORTRANGE_HIGH { CTL_NET, PF_INET, IPPROTO_IP, \
+ IPCTL_IPPORT_HILASTAUTO }
+/* Same for IPv6 */
+#define SYSCTL_V6PORTRANGE_LOW SYSCTL_V4PORTRANGE_LOW
+#define SYSCTL_V6PORTRANGE_HIGH SYSCTL_V4PORTRANGE_HIGH
+#endif
+
+#endif /* HAVE_SYSCTLBYNAME */
+
#if defined(ISC_PLATFORM_HAVEIPV6)
# if defined(ISC_PLATFORM_NEEDIN6ADDRANY)
const struct in6_addr isc_net_in6addrany = IN6ADDR_ANY_INIT;
@@ -338,6 +400,101 @@ isc_net_probe_ipv6pktinfo(void) {
return (ipv6pktinfo_result);
}
+#if defined(USE_SYSCTL_PORTRANGE)
+#if defined(HAVE_SYSCTLBYNAME)
+static isc_result_t
+getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
+ int port_low, port_high;
+ size_t portlen;
+ const char *sysctlname_lowport, *sysctlname_hiport;
+
+ if (af == AF_INET) {
+ sysctlname_lowport = SYSCTL_V4PORTRANGE_LOW;
+ sysctlname_hiport = SYSCTL_V4PORTRANGE_HIGH;
+ } else {
+ sysctlname_lowport = SYSCTL_V6PORTRANGE_LOW;
+ sysctlname_hiport = SYSCTL_V6PORTRANGE_HIGH;
+ }
+ portlen = sizeof(portlen);
+ if (sysctlbyname(sysctlname_lowport, &port_low, &portlen,
+ NULL, 0) < 0) {
+ return (ISC_R_FAILURE);
+ }
+ portlen = sizeof(portlen);
+ if (sysctlbyname(sysctlname_hiport, &port_high, &portlen,
+ NULL, 0) < 0) {
+ return (ISC_R_FAILURE);
+ }
+ if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0)
+ return (ISC_R_RANGE);
+
+ *low = (in_port_t)port_low;
+ *high = (in_port_t)port_high;
+
+ return (ISC_R_SUCCESS);
+}
+#else /* !HAVE_SYSCTLBYNAME */
+static isc_result_t
+getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) {
+ int mib_lo4[4] = SYSCTL_V4PORTRANGE_LOW;
+ int mib_hi4[4] = SYSCTL_V4PORTRANGE_HIGH;
+ int mib_lo6[4] = SYSCTL_V6PORTRANGE_LOW;
+ int mib_hi6[4] = SYSCTL_V6PORTRANGE_HIGH;
+ int *mib_lo, *mib_hi, miblen;
+ int port_low, port_high;
+ size_t portlen;
+
+ if (af == AF_INET) {
+ mib_lo = mib_lo4;
+ mib_hi = mib_hi4;
+ miblen = sizeof(mib_lo4) / sizeof(mib_lo4[0]);
+ } else {
+ mib_lo = mib_lo6;
+ mib_hi = mib_hi6;
+ miblen = sizeof(mib_lo6) / sizeof(mib_lo6[0]);
+ }
+
+ portlen = sizeof(portlen);
+ if (sysctl(mib_lo, miblen, &port_low, &portlen, NULL, 0) < 0) {
+ return (ISC_R_FAILURE);
+ }
+
+ portlen = sizeof(portlen);
+ if (sysctl(mib_hi, miblen, &port_high, &portlen, NULL, 0) < 0) {
+ return (ISC_R_FAILURE);
+ }
+
+ if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0)
+ return (ISC_R_RANGE);
+
+ *low = (in_port_t) port_low;
+ *high = (in_port_t) port_high;
+
+ return (ISC_R_SUCCESS);
+}
+#endif /* HAVE_SYSCTLBYNAME */
+#endif /* USE_SYSCTL_PORTRANGE */
+
+isc_result_t
+isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) {
+ int result = ISC_R_FAILURE;
+
+ REQUIRE(low != NULL && high != NULL);
+
+#if defined(USE_SYSCTL_PORTRANGE)
+ result = getudpportrange_sysctl(af, low, high);
+#else
+ UNUSED(af);
+#endif
+
+ if (result != ISC_R_SUCCESS) {
+ *low = ISC_NET_PORTRANGELOW;
+ *high = ISC_NET_PORTRANGEHIGH;
+ }
+
+ return (ISC_R_SUCCESS); /* we currently never fail in this function */
+}
+
void
isc_net_disableipv4(void) {
initialize();
diff --git a/contrib/bind9/lib/isc/unix/resource.c b/contrib/bind9/lib/isc/unix/resource.c
index 0264976..e9bc5fd 100644
--- a/contrib/bind9/lib/isc/unix/resource.c
+++ b/contrib/bind9/lib/isc/unix/resource.c
@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: resource.c,v 1.12.944.4 2008/07/28 22:44:46 marka Exp $ */
+/* $Id: resource.c,v 1.12.18.6 2008/08/05 07:17:05 marka Exp $ */
#include <config.h>
@@ -32,7 +32,7 @@
#include <linux/fs.h> /* To get the large NR_OPEN. */
#endif
-#ifdef __hpux
+#if defined(__hpux) && defined(HAVE_SYS_DYNTUNE_H)
#include <sys/dyntune.h>
#endif
@@ -48,13 +48,13 @@ resource2rlim(isc_resource_t resource, int *rlim_resource) {
break;
case isc_resource_cputime:
*rlim_resource = RLIMIT_CPU;
- break;
+ break;
case isc_resource_datasize:
*rlim_resource = RLIMIT_DATA;
- break;
+ break;
case isc_resource_filesize:
*rlim_resource = RLIMIT_FSIZE;
- break;
+ break;
case isc_resource_lockedmemory:
#ifdef RLIMIT_MEMLOCK
*rlim_resource = RLIMIT_MEMLOCK;
@@ -87,7 +87,7 @@ resource2rlim(isc_resource_t resource, int *rlim_resource) {
*rlim_resource = RLIMIT_STACK;
break;
default:
- /*
+ /*
* This test is not very robust if isc_resource_t
* changes, but generates a clear assertion message.
*/
@@ -140,51 +140,6 @@ isc_resource_setlimit(isc_resource_t resource, isc_resourcevalue_t value) {
rlim_value = value;
}
- /*
- * The BIND 8 documentation reports:
- *
- * Note: on some operating systems the server cannot set an
- * unlimited value and cannot determine the maximum number of
- * open files the kernel can support. On such systems, choosing
- * unlimited will cause the server to use the larger of the
- * rlim_max for RLIMIT_NOFILE and the value returned by
- * sysconf(_SC_OPEN_MAX). If the actual kernel limit is larger
- * than this value, use limit files to specify the limit
- * explicitly.
- *
- * The CHANGES for 8.1.2-T3A also mention:
- *
- * 352. [bug] Because of problems with setting an infinite
- * rlim_max for RLIMIT_NOFILE on some systems, previous versions
- * of the server implemented "limit files unlimited" by setting
- * the limit to the value returned by sysconf(_SC_OPEN_MAX). The
- * server will now use RLIM_INFINITY on systems which allow it.
- *
- * At some point the BIND 8 server stopped using SC_OPEN_MAX for this
- * purpose at all, but it isn't clear to me when or why, as my access
- * to the CVS archive is limited at the time of this writing. What
- * BIND 8 *does* do is to set RLIMIT_NOFILE to either RLIMIT_INFINITY
- * on a half dozen operating systems or to FD_SETSIZE on the rest,
- * the latter of which is probably fewer than the real limit. (Note
- * that libisc's socket module will have problems with any fd over
- * FD_SETSIZE. This should be fixed in the socket module, not a
- * limitation here. BIND 8's eventlib also has a problem, making
- * its RLIMIT_INFINITY setting useless, because it closes and ignores
- * any fd over FD_SETSIZE.)
- *
- * More troubling is the reference to some operating systems not being
- * able to set an unlimited value for the number of open files. I'd
- * hate to put in code that is really only there to support archaic
- * systems that the rest of libisc won't work on anyway. So what this
- * extremely verbose comment is here to say is the following:
- *
- * I'm aware there might be an issue with not limiting the value
- * for RLIMIT_NOFILE on some systems, but since I don't know yet
- * what those systems are and what the best workaround is (use
- * sysconf()? rlim_max from getrlimit()? FD_SETSIZE?) so nothing
- * is currently being done to clamp the value for open files.
- */
-
rl.rlim_cur = rl.rlim_max = rlim_value;
unixresult = setrlimit(unixresource, &rl);
@@ -215,7 +170,7 @@ isc_resource_setlimit(isc_resource_t resource, isc_resourcevalue_t value) {
if (unixresult == 0)
return (ISC_R_SUCCESS);
}
-#elif defined(__hpux)
+#elif defined(__hpux) && defined(HAVE_SYS_DYNTUNE_H)
if (resource == isc_resource_openfiles && rlim_value == RLIM_INFINITY) {
uint64_t maxfiles;
if (gettune("maxfiles_lim", &maxfiles) == 0) {
@@ -255,7 +210,7 @@ isc_resource_getlimit(isc_resource_t resource, isc_resourcevalue_t *value) {
}
isc_result_t
-isc_resource_curlimit(isc_resource_t resource, isc_resourcevalue_t *value) {
+isc_resource_getcurlimit(isc_resource_t resource, isc_resourcevalue_t *value) {
int unixresult;
int unixresource;
struct rlimit rl;
diff --git a/contrib/bind9/lib/isc/unix/socket.c b/contrib/bind9/lib/isc/unix/socket.c
index 1b4da78..3239614 100644
--- a/contrib/bind9/lib/isc/unix/socket.c
+++ b/contrib/bind9/lib/isc/unix/socket.c
@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: socket.c,v 1.237.18.29.10.6 2008/07/29 04:47:31 each Exp $ */
+/* $Id: socket.c,v 1.237.18.56 2008/11/12 03:58:36 marka Exp $ */
/*! \file */
@@ -25,9 +25,6 @@
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/stat.h>
-#ifdef ISC_PLATFORM_HAVESYSUNH
-#include <sys/un.h>
-#endif
#include <sys/time.h>
#include <sys/uio.h>
@@ -58,6 +55,19 @@
#include <isc/thread.h>
#include <isc/util.h>
+#ifdef ISC_PLATFORM_HAVESYSUNH
+#include <sys/un.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEKQUEUE
+#include <sys/event.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEEPOLL
+#include <sys/epoll.h>
+#endif
+#ifdef ISC_PLATFORM_HAVEDEVPOLL
+#include <sys/devpoll.h>
+#endif
+
#include "errno2result.h"
#ifndef ISC_PLATFORM_USETHREADS
@@ -65,9 +75,46 @@
#endif /* ISC_PLATFORM_USETHREADS */
/*%
- * Max number of open sockets. In the vast majority of cases the default size
- * of FD_SETSIZE should be fine, and this constant should be increased only
- * when absolutely necessary and possible, i.e., the server is exhausting all
+ * Choose the most preferable multiplex method.
+ */
+#ifdef ISC_PLATFORM_HAVEKQUEUE
+#define USE_KQUEUE
+#elif defined (ISC_PLATFORM_HAVEEPOLL)
+#define USE_EPOLL
+#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
+#define USE_DEVPOLL
+typedef struct {
+ unsigned int want_read : 1,
+ want_write : 1;
+} pollinfo_t;
+#else
+#define USE_SELECT
+#endif /* ISC_PLATFORM_HAVEKQUEUE */
+
+#ifndef ISC_PLATFORM_USETHREADS
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+struct isc_socketwait {
+ int nevents;
+};
+#elif defined (USE_SELECT)
+struct isc_socketwait {
+ fd_set *readset;
+ fd_set *writeset;
+ int nfds;
+ int maxfd;
+};
+#endif /* USE_KQUEUE */
+#endif /* !ISC_PLATFORM_USETHREADS */
+
+/*%
+ * Maximum number of allowable open sockets. This is also the maximum
+ * allowable socket file descriptor.
+ *
+ * Care should be taken before modifying this value for select():
+ * The API standard doesn't ensure select() accept more than (the system default
+ * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
+ * the vast majority of cases. This constant should therefore be increased only
+ * when absolutely necessary and possible, i.e., the server is exhausting all
* available file descriptors (up to FD_SETSIZE) and the select() function
* and FD_xxx macros support larger values than FD_SETSIZE (which may not
* always by true, but we keep using some of them to ensure as much
@@ -78,18 +125,72 @@
* As a special note, this value shouldn't have to be touched if
* this is a build for an authoritative only DNS server.
*/
-
-#ifndef ISC_SOCKET_FDSETSIZE
-#define ISC_SOCKET_FDSETSIZE FD_SETSIZE
-#endif
-
+#ifndef ISC_SOCKET_MAXSOCKETS
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+#define ISC_SOCKET_MAXSOCKETS 4096
+#elif defined(USE_SELECT)
+#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
+#endif /* USE_KQUEUE... */
+#endif /* ISC_SOCKET_MAXSOCKETS */
+
+#ifdef USE_SELECT
/*%
- * Mac OS X needs a special definition to support larger values in select()
+ * Mac OS X needs a special definition to support larger values in select().
+ * We always define this because a larger value can be specified run-time.
*/
-#if ISC_SOCKET_FDSETSIZE > FD_SETSIZE
#ifdef __APPLE__
#define _DARWIN_UNLIMITED_SELECT
#endif /* __APPLE__ */
+#endif /* USE_SELECT */
+
+#ifdef ISC_SOCKET_USE_POLLWATCH
+/*%
+ * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
+ * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
+ * some of the specified FD. The idea is based on the observation that it's
+ * likely for a busy server to keep receiving packets. It specifically works
+ * as follows: the socket watcher is first initialized with the state of
+ * "poll_idle". While it's in the idle state it keeps sleeping until a socket
+ * event occurs. When it wakes up for a socket I/O event, it moves to the
+ * poll_active state, and sets the poll timeout to a short period
+ * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the
+ * watcher goes to the poll_checking state with the same timeout period.
+ * In this state, the watcher tries to detect whether this is a break
+ * during intermittent events or the kernel bug is triggered. If the next
+ * polling reports an event within the short period, the previous timeout is
+ * likely to be a kernel bug, and so the watcher goes back to the active state.
+ * Otherwise, it moves to the idle state again.
+ *
+ * It's not clear whether this is a thread-related bug, but since we've only
+ * seen this with threads, this workaround is used only when enabling threads.
+ */
+
+typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
+
+#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
+#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
+#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
+#endif /* ISC_SOCKET_USE_POLLWATCH */
+
+/*%
+ * Size of per-FD lock buckets.
+ */
+#ifdef ISC_PLATFORM_USETHREADS
+#define FDLOCK_COUNT 1024
+#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT)
+#else
+#define FDLOCK_COUNT 1
+#define FDLOCK_ID(fd) 0
+#endif /* ISC_PLATFORM_USETHREADS */
+
+/*%
+ * Maximum number of events communicated with the kernel. There should normally
+ * be no need for having a large number.
+ */
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+#ifndef ISC_SOCKET_MAXEVENTS
+#define ISC_SOCKET_MAXEVENTS 64
+#endif
#endif
/*%
@@ -230,22 +331,50 @@ struct isc_socketmgr {
unsigned int magic;
isc_mem_t *mctx;
isc_mutex_t lock;
+ isc_mutex_t *fdlock;
+#ifdef USE_KQUEUE
+ int kqueue_fd;
+ int nevents;
+ struct kevent *events;
+#endif /* USE_KQUEUE */
+#ifdef USE_EPOLL
+ int epoll_fd;
+ int nevents;
+ struct epoll_event *events;
+#endif /* USE_EPOLL */
+#ifdef USE_DEVPOLL
+ int devpoll_fd;
+ int nevents;
+ struct pollfd *events;
+#endif /* USE_DEVPOLL */
+#ifdef USE_SELECT
int fd_bufsize;
- int fdsize;
+#endif /* USE_SELECT */
+ unsigned int maxsocks;
+#ifdef ISC_PLATFORM_USETHREADS
+ int pipe_fds[2];
+#endif
+
+ /* Locked by fdlock. */
+ isc_socket_t **fds;
+ int *fdstate;
+#ifdef USE_DEVPOLL
+ pollinfo_t *fdpollinfo;
+#endif
+
/* Locked by manager lock. */
ISC_LIST(isc_socket_t) socklist;
+#ifdef USE_SELECT
fd_set *read_fds;
fd_set *read_fds_copy;
fd_set *write_fds;
fd_set *write_fds_copy;
- isc_socket_t **fds;
- int *fdstate;
int maxfd;
- int reserved; /* unlocked */
+#endif /* USE_SELECT */
+ int reserved; /* unlocked */
#ifdef ISC_PLATFORM_USETHREADS
isc_thread_t watcher;
isc_condition_t shutdown_ok;
- int pipe_fds[2];
#else /* ISC_PLATFORM_USETHREADS */
unsigned int refs;
#endif /* ISC_PLATFORM_USETHREADS */
@@ -284,8 +413,9 @@ static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
struct msghdr *, struct iovec *, size_t *);
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
struct msghdr *, struct iovec *, size_t *);
-static void cleanup_fdsets(isc_socketmgr_t *, isc_mem_t *);
-static isc_result_t create_fdsets(isc_socketmgr_t *, isc_mem_t *);
+#ifdef ISC_PLATFORM_USETHREADS
+static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
+#endif
#define SELECT_POKE_SHUTDOWN (-1)
#define SELECT_POKE_NOTHING (-2)
@@ -354,9 +484,195 @@ socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
}
}
+#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
+ defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
+/*
+ * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
+ * setting IPV6_V6ONLY.
+ */
+static void
+FIX_IPV6_RECVPKTINFO(isc_socket_t *sock)
+{
+ char strbuf[ISC_STRERRORSIZE];
+ int on = 1;
+
+ if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
+ return;
+
+ if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
+ (void *)&on, sizeof(on)) < 0) {
+
+ UNEXPECTED_ERROR(__FILE__, __LINE__,
+ "setsockopt(%d, IPV6_RECVPKTINFO) "
+ "%s: %s", sock->fd,
+ isc_msgcat_get(isc_msgcat,
+ ISC_MSGSET_GENERAL,
+ ISC_MSG_FAILED,
+ "failed"),
+ strbuf);
+ }
+}
+#else
+#define FIX_IPV6_RECVPKTINFO(sock) (void)0
+#endif
+
+static inline isc_result_t
+watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
+ isc_result_t result = ISC_R_SUCCESS;
+
+#ifdef USE_KQUEUE
+ struct kevent evchange;
+
+ memset(&evchange, 0, sizeof(evchange));
+ if (msg == SELECT_POKE_READ)
+ evchange.filter = EVFILT_READ;
+ else
+ evchange.filter = EVFILT_WRITE;
+ evchange.flags = EV_ADD;
+ evchange.ident = fd;
+ if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
+ result = isc__errno2result(errno);
+
+ return (result);
+#elif defined(USE_EPOLL)
+ struct epoll_event event;
+
+ if (msg == SELECT_POKE_READ)
+ event.events = EPOLLIN;
+ else
+ event.events = EPOLLOUT;
+ event.data.fd = fd;
+ if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
+ errno != EEXIST) {
+ result = isc__errno2result(errno);
+ }
+
+ return (result);
+#elif defined(USE_DEVPOLL)
+ struct pollfd pfd;
+ int lockid = FDLOCK_ID(fd);
+
+ memset(&pfd, 0, sizeof(pfd));
+ if (msg == SELECT_POKE_READ)
+ pfd.events = POLLIN;
+ else
+ pfd.events = POLLOUT;
+ pfd.fd = fd;
+ pfd.revents = 0;
+ LOCK(&manager->fdlock[lockid]);
+ if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
+ result = isc__errno2result(errno);
+ else {
+ if (msg == SELECT_POKE_READ)
+ manager->fdpollinfo[fd].want_read = 1;
+ else
+ manager->fdpollinfo[fd].want_write = 1;
+ }
+ UNLOCK(&manager->fdlock[lockid]);
+
+ return (result);
+#elif defined(USE_SELECT)
+ LOCK(&manager->lock);
+ if (msg == SELECT_POKE_READ)
+ FD_SET(fd, manager->read_fds);
+ if (msg == SELECT_POKE_WRITE)
+ FD_SET(fd, manager->write_fds);
+ UNLOCK(&manager->lock);
+
+ return (result);
+#endif
+}
+
+static inline isc_result_t
+unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
+ isc_result_t result = ISC_R_SUCCESS;
+
+#ifdef USE_KQUEUE
+ struct kevent evchange;
+
+ memset(&evchange, 0, sizeof(evchange));
+ if (msg == SELECT_POKE_READ)
+ evchange.filter = EVFILT_READ;
+ else
+ evchange.filter = EVFILT_WRITE;
+ evchange.flags = EV_DELETE;
+ evchange.ident = fd;
+ if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
+ result = isc__errno2result(errno);
+
+ return (result);
+#elif defined(USE_EPOLL)
+ struct epoll_event event;
+
+ if (msg == SELECT_POKE_READ)
+ event.events = EPOLLIN;
+ else
+ event.events = EPOLLOUT;
+ event.data.fd = fd;
+ if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
+ errno != ENOENT) {
+ char strbuf[ISC_STRERRORSIZE];
+ isc__strerror(errno, strbuf, sizeof(strbuf));
+ UNEXPECTED_ERROR(__FILE__, __LINE__,
+ "epoll_ctl(DEL), %d: %s", fd, strbuf);
+ result = ISC_R_UNEXPECTED;
+ }
+ return (result);
+#elif defined(USE_DEVPOLL)
+ struct pollfd pfds[2];
+ size_t writelen = sizeof(pfds[0]);
+ int lockid = FDLOCK_ID(fd);
+
+ memset(pfds, 0, sizeof(pfds));
+ pfds[0].events = POLLREMOVE;
+ pfds[0].fd = fd;
+
+ /*
+ * Canceling read or write polling via /dev/poll is tricky. Since it
+ * only provides a way of canceling per FD, we may need to re-poll the
+ * socket for the other operation.
+ */
+ LOCK(&manager->fdlock[lockid]);
+ if (msg == SELECT_POKE_READ &&
+ manager->fdpollinfo[fd].want_write == 1) {
+ pfds[1].events = POLLOUT;
+ pfds[1].fd = fd;
+ writelen += sizeof(pfds[1]);
+ }
+ if (msg == SELECT_POKE_WRITE &&
+ manager->fdpollinfo[fd].want_read == 1) {
+ pfds[1].events = POLLIN;
+ pfds[1].fd = fd;
+ writelen += sizeof(pfds[1]);
+ }
+
+ if (write(manager->devpoll_fd, pfds, writelen) == -1)
+ result = isc__errno2result(errno);
+ else {
+ if (msg == SELECT_POKE_READ)
+ manager->fdpollinfo[fd].want_read = 0;
+ else
+ manager->fdpollinfo[fd].want_write = 0;
+ }
+ UNLOCK(&manager->fdlock[lockid]);
+
+ return (result);
+#elif defined(USE_SELECT)
+ LOCK(&manager->lock);
+ if (msg == SELECT_POKE_READ)
+ FD_CLR(fd, manager->read_fds);
+ else if (msg == SELECT_POKE_WRITE)
+ FD_CLR(fd, manager->write_fds);
+ UNLOCK(&manager->lock);
+
+ return (result);
+#endif
+}
+
static void
wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
- isc_socket_t *sock;
+ isc_result_t result;
+ int lockid = FDLOCK_ID(fd);
/*
* This is a wakeup on a socket. If the socket is not in the
@@ -364,27 +680,54 @@ wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
* or writes.
*/
- INSIST(fd >= 0 && fd < manager->fdsize);
+ INSIST(fd >= 0 && fd < (int)manager->maxsocks);
- if (manager->fdstate[fd] == CLOSE_PENDING) {
+ if (msg == SELECT_POKE_CLOSE) {
+ /* No one should be updating fdstate, so no need to lock it */
+ INSIST(manager->fdstate[fd] == CLOSE_PENDING);
manager->fdstate[fd] = CLOSED;
- FD_CLR(fd, manager->read_fds);
- FD_CLR(fd, manager->write_fds);
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
(void)close(fd);
return;
}
- if (manager->fdstate[fd] != MANAGED)
- return;
- sock = manager->fds[fd];
+ LOCK(&manager->fdlock[lockid]);
+ if (manager->fdstate[fd] == CLOSE_PENDING) {
+ UNLOCK(&manager->fdlock[lockid]);
+ /*
+ * We accept (and ignore) any error from unwatch_fd() as we are
+ * closing the socket, hoping it doesn't leave dangling state in
+ * the kernel.
+ * Note that unwatch_fd() must be called after releasing the
+ * fdlock; otherwise it could cause deadlock due to a lock order
+ * reversal.
+ */
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+ return;
+ }
+ if (manager->fdstate[fd] != MANAGED) {
+ UNLOCK(&manager->fdlock[lockid]);
+ return;
+ }
+ UNLOCK(&manager->fdlock[lockid]);
/*
* Set requested bit.
*/
- if (msg == SELECT_POKE_READ)
- FD_SET(sock->fd, manager->read_fds);
- if (msg == SELECT_POKE_WRITE)
- FD_SET(sock->fd, manager->write_fds);
+ result = watch_fd(manager, fd, msg);
+ if (result != ISC_R_SUCCESS) {
+ /*
+ * XXXJT: what should we do? Ignoring the failure of watching
+ * a socket will make the application dysfunctional, but there
+ * seems to be no reasonable recovery process.
+ */
+ isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
+ "failed to start watching FD (%d): %s",
+ fd, isc_result_totext(result));
+ }
}
#ifdef ISC_PLATFORM_USETHREADS
@@ -452,7 +795,7 @@ select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
"read() failed "
"during watcher poke: %s"),
strbuf);
-
+
return;
}
INSIST(cc == sizeof(buf));
@@ -557,7 +900,7 @@ cmsg_space(ISC_SOCKADDR_LEN_T len) {
return ((char *)cmsgp - (char *)msg.msg_control);
else
return (0);
-#endif
+#endif
}
#endif /* USE_CMSG */
@@ -631,7 +974,7 @@ process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
"interface received on ifindex %u",
dev->pktinfo.ipi6_ifindex);
if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
- dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
+ dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
goto next;
}
#endif
@@ -679,7 +1022,7 @@ build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
memset(msg, 0, sizeof(*msg));
- if (sock->type == isc_sockettype_udp) {
+ if (!sock->connected) {
msg->msg_name = (void *)&dev->address.type.sa;
msg->msg_namelen = dev->address.length;
} else {
@@ -964,15 +1307,17 @@ dump_msg(struct msghdr *msg) {
unsigned int i;
printf("MSGHDR %p\n", msg);
- printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
- printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
+ printf("\tname %p, namelen %ld\n", msg->msg_name,
+ (long) msg->msg_namelen);
+ printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
+ (long) msg->msg_iovlen);
for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
- printf("\t\t%d\tbase %p, len %d\n", i,
+ printf("\t\t%d\tbase %p, len %ld\n", i,
msg->msg_iov[i].iov_base,
- msg->msg_iov[i].iov_len);
+ (long) msg->msg_iov[i].iov_len);
#ifdef ISC_NET_BSD44MSGHDR
- printf("\tcontrol %p, controllen %d\n", msg->msg_control,
- msg->msg_controllen);
+ printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
+ (long) msg->msg_controllen);
#endif
}
#endif
@@ -1014,7 +1359,7 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
isc__strerror(recv_errno, strbuf, sizeof(strbuf));
socket_log(sock, NULL, IOEVENT,
isc_msgcat, ISC_MSGSET_SOCKET,
- ISC_MSG_DOIORECV,
+ ISC_MSG_DOIORECV,
"doio_recv: recvmsg(%d) %d bytes, err %d/%s",
sock->fd, cc, recv_errno, strbuf);
}
@@ -1040,6 +1385,14 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
/* HPUX 11.11 can return EADDRNOTAVAIL. */
SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
+ /*
+ * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
+ * errors.
+ */
+#ifdef EPROTO
+ SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
+#endif
+ SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
@@ -1062,7 +1415,7 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
socket_log(sock, &dev->address, IOEVENT,
isc_msgcat, ISC_MSGSET_SOCKET,
- ISC_MSG_ZEROPORT,
+ ISC_MSG_ZEROPORT,
"dropping source port zero packet");
}
return (DOIO_SOFT);
@@ -1245,7 +1598,54 @@ doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
* references exist.
*/
static void
+closesocket(isc_socketmgr_t *manager, isc_sockettype_t type, int fd) {
+ int lockid = FDLOCK_ID(fd);
+
+ UNUSED(type);
+
+ /*
+ * No one has this socket open, so the watcher doesn't have to be
+ * poked, and the socket doesn't have to be locked.
+ */
+ LOCK(&manager->fdlock[lockid]);
+ manager->fds[fd] = NULL;
+ manager->fdstate[fd] = CLOSE_PENDING;
+ UNLOCK(&manager->fdlock[lockid]);
+ select_poke(manager, fd, SELECT_POKE_CLOSE);
+
+ /*
+ * update manager->maxfd here (XXX: this should be implemented more
+ * efficiently)
+ */
+#ifdef USE_SELECT
+ LOCK(&manager->lock);
+ if (manager->maxfd == fd) {
+ int i;
+
+ manager->maxfd = 0;
+ for (i = fd - 1; i >= 0; i--) {
+ lockid = FDLOCK_ID(i);
+
+ LOCK(&manager->fdlock[lockid]);
+ if (manager->fdstate[i] == MANAGED) {
+ manager->maxfd = i;
+ UNLOCK(&manager->fdlock[lockid]);
+ break;
+ }
+ UNLOCK(&manager->fdlock[lockid]);
+ }
+#ifdef ISC_PLATFORM_USETHREADS
+ if (manager->maxfd < manager->pipe_fds[0])
+ manager->maxfd = manager->pipe_fds[0];
+#endif
+ }
+ UNLOCK(&manager->lock);
+#endif /* USE_SELECT */
+}
+
+static void
destroy(isc_socket_t **sockp) {
+ int fd;
isc_socket_t *sock = *sockp;
isc_socketmgr_t *manager = sock->manager;
@@ -1256,17 +1656,16 @@ destroy(isc_socket_t **sockp) {
INSIST(ISC_LIST_EMPTY(sock->recv_list));
INSIST(ISC_LIST_EMPTY(sock->send_list));
INSIST(sock->connect_ev == NULL);
- REQUIRE(sock->fd >= 0 && sock->fd < (int)manager->fdsize);
+ REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
+
+ if (sock->fd >= 0) {
+ fd = sock->fd;
+ sock->fd = -1;
+ closesocket(manager, sock->type, fd);
+ }
LOCK(&manager->lock);
- /*
- * No one has this socket open, so the watcher doesn't have to be
- * poked, and the socket doesn't have to be locked.
- */
- manager->fds[sock->fd] = NULL;
- manager->fdstate[sock->fd] = CLOSE_PENDING;
- select_poke(manager, sock->fd, SELECT_POKE_CLOSE);
ISC_LIST_UNLINK(manager->socklist, sock, link);
#ifdef ISC_PLATFORM_USETHREADS
@@ -1274,10 +1673,6 @@ destroy(isc_socket_t **sockp) {
SIGNAL(&manager->shutdown_ok);
#endif /* ISC_PLATFORM_USETHREADS */
- /*
- * XXX should reset manager->maxfd here
- */
-
UNLOCK(&manager->lock);
free_socket(sockp);
@@ -1465,18 +1860,11 @@ clear_bsdcompat(void) {
}
#endif
-/*%
- * Create a new 'type' socket managed by 'manager'. Events
- * will be posted to 'task' and when dispatched 'action' will be
- * called with 'arg' as the arg value. The new socket is returned
- * in 'socketp'.
- */
-isc_result_t
-isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
- isc_socket_t **socketp)
-{
- isc_socket_t *sock = NULL;
- isc_result_t result;
+static isc_result_t
+opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) {
+ char strbuf[ISC_STRERRORSIZE];
+ const char *err = "socket";
+ int tries = 0;
#if defined(USE_CMSG) || defined(SO_BSDCOMPAT)
int on = 1;
#endif
@@ -1484,38 +1872,27 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
ISC_SOCKADDR_LEN_T optlen;
int size;
#endif
- char strbuf[ISC_STRERRORSIZE];
- const char *err = "socket";
- int try = 0;
- REQUIRE(VALID_MANAGER(manager));
- REQUIRE(socketp != NULL && *socketp == NULL);
-
- result = allocate_socket(manager, type, &sock);
- if (result != ISC_R_SUCCESS)
- return (result);
-
- sock->pf = pf;
again:
- switch (type) {
+ switch (sock->type) {
case isc_sockettype_udp:
- sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
+ sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
break;
case isc_sockettype_tcp:
- sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
+ sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
break;
case isc_sockettype_unix:
- sock->fd = socket(pf, SOCK_STREAM, 0);
+ sock->fd = socket(sock->pf, SOCK_STREAM, 0);
break;
}
- if (sock->fd == -1 && errno == EINTR && try++ < 42)
+ if (sock->fd == -1 && errno == EINTR && tries++ < 42)
goto again;
#ifdef F_DUPFD
/*
* Leave a space for stdio and TCP to work in.
*/
- if (manager->reserved != 0 && type == isc_sockettype_udp &&
+ if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
sock->fd >= 0 && sock->fd < manager->reserved) {
int new, tmp;
new = fcntl(sock->fd, F_DUPFD, manager->reserved);
@@ -1535,20 +1912,18 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
}
#endif
- if (sock->fd >= (int)manager->fdsize) {
+ if (sock->fd >= (int)manager->maxsocks) {
(void)close(sock->fd);
isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
isc_msgcat, ISC_MSGSET_SOCKET,
ISC_MSG_TOOMANYFDS,
- "%s: too many open file descriptors", "socket");
- free_socket(&sock);
+ "socket: file descriptor exceeds limit (%d/%u)",
+ sock->fd, manager->maxsocks);
return (ISC_R_NORESOURCES);
}
-
- if (sock->fd < 0) {
- free_socket(&sock);
+ if (sock->fd < 0) {
switch (errno) {
case EMFILE:
case ENFILE:
@@ -1580,14 +1955,13 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
(void)close(sock->fd);
- free_socket(&sock);
return (ISC_R_UNEXPECTED);
}
#ifdef SO_BSDCOMPAT
RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
clear_bsdcompat) == ISC_R_SUCCESS);
- if (type != isc_sockettype_unix && bsdcompat &&
+ if (sock->type != isc_sockettype_unix && bsdcompat &&
setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
(void *)&on, sizeof(on)) < 0) {
isc__strerror(errno, strbuf, sizeof(strbuf));
@@ -1601,8 +1975,22 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
}
#endif
+#ifdef SO_NOSIGPIPE
+ if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
+ (void *)&on, sizeof(on)) < 0) {
+ isc__strerror(errno, strbuf, sizeof(strbuf));
+ UNEXPECTED_ERROR(__FILE__, __LINE__,
+ "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
+ sock->fd,
+ isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
+ ISC_MSG_FAILED, "failed"),
+ strbuf);
+ /* Press on... */
+ }
+#endif
+
#if defined(USE_CMSG) || defined(SO_RCVBUF)
- if (type == isc_sockettype_udp) {
+ if (sock->type == isc_sockettype_udp) {
#if defined(USE_CMSG)
#if defined(SO_TIMESTAMP)
@@ -1612,7 +2000,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
isc__strerror(errno, strbuf, sizeof(strbuf));
UNEXPECTED_ERROR(__FILE__, __LINE__,
"setsockopt(%d, SO_TIMESTAMP) %s: %s",
- sock->fd,
+ sock->fd,
isc_msgcat_get(isc_msgcat,
ISC_MSGSET_GENERAL,
ISC_MSG_FAILED,
@@ -1623,7 +2011,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
#endif /* SO_TIMESTAMP */
#if defined(ISC_PLATFORM_HAVEIPV6)
- if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
+ if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
/*
* Warn explicitly because this anomaly can be hidden
* in usual operation (and unexpectedly appear later).
@@ -1635,7 +2023,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
#ifdef IPV6_RECVPKTINFO
/* RFC 3542 */
- if ((pf == AF_INET6)
+ if ((sock->pf == AF_INET6)
&& (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
(void *)&on, sizeof(on)) < 0)) {
isc__strerror(errno, strbuf, sizeof(strbuf));
@@ -1650,7 +2038,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
}
#else
/* RFC 2292 */
- if ((pf == AF_INET6)
+ if ((sock->pf == AF_INET6)
&& (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
(void *)&on, sizeof(on)) < 0)) {
isc__strerror(errno, strbuf, sizeof(strbuf));
@@ -1667,7 +2055,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
#endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
#ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/
/* use minimum MTU */
- if (pf == AF_INET6) {
+ if (sock->pf == AF_INET6) {
(void)setsockopt(sock->fd, IPPROTO_IPV6,
IPV6_USE_MIN_MTU,
(void *)&on, sizeof(on));
@@ -1676,6 +2064,27 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
#endif /* ISC_PLATFORM_HAVEIPV6 */
#endif /* defined(USE_CMSG) */
+#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
+ /*
+ * Turn off Path MTU discovery on IPv4/UDP sockets.
+ */
+ if (sock->pf == AF_INET) {
+ int action = IP_PMTUDISC_DONT;
+ (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
+ &action, sizeof(action));
+ }
+#endif
+#if defined(IP_DONTFRAG)
+ /*
+ * Turn off Path MTU discovery on IPv4/UDP sockets.
+ */
+ if (sock->pf == AF_INET) {
+ int off = 0;
+ (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
+ &off, sizeof(off));
+ }
+#endif
+
#if defined(SO_RCVBUF)
optlen = sizeof(size);
if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
@@ -1699,22 +2108,61 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
}
#endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
+ return (ISC_R_SUCCESS);
+}
+
+/*%
+ * Create a new 'type' socket managed by 'manager'. Events
+ * will be posted to 'task' and when dispatched 'action' will be
+ * called with 'arg' as the arg value. The new socket is returned
+ * in 'socketp'.
+ */
+isc_result_t
+isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
+ isc_socket_t **socketp)
+{
+ isc_socket_t *sock = NULL;
+ isc_result_t result;
+ int lockid;
+
+ REQUIRE(VALID_MANAGER(manager));
+ REQUIRE(socketp != NULL && *socketp == NULL);
+
+ result = allocate_socket(manager, type, &sock);
+ if (result != ISC_R_SUCCESS)
+ return (result);
+
+ sock->pf = pf;
+ result = opensocket(manager, sock);
+ if (result != ISC_R_SUCCESS) {
+ free_socket(&sock);
+ return (result);
+ }
+
sock->references = 1;
*socketp = sock;
- LOCK(&manager->lock);
-
/*
* Note we don't have to lock the socket like we normally would because
* there are no external references to it yet.
*/
+ lockid = FDLOCK_ID(sock->fd);
+ LOCK(&manager->fdlock[lockid]);
manager->fds[sock->fd] = sock;
manager->fdstate[sock->fd] = MANAGED;
+#ifdef USE_DEVPOLL
+ INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
+ sock->manager->fdpollinfo[sock->fd].want_write == 0);
+#endif
+ UNLOCK(&manager->fdlock[lockid]);
+
+ LOCK(&manager->lock);
ISC_LIST_APPEND(manager->socklist, sock, link);
+#ifdef USE_SELECT
if (manager->maxfd < sock->fd)
manager->maxfd = sock->fd;
-
+#endif
UNLOCK(&manager->lock);
socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
@@ -1723,6 +2171,48 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
return (ISC_R_SUCCESS);
}
+isc_result_t
+isc_socket_open(isc_socket_t *sock) {
+ isc_result_t result;
+
+ REQUIRE(VALID_SOCKET(sock));
+
+ LOCK(&sock->lock);
+ REQUIRE(sock->references == 1);
+ UNLOCK(&sock->lock);
+ /*
+ * We don't need to retain the lock hereafter, since no one else has
+ * this socket.
+ */
+ REQUIRE(sock->fd == -1);
+
+ result = opensocket(sock->manager, sock);
+ if (result != ISC_R_SUCCESS)
+ sock->fd = -1;
+
+ if (result == ISC_R_SUCCESS) {
+ int lockid = FDLOCK_ID(sock->fd);
+
+ LOCK(&sock->manager->fdlock[lockid]);
+ sock->manager->fds[sock->fd] = sock;
+ sock->manager->fdstate[sock->fd] = MANAGED;
+#ifdef USE_DEVPOLL
+ INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
+ sock->manager->fdpollinfo[sock->fd].want_write == 0);
+#endif
+ UNLOCK(&sock->manager->fdlock[lockid]);
+
+#ifdef USE_SELECT
+ LOCK(&sock->manager->lock);
+ if (sock->manager->maxfd < sock->fd)
+ sock->manager->maxfd = sock->fd;
+ UNLOCK(&sock->manager->lock);
+#endif
+ }
+
+ return (result);
+}
+
/*
* Attach to a socket. Caller must explicitly detach when it is done.
*/
@@ -1764,6 +2254,44 @@ isc_socket_detach(isc_socket_t **socketp) {
*socketp = NULL;
}
+isc_result_t
+isc_socket_close(isc_socket_t *sock) {
+ int fd;
+
+ REQUIRE(VALID_SOCKET(sock));
+
+ LOCK(&sock->lock);
+ REQUIRE(sock->references == 1);
+ UNLOCK(&sock->lock);
+ /*
+ * We don't need to retain the lock hereafter, since no one else has
+ * this socket.
+ */
+
+ REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
+
+ INSIST(!sock->connecting);
+ INSIST(!sock->pending_recv);
+ INSIST(!sock->pending_send);
+ INSIST(!sock->pending_accept);
+ INSIST(ISC_LIST_EMPTY(sock->recv_list));
+ INSIST(ISC_LIST_EMPTY(sock->send_list));
+ INSIST(ISC_LIST_EMPTY(sock->accept_list));
+ INSIST(sock->connect_ev == NULL);
+
+ fd = sock->fd;
+ sock->fd = -1;
+ sock->listener = 0;
+ sock->connected = 0;
+ sock->connecting = 0;
+ sock->bound = 0;
+ isc_sockaddr_any(&sock->address);
+
+ closesocket(sock->manager, sock->type, fd);
+
+ return (ISC_R_SUCCESS);
+}
+
/*
* I/O is possible on a given socket. Schedule an event to this task that
* will call an internal function to do the I/O. This will charge the
@@ -1993,7 +2521,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) {
*/
addrlen = sizeof(dev->newsocket->address.type);
- memset(&dev->newsocket->address.type.sa, 0, addrlen);
+ memset(&dev->newsocket->address.type, 0, addrlen);
fd = accept(sock->fd, &dev->newsocket->address.type.sa,
(void *)&addrlen);
@@ -2070,19 +2598,20 @@ internal_accept(isc_task_t *me, isc_event_t *ev) {
UNEXPECTED_ERROR(__FILE__, __LINE__,
"internal_accept(): "
"accept() returned peer address "
- "family %u (expected %u)",
+ "family %u (expected %u)",
dev->newsocket->address.
type.sa.sa_family,
sock->pf);
(void)close(fd);
goto soft_error;
- } else if (fd >= (int)manager->fdsize) {
+ } else if (fd >= (int)manager->maxsocks) {
isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
isc_msgcat, ISC_MSGSET_SOCKET,
ISC_MSG_TOOMANYFDS,
- "%s: too many open file descriptors",
- "accept");
+ "accept: "
+ "file descriptor exceeds limit (%d/%u)",
+ fd, manager->maxsocks);
(void)close(fd);
goto soft_error;
}
@@ -2116,6 +2645,13 @@ internal_accept(isc_task_t *me, isc_event_t *ev) {
* -1 means the new socket didn't happen.
*/
if (fd != -1) {
+ int lockid = FDLOCK_ID(fd);
+
+ LOCK(&manager->fdlock[lockid]);
+ manager->fds[fd] = dev->newsocket;
+ manager->fdstate[fd] = MANAGED;
+ UNLOCK(&manager->fdlock[lockid]);
+
LOCK(&manager->lock);
ISC_LIST_APPEND(manager->socklist, dev->newsocket, link);
@@ -2128,10 +2664,10 @@ internal_accept(isc_task_t *me, isc_event_t *ev) {
*/
dev->address = dev->newsocket->address;
- manager->fds[fd] = dev->newsocket;
- manager->fdstate[fd] = MANAGED;
+#ifdef USE_SELECT
if (manager->maxfd < fd)
manager->maxfd = fd;
+#endif
socket_log(sock, &dev->newsocket->address, CREATION,
isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
@@ -2143,7 +2679,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) {
dev->newsocket->references--;
free_socket(&dev->newsocket);
}
-
+
/*
* Fill in the done event details and send it off.
*/
@@ -2280,77 +2816,256 @@ internal_send(isc_task_t *me, isc_event_t *ev) {
UNLOCK(&sock->lock);
}
+/*
+ * Process read/writes on each fd here. Avoid locking
+ * and unlocking twice if both reads and writes are possible.
+ */
static void
-process_fds(isc_socketmgr_t *manager, int maxfd,
- fd_set *readfds, fd_set *writefds)
+process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable,
+ isc_boolean_t writeable)
{
- int i;
isc_socket_t *sock;
isc_boolean_t unlock_sock;
-
- REQUIRE(maxfd <= (int)manager->fdsize);
+ int lockid = FDLOCK_ID(fd);
/*
- * Process read/writes on other fds here. Avoid locking
- * and unlocking twice if both reads and writes are possible.
+ * If the socket is going to be closed, don't do more I/O.
*/
- for (i = 0; i < maxfd; i++) {
+ LOCK(&manager->fdlock[lockid]);
+ if (manager->fdstate[fd] == CLOSE_PENDING) {
+ UNLOCK(&manager->fdlock[lockid]);
+
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+ return;
+ }
+
+ sock = manager->fds[fd];
+ UNLOCK(&manager->fdlock[lockid]);
+ unlock_sock = ISC_FALSE;
+ if (readable) {
+ if (sock == NULL) {
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ goto check_write;
+ }
+ unlock_sock = ISC_TRUE;
+ LOCK(&sock->lock);
+ if (!SOCK_DEAD(sock)) {
+ if (sock->listener)
+ dispatch_accept(sock);
+ else
+ dispatch_recv(sock);
+ }
+ (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
+ }
+check_write:
+ if (writeable) {
+ if (sock == NULL) {
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+ return;
+ }
+ if (!unlock_sock) {
+ unlock_sock = ISC_TRUE;
+ LOCK(&sock->lock);
+ }
+ if (!SOCK_DEAD(sock)) {
+ if (sock->connecting)
+ dispatch_connect(sock);
+ else
+ dispatch_send(sock);
+ }
+ (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
+ }
+ if (unlock_sock)
+ UNLOCK(&sock->lock);
+}
+
+#ifdef USE_KQUEUE
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) {
+ int i;
+ isc_boolean_t readable, writable;
+ isc_boolean_t done = ISC_FALSE;
#ifdef ISC_PLATFORM_USETHREADS
- if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
+ isc_boolean_t have_ctlevent = ISC_FALSE;
+#endif
+
+ if (nevents == manager->nevents) {
+ /*
+ * This is not an error, but something unexpected. If this
+ * happens, it may indicate the need for increasing
+ * ISC_SOCKET_MAXEVENTS.
+ */
+ manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+ "maximum number of FD events (%d) received",
+ nevents);
+ }
+
+ for (i = 0; i < nevents; i++) {
+ REQUIRE(events[i].ident < manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+ if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
+ have_ctlevent = ISC_TRUE;
continue;
-#endif /* ISC_PLATFORM_USETHREADS */
+ }
+#endif
+ readable = ISC_TF(events[i].filter == EVFILT_READ);
+ writable = ISC_TF(events[i].filter == EVFILT_WRITE);
+ process_fd(manager, events[i].ident, readable, writable);
+ }
- if (manager->fdstate[i] == CLOSE_PENDING) {
- manager->fdstate[i] = CLOSED;
- FD_CLR(i, manager->read_fds);
- FD_CLR(i, manager->write_fds);
+#ifdef ISC_PLATFORM_USETHREADS
+ if (have_ctlevent)
+ done = process_ctlfd(manager);
+#endif
- (void)close(i);
+ return (done);
+}
+#elif defined(USE_EPOLL)
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) {
+ int i;
+ isc_boolean_t done = ISC_FALSE;
+#ifdef ISC_PLATFORM_USETHREADS
+ isc_boolean_t have_ctlevent = ISC_FALSE;
+#endif
+
+ if (nevents == manager->nevents) {
+ manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+ "maximum number of FD events (%d) received",
+ nevents);
+ }
+ for (i = 0; i < nevents; i++) {
+ REQUIRE(events[i].data.fd < (int)manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+ if (events[i].data.fd == manager->pipe_fds[0]) {
+ have_ctlevent = ISC_TRUE;
continue;
}
-
- sock = manager->fds[i];
- unlock_sock = ISC_FALSE;
- if (FD_ISSET(i, readfds)) {
- if (sock == NULL) {
- FD_CLR(i, manager->read_fds);
- goto check_write;
- }
- unlock_sock = ISC_TRUE;
- LOCK(&sock->lock);
- if (!SOCK_DEAD(sock)) {
- if (sock->listener)
- dispatch_accept(sock);
- else
- dispatch_recv(sock);
- }
- FD_CLR(i, manager->read_fds);
+#endif
+ if ((events[i].events & EPOLLERR) != 0 ||
+ (events[i].events & EPOLLHUP) != 0) {
+ /*
+ * epoll does not set IN/OUT bits on an erroneous
+ * condition, so we need to try both anyway. This is a
+ * bit inefficient, but should be okay for such rare
+ * events. Note also that the read or write attempt
+ * won't block because we use non-blocking sockets.
+ */
+ events[i].events |= (EPOLLIN | EPOLLOUT);
}
- check_write:
- if (FD_ISSET(i, writefds)) {
- if (sock == NULL) {
- FD_CLR(i, manager->write_fds);
- continue;
- }
- if (!unlock_sock) {
- unlock_sock = ISC_TRUE;
- LOCK(&sock->lock);
- }
- if (!SOCK_DEAD(sock)) {
- if (sock->connecting)
- dispatch_connect(sock);
- else
- dispatch_send(sock);
- }
- FD_CLR(i, manager->write_fds);
+ process_fd(manager, events[i].data.fd,
+ (events[i].events & EPOLLIN) != 0,
+ (events[i].events & EPOLLOUT) != 0);
+ }
+
+#ifdef ISC_PLATFORM_USETHREADS
+ if (have_ctlevent)
+ done = process_ctlfd(manager);
+#endif
+
+ return (done);
+}
+#elif defined(USE_DEVPOLL)
+static isc_boolean_t
+process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) {
+ int i;
+ isc_boolean_t done = ISC_FALSE;
+#ifdef ISC_PLATFORM_USETHREADS
+ isc_boolean_t have_ctlevent = ISC_FALSE;
+#endif
+
+ if (nevents == manager->nevents) {
+ manager_log(manager, ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
+ "maximum number of FD events (%d) received",
+ nevents);
+ }
+
+ for (i = 0; i < nevents; i++) {
+ REQUIRE(events[i].fd < (int)manager->maxsocks);
+#ifdef ISC_PLATFORM_USETHREADS
+ if (events[i].fd == manager->pipe_fds[0]) {
+ have_ctlevent = ISC_TRUE;
+ continue;
}
- if (unlock_sock)
- UNLOCK(&sock->lock);
+#endif
+ process_fd(manager, events[i].fd,
+ (events[i].events & POLLIN) != 0,
+ (events[i].events & POLLOUT) != 0);
+ }
+
+#ifdef ISC_PLATFORM_USETHREADS
+ if (have_ctlevent)
+ done = process_ctlfd(manager);
+#endif
+
+ return (done);
+}
+#elif defined(USE_SELECT)
+static void
+process_fds(isc_socketmgr_t *manager, int maxfd,
+ fd_set *readfds, fd_set *writefds)
+{
+ int i;
+
+ REQUIRE(maxfd <= (int)manager->maxsocks);
+
+ for (i = 0; i < maxfd; i++) {
+#ifdef ISC_PLATFORM_USETHREADS
+ if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
+ continue;
+#endif /* ISC_PLATFORM_USETHREADS */
+ process_fd(manager, i, FD_ISSET(i, readfds),
+ FD_ISSET(i, writefds));
}
}
+#endif
#ifdef ISC_PLATFORM_USETHREADS
+static isc_boolean_t
+process_ctlfd(isc_socketmgr_t *manager) {
+ int msg, fd;
+
+ for (;;) {
+ select_readmsg(manager, &fd, &msg);
+
+ manager_log(manager, IOEVENT,
+ isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
+ ISC_MSG_WATCHERMSG,
+ "watcher got message %d "
+ "for socket %d"), msg, fd);
+
+ /*
+ * Nothing to read?
+ */
+ if (msg == SELECT_POKE_NOTHING)
+ break;
+
+ /*
+ * Handle shutdown message. We really should
+ * jump out of this loop right away, but
+ * it doesn't matter if we have to do a little
+ * more work first.
+ */
+ if (msg == SELECT_POKE_SHUTDOWN)
+ return (ISC_TRUE);
+
+ /*
+ * This is a wakeup on a socket. Look
+ * at the event queue for both read and write,
+ * and decide if we need to watch on it now
+ * or not.
+ */
+ wakeup_socket(manager, fd, msg);
+ }
+
+ return (ISC_FALSE);
+}
+
/*
* This is the thread that will loop forever, always in a select or poll
* call.
@@ -2364,98 +3079,116 @@ watcher(void *uap) {
isc_boolean_t done;
int ctlfd;
int cc;
- int msg, fd;
+#ifdef USE_KQUEUE
+ const char *fnname = "kevent()";
+#elif defined (USE_EPOLL)
+ const char *fnname = "epoll_wait()";
+#elif defined(USE_DEVPOLL)
+ const char *fnname = "ioctl(DP_POLL)";
+ struct dvpoll dvp;
+#elif defined (USE_SELECT)
+ const char *fnname = "select()";
int maxfd;
+#endif
char strbuf[ISC_STRERRORSIZE];
+#ifdef ISC_SOCKET_USE_POLLWATCH
+ pollstate_t pollstate = poll_idle;
+#endif
/*
* Get the control fd here. This will never change.
*/
- LOCK(&manager->lock);
ctlfd = manager->pipe_fds[0];
-
done = ISC_FALSE;
while (!done) {
do {
+#ifdef USE_KQUEUE
+ cc = kevent(manager->kqueue_fd, NULL, 0,
+ manager->events, manager->nevents, NULL);
+#elif defined(USE_EPOLL)
+ cc = epoll_wait(manager->epoll_fd, manager->events,
+ manager->nevents, -1);
+#elif defined(USE_DEVPOLL)
+ dvp.dp_fds = manager->events;
+ dvp.dp_nfds = manager->nevents;
+#ifndef ISC_SOCKET_USE_POLLWATCH
+ dvp.dp_timeout = -1;
+#else
+ if (pollstate == poll_idle)
+ dvp.dp_timeout = -1;
+ else
+ dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT;
+#endif /* ISC_SOCKET_USE_POLLWATCH */
+ cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
+#elif defined(USE_SELECT)
+ LOCK(&manager->lock);
memcpy(manager->read_fds_copy, manager->read_fds,
manager->fd_bufsize);
memcpy(manager->write_fds_copy, manager->write_fds,
manager->fd_bufsize);
maxfd = manager->maxfd + 1;
-
UNLOCK(&manager->lock);
cc = select(maxfd, manager->read_fds_copy,
manager->write_fds_copy, NULL, NULL);
- if (cc < 0) {
- if (!SOFT_ERROR(errno)) {
- isc__strerror(errno, strbuf,
- sizeof(strbuf));
- FATAL_ERROR(__FILE__, __LINE__,
- "select() %s: %s",
- isc_msgcat_get(isc_msgcat,
- ISC_MSGSET_GENERAL,
- ISC_MSG_FAILED,
- "failed"),
- strbuf);
- }
+#endif /* USE_KQUEUE */
+
+ if (cc < 0 && !SOFT_ERROR(errno)) {
+ isc__strerror(errno, strbuf, sizeof(strbuf));
+ FATAL_ERROR(__FILE__, __LINE__,
+ "%s %s: %s", fnname,
+ isc_msgcat_get(isc_msgcat,
+ ISC_MSGSET_GENERAL,
+ ISC_MSG_FAILED,
+ "failed"), strbuf);
}
- LOCK(&manager->lock);
+#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
+ if (cc == 0) {
+ if (pollstate == poll_active)
+ pollstate = poll_checking;
+ else if (pollstate == poll_checking)
+ pollstate = poll_idle;
+ } else if (cc > 0) {
+ if (pollstate == poll_checking) {
+ /*
+ * XXX: We'd like to use a more
+ * verbose log level as it's actually an
+ * unexpected event, but the kernel bug
+ * reportedly happens pretty frequently
+ * (and it can also be a false positive)
+ * so it would be just too noisy.
+ */
+ manager_log(manager,
+ ISC_LOGCATEGORY_GENERAL,
+ ISC_LOGMODULE_SOCKET,
+ ISC_LOG_DEBUG(1),
+ ISC_LOG_INFO,
+ "unexpected POLL timeout");
+ }
+ pollstate = poll_active;
+ }
+#endif
} while (cc < 0);
+#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
+ done = process_fds(manager, manager->events, cc);
+#elif defined(USE_SELECT)
+ process_fds(manager, maxfd, manager->read_fds_copy,
+ manager->write_fds_copy);
/*
* Process reads on internal, control fd.
*/
- if (FD_ISSET(ctlfd, manager->read_fds_copy)) {
- for (;;) {
- select_readmsg(manager, &fd, &msg);
-
- manager_log(manager, IOEVENT,
- isc_msgcat_get(isc_msgcat,
- ISC_MSGSET_SOCKET,
- ISC_MSG_WATCHERMSG,
- "watcher got message %d"),
- msg);
-
- /*
- * Nothing to read?
- */
- if (msg == SELECT_POKE_NOTHING)
- break;
-
- /*
- * Handle shutdown message. We really should
- * jump out of this loop right away, but
- * it doesn't matter if we have to do a little
- * more work first.
- */
- if (msg == SELECT_POKE_SHUTDOWN) {
- done = ISC_TRUE;
-
- break;
- }
-
- /*
- * This is a wakeup on a socket. Look
- * at the event queue for both read and write,
- * and decide if we need to watch on it now
- * or not.
- */
- wakeup_socket(manager, fd, msg);
- }
- }
-
- process_fds(manager, maxfd, manager->read_fds_copy,
- manager->write_fds_copy);
+ if (FD_ISSET(ctlfd, manager->read_fds_copy))
+ done = process_ctlfd(manager);
+#endif
}
manager_log(manager, TRACE,
isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
ISC_MSG_EXITING, "watcher exiting"));
- UNLOCK(&manager->lock);
return ((isc_threadresult_t)0);
}
#endif /* ISC_PLATFORM_USETHREADS */
@@ -2469,69 +3202,187 @@ isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) {
}
/*
- * Initialize fdsets in socketmgr structure.
+ * Create a new socket manager.
*/
+
static isc_result_t
-create_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
-#if ISC_SOCKET_FDSETSIZE > FD_SETSIZE
- manager->fdsize = ISC_SOCKET_FDSETSIZE;
- manager->fd_bufsize = howmany(ISC_SOCKET_FDSETSIZE, NFDBITS) *
+setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
+ isc_result_t result;
+
+#ifdef USE_KQUEUE
+ manager->nevents = ISC_SOCKET_MAXEVENTS;
+ manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
+ manager->nevents);
+ if (manager->events == NULL)
+ return (ISC_R_NOMEMORY);
+ manager->kqueue_fd = kqueue();
+ if (manager->kqueue_fd == -1) {
+ result = isc__errno2result(errno);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct kevent) * manager->nevents);
+ return (result);
+ }
+
+#ifdef ISC_PLATFORM_USETHREADS
+ result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ if (result != ISC_R_SUCCESS) {
+ close(manager->kqueue_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct kevent) * manager->nevents);
+ return (result);
+ }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_EPOLL)
+ manager->nevents = ISC_SOCKET_MAXEVENTS;
+ manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
+ manager->nevents);
+ if (manager->events == NULL)
+ return (ISC_R_NOMEMORY);
+ manager->epoll_fd = epoll_create(manager->nevents);
+ if (manager->epoll_fd == -1) {
+ result = isc__errno2result(errno);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct epoll_event) * manager->nevents);
+ return (result);
+ }
+#ifdef ISC_PLATFORM_USETHREADS
+ result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ if (result != ISC_R_SUCCESS) {
+ close(manager->epoll_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct epoll_event) * manager->nevents);
+ return (result);
+ }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_DEVPOLL)
+ /*
+ * XXXJT: /dev/poll seems to reject large numbers of events,
+ * so we should be careful about redefining ISC_SOCKET_MAXEVENTS.
+ */
+ manager->nevents = ISC_SOCKET_MAXEVENTS;
+ manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
+ manager->nevents);
+ if (manager->events == NULL)
+ return (ISC_R_NOMEMORY);
+ /*
+ * Note: fdpollinfo should be able to support all possible FDs, so
+ * it must have maxsocks entries (not nevents).
+ */
+ manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
+ manager->maxsocks);
+ if (manager->fdpollinfo == NULL) {
+ isc_mem_put(mctx, manager->events,
+ sizeof(pollinfo_t) * manager->maxsocks);
+ return (ISC_R_NOMEMORY);
+ }
+ memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
+ manager->devpoll_fd = open("/dev/poll", O_RDWR);
+ if (manager->devpoll_fd == -1) {
+ result = isc__errno2result(errno);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct pollfd) * manager->nevents);
+ isc_mem_put(mctx, manager->fdpollinfo,
+ sizeof(pollinfo_t) * manager->maxsocks);
+ return (result);
+ }
+#ifdef ISC_PLATFORM_USETHREADS
+ result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ if (result != ISC_R_SUCCESS) {
+ close(manager->devpoll_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct pollfd) * manager->nevents);
+ isc_mem_put(mctx, manager->fdpollinfo,
+ sizeof(pollinfo_t) * manager->maxsocks);
+ return (result);
+ }
+#endif /* ISC_PLATFORM_USETHREADS */
+#elif defined(USE_SELECT)
+ UNUSED(result);
+
+#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
+ /*
+ * Note: this code should also cover the case of MAXSOCKETS <=
+ * FD_SETSIZE, but we separate the cases to avoid possible portability
+ * issues regarding howmany() and the actual representation of fd_set.
+ */
+ manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
sizeof(fd_mask);
#else
- manager->fdsize = FD_SETSIZE;
manager->fd_bufsize = sizeof(fd_set);
#endif
- manager->fds = NULL;
- manager->fdstate = NULL;
manager->read_fds = NULL;
manager->read_fds_copy = NULL;
manager->write_fds = NULL;
manager->write_fds_copy = NULL;
- manager->fds = isc_mem_get(mctx,
- manager->fdsize * sizeof(manager->fds[0]));
- if (manager->fds == NULL)
- goto fail;
-
- manager->fdstate = isc_mem_get(mctx, manager->fdsize *
- sizeof(manager->fdstate[0]));
- if (manager->fdstate == NULL)
- goto fail;
-
manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
- if (manager->read_fds == NULL)
- goto fail;
- manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
- if (manager->read_fds_copy == NULL)
- goto fail;
- manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
- if (manager->write_fds == NULL)
- goto fail;
- manager->write_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
- if (manager->write_fds_copy == NULL)
- goto fail;
+ if (manager->read_fds != NULL)
+ manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
+ if (manager->read_fds_copy != NULL)
+ manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
+ if (manager->write_fds != NULL) {
+ manager->write_fds_copy = isc_mem_get(mctx,
+ manager->fd_bufsize);
+ }
+ if (manager->write_fds_copy == NULL) {
+ if (manager->write_fds != NULL) {
+ isc_mem_put(mctx, manager->write_fds,
+ manager->fd_bufsize);
+ }
+ if (manager->read_fds_copy != NULL) {
+ isc_mem_put(mctx, manager->read_fds_copy,
+ manager->fd_bufsize);
+ }
+ if (manager->read_fds != NULL) {
+ isc_mem_put(mctx, manager->read_fds,
+ manager->fd_bufsize);
+ }
+ return (ISC_R_NOMEMORY);
+ }
+ memset(manager->read_fds, 0, manager->fd_bufsize);
+ memset(manager->write_fds, 0, manager->fd_bufsize);
- return (ISC_R_SUCCESS);
+#ifdef ISC_PLATFORM_USETHREADS
+ (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ manager->maxfd = manager->pipe_fds[0];
+#else /* ISC_PLATFORM_USETHREADS */
+ manager->maxfd = 0;
+#endif /* ISC_PLATFORM_USETHREADS */
+#endif /* USE_KQUEUE */
- fail:
- cleanup_fdsets(manager, mctx);
- return (ISC_R_NOMEMORY);
+ return (ISC_R_SUCCESS);
}
-/*
- * Clean up fdsets in socketmgr structure.
- */
static void
-cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
- if (manager->fds != NULL) {
- isc_mem_put(mctx, manager->fds,
- manager->fdsize * sizeof(manager->fds[0]));
- }
- if (manager->fdstate != NULL) {
- isc_mem_put(mctx, manager->fdstate,
- manager->fdsize * sizeof(manager->fdstate[0]));
+cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) {
+#ifdef ISC_PLATFORM_USETHREADS
+ isc_result_t result;
+
+ result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
+ if (result != ISC_R_SUCCESS) {
+ UNEXPECTED_ERROR(__FILE__, __LINE__,
+ "epoll_ctl(DEL) %s",
+ isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
+ ISC_MSG_FAILED, "failed"));
}
+#endif /* ISC_PLATFORM_USETHREADS */
+
+#ifdef USE_KQUEUE
+ close(manager->kqueue_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct kevent) * manager->nevents);
+#elif defined(USE_EPOLL)
+ close(manager->epoll_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct epoll_event) * manager->nevents);
+#elif defined(USE_DEVPOLL)
+ close(manager->devpoll_fd);
+ isc_mem_put(mctx, manager->events,
+ sizeof(struct pollfd) * manager->nevents);
+ isc_mem_put(mctx, manager->fdpollinfo,
+ sizeof(pollinfo_t) * manager->maxsocks);
+#elif defined(USE_SELECT)
if (manager->read_fds != NULL)
isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
if (manager->read_fds_copy != NULL)
@@ -2540,13 +3391,19 @@ cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) {
isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
if (manager->write_fds_copy != NULL)
isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
+#endif /* USE_KQUEUE */
}
-/*
- * Create a new socket manager.
- */
isc_result_t
isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
+ return (isc_socketmgr_create2(mctx, managerp, 0));
+}
+
+isc_result_t
+isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
+ unsigned int maxsocks)
+{
+ int i;
isc_socketmgr_t *manager;
#ifdef ISC_PLATFORM_USETHREADS
char strbuf[ISC_STRERRORSIZE];
@@ -2557,43 +3414,71 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
#ifndef ISC_PLATFORM_USETHREADS
if (socketmgr != NULL) {
+ /* Don't allow maxsocks to be updated */
+ if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
+ return (ISC_R_EXISTS);
+
socketmgr->refs++;
*managerp = socketmgr;
return (ISC_R_SUCCESS);
}
#endif /* ISC_PLATFORM_USETHREADS */
+ if (maxsocks == 0)
+ maxsocks = ISC_SOCKET_MAXSOCKETS;
+
manager = isc_mem_get(mctx, sizeof(*manager));
if (manager == NULL)
return (ISC_R_NOMEMORY);
- result = create_fdsets(manager, mctx);
- if (result != ISC_R_SUCCESS) {
- cleanup_fdsets(manager, mctx);
- isc_mem_put(mctx, manager, sizeof(*manager));
- return (result);
+ /* zero-clear so that necessary cleanup on failure will be easy */
+ memset(manager, 0, sizeof(*manager));
+ manager->maxsocks = maxsocks;
+ manager->reserved = 0;
+ manager->fds = isc_mem_get(mctx,
+ manager->maxsocks * sizeof(isc_socket_t *));
+ if (manager->fds == NULL) {
+ result = ISC_R_NOMEMORY;
+ goto free_manager;
+ }
+ manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
+ if (manager->fds == NULL) {
+ result = ISC_R_NOMEMORY;
+ goto free_manager;
}
manager->magic = SOCKET_MANAGER_MAGIC;
manager->mctx = NULL;
- memset(manager->fds, 0, sizeof(manager->fds[0]) * manager->fdsize);
+ memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
ISC_LIST_INIT(manager->socklist);
result = isc_mutex_init(&manager->lock);
- if (result != ISC_R_SUCCESS) {
- cleanup_fdsets(manager, mctx);
- isc_mem_put(mctx, manager, sizeof(*manager));
- return (result);
+ if (result != ISC_R_SUCCESS)
+ goto free_manager;
+ manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
+ if (manager->fdlock == NULL) {
+ result = ISC_R_NOMEMORY;
+ goto cleanup_lock;
}
+ for (i = 0; i < FDLOCK_COUNT; i++) {
+ result = isc_mutex_init(&manager->fdlock[i]);
+ if (result != ISC_R_SUCCESS) {
+ while (--i >= 0)
+ DESTROYLOCK(&manager->fdlock[i]);
+ isc_mem_put(mctx, manager->fdlock,
+ FDLOCK_COUNT * sizeof(isc_mutex_t));
+ manager->fdlock = NULL;
+ goto cleanup_lock;
+ }
+ }
+
#ifdef ISC_PLATFORM_USETHREADS
if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
- cleanup_fdsets(manager, mctx);
- DESTROYLOCK(&manager->lock);
- isc_mem_put(mctx, manager, sizeof(*manager));
UNEXPECTED_ERROR(__FILE__, __LINE__,
"isc_condition_init() %s",
isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
ISC_MSG_FAILED, "failed"));
- return (ISC_R_UNEXPECTED);
+ result = ISC_R_UNEXPECTED;
+ goto cleanup_lock;
}
/*
@@ -2601,17 +3486,14 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
* select/poll loop when something internal needs to be done.
*/
if (pipe(manager->pipe_fds) != 0) {
- cleanup_fdsets(manager, mctx);
- DESTROYLOCK(&manager->lock);
- isc_mem_put(mctx, manager, sizeof(*manager));
isc__strerror(errno, strbuf, sizeof(strbuf));
UNEXPECTED_ERROR(__FILE__, __LINE__,
"pipe() %s: %s",
isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
ISC_MSG_FAILED, "failed"),
strbuf);
-
- return (ISC_R_UNEXPECTED);
+ result = ISC_R_UNEXPECTED;
+ goto cleanup_condition;
}
RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
@@ -2625,33 +3507,23 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
/*
* Set up initial state for the select loop
*/
- memset(manager->read_fds, 0, manager->fd_bufsize);
- memset(manager->write_fds, 0, manager->fd_bufsize);
-#ifdef ISC_PLATFORM_USETHREADS
- FD_SET(manager->pipe_fds[0], manager->read_fds);
- manager->maxfd = manager->pipe_fds[0];
-#else /* ISC_PLATFORM_USETHREADS */
- manager->maxfd = 0;
-#endif /* ISC_PLATFORM_USETHREADS */
- manager->reserved = 0;
- memset(manager->fdstate, 0,
- manager->fdsize * sizeof(manager->fdstate[0]));
-
+ result = setup_watcher(mctx, manager);
+ if (result != ISC_R_SUCCESS)
+ goto cleanup;
+ memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
#ifdef ISC_PLATFORM_USETHREADS
/*
* Start up the select/poll thread.
*/
if (isc_thread_create(watcher, manager, &manager->watcher) !=
ISC_R_SUCCESS) {
- (void)close(manager->pipe_fds[0]);
- (void)close(manager->pipe_fds[1]);
- DESTROYLOCK(&manager->lock);
- isc_mem_put(mctx, manager, sizeof(*manager));
UNEXPECTED_ERROR(__FILE__, __LINE__,
"isc_thread_create() %s",
isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
ISC_MSG_FAILED, "failed"));
- return (ISC_R_UNEXPECTED);
+ cleanup_watcher(mctx, manager);
+ result = ISC_R_UNEXPECTED;
+ goto cleanup;
}
#endif /* ISC_PLATFORM_USETHREADS */
isc_mem_attach(mctx, &manager->mctx);
@@ -2662,6 +3534,52 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
*managerp = manager;
return (ISC_R_SUCCESS);
+
+cleanup:
+#ifdef ISC_PLATFORM_USETHREADS
+ (void)close(manager->pipe_fds[0]);
+ (void)close(manager->pipe_fds[1]);
+#endif /* ISC_PLATFORM_USETHREADS */
+
+#ifdef ISC_PLATFORM_USETHREADS
+cleanup_condition:
+ (void)isc_condition_destroy(&manager->shutdown_ok);
+#endif /* ISC_PLATFORM_USETHREADS */
+
+
+cleanup_lock:
+ if (manager->fdlock != NULL) {
+ for (i = 0; i < FDLOCK_COUNT; i++)
+ DESTROYLOCK(&manager->fdlock[i]);
+ }
+ DESTROYLOCK(&manager->lock);
+
+free_manager:
+ if (manager->fdlock != NULL) {
+ isc_mem_put(mctx, manager->fdlock,
+ FDLOCK_COUNT * sizeof(isc_mutex_t));
+ }
+ if (manager->fdstate != NULL) {
+ isc_mem_put(mctx, manager->fdstate,
+ manager->maxsocks * sizeof(int));
+ }
+ if (manager->fds != NULL) {
+ isc_mem_put(mctx, manager->fds,
+ manager->maxsocks * sizeof(isc_socket_t *));
+ }
+ isc_mem_put(mctx, manager, sizeof(*manager));
+
+ return (result);
+}
+
+isc_result_t
+isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) {
+ REQUIRE(VALID_MANAGER(manager));
+ REQUIRE(nsockp != NULL);
+
+ *nsockp = manager->maxsocks;
+
+ return (ISC_R_SUCCESS);
}
void
@@ -2735,18 +3653,30 @@ isc_socketmgr_destroy(isc_socketmgr_t **managerp) {
/*
* Clean up.
*/
+ cleanup_watcher(manager->mctx, manager);
+
#ifdef ISC_PLATFORM_USETHREADS
(void)close(manager->pipe_fds[0]);
(void)close(manager->pipe_fds[1]);
(void)isc_condition_destroy(&manager->shutdown_ok);
#endif /* ISC_PLATFORM_USETHREADS */
- for (i = 0; i < (int)manager->fdsize; i++)
- if (manager->fdstate[i] == CLOSE_PENDING)
+ for (i = 0; i < (int)manager->maxsocks; i++)
+ if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
(void)close(i);
+ isc_mem_put(manager->mctx, manager->fds,
+ manager->maxsocks * sizeof(isc_socket_t *));
+ isc_mem_put(manager->mctx, manager->fdstate,
+ manager->maxsocks * sizeof(int));
+
+ if (manager->fdlock != NULL) {
+ for (i = 0; i < FDLOCK_COUNT; i++)
+ DESTROYLOCK(&manager->fdlock[i]);
+ isc_mem_put(manager->mctx, manager->fdlock,
+ FDLOCK_COUNT * sizeof(isc_mutex_t));
+ }
DESTROYLOCK(&manager->lock);
- cleanup_fdsets(manager, manager->mctx);
manager->magic = 0;
mctx= manager->mctx;
isc_mem_put(mctx, manager, sizeof(*manager));
@@ -2799,7 +3729,7 @@ socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
* Enqueue the request. If the socket was previously not being
* watched, poke the watcher to start paying attention to it.
*/
- if (ISC_LIST_EMPTY(sock->recv_list))
+ if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
@@ -2996,7 +3926,8 @@ socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
* not being watched, poke the watcher to start
* paying attention to it.
*/
- if (ISC_LIST_EMPTY(sock->send_list))
+ if (ISC_LIST_EMPTY(sock->send_list) &&
+ !sock->pending_send)
select_poke(sock->manager, sock->fd,
SELECT_POKE_WRITE);
ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
@@ -3286,7 +4217,7 @@ isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
} else
strcpy(path, ".");
#endif
-
+
if (chmod(path, perm) < 0) {
isc__strerror(errno, strbuf, sizeof(strbuf));
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
@@ -3315,7 +4246,7 @@ isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
}
isc_result_t
-isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
+isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
unsigned int options) {
char strbuf[ISC_STRERRORSIZE];
int on = 1;
@@ -3446,7 +4377,7 @@ isc_socket_listen(isc_socket_t *sock, unsigned int backlog) {
}
/*
- * This should try to do agressive accept() XXXMLG
+ * This should try to do aggressive accept() XXXMLG
*/
isc_result_t
isc_socket_accept(isc_socket_t *sock,
@@ -3557,6 +4488,16 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
sock->address = *addr;
cc = connect(sock->fd, &addr->type.sa, addr->length);
if (cc < 0) {
+ /*
+ * HP-UX "fails" to connect a UDP socket and sets errno to
+ * EINPROGRESS if it's non-blocking. We'd rather regard this as
+ * a success and let the user detect it if it's really an error
+ * at the time of sending a packet on the socket.
+ */
+ if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
+ cc = 0;
+ goto success;
+ }
if (SOFT_ERROR(errno) || errno == EINPROGRESS)
goto queue;
@@ -3598,6 +4539,7 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
/*
* If connect completed, fire off the done event.
*/
+ success:
if (cc == 0) {
sock->connected = 1;
sock->bound = 1;
@@ -3957,37 +4899,107 @@ isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) {
#ifdef IPV6_V6ONLY
if (sock->pf == AF_INET6) {
- (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
- (void *)&onoff, sizeof(onoff));
+ if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
+ (void *)&onoff, sizeof(int)) < 0) {
+ char strbuf[ISC_STRERRORSIZE];
+
+ UNEXPECTED_ERROR(__FILE__, __LINE__,
+ "setsockopt(%d, IPV6_V6ONLY) "
+ "%s: %s", sock->fd,
+ isc_msgcat_get(isc_msgcat,
+ ISC_MSGSET_GENERAL,
+ ISC_MSG_FAILED,
+ "failed"),
+ strbuf);
+ }
}
+ FIX_IPV6_RECVPKTINFO(sock); /* AIX */
#endif
}
#ifndef ISC_PLATFORM_USETHREADS
-void
-isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd) {
+/* In our assumed scenario, we can simply use a single static object. */
+static isc_socketwait_t swait_private;
+
+int
+isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) {
+ int n;
+#ifdef USE_KQUEUE
+ struct timespec ts, *tsp;
+#endif
+#ifdef USE_EPOLL
+ int timeout;
+#endif
+#ifdef USE_DEVPOLL
+ struct dvpoll dvp;
+#endif
+
+ REQUIRE(swaitp != NULL && *swaitp == NULL);
+
if (socketmgr == NULL)
- *maxfd = 0;
- else {
- /* Prepare duplicates of fd_sets, as select() will modify */
- memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
- socketmgr->fd_bufsize);
- memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
- socketmgr->fd_bufsize);
- *readset = socketmgr->read_fds_copy;
- *writeset = socketmgr->write_fds_copy;
- *maxfd = socketmgr->maxfd + 1;
- }
+ return (0);
+
+#ifdef USE_KQUEUE
+ if (tvp != NULL) {
+ ts.tv_sec = tvp->tv_sec;
+ ts.tv_nsec = tvp->tv_usec * 1000;
+ tsp = &ts;
+ } else
+ tsp = NULL;
+ swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0,
+ socketmgr->events, socketmgr->nevents,
+ tsp);
+ n = swait_private.nevents;
+#elif defined(USE_EPOLL)
+ if (tvp != NULL)
+ timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
+ else
+ timeout = -1;
+ swait_private.nevents = epoll_wait(socketmgr->epoll_fd,
+ socketmgr->events,
+ socketmgr->nevents, timeout);
+ n = swait_private.nevents;
+#elif defined(USE_DEVPOLL)
+ dvp.dp_fds = socketmgr->events;
+ dvp.dp_nfds = socketmgr->nevents;
+ if (tvp != NULL) {
+ dvp.dp_timeout = tvp->tv_sec * 1000 +
+ (tvp->tv_usec + 999) / 1000;
+ } else
+ dvp.dp_timeout = -1;
+ swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp);
+ n = swait_private.nevents;
+#elif defined(USE_SELECT)
+ memcpy(socketmgr->read_fds_copy, socketmgr->read_fds,
+ socketmgr->fd_bufsize);
+ memcpy(socketmgr->write_fds_copy, socketmgr->write_fds,
+ socketmgr->fd_bufsize);
+
+ swait_private.readset = socketmgr->read_fds_copy;
+ swait_private.writeset = socketmgr->write_fds_copy;
+ swait_private.maxfd = socketmgr->maxfd + 1;
+
+ n = select(swait_private.maxfd, swait_private.readset,
+ swait_private.writeset, NULL, tvp);
+#endif
+
+ *swaitp = &swait_private;
+ return (n);
}
isc_result_t
-isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) {
- isc_socketmgr_t *manager = socketmgr;
+isc__socketmgr_dispatch(isc_socketwait_t *swait) {
+ REQUIRE(swait == &swait_private);
- if (manager == NULL)
+ if (socketmgr == NULL)
return (ISC_R_NOTFOUND);
- process_fds(manager, maxfd, readset, writeset);
+#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
+ (void)process_fds(socketmgr, socketmgr->events, swait->nevents);
return (ISC_R_SUCCESS);
+#elif defined(USE_SELECT)
+ process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset);
+ return (ISC_R_SUCCESS);
+#endif
}
#endif /* ISC_PLATFORM_USETHREADS */
diff --git a/contrib/bind9/lib/isc/unix/socket_p.h b/contrib/bind9/lib/isc/unix/socket_p.h
index 4f9cf27..b7da860 100644
--- a/contrib/bind9/lib/isc/unix/socket_p.h
+++ b/contrib/bind9/lib/isc/unix/socket_p.h
@@ -1,8 +1,8 @@
/*
- * Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC")
+ * Copyright (C) 2004, 2005, 2008 Internet Systems Consortium, Inc. ("ISC")
* Copyright (C) 2000, 2001 Internet Software Consortium.
*
- * Permission to use, copy, modify, and distribute this software for any
+ * Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: socket_p.h,v 1.7.18.2.52.1 2008/07/29 04:47:31 each Exp $ */
+/* $Id: socket_p.h,v 1.7.18.4 2008/06/24 23:45:55 tbox Exp $ */
#ifndef ISC_SOCKET_P_H
#define ISC_SOCKET_P_H
@@ -26,10 +26,7 @@
#include <sys/select.h>
#endif
-void
-isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd);
-
-isc_result_t
-isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd);
-
+typedef struct isc_socketwait isc_socketwait_t;
+int isc__socketmgr_waitevents(struct timeval *, isc_socketwait_t **);
+isc_result_t isc__socketmgr_dispatch(isc_socketwait_t *);
#endif /* ISC_SOCKET_P_H */
diff --git a/contrib/bind9/lib/isc/unix/time.c b/contrib/bind9/lib/isc/unix/time.c
index bac24d7..facc12b 100644
--- a/contrib/bind9/lib/isc/unix/time.c
+++ b/contrib/bind9/lib/isc/unix/time.c
@@ -1,8 +1,8 @@
/*
- * Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC")
+ * Copyright (C) 2004, 2005, 2008 Internet Systems Consortium, Inc. ("ISC")
* Copyright (C) 1998-2001, 2003 Internet Software Consortium.
*
- * Permission to use, copy, modify, and distribute this software for any
+ * Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: time.c,v 1.47.18.2 2005/04/29 00:17:09 marka Exp $ */
+/* $Id: time.c,v 1.47.18.4 2008/02/18 23:46:01 tbox Exp $ */
/*! \file */
@@ -227,7 +227,7 @@ isc_time_nowplusinterval(isc_time_t *t, const isc_interval_t *i) {
t->seconds = tv.tv_sec + i->seconds;
t->nanoseconds = tv.tv_usec * NS_PER_US + i->nanoseconds;
- if (t->nanoseconds > NS_PER_S) {
+ if (t->nanoseconds >= NS_PER_S) {
t->seconds++;
t->nanoseconds -= NS_PER_S;
}
@@ -410,5 +410,5 @@ isc_time_formattimestamp(const isc_time_t *t, char *buf, unsigned int len) {
snprintf(buf + flen, len - flen,
".%03u", t->nanoseconds / 1000000);
else
- snprintf(buf, len, "99-Bad-9999 99:99:99.999");
+ snprintf(buf, len, "99-Bad-9999 99:99:99.999");
}
OpenPOWER on IntegriCloud