diff options
author | dougb <dougb@FreeBSD.org> | 2008-12-23 22:47:56 +0000 |
---|---|---|
committer | dougb <dougb@FreeBSD.org> | 2008-12-23 22:47:56 +0000 |
commit | c673a416c3c80055f220808cf9464c00fa1c5042 (patch) | |
tree | 6caf68c956b10fe118ac0bb9f368df80b7d0818e /contrib/bind9/lib/isc/unix | |
parent | cabae62b0ba1d31f524c393e294d3a5e08d543fc (diff) | |
parent | fa25a858e20428b15ec892d020272b1f70eaa725 (diff) | |
download | FreeBSD-src-c673a416c3c80055f220808cf9464c00fa1c5042.zip FreeBSD-src-c673a416c3c80055f220808cf9464c00fa1c5042.tar.gz |
Merge from vendor/bind9/dist as of the 9.4.3 import
Diffstat (limited to 'contrib/bind9/lib/isc/unix')
-rw-r--r-- | contrib/bind9/lib/isc/unix/app.c | 37 | ||||
-rw-r--r-- | contrib/bind9/lib/isc/unix/include/isc/net.h | 27 | ||||
-rw-r--r-- | contrib/bind9/lib/isc/unix/net.c | 161 | ||||
-rw-r--r-- | contrib/bind9/lib/isc/unix/resource.c | 61 | ||||
-rw-r--r-- | contrib/bind9/lib/isc/unix/socket.c | 1674 | ||||
-rw-r--r-- | contrib/bind9/lib/isc/unix/socket_p.h | 15 | ||||
-rw-r--r-- | contrib/bind9/lib/isc/unix/time.c | 10 |
7 files changed, 1562 insertions, 423 deletions
diff --git a/contrib/bind9/lib/isc/unix/app.c b/contrib/bind9/lib/isc/unix/app.c index b71d766..c119362 100644 --- a/contrib/bind9/lib/isc/unix/app.c +++ b/contrib/bind9/lib/isc/unix/app.c @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2005, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2003 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: app.c,v 1.50.18.2.50.1 2008/07/29 04:47:31 each Exp $ */ +/* $Id: app.c,v 1.50.18.8 2008/10/15 03:41:17 marka Exp $ */ /*! \file */ @@ -30,6 +30,9 @@ #include <unistd.h> #include <signal.h> #include <sys/time.h> +#ifdef HAVE_EPOLL +#include <sys/epoll.h> +#endif #include <isc/app.h> #include <isc/boolean.h> @@ -59,11 +62,11 @@ static isc_boolean_t running = ISC_FALSE; /*! * We assume that 'want_shutdown' can be read and written atomically. */ -static isc_boolean_t want_shutdown = ISC_FALSE; +static volatile isc_boolean_t want_shutdown = ISC_FALSE; /* * We assume that 'want_reload' can be read and written atomically. */ -static isc_boolean_t want_reload = ISC_FALSE; +static volatile isc_boolean_t want_reload = ISC_FALSE; static isc_boolean_t blocked = ISC_FALSE; #ifdef ISC_PLATFORM_USETHREADS @@ -87,13 +90,13 @@ static pthread_t main_thread; #ifndef HAVE_SIGWAIT static void exit_action(int arg) { - UNUSED(arg); + UNUSED(arg); want_shutdown = ISC_TRUE; } static void reload_action(int arg) { - UNUSED(arg); + UNUSED(arg); want_reload = ISC_TRUE; } #endif @@ -297,14 +300,13 @@ isc_app_onrun(isc_mem_t *mctx, isc_task_t *task, isc_taskaction_t action, * Event loop for nonthreaded programs. */ static isc_result_t -evloop() { +evloop(void) { isc_result_t result; while (!want_shutdown) { int n; isc_time_t when, now; struct timeval tv, *tvp; - fd_set *readfds, *writefds; - int maxfd; + isc_socketwait_t *swait; isc_boolean_t readytasks; isc_boolean_t call_timer_dispatch = ISC_FALSE; @@ -331,15 +333,15 @@ evloop() { } } - isc__socketmgr_getfdsets(&readfds, &writefds, &maxfd); - n = select(maxfd, readfds, writefds, NULL, tvp); + swait = NULL; + n = isc__socketmgr_waitevents(tvp, &swait); if (n == 0 || call_timer_dispatch) { /* * We call isc__timermgr_dispatch() only when * necessary, in order to reduce overhead. If the * select() call indicates a timeout, we need the - * dispatch. Even if not, if we set the 0-timeout + * dispatch. Even if not, if we set the 0-timeout * for the select() call, we need to check the timer * events. In the 'readytasks' case, there may be no * timeout event actually, but there is no other way @@ -352,8 +354,7 @@ evloop() { isc__timermgr_dispatch(); } if (n > 0) - (void)isc__socketmgr_dispatch(readfds, writefds, - maxfd); + (void)isc__socketmgr_dispatch(swait); (void)isc__taskmgr_dispatch(); if (want_reload) { @@ -423,7 +424,7 @@ isc__nothread_signal_hack(isc_condition_t *cp) { signalled = ISC_TRUE; return (ISC_R_SUCCESS); } - + #endif /* ISC_PLATFORM_USETHREADS */ isc_result_t @@ -434,10 +435,10 @@ isc_app_run(void) { #ifdef ISC_PLATFORM_USETHREADS sigset_t sset; char strbuf[ISC_STRERRORSIZE]; -#endif /* ISC_PLATFORM_USETHREADS */ #ifdef HAVE_SIGWAIT int sig; #endif +#endif /* ISC_PLATFORM_USETHREADS */ #ifdef HAVE_LINUXTHREADS REQUIRE(main_thread == pthread_self()); @@ -676,7 +677,7 @@ isc_app_unblock(void) { REQUIRE(blockedthread == pthread_self()); RUNTIME_CHECK(sigemptyset(&sset) == 0 && - sigaddset(&sset, SIGINT) == 0 && + sigaddset(&sset, SIGINT) == 0 && sigaddset(&sset, SIGTERM) == 0); RUNTIME_CHECK(pthread_sigmask(SIG_BLOCK, &sset, NULL) == 0); #endif /* ISC_PLATFORM_USETHREADS */ diff --git a/contrib/bind9/lib/isc/unix/include/isc/net.h b/contrib/bind9/lib/isc/unix/include/isc/net.h index bdd8c14..948e7b1 100644 --- a/contrib/bind9/lib/isc/unix/include/isc/net.h +++ b/contrib/bind9/lib/isc/unix/include/isc/net.h @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2005, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2003 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: net.h,v 1.39.18.4 2005/04/27 05:02:37 sra Exp $ */ +/* $Id: net.h,v 1.39.18.6 2008/06/24 23:45:55 tbox Exp $ */ #ifndef ISC_NET_H #define ISC_NET_H 1 @@ -104,7 +104,7 @@ /*% * Required for some pre RFC2133 implementations. * IN6ADDR_ANY_INIT and IN6ADDR_LOOPBACK_INIT were added in - * draft-ietf-ipngwg-bsd-api-04.txt or draft-ietf-ipngwg-bsd-api-05.txt. + * draft-ietf-ipngwg-bsd-api-04.txt or draft-ietf-ipngwg-bsd-api-05.txt. * If 's6_addr' is defined then assume that there is a union and three * levels otherwise assume two levels required. */ @@ -202,7 +202,7 @@ extern const struct in6_addr isc_net_in6addrloop; #ifdef ISC_PLATFORM_FIXIN6ISADDR #undef IN6_IS_ADDR_GEOGRAPHIC -/*! +/*! * \brief * Fix UnixWare 7.1.1's broken IN6_IS_ADDR_* definitions. */ @@ -324,6 +324,23 @@ isc_net_probeunix(void); * Returns whether UNIX domain sockets are supported. */ +isc_result_t +isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high); +/*%< + * Returns system's default range of ephemeral UDP ports, if defined. + * If the range is not available or unknown, ISC_NET_PORTRANGELOW and + * ISC_NET_PORTRANGEHIGH will be returned. + * + * Requires: + * + *\li 'low' and 'high' must be non NULL. + * + * Returns: + * + *\li *low and *high will be the ports specifying the low and high ends of + * the range. + */ + #ifdef ISC_PLATFORM_NEEDNTOP const char * isc_net_ntop(int af, const void *src, char *dst, size_t size); diff --git a/contrib/bind9/lib/isc/unix/net.c b/contrib/bind9/lib/isc/unix/net.c index 1de6b32..600ac92 100644 --- a/contrib/bind9/lib/isc/unix/net.c +++ b/contrib/bind9/lib/isc/unix/net.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2004, 2005, 2007 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2005, 2007, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1999-2003 Internet Software Consortium. * * Permission to use, copy, modify, and/or distribute this software for any @@ -15,10 +15,19 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: net.c,v 1.29.18.6 2007/09/13 23:46:26 tbox Exp $ */ +/* $Id: net.c,v 1.29.18.9 2008/07/04 05:52:05 each Exp $ */ #include <config.h> +#include <sys/types.h> + +#if defined(HAVE_SYS_SYSCTL_H) +#if defined(HAVE_SYS_PARAM_H) +#include <sys/param.h> +#endif +#include <sys/sysctl.h> +#endif + #include <errno.h> #include <unistd.h> @@ -30,6 +39,59 @@ #include <isc/string.h> #include <isc/util.h> +/*% + * Definitions about UDP port range specification. This is a total mess of + * portability variants: some use sysctl (but the sysctl names vary), some use + * system-specific interfaces, some have the same interface for IPv4 and IPv6, + * some separate them, etc... + */ + +/*% + * The last resort defaults: use all non well known port space + */ +#ifndef ISC_NET_PORTRANGELOW +#define ISC_NET_PORTRANGELOW 1024 +#endif /* ISC_NET_PORTRANGELOW */ +#ifndef ISC_NET_PORTRANGEHIGH +#define ISC_NET_PORTRANGEHIGH 65535 +#endif /* ISC_NET_PORTRANGEHIGH */ + +#ifdef HAVE_SYSCTLBYNAME + +/*% + * sysctl variants + */ +#if defined(__FreeBSD__) || defined(__APPLE__) || defined(__DragonFly__) +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.portrange.hifirst" +#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.portrange.hilast" +#define SYSCTL_V6PORTRANGE_LOW "net.inet.ip.portrange.hifirst" +#define SYSCTL_V6PORTRANGE_HIGH "net.inet.ip.portrange.hilast" +#endif + +#ifdef __NetBSD__ +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW "net.inet.ip.anonportmin" +#define SYSCTL_V4PORTRANGE_HIGH "net.inet.ip.anonportmax" +#define SYSCTL_V6PORTRANGE_LOW "net.inet6.ip6.anonportmin" +#define SYSCTL_V6PORTRANGE_HIGH "net.inet6.ip6.anonportmax" +#endif + +#else /* !HAVE_SYSCTLBYNAME */ + +#ifdef __OpenBSD__ +#define USE_SYSCTL_PORTRANGE +#define SYSCTL_V4PORTRANGE_LOW { CTL_NET, PF_INET, IPPROTO_IP, \ + IPCTL_IPPORT_HIFIRSTAUTO } +#define SYSCTL_V4PORTRANGE_HIGH { CTL_NET, PF_INET, IPPROTO_IP, \ + IPCTL_IPPORT_HILASTAUTO } +/* Same for IPv6 */ +#define SYSCTL_V6PORTRANGE_LOW SYSCTL_V4PORTRANGE_LOW +#define SYSCTL_V6PORTRANGE_HIGH SYSCTL_V4PORTRANGE_HIGH +#endif + +#endif /* HAVE_SYSCTLBYNAME */ + #if defined(ISC_PLATFORM_HAVEIPV6) # if defined(ISC_PLATFORM_NEEDIN6ADDRANY) const struct in6_addr isc_net_in6addrany = IN6ADDR_ANY_INIT; @@ -338,6 +400,101 @@ isc_net_probe_ipv6pktinfo(void) { return (ipv6pktinfo_result); } +#if defined(USE_SYSCTL_PORTRANGE) +#if defined(HAVE_SYSCTLBYNAME) +static isc_result_t +getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { + int port_low, port_high; + size_t portlen; + const char *sysctlname_lowport, *sysctlname_hiport; + + if (af == AF_INET) { + sysctlname_lowport = SYSCTL_V4PORTRANGE_LOW; + sysctlname_hiport = SYSCTL_V4PORTRANGE_HIGH; + } else { + sysctlname_lowport = SYSCTL_V6PORTRANGE_LOW; + sysctlname_hiport = SYSCTL_V6PORTRANGE_HIGH; + } + portlen = sizeof(portlen); + if (sysctlbyname(sysctlname_lowport, &port_low, &portlen, + NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + portlen = sizeof(portlen); + if (sysctlbyname(sysctlname_hiport, &port_high, &portlen, + NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0) + return (ISC_R_RANGE); + + *low = (in_port_t)port_low; + *high = (in_port_t)port_high; + + return (ISC_R_SUCCESS); +} +#else /* !HAVE_SYSCTLBYNAME */ +static isc_result_t +getudpportrange_sysctl(int af, in_port_t *low, in_port_t *high) { + int mib_lo4[4] = SYSCTL_V4PORTRANGE_LOW; + int mib_hi4[4] = SYSCTL_V4PORTRANGE_HIGH; + int mib_lo6[4] = SYSCTL_V6PORTRANGE_LOW; + int mib_hi6[4] = SYSCTL_V6PORTRANGE_HIGH; + int *mib_lo, *mib_hi, miblen; + int port_low, port_high; + size_t portlen; + + if (af == AF_INET) { + mib_lo = mib_lo4; + mib_hi = mib_hi4; + miblen = sizeof(mib_lo4) / sizeof(mib_lo4[0]); + } else { + mib_lo = mib_lo6; + mib_hi = mib_hi6; + miblen = sizeof(mib_lo6) / sizeof(mib_lo6[0]); + } + + portlen = sizeof(portlen); + if (sysctl(mib_lo, miblen, &port_low, &portlen, NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + + portlen = sizeof(portlen); + if (sysctl(mib_hi, miblen, &port_high, &portlen, NULL, 0) < 0) { + return (ISC_R_FAILURE); + } + + if ((port_low & ~0xffff) != 0 || (port_high & ~0xffff) != 0) + return (ISC_R_RANGE); + + *low = (in_port_t) port_low; + *high = (in_port_t) port_high; + + return (ISC_R_SUCCESS); +} +#endif /* HAVE_SYSCTLBYNAME */ +#endif /* USE_SYSCTL_PORTRANGE */ + +isc_result_t +isc_net_getudpportrange(int af, in_port_t *low, in_port_t *high) { + int result = ISC_R_FAILURE; + + REQUIRE(low != NULL && high != NULL); + +#if defined(USE_SYSCTL_PORTRANGE) + result = getudpportrange_sysctl(af, low, high); +#else + UNUSED(af); +#endif + + if (result != ISC_R_SUCCESS) { + *low = ISC_NET_PORTRANGELOW; + *high = ISC_NET_PORTRANGEHIGH; + } + + return (ISC_R_SUCCESS); /* we currently never fail in this function */ +} + void isc_net_disableipv4(void) { initialize(); diff --git a/contrib/bind9/lib/isc/unix/resource.c b/contrib/bind9/lib/isc/unix/resource.c index 0264976..e9bc5fd 100644 --- a/contrib/bind9/lib/isc/unix/resource.c +++ b/contrib/bind9/lib/isc/unix/resource.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: resource.c,v 1.12.944.4 2008/07/28 22:44:46 marka Exp $ */ +/* $Id: resource.c,v 1.12.18.6 2008/08/05 07:17:05 marka Exp $ */ #include <config.h> @@ -32,7 +32,7 @@ #include <linux/fs.h> /* To get the large NR_OPEN. */ #endif -#ifdef __hpux +#if defined(__hpux) && defined(HAVE_SYS_DYNTUNE_H) #include <sys/dyntune.h> #endif @@ -48,13 +48,13 @@ resource2rlim(isc_resource_t resource, int *rlim_resource) { break; case isc_resource_cputime: *rlim_resource = RLIMIT_CPU; - break; + break; case isc_resource_datasize: *rlim_resource = RLIMIT_DATA; - break; + break; case isc_resource_filesize: *rlim_resource = RLIMIT_FSIZE; - break; + break; case isc_resource_lockedmemory: #ifdef RLIMIT_MEMLOCK *rlim_resource = RLIMIT_MEMLOCK; @@ -87,7 +87,7 @@ resource2rlim(isc_resource_t resource, int *rlim_resource) { *rlim_resource = RLIMIT_STACK; break; default: - /* + /* * This test is not very robust if isc_resource_t * changes, but generates a clear assertion message. */ @@ -140,51 +140,6 @@ isc_resource_setlimit(isc_resource_t resource, isc_resourcevalue_t value) { rlim_value = value; } - /* - * The BIND 8 documentation reports: - * - * Note: on some operating systems the server cannot set an - * unlimited value and cannot determine the maximum number of - * open files the kernel can support. On such systems, choosing - * unlimited will cause the server to use the larger of the - * rlim_max for RLIMIT_NOFILE and the value returned by - * sysconf(_SC_OPEN_MAX). If the actual kernel limit is larger - * than this value, use limit files to specify the limit - * explicitly. - * - * The CHANGES for 8.1.2-T3A also mention: - * - * 352. [bug] Because of problems with setting an infinite - * rlim_max for RLIMIT_NOFILE on some systems, previous versions - * of the server implemented "limit files unlimited" by setting - * the limit to the value returned by sysconf(_SC_OPEN_MAX). The - * server will now use RLIM_INFINITY on systems which allow it. - * - * At some point the BIND 8 server stopped using SC_OPEN_MAX for this - * purpose at all, but it isn't clear to me when or why, as my access - * to the CVS archive is limited at the time of this writing. What - * BIND 8 *does* do is to set RLIMIT_NOFILE to either RLIMIT_INFINITY - * on a half dozen operating systems or to FD_SETSIZE on the rest, - * the latter of which is probably fewer than the real limit. (Note - * that libisc's socket module will have problems with any fd over - * FD_SETSIZE. This should be fixed in the socket module, not a - * limitation here. BIND 8's eventlib also has a problem, making - * its RLIMIT_INFINITY setting useless, because it closes and ignores - * any fd over FD_SETSIZE.) - * - * More troubling is the reference to some operating systems not being - * able to set an unlimited value for the number of open files. I'd - * hate to put in code that is really only there to support archaic - * systems that the rest of libisc won't work on anyway. So what this - * extremely verbose comment is here to say is the following: - * - * I'm aware there might be an issue with not limiting the value - * for RLIMIT_NOFILE on some systems, but since I don't know yet - * what those systems are and what the best workaround is (use - * sysconf()? rlim_max from getrlimit()? FD_SETSIZE?) so nothing - * is currently being done to clamp the value for open files. - */ - rl.rlim_cur = rl.rlim_max = rlim_value; unixresult = setrlimit(unixresource, &rl); @@ -215,7 +170,7 @@ isc_resource_setlimit(isc_resource_t resource, isc_resourcevalue_t value) { if (unixresult == 0) return (ISC_R_SUCCESS); } -#elif defined(__hpux) +#elif defined(__hpux) && defined(HAVE_SYS_DYNTUNE_H) if (resource == isc_resource_openfiles && rlim_value == RLIM_INFINITY) { uint64_t maxfiles; if (gettune("maxfiles_lim", &maxfiles) == 0) { @@ -255,7 +210,7 @@ isc_resource_getlimit(isc_resource_t resource, isc_resourcevalue_t *value) { } isc_result_t -isc_resource_curlimit(isc_resource_t resource, isc_resourcevalue_t *value) { +isc_resource_getcurlimit(isc_resource_t resource, isc_resourcevalue_t *value) { int unixresult; int unixresource; struct rlimit rl; diff --git a/contrib/bind9/lib/isc/unix/socket.c b/contrib/bind9/lib/isc/unix/socket.c index 1b4da78..3239614 100644 --- a/contrib/bind9/lib/isc/unix/socket.c +++ b/contrib/bind9/lib/isc/unix/socket.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket.c,v 1.237.18.29.10.6 2008/07/29 04:47:31 each Exp $ */ +/* $Id: socket.c,v 1.237.18.56 2008/11/12 03:58:36 marka Exp $ */ /*! \file */ @@ -25,9 +25,6 @@ #include <sys/types.h> #include <sys/socket.h> #include <sys/stat.h> -#ifdef ISC_PLATFORM_HAVESYSUNH -#include <sys/un.h> -#endif #include <sys/time.h> #include <sys/uio.h> @@ -58,6 +55,19 @@ #include <isc/thread.h> #include <isc/util.h> +#ifdef ISC_PLATFORM_HAVESYSUNH +#include <sys/un.h> +#endif +#ifdef ISC_PLATFORM_HAVEKQUEUE +#include <sys/event.h> +#endif +#ifdef ISC_PLATFORM_HAVEEPOLL +#include <sys/epoll.h> +#endif +#ifdef ISC_PLATFORM_HAVEDEVPOLL +#include <sys/devpoll.h> +#endif + #include "errno2result.h" #ifndef ISC_PLATFORM_USETHREADS @@ -65,9 +75,46 @@ #endif /* ISC_PLATFORM_USETHREADS */ /*% - * Max number of open sockets. In the vast majority of cases the default size - * of FD_SETSIZE should be fine, and this constant should be increased only - * when absolutely necessary and possible, i.e., the server is exhausting all + * Choose the most preferable multiplex method. + */ +#ifdef ISC_PLATFORM_HAVEKQUEUE +#define USE_KQUEUE +#elif defined (ISC_PLATFORM_HAVEEPOLL) +#define USE_EPOLL +#elif defined (ISC_PLATFORM_HAVEDEVPOLL) +#define USE_DEVPOLL +typedef struct { + unsigned int want_read : 1, + want_write : 1; +} pollinfo_t; +#else +#define USE_SELECT +#endif /* ISC_PLATFORM_HAVEKQUEUE */ + +#ifndef ISC_PLATFORM_USETHREADS +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +struct isc_socketwait { + int nevents; +}; +#elif defined (USE_SELECT) +struct isc_socketwait { + fd_set *readset; + fd_set *writeset; + int nfds; + int maxfd; +}; +#endif /* USE_KQUEUE */ +#endif /* !ISC_PLATFORM_USETHREADS */ + +/*% + * Maximum number of allowable open sockets. This is also the maximum + * allowable socket file descriptor. + * + * Care should be taken before modifying this value for select(): + * The API standard doesn't ensure select() accept more than (the system default + * of) FD_SETSIZE descriptors, and the default size should in fact be fine in + * the vast majority of cases. This constant should therefore be increased only + * when absolutely necessary and possible, i.e., the server is exhausting all * available file descriptors (up to FD_SETSIZE) and the select() function * and FD_xxx macros support larger values than FD_SETSIZE (which may not * always by true, but we keep using some of them to ensure as much @@ -78,18 +125,72 @@ * As a special note, this value shouldn't have to be touched if * this is a build for an authoritative only DNS server. */ - -#ifndef ISC_SOCKET_FDSETSIZE -#define ISC_SOCKET_FDSETSIZE FD_SETSIZE -#endif - +#ifndef ISC_SOCKET_MAXSOCKETS +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +#define ISC_SOCKET_MAXSOCKETS 4096 +#elif defined(USE_SELECT) +#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE +#endif /* USE_KQUEUE... */ +#endif /* ISC_SOCKET_MAXSOCKETS */ + +#ifdef USE_SELECT /*% - * Mac OS X needs a special definition to support larger values in select() + * Mac OS X needs a special definition to support larger values in select(). + * We always define this because a larger value can be specified run-time. */ -#if ISC_SOCKET_FDSETSIZE > FD_SETSIZE #ifdef __APPLE__ #define _DARWIN_UNLIMITED_SELECT #endif /* __APPLE__ */ +#endif /* USE_SELECT */ + +#ifdef ISC_SOCKET_USE_POLLWATCH +/*% + * If this macro is defined, enable workaround for a Solaris /dev/poll kernel + * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for + * some of the specified FD. The idea is based on the observation that it's + * likely for a busy server to keep receiving packets. It specifically works + * as follows: the socket watcher is first initialized with the state of + * "poll_idle". While it's in the idle state it keeps sleeping until a socket + * event occurs. When it wakes up for a socket I/O event, it moves to the + * poll_active state, and sets the poll timeout to a short period + * (ISC_SOCKET_POLLWATCH_TIMEOUT msec). If timeout occurs in this state, the + * watcher goes to the poll_checking state with the same timeout period. + * In this state, the watcher tries to detect whether this is a break + * during intermittent events or the kernel bug is triggered. If the next + * polling reports an event within the short period, the previous timeout is + * likely to be a kernel bug, and so the watcher goes back to the active state. + * Otherwise, it moves to the idle state again. + * + * It's not clear whether this is a thread-related bug, but since we've only + * seen this with threads, this workaround is used only when enabling threads. + */ + +typedef enum { poll_idle, poll_active, poll_checking } pollstate_t; + +#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT +#define ISC_SOCKET_POLLWATCH_TIMEOUT 10 +#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */ +#endif /* ISC_SOCKET_USE_POLLWATCH */ + +/*% + * Size of per-FD lock buckets. + */ +#ifdef ISC_PLATFORM_USETHREADS +#define FDLOCK_COUNT 1024 +#define FDLOCK_ID(fd) ((fd) % FDLOCK_COUNT) +#else +#define FDLOCK_COUNT 1 +#define FDLOCK_ID(fd) 0 +#endif /* ISC_PLATFORM_USETHREADS */ + +/*% + * Maximum number of events communicated with the kernel. There should normally + * be no need for having a large number. + */ +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) +#ifndef ISC_SOCKET_MAXEVENTS +#define ISC_SOCKET_MAXEVENTS 64 +#endif #endif /*% @@ -230,22 +331,50 @@ struct isc_socketmgr { unsigned int magic; isc_mem_t *mctx; isc_mutex_t lock; + isc_mutex_t *fdlock; +#ifdef USE_KQUEUE + int kqueue_fd; + int nevents; + struct kevent *events; +#endif /* USE_KQUEUE */ +#ifdef USE_EPOLL + int epoll_fd; + int nevents; + struct epoll_event *events; +#endif /* USE_EPOLL */ +#ifdef USE_DEVPOLL + int devpoll_fd; + int nevents; + struct pollfd *events; +#endif /* USE_DEVPOLL */ +#ifdef USE_SELECT int fd_bufsize; - int fdsize; +#endif /* USE_SELECT */ + unsigned int maxsocks; +#ifdef ISC_PLATFORM_USETHREADS + int pipe_fds[2]; +#endif + + /* Locked by fdlock. */ + isc_socket_t **fds; + int *fdstate; +#ifdef USE_DEVPOLL + pollinfo_t *fdpollinfo; +#endif + /* Locked by manager lock. */ ISC_LIST(isc_socket_t) socklist; +#ifdef USE_SELECT fd_set *read_fds; fd_set *read_fds_copy; fd_set *write_fds; fd_set *write_fds_copy; - isc_socket_t **fds; - int *fdstate; int maxfd; - int reserved; /* unlocked */ +#endif /* USE_SELECT */ + int reserved; /* unlocked */ #ifdef ISC_PLATFORM_USETHREADS isc_thread_t watcher; isc_condition_t shutdown_ok; - int pipe_fds[2]; #else /* ISC_PLATFORM_USETHREADS */ unsigned int refs; #endif /* ISC_PLATFORM_USETHREADS */ @@ -284,8 +413,9 @@ static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *, struct msghdr *, struct iovec *, size_t *); static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *, struct msghdr *, struct iovec *, size_t *); -static void cleanup_fdsets(isc_socketmgr_t *, isc_mem_t *); -static isc_result_t create_fdsets(isc_socketmgr_t *, isc_mem_t *); +#ifdef ISC_PLATFORM_USETHREADS +static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager); +#endif #define SELECT_POKE_SHUTDOWN (-1) #define SELECT_POKE_NOTHING (-2) @@ -354,9 +484,195 @@ socket_log(isc_socket_t *sock, isc_sockaddr_t *address, } } +#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \ + defined(USE_CMSG) && defined(IPV6_RECVPKTINFO) +/* + * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by + * setting IPV6_V6ONLY. + */ +static void +FIX_IPV6_RECVPKTINFO(isc_socket_t *sock) +{ + char strbuf[ISC_STRERRORSIZE]; + int on = 1; + + if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp) + return; + + if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, + (void *)&on, sizeof(on)) < 0) { + + UNEXPECTED_ERROR(__FILE__, __LINE__, + "setsockopt(%d, IPV6_RECVPKTINFO) " + "%s: %s", sock->fd, + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed"), + strbuf); + } +} +#else +#define FIX_IPV6_RECVPKTINFO(sock) (void)0 +#endif + +static inline isc_result_t +watch_fd(isc_socketmgr_t *manager, int fd, int msg) { + isc_result_t result = ISC_R_SUCCESS; + +#ifdef USE_KQUEUE + struct kevent evchange; + + memset(&evchange, 0, sizeof(evchange)); + if (msg == SELECT_POKE_READ) + evchange.filter = EVFILT_READ; + else + evchange.filter = EVFILT_WRITE; + evchange.flags = EV_ADD; + evchange.ident = fd; + if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) + result = isc__errno2result(errno); + + return (result); +#elif defined(USE_EPOLL) + struct epoll_event event; + + if (msg == SELECT_POKE_READ) + event.events = EPOLLIN; + else + event.events = EPOLLOUT; + event.data.fd = fd; + if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 && + errno != EEXIST) { + result = isc__errno2result(errno); + } + + return (result); +#elif defined(USE_DEVPOLL) + struct pollfd pfd; + int lockid = FDLOCK_ID(fd); + + memset(&pfd, 0, sizeof(pfd)); + if (msg == SELECT_POKE_READ) + pfd.events = POLLIN; + else + pfd.events = POLLOUT; + pfd.fd = fd; + pfd.revents = 0; + LOCK(&manager->fdlock[lockid]); + if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1) + result = isc__errno2result(errno); + else { + if (msg == SELECT_POKE_READ) + manager->fdpollinfo[fd].want_read = 1; + else + manager->fdpollinfo[fd].want_write = 1; + } + UNLOCK(&manager->fdlock[lockid]); + + return (result); +#elif defined(USE_SELECT) + LOCK(&manager->lock); + if (msg == SELECT_POKE_READ) + FD_SET(fd, manager->read_fds); + if (msg == SELECT_POKE_WRITE) + FD_SET(fd, manager->write_fds); + UNLOCK(&manager->lock); + + return (result); +#endif +} + +static inline isc_result_t +unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) { + isc_result_t result = ISC_R_SUCCESS; + +#ifdef USE_KQUEUE + struct kevent evchange; + + memset(&evchange, 0, sizeof(evchange)); + if (msg == SELECT_POKE_READ) + evchange.filter = EVFILT_READ; + else + evchange.filter = EVFILT_WRITE; + evchange.flags = EV_DELETE; + evchange.ident = fd; + if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) + result = isc__errno2result(errno); + + return (result); +#elif defined(USE_EPOLL) + struct epoll_event event; + + if (msg == SELECT_POKE_READ) + event.events = EPOLLIN; + else + event.events = EPOLLOUT; + event.data.fd = fd; + if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 && + errno != ENOENT) { + char strbuf[ISC_STRERRORSIZE]; + isc__strerror(errno, strbuf, sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "epoll_ctl(DEL), %d: %s", fd, strbuf); + result = ISC_R_UNEXPECTED; + } + return (result); +#elif defined(USE_DEVPOLL) + struct pollfd pfds[2]; + size_t writelen = sizeof(pfds[0]); + int lockid = FDLOCK_ID(fd); + + memset(pfds, 0, sizeof(pfds)); + pfds[0].events = POLLREMOVE; + pfds[0].fd = fd; + + /* + * Canceling read or write polling via /dev/poll is tricky. Since it + * only provides a way of canceling per FD, we may need to re-poll the + * socket for the other operation. + */ + LOCK(&manager->fdlock[lockid]); + if (msg == SELECT_POKE_READ && + manager->fdpollinfo[fd].want_write == 1) { + pfds[1].events = POLLOUT; + pfds[1].fd = fd; + writelen += sizeof(pfds[1]); + } + if (msg == SELECT_POKE_WRITE && + manager->fdpollinfo[fd].want_read == 1) { + pfds[1].events = POLLIN; + pfds[1].fd = fd; + writelen += sizeof(pfds[1]); + } + + if (write(manager->devpoll_fd, pfds, writelen) == -1) + result = isc__errno2result(errno); + else { + if (msg == SELECT_POKE_READ) + manager->fdpollinfo[fd].want_read = 0; + else + manager->fdpollinfo[fd].want_write = 0; + } + UNLOCK(&manager->fdlock[lockid]); + + return (result); +#elif defined(USE_SELECT) + LOCK(&manager->lock); + if (msg == SELECT_POKE_READ) + FD_CLR(fd, manager->read_fds); + else if (msg == SELECT_POKE_WRITE) + FD_CLR(fd, manager->write_fds); + UNLOCK(&manager->lock); + + return (result); +#endif +} + static void wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { - isc_socket_t *sock; + isc_result_t result; + int lockid = FDLOCK_ID(fd); /* * This is a wakeup on a socket. If the socket is not in the @@ -364,27 +680,54 @@ wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { * or writes. */ - INSIST(fd >= 0 && fd < manager->fdsize); + INSIST(fd >= 0 && fd < (int)manager->maxsocks); - if (manager->fdstate[fd] == CLOSE_PENDING) { + if (msg == SELECT_POKE_CLOSE) { + /* No one should be updating fdstate, so no need to lock it */ + INSIST(manager->fdstate[fd] == CLOSE_PENDING); manager->fdstate[fd] = CLOSED; - FD_CLR(fd, manager->read_fds); - FD_CLR(fd, manager->write_fds); + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); (void)close(fd); return; } - if (manager->fdstate[fd] != MANAGED) - return; - sock = manager->fds[fd]; + LOCK(&manager->fdlock[lockid]); + if (manager->fdstate[fd] == CLOSE_PENDING) { + UNLOCK(&manager->fdlock[lockid]); + /* + * We accept (and ignore) any error from unwatch_fd() as we are + * closing the socket, hoping it doesn't leave dangling state in + * the kernel. + * Note that unwatch_fd() must be called after releasing the + * fdlock; otherwise it could cause deadlock due to a lock order + * reversal. + */ + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + return; + } + if (manager->fdstate[fd] != MANAGED) { + UNLOCK(&manager->fdlock[lockid]); + return; + } + UNLOCK(&manager->fdlock[lockid]); /* * Set requested bit. */ - if (msg == SELECT_POKE_READ) - FD_SET(sock->fd, manager->read_fds); - if (msg == SELECT_POKE_WRITE) - FD_SET(sock->fd, manager->write_fds); + result = watch_fd(manager, fd, msg); + if (result != ISC_R_SUCCESS) { + /* + * XXXJT: what should we do? Ignoring the failure of watching + * a socket will make the application dysfunctional, but there + * seems to be no reasonable recovery process. + */ + isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, + "failed to start watching FD (%d): %s", + fd, isc_result_totext(result)); + } } #ifdef ISC_PLATFORM_USETHREADS @@ -452,7 +795,7 @@ select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) { "read() failed " "during watcher poke: %s"), strbuf); - + return; } INSIST(cc == sizeof(buf)); @@ -557,7 +900,7 @@ cmsg_space(ISC_SOCKADDR_LEN_T len) { return ((char *)cmsgp - (char *)msg.msg_control); else return (0); -#endif +#endif } #endif /* USE_CMSG */ @@ -631,7 +974,7 @@ process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { "interface received on ifindex %u", dev->pktinfo.ipi6_ifindex); if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) - dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; + dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; goto next; } #endif @@ -679,7 +1022,7 @@ build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, memset(msg, 0, sizeof(*msg)); - if (sock->type == isc_sockettype_udp) { + if (!sock->connected) { msg->msg_name = (void *)&dev->address.type.sa; msg->msg_namelen = dev->address.length; } else { @@ -964,15 +1307,17 @@ dump_msg(struct msghdr *msg) { unsigned int i; printf("MSGHDR %p\n", msg); - printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen); - printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen); + printf("\tname %p, namelen %ld\n", msg->msg_name, + (long) msg->msg_namelen); + printf("\tiov %p, iovlen %ld\n", msg->msg_iov, + (long) msg->msg_iovlen); for (i = 0; i < (unsigned int)msg->msg_iovlen; i++) - printf("\t\t%d\tbase %p, len %d\n", i, + printf("\t\t%d\tbase %p, len %ld\n", i, msg->msg_iov[i].iov_base, - msg->msg_iov[i].iov_len); + (long) msg->msg_iov[i].iov_len); #ifdef ISC_NET_BSD44MSGHDR - printf("\tcontrol %p, controllen %d\n", msg->msg_control, - msg->msg_controllen); + printf("\tcontrol %p, controllen %ld\n", msg->msg_control, + (long) msg->msg_controllen); #endif } #endif @@ -1014,7 +1359,7 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { isc__strerror(recv_errno, strbuf, sizeof(strbuf)); socket_log(sock, NULL, IOEVENT, isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_DOIORECV, + ISC_MSG_DOIORECV, "doio_recv: recvmsg(%d) %d bytes, err %d/%s", sock->fd, cc, recv_errno, strbuf); } @@ -1040,6 +1385,14 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { /* HPUX 11.11 can return EADDRNOTAVAIL. */ SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES); + /* + * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6 + * errors. + */ +#ifdef EPROTO + SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH); +#endif + SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH); #undef SOFT_OR_HARD #undef ALWAYS_HARD @@ -1062,7 +1415,7 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { socket_log(sock, &dev->address, IOEVENT, isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_ZEROPORT, + ISC_MSG_ZEROPORT, "dropping source port zero packet"); } return (DOIO_SOFT); @@ -1245,7 +1598,54 @@ doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { * references exist. */ static void +closesocket(isc_socketmgr_t *manager, isc_sockettype_t type, int fd) { + int lockid = FDLOCK_ID(fd); + + UNUSED(type); + + /* + * No one has this socket open, so the watcher doesn't have to be + * poked, and the socket doesn't have to be locked. + */ + LOCK(&manager->fdlock[lockid]); + manager->fds[fd] = NULL; + manager->fdstate[fd] = CLOSE_PENDING; + UNLOCK(&manager->fdlock[lockid]); + select_poke(manager, fd, SELECT_POKE_CLOSE); + + /* + * update manager->maxfd here (XXX: this should be implemented more + * efficiently) + */ +#ifdef USE_SELECT + LOCK(&manager->lock); + if (manager->maxfd == fd) { + int i; + + manager->maxfd = 0; + for (i = fd - 1; i >= 0; i--) { + lockid = FDLOCK_ID(i); + + LOCK(&manager->fdlock[lockid]); + if (manager->fdstate[i] == MANAGED) { + manager->maxfd = i; + UNLOCK(&manager->fdlock[lockid]); + break; + } + UNLOCK(&manager->fdlock[lockid]); + } +#ifdef ISC_PLATFORM_USETHREADS + if (manager->maxfd < manager->pipe_fds[0]) + manager->maxfd = manager->pipe_fds[0]; +#endif + } + UNLOCK(&manager->lock); +#endif /* USE_SELECT */ +} + +static void destroy(isc_socket_t **sockp) { + int fd; isc_socket_t *sock = *sockp; isc_socketmgr_t *manager = sock->manager; @@ -1256,17 +1656,16 @@ destroy(isc_socket_t **sockp) { INSIST(ISC_LIST_EMPTY(sock->recv_list)); INSIST(ISC_LIST_EMPTY(sock->send_list)); INSIST(sock->connect_ev == NULL); - REQUIRE(sock->fd >= 0 && sock->fd < (int)manager->fdsize); + REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks); + + if (sock->fd >= 0) { + fd = sock->fd; + sock->fd = -1; + closesocket(manager, sock->type, fd); + } LOCK(&manager->lock); - /* - * No one has this socket open, so the watcher doesn't have to be - * poked, and the socket doesn't have to be locked. - */ - manager->fds[sock->fd] = NULL; - manager->fdstate[sock->fd] = CLOSE_PENDING; - select_poke(manager, sock->fd, SELECT_POKE_CLOSE); ISC_LIST_UNLINK(manager->socklist, sock, link); #ifdef ISC_PLATFORM_USETHREADS @@ -1274,10 +1673,6 @@ destroy(isc_socket_t **sockp) { SIGNAL(&manager->shutdown_ok); #endif /* ISC_PLATFORM_USETHREADS */ - /* - * XXX should reset manager->maxfd here - */ - UNLOCK(&manager->lock); free_socket(sockp); @@ -1465,18 +1860,11 @@ clear_bsdcompat(void) { } #endif -/*% - * Create a new 'type' socket managed by 'manager'. Events - * will be posted to 'task' and when dispatched 'action' will be - * called with 'arg' as the arg value. The new socket is returned - * in 'socketp'. - */ -isc_result_t -isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, - isc_socket_t **socketp) -{ - isc_socket_t *sock = NULL; - isc_result_t result; +static isc_result_t +opensocket(isc_socketmgr_t *manager, isc_socket_t *sock) { + char strbuf[ISC_STRERRORSIZE]; + const char *err = "socket"; + int tries = 0; #if defined(USE_CMSG) || defined(SO_BSDCOMPAT) int on = 1; #endif @@ -1484,38 +1872,27 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, ISC_SOCKADDR_LEN_T optlen; int size; #endif - char strbuf[ISC_STRERRORSIZE]; - const char *err = "socket"; - int try = 0; - REQUIRE(VALID_MANAGER(manager)); - REQUIRE(socketp != NULL && *socketp == NULL); - - result = allocate_socket(manager, type, &sock); - if (result != ISC_R_SUCCESS) - return (result); - - sock->pf = pf; again: - switch (type) { + switch (sock->type) { case isc_sockettype_udp: - sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP); + sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP); break; case isc_sockettype_tcp: - sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP); + sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP); break; case isc_sockettype_unix: - sock->fd = socket(pf, SOCK_STREAM, 0); + sock->fd = socket(sock->pf, SOCK_STREAM, 0); break; } - if (sock->fd == -1 && errno == EINTR && try++ < 42) + if (sock->fd == -1 && errno == EINTR && tries++ < 42) goto again; #ifdef F_DUPFD /* * Leave a space for stdio and TCP to work in. */ - if (manager->reserved != 0 && type == isc_sockettype_udp && + if (manager->reserved != 0 && sock->type == isc_sockettype_udp && sock->fd >= 0 && sock->fd < manager->reserved) { int new, tmp; new = fcntl(sock->fd, F_DUPFD, manager->reserved); @@ -1535,20 +1912,18 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #endif - if (sock->fd >= (int)manager->fdsize) { + if (sock->fd >= (int)manager->maxsocks) { (void)close(sock->fd); isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_TOOMANYFDS, - "%s: too many open file descriptors", "socket"); - free_socket(&sock); + "socket: file descriptor exceeds limit (%d/%u)", + sock->fd, manager->maxsocks); return (ISC_R_NORESOURCES); } - - if (sock->fd < 0) { - free_socket(&sock); + if (sock->fd < 0) { switch (errno) { case EMFILE: case ENFILE: @@ -1580,14 +1955,13 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, if (make_nonblock(sock->fd) != ISC_R_SUCCESS) { (void)close(sock->fd); - free_socket(&sock); return (ISC_R_UNEXPECTED); } #ifdef SO_BSDCOMPAT RUNTIME_CHECK(isc_once_do(&bsdcompat_once, clear_bsdcompat) == ISC_R_SUCCESS); - if (type != isc_sockettype_unix && bsdcompat && + if (sock->type != isc_sockettype_unix && bsdcompat && setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, (void *)&on, sizeof(on)) < 0) { isc__strerror(errno, strbuf, sizeof(strbuf)); @@ -1601,8 +1975,22 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #endif +#ifdef SO_NOSIGPIPE + if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE, + (void *)&on, sizeof(on)) < 0) { + isc__strerror(errno, strbuf, sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "setsockopt(%d, SO_NOSIGPIPE) %s: %s", + sock->fd, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed"), + strbuf); + /* Press on... */ + } +#endif + #if defined(USE_CMSG) || defined(SO_RCVBUF) - if (type == isc_sockettype_udp) { + if (sock->type == isc_sockettype_udp) { #if defined(USE_CMSG) #if defined(SO_TIMESTAMP) @@ -1612,7 +2000,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, isc__strerror(errno, strbuf, sizeof(strbuf)); UNEXPECTED_ERROR(__FILE__, __LINE__, "setsockopt(%d, SO_TIMESTAMP) %s: %s", - sock->fd, + sock->fd, isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, @@ -1623,7 +2011,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #endif /* SO_TIMESTAMP */ #if defined(ISC_PLATFORM_HAVEIPV6) - if (pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { + if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) { /* * Warn explicitly because this anomaly can be hidden * in usual operation (and unexpectedly appear later). @@ -1635,7 +2023,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #ifdef ISC_PLATFORM_HAVEIN6PKTINFO #ifdef IPV6_RECVPKTINFO /* RFC 3542 */ - if ((pf == AF_INET6) + if ((sock->pf == AF_INET6) && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO, (void *)&on, sizeof(on)) < 0)) { isc__strerror(errno, strbuf, sizeof(strbuf)); @@ -1650,7 +2038,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #else /* RFC 2292 */ - if ((pf == AF_INET6) + if ((sock->pf == AF_INET6) && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO, (void *)&on, sizeof(on)) < 0)) { isc__strerror(errno, strbuf, sizeof(strbuf)); @@ -1667,7 +2055,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */ #ifdef IPV6_USE_MIN_MTU /* RFC 3542, not too common yet*/ /* use minimum MTU */ - if (pf == AF_INET6) { + if (sock->pf == AF_INET6) { (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU, (void *)&on, sizeof(on)); @@ -1676,6 +2064,27 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #endif /* ISC_PLATFORM_HAVEIPV6 */ #endif /* defined(USE_CMSG) */ +#if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT) + /* + * Turn off Path MTU discovery on IPv4/UDP sockets. + */ + if (sock->pf == AF_INET) { + int action = IP_PMTUDISC_DONT; + (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER, + &action, sizeof(action)); + } +#endif +#if defined(IP_DONTFRAG) + /* + * Turn off Path MTU discovery on IPv4/UDP sockets. + */ + if (sock->pf == AF_INET) { + int off = 0; + (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG, + &off, sizeof(off)); + } +#endif + #if defined(SO_RCVBUF) optlen = sizeof(size); if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, @@ -1699,22 +2108,61 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, } #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */ + return (ISC_R_SUCCESS); +} + +/*% + * Create a new 'type' socket managed by 'manager'. Events + * will be posted to 'task' and when dispatched 'action' will be + * called with 'arg' as the arg value. The new socket is returned + * in 'socketp'. + */ +isc_result_t +isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, + isc_socket_t **socketp) +{ + isc_socket_t *sock = NULL; + isc_result_t result; + int lockid; + + REQUIRE(VALID_MANAGER(manager)); + REQUIRE(socketp != NULL && *socketp == NULL); + + result = allocate_socket(manager, type, &sock); + if (result != ISC_R_SUCCESS) + return (result); + + sock->pf = pf; + result = opensocket(manager, sock); + if (result != ISC_R_SUCCESS) { + free_socket(&sock); + return (result); + } + sock->references = 1; *socketp = sock; - LOCK(&manager->lock); - /* * Note we don't have to lock the socket like we normally would because * there are no external references to it yet. */ + lockid = FDLOCK_ID(sock->fd); + LOCK(&manager->fdlock[lockid]); manager->fds[sock->fd] = sock; manager->fdstate[sock->fd] = MANAGED; +#ifdef USE_DEVPOLL + INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && + sock->manager->fdpollinfo[sock->fd].want_write == 0); +#endif + UNLOCK(&manager->fdlock[lockid]); + + LOCK(&manager->lock); ISC_LIST_APPEND(manager->socklist, sock, link); +#ifdef USE_SELECT if (manager->maxfd < sock->fd) manager->maxfd = sock->fd; - +#endif UNLOCK(&manager->lock); socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, @@ -1723,6 +2171,48 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, return (ISC_R_SUCCESS); } +isc_result_t +isc_socket_open(isc_socket_t *sock) { + isc_result_t result; + + REQUIRE(VALID_SOCKET(sock)); + + LOCK(&sock->lock); + REQUIRE(sock->references == 1); + UNLOCK(&sock->lock); + /* + * We don't need to retain the lock hereafter, since no one else has + * this socket. + */ + REQUIRE(sock->fd == -1); + + result = opensocket(sock->manager, sock); + if (result != ISC_R_SUCCESS) + sock->fd = -1; + + if (result == ISC_R_SUCCESS) { + int lockid = FDLOCK_ID(sock->fd); + + LOCK(&sock->manager->fdlock[lockid]); + sock->manager->fds[sock->fd] = sock; + sock->manager->fdstate[sock->fd] = MANAGED; +#ifdef USE_DEVPOLL + INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 && + sock->manager->fdpollinfo[sock->fd].want_write == 0); +#endif + UNLOCK(&sock->manager->fdlock[lockid]); + +#ifdef USE_SELECT + LOCK(&sock->manager->lock); + if (sock->manager->maxfd < sock->fd) + sock->manager->maxfd = sock->fd; + UNLOCK(&sock->manager->lock); +#endif + } + + return (result); +} + /* * Attach to a socket. Caller must explicitly detach when it is done. */ @@ -1764,6 +2254,44 @@ isc_socket_detach(isc_socket_t **socketp) { *socketp = NULL; } +isc_result_t +isc_socket_close(isc_socket_t *sock) { + int fd; + + REQUIRE(VALID_SOCKET(sock)); + + LOCK(&sock->lock); + REQUIRE(sock->references == 1); + UNLOCK(&sock->lock); + /* + * We don't need to retain the lock hereafter, since no one else has + * this socket. + */ + + REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks); + + INSIST(!sock->connecting); + INSIST(!sock->pending_recv); + INSIST(!sock->pending_send); + INSIST(!sock->pending_accept); + INSIST(ISC_LIST_EMPTY(sock->recv_list)); + INSIST(ISC_LIST_EMPTY(sock->send_list)); + INSIST(ISC_LIST_EMPTY(sock->accept_list)); + INSIST(sock->connect_ev == NULL); + + fd = sock->fd; + sock->fd = -1; + sock->listener = 0; + sock->connected = 0; + sock->connecting = 0; + sock->bound = 0; + isc_sockaddr_any(&sock->address); + + closesocket(sock->manager, sock->type, fd); + + return (ISC_R_SUCCESS); +} + /* * I/O is possible on a given socket. Schedule an event to this task that * will call an internal function to do the I/O. This will charge the @@ -1993,7 +2521,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { */ addrlen = sizeof(dev->newsocket->address.type); - memset(&dev->newsocket->address.type.sa, 0, addrlen); + memset(&dev->newsocket->address.type, 0, addrlen); fd = accept(sock->fd, &dev->newsocket->address.type.sa, (void *)&addrlen); @@ -2070,19 +2598,20 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_accept(): " "accept() returned peer address " - "family %u (expected %u)", + "family %u (expected %u)", dev->newsocket->address. type.sa.sa_family, sock->pf); (void)close(fd); goto soft_error; - } else if (fd >= (int)manager->fdsize) { + } else if (fd >= (int)manager->maxsocks) { isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_TOOMANYFDS, - "%s: too many open file descriptors", - "accept"); + "accept: " + "file descriptor exceeds limit (%d/%u)", + fd, manager->maxsocks); (void)close(fd); goto soft_error; } @@ -2116,6 +2645,13 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { * -1 means the new socket didn't happen. */ if (fd != -1) { + int lockid = FDLOCK_ID(fd); + + LOCK(&manager->fdlock[lockid]); + manager->fds[fd] = dev->newsocket; + manager->fdstate[fd] = MANAGED; + UNLOCK(&manager->fdlock[lockid]); + LOCK(&manager->lock); ISC_LIST_APPEND(manager->socklist, dev->newsocket, link); @@ -2128,10 +2664,10 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { */ dev->address = dev->newsocket->address; - manager->fds[fd] = dev->newsocket; - manager->fdstate[fd] = MANAGED; +#ifdef USE_SELECT if (manager->maxfd < fd) manager->maxfd = fd; +#endif socket_log(sock, &dev->newsocket->address, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, @@ -2143,7 +2679,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { dev->newsocket->references--; free_socket(&dev->newsocket); } - + /* * Fill in the done event details and send it off. */ @@ -2280,77 +2816,256 @@ internal_send(isc_task_t *me, isc_event_t *ev) { UNLOCK(&sock->lock); } +/* + * Process read/writes on each fd here. Avoid locking + * and unlocking twice if both reads and writes are possible. + */ static void -process_fds(isc_socketmgr_t *manager, int maxfd, - fd_set *readfds, fd_set *writefds) +process_fd(isc_socketmgr_t *manager, int fd, isc_boolean_t readable, + isc_boolean_t writeable) { - int i; isc_socket_t *sock; isc_boolean_t unlock_sock; - - REQUIRE(maxfd <= (int)manager->fdsize); + int lockid = FDLOCK_ID(fd); /* - * Process read/writes on other fds here. Avoid locking - * and unlocking twice if both reads and writes are possible. + * If the socket is going to be closed, don't do more I/O. */ - for (i = 0; i < maxfd; i++) { + LOCK(&manager->fdlock[lockid]); + if (manager->fdstate[fd] == CLOSE_PENDING) { + UNLOCK(&manager->fdlock[lockid]); + + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + return; + } + + sock = manager->fds[fd]; + UNLOCK(&manager->fdlock[lockid]); + unlock_sock = ISC_FALSE; + if (readable) { + if (sock == NULL) { + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + goto check_write; + } + unlock_sock = ISC_TRUE; + LOCK(&sock->lock); + if (!SOCK_DEAD(sock)) { + if (sock->listener) + dispatch_accept(sock); + else + dispatch_recv(sock); + } + (void)unwatch_fd(manager, fd, SELECT_POKE_READ); + } +check_write: + if (writeable) { + if (sock == NULL) { + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + return; + } + if (!unlock_sock) { + unlock_sock = ISC_TRUE; + LOCK(&sock->lock); + } + if (!SOCK_DEAD(sock)) { + if (sock->connecting) + dispatch_connect(sock); + else + dispatch_send(sock); + } + (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE); + } + if (unlock_sock) + UNLOCK(&sock->lock); +} + +#ifdef USE_KQUEUE +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct kevent *events, int nevents) { + int i; + isc_boolean_t readable, writable; + isc_boolean_t done = ISC_FALSE; #ifdef ISC_PLATFORM_USETHREADS - if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) + isc_boolean_t have_ctlevent = ISC_FALSE; +#endif + + if (nevents == manager->nevents) { + /* + * This is not an error, but something unexpected. If this + * happens, it may indicate the need for increasing + * ISC_SOCKET_MAXEVENTS. + */ + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].ident < manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) { + have_ctlevent = ISC_TRUE; continue; -#endif /* ISC_PLATFORM_USETHREADS */ + } +#endif + readable = ISC_TF(events[i].filter == EVFILT_READ); + writable = ISC_TF(events[i].filter == EVFILT_WRITE); + process_fd(manager, events[i].ident, readable, writable); + } - if (manager->fdstate[i] == CLOSE_PENDING) { - manager->fdstate[i] = CLOSED; - FD_CLR(i, manager->read_fds); - FD_CLR(i, manager->write_fds); +#ifdef ISC_PLATFORM_USETHREADS + if (have_ctlevent) + done = process_ctlfd(manager); +#endif - (void)close(i); + return (done); +} +#elif defined(USE_EPOLL) +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct epoll_event *events, int nevents) { + int i; + isc_boolean_t done = ISC_FALSE; +#ifdef ISC_PLATFORM_USETHREADS + isc_boolean_t have_ctlevent = ISC_FALSE; +#endif + + if (nevents == manager->nevents) { + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].data.fd < (int)manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].data.fd == manager->pipe_fds[0]) { + have_ctlevent = ISC_TRUE; continue; } - - sock = manager->fds[i]; - unlock_sock = ISC_FALSE; - if (FD_ISSET(i, readfds)) { - if (sock == NULL) { - FD_CLR(i, manager->read_fds); - goto check_write; - } - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - if (!SOCK_DEAD(sock)) { - if (sock->listener) - dispatch_accept(sock); - else - dispatch_recv(sock); - } - FD_CLR(i, manager->read_fds); +#endif + if ((events[i].events & EPOLLERR) != 0 || + (events[i].events & EPOLLHUP) != 0) { + /* + * epoll does not set IN/OUT bits on an erroneous + * condition, so we need to try both anyway. This is a + * bit inefficient, but should be okay for such rare + * events. Note also that the read or write attempt + * won't block because we use non-blocking sockets. + */ + events[i].events |= (EPOLLIN | EPOLLOUT); } - check_write: - if (FD_ISSET(i, writefds)) { - if (sock == NULL) { - FD_CLR(i, manager->write_fds); - continue; - } - if (!unlock_sock) { - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - } - if (!SOCK_DEAD(sock)) { - if (sock->connecting) - dispatch_connect(sock); - else - dispatch_send(sock); - } - FD_CLR(i, manager->write_fds); + process_fd(manager, events[i].data.fd, + (events[i].events & EPOLLIN) != 0, + (events[i].events & EPOLLOUT) != 0); + } + +#ifdef ISC_PLATFORM_USETHREADS + if (have_ctlevent) + done = process_ctlfd(manager); +#endif + + return (done); +} +#elif defined(USE_DEVPOLL) +static isc_boolean_t +process_fds(isc_socketmgr_t *manager, struct pollfd *events, int nevents) { + int i; + isc_boolean_t done = ISC_FALSE; +#ifdef ISC_PLATFORM_USETHREADS + isc_boolean_t have_ctlevent = ISC_FALSE; +#endif + + if (nevents == manager->nevents) { + manager_log(manager, ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, ISC_LOG_INFO, + "maximum number of FD events (%d) received", + nevents); + } + + for (i = 0; i < nevents; i++) { + REQUIRE(events[i].fd < (int)manager->maxsocks); +#ifdef ISC_PLATFORM_USETHREADS + if (events[i].fd == manager->pipe_fds[0]) { + have_ctlevent = ISC_TRUE; + continue; } - if (unlock_sock) - UNLOCK(&sock->lock); +#endif + process_fd(manager, events[i].fd, + (events[i].events & POLLIN) != 0, + (events[i].events & POLLOUT) != 0); + } + +#ifdef ISC_PLATFORM_USETHREADS + if (have_ctlevent) + done = process_ctlfd(manager); +#endif + + return (done); +} +#elif defined(USE_SELECT) +static void +process_fds(isc_socketmgr_t *manager, int maxfd, + fd_set *readfds, fd_set *writefds) +{ + int i; + + REQUIRE(maxfd <= (int)manager->maxsocks); + + for (i = 0; i < maxfd; i++) { +#ifdef ISC_PLATFORM_USETHREADS + if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1]) + continue; +#endif /* ISC_PLATFORM_USETHREADS */ + process_fd(manager, i, FD_ISSET(i, readfds), + FD_ISSET(i, writefds)); } } +#endif #ifdef ISC_PLATFORM_USETHREADS +static isc_boolean_t +process_ctlfd(isc_socketmgr_t *manager) { + int msg, fd; + + for (;;) { + select_readmsg(manager, &fd, &msg); + + manager_log(manager, IOEVENT, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_WATCHERMSG, + "watcher got message %d " + "for socket %d"), msg, fd); + + /* + * Nothing to read? + */ + if (msg == SELECT_POKE_NOTHING) + break; + + /* + * Handle shutdown message. We really should + * jump out of this loop right away, but + * it doesn't matter if we have to do a little + * more work first. + */ + if (msg == SELECT_POKE_SHUTDOWN) + return (ISC_TRUE); + + /* + * This is a wakeup on a socket. Look + * at the event queue for both read and write, + * and decide if we need to watch on it now + * or not. + */ + wakeup_socket(manager, fd, msg); + } + + return (ISC_FALSE); +} + /* * This is the thread that will loop forever, always in a select or poll * call. @@ -2364,98 +3079,116 @@ watcher(void *uap) { isc_boolean_t done; int ctlfd; int cc; - int msg, fd; +#ifdef USE_KQUEUE + const char *fnname = "kevent()"; +#elif defined (USE_EPOLL) + const char *fnname = "epoll_wait()"; +#elif defined(USE_DEVPOLL) + const char *fnname = "ioctl(DP_POLL)"; + struct dvpoll dvp; +#elif defined (USE_SELECT) + const char *fnname = "select()"; int maxfd; +#endif char strbuf[ISC_STRERRORSIZE]; +#ifdef ISC_SOCKET_USE_POLLWATCH + pollstate_t pollstate = poll_idle; +#endif /* * Get the control fd here. This will never change. */ - LOCK(&manager->lock); ctlfd = manager->pipe_fds[0]; - done = ISC_FALSE; while (!done) { do { +#ifdef USE_KQUEUE + cc = kevent(manager->kqueue_fd, NULL, 0, + manager->events, manager->nevents, NULL); +#elif defined(USE_EPOLL) + cc = epoll_wait(manager->epoll_fd, manager->events, + manager->nevents, -1); +#elif defined(USE_DEVPOLL) + dvp.dp_fds = manager->events; + dvp.dp_nfds = manager->nevents; +#ifndef ISC_SOCKET_USE_POLLWATCH + dvp.dp_timeout = -1; +#else + if (pollstate == poll_idle) + dvp.dp_timeout = -1; + else + dvp.dp_timeout = ISC_SOCKET_POLLWATCH_TIMEOUT; +#endif /* ISC_SOCKET_USE_POLLWATCH */ + cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp); +#elif defined(USE_SELECT) + LOCK(&manager->lock); memcpy(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize); memcpy(manager->write_fds_copy, manager->write_fds, manager->fd_bufsize); maxfd = manager->maxfd + 1; - UNLOCK(&manager->lock); cc = select(maxfd, manager->read_fds_copy, manager->write_fds_copy, NULL, NULL); - if (cc < 0) { - if (!SOFT_ERROR(errno)) { - isc__strerror(errno, strbuf, - sizeof(strbuf)); - FATAL_ERROR(__FILE__, __LINE__, - "select() %s: %s", - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, - "failed"), - strbuf); - } +#endif /* USE_KQUEUE */ + + if (cc < 0 && !SOFT_ERROR(errno)) { + isc__strerror(errno, strbuf, sizeof(strbuf)); + FATAL_ERROR(__FILE__, __LINE__, + "%s %s: %s", fnname, + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed"), strbuf); } - LOCK(&manager->lock); +#if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH) + if (cc == 0) { + if (pollstate == poll_active) + pollstate = poll_checking; + else if (pollstate == poll_checking) + pollstate = poll_idle; + } else if (cc > 0) { + if (pollstate == poll_checking) { + /* + * XXX: We'd like to use a more + * verbose log level as it's actually an + * unexpected event, but the kernel bug + * reportedly happens pretty frequently + * (and it can also be a false positive) + * so it would be just too noisy. + */ + manager_log(manager, + ISC_LOGCATEGORY_GENERAL, + ISC_LOGMODULE_SOCKET, + ISC_LOG_DEBUG(1), + ISC_LOG_INFO, + "unexpected POLL timeout"); + } + pollstate = poll_active; + } +#endif } while (cc < 0); +#if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL) + done = process_fds(manager, manager->events, cc); +#elif defined(USE_SELECT) + process_fds(manager, maxfd, manager->read_fds_copy, + manager->write_fds_copy); /* * Process reads on internal, control fd. */ - if (FD_ISSET(ctlfd, manager->read_fds_copy)) { - for (;;) { - select_readmsg(manager, &fd, &msg); - - manager_log(manager, IOEVENT, - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_SOCKET, - ISC_MSG_WATCHERMSG, - "watcher got message %d"), - msg); - - /* - * Nothing to read? - */ - if (msg == SELECT_POKE_NOTHING) - break; - - /* - * Handle shutdown message. We really should - * jump out of this loop right away, but - * it doesn't matter if we have to do a little - * more work first. - */ - if (msg == SELECT_POKE_SHUTDOWN) { - done = ISC_TRUE; - - break; - } - - /* - * This is a wakeup on a socket. Look - * at the event queue for both read and write, - * and decide if we need to watch on it now - * or not. - */ - wakeup_socket(manager, fd, msg); - } - } - - process_fds(manager, maxfd, manager->read_fds_copy, - manager->write_fds_copy); + if (FD_ISSET(ctlfd, manager->read_fds_copy)) + done = process_ctlfd(manager); +#endif } manager_log(manager, TRACE, isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_EXITING, "watcher exiting")); - UNLOCK(&manager->lock); return ((isc_threadresult_t)0); } #endif /* ISC_PLATFORM_USETHREADS */ @@ -2469,69 +3202,187 @@ isc__socketmgr_setreserved(isc_socketmgr_t *manager, isc_uint32_t reserved) { } /* - * Initialize fdsets in socketmgr structure. + * Create a new socket manager. */ + static isc_result_t -create_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) { -#if ISC_SOCKET_FDSETSIZE > FD_SETSIZE - manager->fdsize = ISC_SOCKET_FDSETSIZE; - manager->fd_bufsize = howmany(ISC_SOCKET_FDSETSIZE, NFDBITS) * +setup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { + isc_result_t result; + +#ifdef USE_KQUEUE + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct kevent) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + manager->kqueue_fd = kqueue(); + if (manager->kqueue_fd == -1) { + result = isc__errno2result(errno); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); + return (result); + } + +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->kqueue_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_EPOLL) + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + manager->epoll_fd = epoll_create(manager->nevents); + if (manager->epoll_fd == -1) { + result = isc__errno2result(errno); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); + return (result); + } +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->epoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_DEVPOLL) + /* + * XXXJT: /dev/poll seems to reject large numbers of events, + * so we should be careful about redefining ISC_SOCKET_MAXEVENTS. + */ + manager->nevents = ISC_SOCKET_MAXEVENTS; + manager->events = isc_mem_get(mctx, sizeof(struct pollfd) * + manager->nevents); + if (manager->events == NULL) + return (ISC_R_NOMEMORY); + /* + * Note: fdpollinfo should be able to support all possible FDs, so + * it must have maxsocks entries (not nevents). + */ + manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) * + manager->maxsocks); + if (manager->fdpollinfo == NULL) { + isc_mem_put(mctx, manager->events, + sizeof(pollinfo_t) * manager->maxsocks); + return (ISC_R_NOMEMORY); + } + memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks); + manager->devpoll_fd = open("/dev/poll", O_RDWR); + if (manager->devpoll_fd == -1) { + result = isc__errno2result(errno); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); + return (result); + } +#ifdef ISC_PLATFORM_USETHREADS + result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + close(manager->devpoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); + return (result); + } +#endif /* ISC_PLATFORM_USETHREADS */ +#elif defined(USE_SELECT) + UNUSED(result); + +#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE + /* + * Note: this code should also cover the case of MAXSOCKETS <= + * FD_SETSIZE, but we separate the cases to avoid possible portability + * issues regarding howmany() and the actual representation of fd_set. + */ + manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) * sizeof(fd_mask); #else - manager->fdsize = FD_SETSIZE; manager->fd_bufsize = sizeof(fd_set); #endif - manager->fds = NULL; - manager->fdstate = NULL; manager->read_fds = NULL; manager->read_fds_copy = NULL; manager->write_fds = NULL; manager->write_fds_copy = NULL; - manager->fds = isc_mem_get(mctx, - manager->fdsize * sizeof(manager->fds[0])); - if (manager->fds == NULL) - goto fail; - - manager->fdstate = isc_mem_get(mctx, manager->fdsize * - sizeof(manager->fdstate[0])); - if (manager->fdstate == NULL) - goto fail; - manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize); - if (manager->read_fds == NULL) - goto fail; - manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); - if (manager->read_fds_copy == NULL) - goto fail; - manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); - if (manager->write_fds == NULL) - goto fail; - manager->write_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); - if (manager->write_fds_copy == NULL) - goto fail; + if (manager->read_fds != NULL) + manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize); + if (manager->read_fds_copy != NULL) + manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize); + if (manager->write_fds != NULL) { + manager->write_fds_copy = isc_mem_get(mctx, + manager->fd_bufsize); + } + if (manager->write_fds_copy == NULL) { + if (manager->write_fds != NULL) { + isc_mem_put(mctx, manager->write_fds, + manager->fd_bufsize); + } + if (manager->read_fds_copy != NULL) { + isc_mem_put(mctx, manager->read_fds_copy, + manager->fd_bufsize); + } + if (manager->read_fds != NULL) { + isc_mem_put(mctx, manager->read_fds, + manager->fd_bufsize); + } + return (ISC_R_NOMEMORY); + } + memset(manager->read_fds, 0, manager->fd_bufsize); + memset(manager->write_fds, 0, manager->fd_bufsize); - return (ISC_R_SUCCESS); +#ifdef ISC_PLATFORM_USETHREADS + (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + manager->maxfd = manager->pipe_fds[0]; +#else /* ISC_PLATFORM_USETHREADS */ + manager->maxfd = 0; +#endif /* ISC_PLATFORM_USETHREADS */ +#endif /* USE_KQUEUE */ - fail: - cleanup_fdsets(manager, mctx); - return (ISC_R_NOMEMORY); + return (ISC_R_SUCCESS); } -/* - * Clean up fdsets in socketmgr structure. - */ static void -cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) { - if (manager->fds != NULL) { - isc_mem_put(mctx, manager->fds, - manager->fdsize * sizeof(manager->fds[0])); - } - if (manager->fdstate != NULL) { - isc_mem_put(mctx, manager->fdstate, - manager->fdsize * sizeof(manager->fdstate[0])); +cleanup_watcher(isc_mem_t *mctx, isc_socketmgr_t *manager) { +#ifdef ISC_PLATFORM_USETHREADS + isc_result_t result; + + result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ); + if (result != ISC_R_SUCCESS) { + UNEXPECTED_ERROR(__FILE__, __LINE__, + "epoll_ctl(DEL) %s", + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed")); } +#endif /* ISC_PLATFORM_USETHREADS */ + +#ifdef USE_KQUEUE + close(manager->kqueue_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct kevent) * manager->nevents); +#elif defined(USE_EPOLL) + close(manager->epoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct epoll_event) * manager->nevents); +#elif defined(USE_DEVPOLL) + close(manager->devpoll_fd); + isc_mem_put(mctx, manager->events, + sizeof(struct pollfd) * manager->nevents); + isc_mem_put(mctx, manager->fdpollinfo, + sizeof(pollinfo_t) * manager->maxsocks); +#elif defined(USE_SELECT) if (manager->read_fds != NULL) isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize); if (manager->read_fds_copy != NULL) @@ -2540,13 +3391,19 @@ cleanup_fdsets(isc_socketmgr_t *manager, isc_mem_t *mctx) { isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize); if (manager->write_fds_copy != NULL) isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize); +#endif /* USE_KQUEUE */ } -/* - * Create a new socket manager. - */ isc_result_t isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { + return (isc_socketmgr_create2(mctx, managerp, 0)); +} + +isc_result_t +isc_socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp, + unsigned int maxsocks) +{ + int i; isc_socketmgr_t *manager; #ifdef ISC_PLATFORM_USETHREADS char strbuf[ISC_STRERRORSIZE]; @@ -2557,43 +3414,71 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { #ifndef ISC_PLATFORM_USETHREADS if (socketmgr != NULL) { + /* Don't allow maxsocks to be updated */ + if (maxsocks > 0 && socketmgr->maxsocks != maxsocks) + return (ISC_R_EXISTS); + socketmgr->refs++; *managerp = socketmgr; return (ISC_R_SUCCESS); } #endif /* ISC_PLATFORM_USETHREADS */ + if (maxsocks == 0) + maxsocks = ISC_SOCKET_MAXSOCKETS; + manager = isc_mem_get(mctx, sizeof(*manager)); if (manager == NULL) return (ISC_R_NOMEMORY); - result = create_fdsets(manager, mctx); - if (result != ISC_R_SUCCESS) { - cleanup_fdsets(manager, mctx); - isc_mem_put(mctx, manager, sizeof(*manager)); - return (result); + /* zero-clear so that necessary cleanup on failure will be easy */ + memset(manager, 0, sizeof(*manager)); + manager->maxsocks = maxsocks; + manager->reserved = 0; + manager->fds = isc_mem_get(mctx, + manager->maxsocks * sizeof(isc_socket_t *)); + if (manager->fds == NULL) { + result = ISC_R_NOMEMORY; + goto free_manager; + } + manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int)); + if (manager->fds == NULL) { + result = ISC_R_NOMEMORY; + goto free_manager; } manager->magic = SOCKET_MANAGER_MAGIC; manager->mctx = NULL; - memset(manager->fds, 0, sizeof(manager->fds[0]) * manager->fdsize); + memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *)); ISC_LIST_INIT(manager->socklist); result = isc_mutex_init(&manager->lock); - if (result != ISC_R_SUCCESS) { - cleanup_fdsets(manager, mctx); - isc_mem_put(mctx, manager, sizeof(*manager)); - return (result); + if (result != ISC_R_SUCCESS) + goto free_manager; + manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t)); + if (manager->fdlock == NULL) { + result = ISC_R_NOMEMORY; + goto cleanup_lock; } + for (i = 0; i < FDLOCK_COUNT; i++) { + result = isc_mutex_init(&manager->fdlock[i]); + if (result != ISC_R_SUCCESS) { + while (--i >= 0) + DESTROYLOCK(&manager->fdlock[i]); + isc_mem_put(mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + manager->fdlock = NULL; + goto cleanup_lock; + } + } + #ifdef ISC_PLATFORM_USETHREADS if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { - cleanup_fdsets(manager, mctx); - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_condition_init() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed")); - return (ISC_R_UNEXPECTED); + result = ISC_R_UNEXPECTED; + goto cleanup_lock; } /* @@ -2601,17 +3486,14 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { * select/poll loop when something internal needs to be done. */ if (pipe(manager->pipe_fds) != 0) { - cleanup_fdsets(manager, mctx); - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); isc__strerror(errno, strbuf, sizeof(strbuf)); UNEXPECTED_ERROR(__FILE__, __LINE__, "pipe() %s: %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed"), strbuf); - - return (ISC_R_UNEXPECTED); + result = ISC_R_UNEXPECTED; + goto cleanup_condition; } RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS); @@ -2625,33 +3507,23 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { /* * Set up initial state for the select loop */ - memset(manager->read_fds, 0, manager->fd_bufsize); - memset(manager->write_fds, 0, manager->fd_bufsize); -#ifdef ISC_PLATFORM_USETHREADS - FD_SET(manager->pipe_fds[0], manager->read_fds); - manager->maxfd = manager->pipe_fds[0]; -#else /* ISC_PLATFORM_USETHREADS */ - manager->maxfd = 0; -#endif /* ISC_PLATFORM_USETHREADS */ - manager->reserved = 0; - memset(manager->fdstate, 0, - manager->fdsize * sizeof(manager->fdstate[0])); - + result = setup_watcher(mctx, manager); + if (result != ISC_R_SUCCESS) + goto cleanup; + memset(manager->fdstate, 0, manager->maxsocks * sizeof(int)); #ifdef ISC_PLATFORM_USETHREADS /* * Start up the select/poll thread. */ if (isc_thread_create(watcher, manager, &manager->watcher) != ISC_R_SUCCESS) { - (void)close(manager->pipe_fds[0]); - (void)close(manager->pipe_fds[1]); - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_thread_create() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed")); - return (ISC_R_UNEXPECTED); + cleanup_watcher(mctx, manager); + result = ISC_R_UNEXPECTED; + goto cleanup; } #endif /* ISC_PLATFORM_USETHREADS */ isc_mem_attach(mctx, &manager->mctx); @@ -2662,6 +3534,52 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { *managerp = manager; return (ISC_R_SUCCESS); + +cleanup: +#ifdef ISC_PLATFORM_USETHREADS + (void)close(manager->pipe_fds[0]); + (void)close(manager->pipe_fds[1]); +#endif /* ISC_PLATFORM_USETHREADS */ + +#ifdef ISC_PLATFORM_USETHREADS +cleanup_condition: + (void)isc_condition_destroy(&manager->shutdown_ok); +#endif /* ISC_PLATFORM_USETHREADS */ + + +cleanup_lock: + if (manager->fdlock != NULL) { + for (i = 0; i < FDLOCK_COUNT; i++) + DESTROYLOCK(&manager->fdlock[i]); + } + DESTROYLOCK(&manager->lock); + +free_manager: + if (manager->fdlock != NULL) { + isc_mem_put(mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + } + if (manager->fdstate != NULL) { + isc_mem_put(mctx, manager->fdstate, + manager->maxsocks * sizeof(int)); + } + if (manager->fds != NULL) { + isc_mem_put(mctx, manager->fds, + manager->maxsocks * sizeof(isc_socket_t *)); + } + isc_mem_put(mctx, manager, sizeof(*manager)); + + return (result); +} + +isc_result_t +isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager, unsigned int *nsockp) { + REQUIRE(VALID_MANAGER(manager)); + REQUIRE(nsockp != NULL); + + *nsockp = manager->maxsocks; + + return (ISC_R_SUCCESS); } void @@ -2735,18 +3653,30 @@ isc_socketmgr_destroy(isc_socketmgr_t **managerp) { /* * Clean up. */ + cleanup_watcher(manager->mctx, manager); + #ifdef ISC_PLATFORM_USETHREADS (void)close(manager->pipe_fds[0]); (void)close(manager->pipe_fds[1]); (void)isc_condition_destroy(&manager->shutdown_ok); #endif /* ISC_PLATFORM_USETHREADS */ - for (i = 0; i < (int)manager->fdsize; i++) - if (manager->fdstate[i] == CLOSE_PENDING) + for (i = 0; i < (int)manager->maxsocks; i++) + if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */ (void)close(i); + isc_mem_put(manager->mctx, manager->fds, + manager->maxsocks * sizeof(isc_socket_t *)); + isc_mem_put(manager->mctx, manager->fdstate, + manager->maxsocks * sizeof(int)); + + if (manager->fdlock != NULL) { + for (i = 0; i < FDLOCK_COUNT; i++) + DESTROYLOCK(&manager->fdlock[i]); + isc_mem_put(manager->mctx, manager->fdlock, + FDLOCK_COUNT * sizeof(isc_mutex_t)); + } DESTROYLOCK(&manager->lock); - cleanup_fdsets(manager, manager->mctx); manager->magic = 0; mctx= manager->mctx; isc_mem_put(mctx, manager, sizeof(*manager)); @@ -2799,7 +3729,7 @@ socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, * Enqueue the request. If the socket was previously not being * watched, poke the watcher to start paying attention to it. */ - if (ISC_LIST_EMPTY(sock->recv_list)) + if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv) select_poke(sock->manager, sock->fd, SELECT_POKE_READ); ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); @@ -2996,7 +3926,8 @@ socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, * not being watched, poke the watcher to start * paying attention to it. */ - if (ISC_LIST_EMPTY(sock->send_list)) + if (ISC_LIST_EMPTY(sock->send_list) && + !sock->pending_send) select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); @@ -3286,7 +4217,7 @@ isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, } else strcpy(path, "."); #endif - + if (chmod(path, perm) < 0) { isc__strerror(errno, strbuf, sizeof(strbuf)); isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, @@ -3315,7 +4246,7 @@ isc_socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm, } isc_result_t -isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, +isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr, unsigned int options) { char strbuf[ISC_STRERRORSIZE]; int on = 1; @@ -3446,7 +4377,7 @@ isc_socket_listen(isc_socket_t *sock, unsigned int backlog) { } /* - * This should try to do agressive accept() XXXMLG + * This should try to do aggressive accept() XXXMLG */ isc_result_t isc_socket_accept(isc_socket_t *sock, @@ -3557,6 +4488,16 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, sock->address = *addr; cc = connect(sock->fd, &addr->type.sa, addr->length); if (cc < 0) { + /* + * HP-UX "fails" to connect a UDP socket and sets errno to + * EINPROGRESS if it's non-blocking. We'd rather regard this as + * a success and let the user detect it if it's really an error + * at the time of sending a packet on the socket. + */ + if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) { + cc = 0; + goto success; + } if (SOFT_ERROR(errno) || errno == EINPROGRESS) goto queue; @@ -3598,6 +4539,7 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, /* * If connect completed, fire off the done event. */ + success: if (cc == 0) { sock->connected = 1; sock->bound = 1; @@ -3957,37 +4899,107 @@ isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) { #ifdef IPV6_V6ONLY if (sock->pf == AF_INET6) { - (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, - (void *)&onoff, sizeof(onoff)); + if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, + (void *)&onoff, sizeof(int)) < 0) { + char strbuf[ISC_STRERRORSIZE]; + + UNEXPECTED_ERROR(__FILE__, __LINE__, + "setsockopt(%d, IPV6_V6ONLY) " + "%s: %s", sock->fd, + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed"), + strbuf); + } } + FIX_IPV6_RECVPKTINFO(sock); /* AIX */ #endif } #ifndef ISC_PLATFORM_USETHREADS -void -isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd) { +/* In our assumed scenario, we can simply use a single static object. */ +static isc_socketwait_t swait_private; + +int +isc__socketmgr_waitevents(struct timeval *tvp, isc_socketwait_t **swaitp) { + int n; +#ifdef USE_KQUEUE + struct timespec ts, *tsp; +#endif +#ifdef USE_EPOLL + int timeout; +#endif +#ifdef USE_DEVPOLL + struct dvpoll dvp; +#endif + + REQUIRE(swaitp != NULL && *swaitp == NULL); + if (socketmgr == NULL) - *maxfd = 0; - else { - /* Prepare duplicates of fd_sets, as select() will modify */ - memcpy(socketmgr->read_fds_copy, socketmgr->read_fds, - socketmgr->fd_bufsize); - memcpy(socketmgr->write_fds_copy, socketmgr->write_fds, - socketmgr->fd_bufsize); - *readset = socketmgr->read_fds_copy; - *writeset = socketmgr->write_fds_copy; - *maxfd = socketmgr->maxfd + 1; - } + return (0); + +#ifdef USE_KQUEUE + if (tvp != NULL) { + ts.tv_sec = tvp->tv_sec; + ts.tv_nsec = tvp->tv_usec * 1000; + tsp = &ts; + } else + tsp = NULL; + swait_private.nevents = kevent(socketmgr->kqueue_fd, NULL, 0, + socketmgr->events, socketmgr->nevents, + tsp); + n = swait_private.nevents; +#elif defined(USE_EPOLL) + if (tvp != NULL) + timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000; + else + timeout = -1; + swait_private.nevents = epoll_wait(socketmgr->epoll_fd, + socketmgr->events, + socketmgr->nevents, timeout); + n = swait_private.nevents; +#elif defined(USE_DEVPOLL) + dvp.dp_fds = socketmgr->events; + dvp.dp_nfds = socketmgr->nevents; + if (tvp != NULL) { + dvp.dp_timeout = tvp->tv_sec * 1000 + + (tvp->tv_usec + 999) / 1000; + } else + dvp.dp_timeout = -1; + swait_private.nevents = ioctl(socketmgr->devpoll_fd, DP_POLL, &dvp); + n = swait_private.nevents; +#elif defined(USE_SELECT) + memcpy(socketmgr->read_fds_copy, socketmgr->read_fds, + socketmgr->fd_bufsize); + memcpy(socketmgr->write_fds_copy, socketmgr->write_fds, + socketmgr->fd_bufsize); + + swait_private.readset = socketmgr->read_fds_copy; + swait_private.writeset = socketmgr->write_fds_copy; + swait_private.maxfd = socketmgr->maxfd + 1; + + n = select(swait_private.maxfd, swait_private.readset, + swait_private.writeset, NULL, tvp); +#endif + + *swaitp = &swait_private; + return (n); } isc_result_t -isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd) { - isc_socketmgr_t *manager = socketmgr; +isc__socketmgr_dispatch(isc_socketwait_t *swait) { + REQUIRE(swait == &swait_private); - if (manager == NULL) + if (socketmgr == NULL) return (ISC_R_NOTFOUND); - process_fds(manager, maxfd, readset, writeset); +#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) + (void)process_fds(socketmgr, socketmgr->events, swait->nevents); return (ISC_R_SUCCESS); +#elif defined(USE_SELECT) + process_fds(socketmgr, swait->maxfd, swait->readset, swait->writeset); + return (ISC_R_SUCCESS); +#endif } #endif /* ISC_PLATFORM_USETHREADS */ diff --git a/contrib/bind9/lib/isc/unix/socket_p.h b/contrib/bind9/lib/isc/unix/socket_p.h index 4f9cf27..b7da860 100644 --- a/contrib/bind9/lib/isc/unix/socket_p.h +++ b/contrib/bind9/lib/isc/unix/socket_p.h @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2005, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 2000, 2001 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket_p.h,v 1.7.18.2.52.1 2008/07/29 04:47:31 each Exp $ */ +/* $Id: socket_p.h,v 1.7.18.4 2008/06/24 23:45:55 tbox Exp $ */ #ifndef ISC_SOCKET_P_H #define ISC_SOCKET_P_H @@ -26,10 +26,7 @@ #include <sys/select.h> #endif -void -isc__socketmgr_getfdsets(fd_set **readset, fd_set **writeset, int *maxfd); - -isc_result_t -isc__socketmgr_dispatch(fd_set *readset, fd_set *writeset, int maxfd); - +typedef struct isc_socketwait isc_socketwait_t; +int isc__socketmgr_waitevents(struct timeval *, isc_socketwait_t **); +isc_result_t isc__socketmgr_dispatch(isc_socketwait_t *); #endif /* ISC_SOCKET_P_H */ diff --git a/contrib/bind9/lib/isc/unix/time.c b/contrib/bind9/lib/isc/unix/time.c index bac24d7..facc12b 100644 --- a/contrib/bind9/lib/isc/unix/time.c +++ b/contrib/bind9/lib/isc/unix/time.c @@ -1,8 +1,8 @@ /* - * Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC") + * Copyright (C) 2004, 2005, 2008 Internet Systems Consortium, Inc. ("ISC") * Copyright (C) 1998-2001, 2003 Internet Software Consortium. * - * Permission to use, copy, modify, and distribute this software for any + * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: time.c,v 1.47.18.2 2005/04/29 00:17:09 marka Exp $ */ +/* $Id: time.c,v 1.47.18.4 2008/02/18 23:46:01 tbox Exp $ */ /*! \file */ @@ -227,7 +227,7 @@ isc_time_nowplusinterval(isc_time_t *t, const isc_interval_t *i) { t->seconds = tv.tv_sec + i->seconds; t->nanoseconds = tv.tv_usec * NS_PER_US + i->nanoseconds; - if (t->nanoseconds > NS_PER_S) { + if (t->nanoseconds >= NS_PER_S) { t->seconds++; t->nanoseconds -= NS_PER_S; } @@ -410,5 +410,5 @@ isc_time_formattimestamp(const isc_time_t *t, char *buf, unsigned int len) { snprintf(buf + flen, len - flen, ".%03u", t->nanoseconds / 1000000); else - snprintf(buf, len, "99-Bad-9999 99:99:99.999"); + snprintf(buf, len, "99-Bad-9999 99:99:99.999"); } |