summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjoerg <joerg@FreeBSD.org>2000-02-28 17:49:43 +0000
committerjoerg <joerg@FreeBSD.org>2000-02-28 17:49:43 +0000
commite52757e9dd40fad69d76303ca929f34458ad55ec (patch)
tree37b8e5edab33453a24f4992154f2c2e2087c411a
parent543d5b04852f50d25baedfca23bc50db5252bd1f (diff)
downloadFreeBSD-src-e52757e9dd40fad69d76303ca929f34458ad55ec.zip
FreeBSD-src-e52757e9dd40fad69d76303ca929f34458ad55ec.tar.gz
Fix a serious bug in syslogd regarding the handling of pipes. The bug
would cause syslogd to eventually kill innocent processes in the system over time (note: not `could' but `would'). Many thanks to my colleague Mirko for digging into the kernel structures and providing me with the debugging framework to find out about the nature of this bug (and to isolate that syslogd was the culprit) in a rather large set of distributed machines at client sites where this happened occasionally. Whenever a child process was no longer responsive, or when syslogd receives a SIGHUP so it closes all its logging file descriptors, for any descriptor that refers to a pipe syslogd enters the data about the old logging child process into a `dead queue', where it is being removed from (and the status of the dead kitten being fetched) upon receipt of a SIGCHLD. However, there's a high probability that the SIGCHLD already arrives before the child's data are actually entered into the dead queue inside the SIGHUP handler, so the SIGCHLD handler has nothing to fetch and remove and simply continues. Whenever this happens, the process'es data remain on the dead queue forever, and since domark() tried to get rid of totally unresponsive children by first sending a SIGTERM and later a SIGKILL, it was only a matter of time until the system had recycled enough PIDs so an innocent process got shot to death. Fix the race by masking SIGHUP and SIGCHLD from both handlers mutually. Add additional bandaids ``just in case'', i. e. don't enter a process into the dead queue if we can't signal it (this should only happen in case it is already dead by that time so we can fetch the status immediately instead of deferring this to the SIGCHLD handler); for the kill(2) inside domark(), check for an error status (/* Can't happen */ :) and remove it from the dead queue in this case (which if it would have been there in the first place would have reduced the problem to a statistically minimal likelihood so i certainly would never have noticed the bug at all :). Mirko also reviewed the fix in priciple (mutual blocking of both signals inside the handlers), but not the actual code. Reviewed by: Mirko Kaffka <mirko@interface-business.de> Approved by: jkh
-rw-r--r--usr.sbin/syslogd/syslogd.c131
1 files changed, 96 insertions, 35 deletions
diff --git a/usr.sbin/syslogd/syslogd.c b/usr.sbin/syslogd/syslogd.c
index 008228b..1ef6f29 100644
--- a/usr.sbin/syslogd/syslogd.c
+++ b/usr.sbin/syslogd/syslogd.c
@@ -271,7 +271,8 @@ int LogFacPri = 0; /* Put facility and priority in log message: */
int allowaddr __P((char *));
void cfline __P((char *, struct filed *, char *));
char *cvthname __P((struct sockaddr_in *));
-void deadq_enter __P((pid_t));
+void deadq_enter __P((pid_t, const char *));
+int deadq_remove __P((pid_t));
int decode __P((const char *, CODE *));
void die __P((int));
void domark __P((int));
@@ -279,6 +280,7 @@ void fprintlog __P((struct filed *, int, char *));
void init __P((int));
void logerror __P((const char *));
void logmsg __P((int, char *, char *, int));
+void log_deadchild __P((pid_t, int, const char *));
void printline __P((char *, char *));
void printsys __P((char *));
int p_open __P((char *, pid_t *));
@@ -302,6 +304,8 @@ main(argc, argv)
FILE *fp;
char *p, *hname, line[MAXLINE + 1];
struct timeval tv, *tvp;
+ struct sigaction sact;
+ sigset_t mask;
pid_t ppid = 1;
socklen_t len;
@@ -368,7 +372,18 @@ main(argc, argv)
(void)signal(SIGTERM, die);
(void)signal(SIGINT, Debug ? die : SIG_IGN);
(void)signal(SIGQUIT, Debug ? die : SIG_IGN);
- (void)signal(SIGCHLD, reapchild);
+ /*
+ * We don't want the SIGCHLD and SIGHUP handlers to interfere
+ * with each other; they are likely candidates for being called
+ * simultaneously (SIGHUP closes pipe descriptor, process dies,
+ * SIGCHLD happens).
+ */
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGHUP);
+ sact.sa_handler = reapchild;
+ sact.sa_mask = mask;
+ sact.sa_flags = 0;
+ (void)sigaction(SIGCHLD, &sact, NULL);
(void)signal(SIGALRM, domark);
(void)signal(SIGPIPE, SIG_IGN); /* We'll catch EPIPE instead. */
(void)alarm(TIMERINTVL);
@@ -440,7 +455,13 @@ main(argc, argv)
dprintf("off & running....\n");
init(0);
- (void)signal(SIGHUP, init);
+ /* prevent SIGHUP and SIGCHLD handlers from running in parallel */
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGCHLD);
+ sact.sa_handler = init;
+ sact.sa_mask = mask;
+ sact.sa_flags = 0;
+ (void)sigaction(SIGHUP, &sact, NULL);
tvp = &tv;
tv.tv_sec = tv.tv_usec = 0;
@@ -938,7 +959,8 @@ fprintlog(f, flags, msg)
int e = errno;
(void)close(f->f_file);
if (f->f_un.f_pipe.f_pid > 0)
- deadq_enter(f->f_un.f_pipe.f_pid);
+ deadq_enter(f->f_un.f_pipe.f_pid,
+ f->f_un.f_pipe.f_pname);
f->f_un.f_pipe.f_pid = 0;
errno = e;
logerror(f->f_un.f_pipe.f_pname);
@@ -1036,12 +1058,9 @@ void
reapchild(signo)
int signo;
{
- int status, code;
+ int status;
pid_t pid;
struct filed *f;
- char buf[256];
- const char *reason;
- dq_t q;
while ((pid = wait3(&status, WNOHANG, (struct rusage *)NULL)) > 0) {
if (!Initialized)
@@ -1049,35 +1068,17 @@ reapchild(signo)
continue;
/* First, look if it's a process from the dead queue. */
- for (q = TAILQ_FIRST(&deadq_head); q != NULL; q = TAILQ_NEXT(q, dq_entries))
- if (q->dq_pid == pid) {
- TAILQ_REMOVE(&deadq_head, q, dq_entries);
- free(q);
- goto oncemore;
- }
+ if (deadq_remove(pid))
+ goto oncemore;
/* Now, look in list of active processes. */
for (f = Files; f; f = f->f_next)
if (f->f_type == F_PIPE &&
f->f_un.f_pipe.f_pid == pid) {
(void)close(f->f_file);
-
- errno = 0; /* Keep strerror() stuff out of logerror messages. */
f->f_un.f_pipe.f_pid = 0;
- if (WIFSIGNALED(status)) {
- reason = "due to signal";
- code = WTERMSIG(status);
- } else {
- reason = "with status";
- code = WEXITSTATUS(status);
- if (code == 0)
- goto oncemore; /* Exited OK. */
- }
- (void)snprintf(buf, sizeof buf,
- "Logging subprocess %d (%s) exited %s %d.",
- pid, f->f_un.f_pipe.f_pname,
- reason, code);
- logerror(buf);
+ log_deadchild(pid, status,
+ f->f_un.f_pipe.f_pname);
break;
}
oncemore:
@@ -1148,7 +1149,8 @@ domark(signo)
switch (q->dq_timeout) {
case 0:
/* Already signalled once, try harder now. */
- kill(q->dq_pid, SIGKILL);
+ if (kill(q->dq_pid, SIGKILL) != 0)
+ (void)deadq_remove(q->dq_pid);
break;
case 1:
@@ -1156,9 +1158,12 @@ domark(signo)
* Timed out on dead queue, send terminate
* signal. Note that we leave the removal
* from the dead queue to reapchild(), which
- * will also log the event.
+ * will also log the event (unless the process
+ * didn't even really exist, in case we simply
+ * drop it from the dead queue).
*/
- kill(q->dq_pid, SIGTERM);
+ if (kill(q->dq_pid, SIGTERM) != 0)
+ (void)deadq_remove(q->dq_pid);
/* FALLTHROUGH */
default:
@@ -1253,7 +1258,8 @@ init(signo)
case F_PIPE:
(void)close(f->f_file);
if (f->f_un.f_pipe.f_pid > 0)
- deadq_enter(f->f_un.f_pipe.f_pid);
+ deadq_enter(f->f_un.f_pipe.f_pid,
+ f->f_un.f_pipe.f_pname);
f->f_un.f_pipe.f_pid = 0;
break;
}
@@ -1912,10 +1918,23 @@ p_open(prog, pid)
}
void
-deadq_enter(pid)
+deadq_enter(pid, name)
pid_t pid;
+ const char *name;
{
dq_t p;
+ int status;
+
+ /*
+ * Be paranoid, if we can't signal the process, don't enter it
+ * into the dead queue (perhaps it's already dead). If possible,
+ * we try to fetch and log the child's status.
+ */
+ if (kill(pid, 0) != 0) {
+ if (waitpid(pid, &status, WNOHANG) > 0)
+ log_deadchild(pid, status, name);
+ return;
+ }
p = malloc(sizeof(struct deadq_entry));
if (p == 0) {
@@ -1928,3 +1947,45 @@ deadq_enter(pid)
p->dq_timeout = DQ_TIMO_INIT;
TAILQ_INSERT_TAIL(&deadq_head, p, dq_entries);
}
+
+int
+deadq_remove(pid)
+ pid_t pid;
+{
+ dq_t q;
+
+ for (q = TAILQ_FIRST(&deadq_head); q != NULL; q = TAILQ_NEXT(q, dq_entries))
+ if (q->dq_pid == pid) {
+ TAILQ_REMOVE(&deadq_head, q, dq_entries);
+ free(q);
+ return 1;
+ }
+
+ return 0;
+}
+
+void
+log_deadchild(pid, status, name)
+ pid_t pid;
+ int status;
+ const char *name;
+{
+ int code;
+ char buf[256];
+ const char *reason;
+
+ errno = 0; /* Keep strerror() stuff out of logerror messages. */
+ if (WIFSIGNALED(status)) {
+ reason = "due to signal";
+ code = WTERMSIG(status);
+ } else {
+ reason = "with status";
+ code = WEXITSTATUS(status);
+ if (code == 0)
+ return;
+ }
+ (void)snprintf(buf, sizeof buf,
+ "Logging subprocess %d (%s) exited %s %d.",
+ pid, name, reason, code);
+ logerror(buf);
+}
OpenPOWER on IntegriCloud