From 7e7f8a036b8e2b2a300df016da5e7128c8a9192e Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 31 Jan 2006 16:56:28 -0500 Subject: [PATCH] make vm86 call audit_syscall_exit hi, The motivation behind the patch below was to address messages in /var/log/messages such as: Jan 31 10:54:15 mets kernel: audit(:0): major=252 name_count=0: freeing multiple contexts (1) Jan 31 10:54:15 mets kernel: audit(:0): major=113 name_count=0: freeing multiple contexts (2) I can reproduce by running 'get-edid' from: http://john.fremlin.de/programs/linux/read-edid/. These messages come about in the log b/c the vm86 calls do not exit via the normal system call exit paths and thus do not call 'audit_syscall_exit'. The next system call will then free the context for itself and for the vm86 context, thus generating the above messages. This patch addresses the issue by simply adding a call to 'audit_syscall_exit' from the vm86 code. Besides fixing the above error messages the patch also now allows vm86 system calls to become auditable. This is useful since strace does not appear to properly record the return values from sys_vm86. I think this patch is also a step in the right direction in terms of cleaning up some core auditing code. If we can correct any other paths that do not properly call the audit exit and entries points, then we can also eliminate the notion of context chaining. I've tested this patch by verifying that the log messages no longer appear, and that the audit records for sys_vm86 appear to be correct. Also, 'read_edid' produces itentical output. thanks, -Jason Signed-off-by: Jason Baron Signed-off-by: Al Viro --- kernel/auditsc.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d7e7e63..cfaa4a2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -966,11 +966,6 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, if (context->in_syscall) { struct audit_context *newctx; -#if defined(__NR_vm86) && defined(__NR_vm86old) - /* vm86 mode should only be entered once */ - if (major == __NR_vm86 || major == __NR_vm86old) - return; -#endif #if AUDIT_DEBUG printk(KERN_ERR "audit(:%d) pid=%d in syscall=%d;" -- cgit v1.1 From b0dd25a8263dde3c30b0d7d72a8bd92d7ba0e3f5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 13 Sep 2005 12:47:11 -0700 Subject: [PATCH] AUDIT: kerneldoc for kernel/audit*.c - add kerneldoc for non-static functions; - don't init static data to 0; - limit lines to < 80 columns; - fix long-format style; - delete whitespace at end of some lines; (chrisw: resend and update to current audit-2.6 tree) Signed-off-by: Randy Dunlap Signed-off-by: Chris Wright Signed-off-by: David Woodhouse --- kernel/audit.c | 134 ++++++++++++++++++++++++++++++++++++++----------- kernel/auditsc.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 238 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0a813d2..973ca5a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -72,7 +72,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK; * contains the (non-zero) pid. */ int audit_pid; -/* If audit_limit is non-zero, limit the rate of sending audit records +/* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in * audit records being dropped. */ static int audit_rate_limit; @@ -102,7 +102,7 @@ static struct sock *audit_sock; * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ static DEFINE_SPINLOCK(audit_freelist_lock); -static int audit_freelist_count = 0; +static int audit_freelist_count; static LIST_HEAD(audit_freelist); static struct sk_buff_head audit_skb_queue; @@ -186,8 +186,14 @@ static inline int audit_rate_check(void) return retval; } -/* Emit at least 1 message per second, even if audit_rate_check is - * throttling. */ +/** + * audit_log_lost - conditionally log lost audit message event + * @message: the message stating reason for lost audit message + * + * Emit at least 1 message per second, even if audit_rate_check is + * throttling. + * Always increment the lost messages counter. +*/ void audit_log_lost(const char *message) { static unsigned long last_msg = 0; @@ -218,7 +224,6 @@ void audit_log_lost(const char *message) audit_backlog_limit); audit_panic(message); } - } static int audit_set_rate_limit(int limit, uid_t loginuid) @@ -302,6 +307,19 @@ static int kauditd_thread(void *dummy) } } +/** + * audit_send_reply - send an audit reply message via netlink + * @pid: process id to send reply to + * @seq: sequence number + * @type: audit message type + * @done: done (last) flag + * @multi: multi-part message flag + * @payload: payload data + * @size: payload size + * + * Allocates an skb, builds the netlink message, and sends it to the pid. + * No failure notifications. + */ void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { @@ -376,7 +394,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; - /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ + /* As soon as there's any sign of userspace auditd, + * start kauditd to talk to it */ if (!kauditd_task) kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { @@ -469,9 +488,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err < 0 ? err : 0; } -/* Get message from skb (based on rtnetlink_rcv_skb). Each message is +/* + * Get message from skb (based on rtnetlink_rcv_skb). Each message is * processed by audit_receive_msg. Malformed skbs with wrong length are - * discarded silently. */ + * discarded silently. + */ static void audit_receive_skb(struct sk_buff *skb) { int err; @@ -600,7 +621,10 @@ err: return NULL; } -/* Compute a serial number for the audit record. Audit records are +/** + * audit_serial - compute a serial number for the audit record + * + * Compute a serial number for the audit record. Audit records are * written to user-space as soon as they are generated, so a complete * audit record may be written in several pieces. The timestamp of the * record and this serial number are used by the user-space tools to @@ -612,8 +636,8 @@ err: * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system - * halts). */ - + * halts). + */ unsigned int audit_serial(void) { static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; @@ -649,6 +673,21 @@ static inline void audit_get_stamp(struct audit_context *ctx, * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ +/** + * audit_log_start - obtain an audit buffer + * @ctx: audit_context (may be NULL) + * @gfp_mask: type of allocation + * @type: audit message type + * + * Returns audit_buffer pointer on success or NULL on error. + * + * Obtain an audit buffer. This routine does locking to obtain the + * audit buffer, but then no locking is required for calls to + * audit_log_*format. If the task (ctx) is a task that is currently in a + * syscall, then the syscall is marked as auditable and an audit record + * will be written at syscall exit. If there is no associated task, then + * task context (ctx) should be NULL. + */ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { @@ -713,6 +752,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, /** * audit_expand - expand skb in the audit buffer * @ab: audit_buffer + * @extra: space to add at tail of the skb * * Returns 0 (no space) on failed expansion, or available space if * successful. @@ -729,10 +769,12 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) return skb_tailroom(skb); } -/* Format an audit message into the audit buffer. If there isn't enough +/* + * Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint * will be called a second time. Currently, we assume that a printk - * can't format message larger than 1024 bytes, so we don't either. */ + * can't format message larger than 1024 bytes, so we don't either. + */ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { @@ -757,7 +799,8 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ - avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); + avail = audit_expand(ab, + max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out; len = vsnprintf(skb->tail, avail, fmt, args2); @@ -768,8 +811,14 @@ out: return; } -/* Format a message into the audit buffer. All the work is done in - * audit_log_vformat. */ +/** + * audit_log_format - format a message into the audit buffer. + * @ab: audit_buffer + * @fmt: format string + * @...: optional parameters matching @fmt string + * + * All the work is done in audit_log_vformat. + */ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { va_list args; @@ -781,9 +830,18 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) va_end(args); } -/* This function will take the passed buf and convert it into a string of - * ascii hex digits. The new string is placed onto the skb. */ -void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, +/** + * audit_log_hex - convert a buffer to hex and append it to the audit skb + * @ab: the audit_buffer + * @buf: buffer to convert to hex + * @len: length of @buf to be converted + * + * No return value; failure to expand is silently ignored. + * + * This function will take the passed buf and convert it into a string of + * ascii hex digits. The new string is placed onto the skb. + */ +void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; @@ -812,10 +870,16 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, skb_put(skb, len << 1); /* new string is twice the old string */ } -/* This code will escape a string that is passed to it if the string - * contains a control character, unprintable character, double quote mark, +/** + * audit_log_unstrustedstring - log a string that may contain random characters + * @ab: audit_buffer + * @string: string to be logged + * + * This code will escape a string that is passed to it if the string + * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. - * Strings that are escaped are printed in hex (2 digits per char). */ + * Strings that are escaped are printed in hex (2 digits per char). + */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { const unsigned char *p = string; @@ -854,10 +918,15 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, kfree(path); } -/* The netlink_* functions cannot be called inside an irq context, so - * the audit buffer is places on a queue and a tasklet is scheduled to +/** + * audit_log_end - end one audit record + * @ab: the audit_buffer + * + * The netlink_* functions cannot be called inside an irq context, so + * the audit buffer is placed on a queue and a tasklet is scheduled to * remove them from the queue outside the irq context. May be called in - * any context. */ + * any context. + */ void audit_log_end(struct audit_buffer *ab) { if (!ab) @@ -878,9 +947,18 @@ void audit_log_end(struct audit_buffer *ab) audit_buffer_free(ab); } -/* Log an audit record. This is a convenience function that calls - * audit_log_start, audit_log_vformat, and audit_log_end. It may be - * called in any context. */ +/** + * audit_log - Log an audit record + * @ctx: audit context + * @gfp_mask: type of allocation + * @type: audit message type + * @fmt: format string to use + * @...: variable parameters matching the format string + * + * This is a convenience function that calls audit_log_start, + * audit_log_vformat, and audit_log_end. It may be called + * in any context. + */ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cfaa4a2..51a4f58a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -330,6 +330,15 @@ static int audit_list_rules(void *_dest) return 0; } +/** + * audit_receive_filter - apply all rules to the specified message type + * @type: audit message type + * @pid: target pid for netlink audit messages + * @uid: target uid for netlink audit messages + * @seq: netlink audit message sequence (serial) number + * @data: payload data + * @loginuid: loginuid of sender + */ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, uid_t loginuid) { @@ -527,7 +536,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) /* At syscall entry and exit time, this filter is called if the * audit_state is not low enough that auditing cannot take place, but is * also not high enough that we already know we have to write an audit - * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). + * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). */ static enum audit_state audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx, @@ -721,10 +730,15 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) return context; } -/* Filter on the task information and allocate a per-task audit context +/** + * audit_alloc - allocate an audit context block for a task + * @tsk: task + * + * Filter on the task information and allocate a per-task audit context * if necessary. Doing so turns on system call auditing for the * specified task. This is called from copy_process, so no lock is - * needed. */ + * needed. + */ int audit_alloc(struct task_struct *tsk) { struct audit_context *context; @@ -911,8 +925,12 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) } } -/* Free a per-task audit context. Called from copy_process and - * __put_task_struct. */ +/** + * audit_free - free a per-task audit context + * @tsk: task whose audit context block to free + * + * Called from copy_process and __put_task_struct. + */ void audit_free(struct task_struct *tsk) { struct audit_context *context; @@ -934,13 +952,24 @@ void audit_free(struct task_struct *tsk) audit_free_context(context); } -/* Fill in audit context at syscall entry. This only happens if the +/** + * audit_syscall_entry - fill in an audit record at syscall entry + * @tsk: task being audited + * @arch: architecture type + * @major: major syscall type (function) + * @a1: additional syscall register 1 + * @a2: additional syscall register 2 + * @a3: additional syscall register 3 + * @a4: additional syscall register 4 + * + * Fill in audit context at syscall entry. This only happens if the * audit context was created when the task was created and the state or * filters demand the audit context be built. If the state from the * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, * then the record will be written at syscall exit time (otherwise, it * will only be written if another part of the kernel requests that it - * be written). */ + * be written). + */ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) @@ -950,7 +979,8 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, BUG_ON(!context); - /* This happens only on certain architectures that make system + /* + * This happens only on certain architectures that make system * calls in kernel_thread via the entry.S interface, instead of * with direct calls. (If you are porting to a new * architecture, hitting this condition can indicate that you @@ -1009,11 +1039,18 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, context->auditable = !!(state == AUDIT_RECORD_CONTEXT); } -/* Tear down after system call. If the audit context has been marked as +/** + * audit_syscall_exit - deallocate audit context after a system call + * @tsk: task being audited + * @valid: success/failure flag + * @return_code: syscall return value + * + * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from * filtering, or because some other part of the kernel write an audit * message), then write out the syscall information. In call cases, - * free the names stored from getname(). */ + * free the names stored from getname(). + */ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) { struct audit_context *context; @@ -1048,7 +1085,13 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) put_task_struct(tsk); } -/* Add a name to the list. Called from fs/namei.c:getname(). */ +/** + * audit_getname - add a name to the list + * @name: name to add + * + * Add a name to the list of audit names for this context. + * Called from fs/namei.c:getname(). + */ void audit_getname(const char *name) { struct audit_context *context = current->audit_context; @@ -1077,10 +1120,13 @@ void audit_getname(const char *name) } -/* Intercept a putname request. Called from - * include/linux/fs.h:putname(). If we have stored the name from - * getname in the audit context, then we delay the putname until syscall - * exit. */ +/* audit_putname - intercept a putname request + * @name: name to intercept and delay for putname + * + * If we have stored the name from getname in the audit context, + * then we delay the putname until syscall exit. + * Called from include/linux/fs.h:putname(). + */ void audit_putname(const char *name) { struct audit_context *context = current->audit_context; @@ -1117,8 +1163,14 @@ void audit_putname(const char *name) #endif } -/* Store the inode and device from a lookup. Called from - * fs/namei.c:path_lookup(). */ +/** + * audit_inode - store the inode and device from a lookup + * @name: name being audited + * @inode: inode being audited + * @flags: lookup flags (as used in path_lookup()) + * + * Called from fs/namei.c:path_lookup(). + */ void audit_inode(const char *name, const struct inode *inode, unsigned flags) { int idx; @@ -1154,6 +1206,14 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags) context->names[idx].rdev = inode->i_rdev; } +/** + * auditsc_get_stamp - get local copies of audit_context values + * @ctx: audit_context for the task + * @t: timespec to store time recorded in the audit_context + * @serial: serial value that is recorded in the audit_context + * + * Also sets the context as auditable. + */ void auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial) { @@ -1165,6 +1225,15 @@ void auditsc_get_stamp(struct audit_context *ctx, ctx->auditable = 1; } +/** + * audit_set_loginuid - set a task's audit_context loginuid + * @task: task whose audit context is being modified + * @loginuid: loginuid value + * + * Returns 0. + * + * Called (set) from fs/proc/base.c::proc_loginuid_write(). + */ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { if (task->audit_context) { @@ -1183,11 +1252,26 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) return 0; } +/** + * audit_get_loginuid - get the loginuid for an audit_context + * @ctx: the audit_context + * + * Returns the context's loginuid or -1 if @ctx is NULL. + */ uid_t audit_get_loginuid(struct audit_context *ctx) { return ctx ? ctx->loginuid : -1; } +/** + * audit_ipc_perms - record audit data for ipc + * @qbytes: msgq bytes + * @uid: msgq user id + * @gid: msgq group id + * @mode: msgq mode (permissions) + * + * Returns 0 for success or NULL context or < 0 on error. + */ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { struct audit_aux_data_ipcctl *ax; @@ -1211,6 +1295,13 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) return 0; } +/** + * audit_socketcall - record audit data for sys_socketcall + * @nargs: number of args + * @args: args array + * + * Returns 0 for success or NULL context or < 0 on error. + */ int audit_socketcall(int nargs, unsigned long *args) { struct audit_aux_data_socketcall *ax; @@ -1232,6 +1323,13 @@ int audit_socketcall(int nargs, unsigned long *args) return 0; } +/** + * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto + * @len: data length in user space + * @a: data address in kernel space + * + * Returns 0 for success or NULL context or < 0 on error. + */ int audit_sockaddr(int len, void *a) { struct audit_aux_data_sockaddr *ax; @@ -1253,6 +1351,15 @@ int audit_sockaddr(int len, void *a) return 0; } +/** + * audit_avc_path - record the granting or denial of permissions + * @dentry: dentry to record + * @mnt: mnt to record + * + * Returns 0 for success or NULL context or < 0 on error. + * + * Called from security/selinux/avc.c::avc_audit() + */ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) { struct audit_aux_data_path *ax; @@ -1274,6 +1381,14 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) return 0; } +/** + * audit_signal_info - record signal info for shutting down audit subsystem + * @sig: signal value + * @t: task being signaled + * + * If the audit subsystem is being terminated, record the task (pid) + * and uid that is doing that. + */ void audit_signal_info(int sig, struct task_struct *t) { extern pid_t audit_sig_pid; @@ -1290,4 +1405,3 @@ void audit_signal_info(int sig, struct task_struct *t) } } } - -- cgit v1.1 From b63862f46547487388e582e8ac9083830d34f058 Mon Sep 17 00:00:00 2001 From: Dustin Kirkland Date: Thu, 3 Nov 2005 15:41:46 +0000 Subject: [PATCH] Filter rule comparators Currently, audit only supports the "=" and "!=" operators in the -F filter rules. This patch reworks the support for "=" and "!=", and adds support for ">", ">=", "<", and "<=". This turned out to be a pretty clean, and simply process. I ended up using the high order bits of the "field", as suggested by Steve and Amy. This allowed for no changes whatsoever to the netlink communications. See the documentation within the patch in the include/linux/audit.h area, where there is a table that explains the reasoning of the bitmask assignments clearly. The patch adds a new function, audit_comparator(left, op, right). This function will perform the specified comparison (op, which defaults to "==" for backward compatibility) between two values (left and right). If the negate bit is on, it will negate whatever that result was. This value is returned. Signed-off-by: Dustin Kirkland Signed-off-by: David Woodhouse --- kernel/auditsc.c | 117 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 51a4f58a..95076fa 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2,6 +2,7 @@ * Handles all system-call specific auditing features. * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * Copyright (C) 2005 IBM Corporation * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify @@ -27,6 +28,9 @@ * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * + * The support of additional filter rules compares (>, <, >=, <=) was + * added by Dustin Kirkland , 2005. + * */ #include @@ -252,6 +256,7 @@ static inline int audit_add_rule(struct audit_rule *rule, struct list_head *list) { struct audit_entry *entry; + int i; /* Do not use the _rcu iterator here, since this is the only * addition routine. */ @@ -261,6 +266,16 @@ static inline int audit_add_rule(struct audit_rule *rule, } } + for (i = 0; i < rule->field_count; i++) { + if (rule->fields[i] & AUDIT_UNUSED_BITS) + return -EINVAL; + if ( rule->fields[i] & AUDIT_NEGATE ) + rule->fields[i] |= AUDIT_NOT_EQUAL; + else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 ) + rule->fields[i] |= AUDIT_EQUAL; + rule->fields[i] &= (~AUDIT_NEGATE); + } + if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) return -ENOMEM; if (audit_copy_rule(&entry->rule, rule)) { @@ -394,6 +409,26 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return err; } +static int audit_comparator(const u32 left, const u32 op, const u32 right) +{ + switch (op) { + case AUDIT_EQUAL: + return (left == right); + case AUDIT_NOT_EQUAL: + return (left != right); + case AUDIT_LESS_THAN: + return (left < right); + case AUDIT_LESS_THAN_OR_EQUAL: + return (left <= right); + case AUDIT_GREATER_THAN: + return (left > right); + case AUDIT_GREATER_THAN_OR_EQUAL: + return (left >= right); + default: + return -EINVAL; + } +} + /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ static int audit_filter_rules(struct task_struct *tsk, @@ -404,62 +439,63 @@ static int audit_filter_rules(struct task_struct *tsk, int i, j; for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_NEGATE; + u32 field = rule->fields[i] & ~AUDIT_OPERATORS; + u32 op = rule->fields[i] & AUDIT_OPERATORS; u32 value = rule->values[i]; int result = 0; switch (field) { case AUDIT_PID: - result = (tsk->pid == value); + result = audit_comparator(tsk->pid, op, value); break; case AUDIT_UID: - result = (tsk->uid == value); + result = audit_comparator(tsk->uid, op, value); break; case AUDIT_EUID: - result = (tsk->euid == value); + result = audit_comparator(tsk->euid, op, value); break; case AUDIT_SUID: - result = (tsk->suid == value); + result = audit_comparator(tsk->suid, op, value); break; case AUDIT_FSUID: - result = (tsk->fsuid == value); + result = audit_comparator(tsk->fsuid, op, value); break; case AUDIT_GID: - result = (tsk->gid == value); + result = audit_comparator(tsk->gid, op, value); break; case AUDIT_EGID: - result = (tsk->egid == value); + result = audit_comparator(tsk->egid, op, value); break; case AUDIT_SGID: - result = (tsk->sgid == value); + result = audit_comparator(tsk->sgid, op, value); break; case AUDIT_FSGID: - result = (tsk->fsgid == value); + result = audit_comparator(tsk->fsgid, op, value); break; case AUDIT_PERS: - result = (tsk->personality == value); + result = audit_comparator(tsk->personality, op, value); break; case AUDIT_ARCH: - if (ctx) - result = (ctx->arch == value); + if (ctx) + result = audit_comparator(ctx->arch, op, value); break; case AUDIT_EXIT: if (ctx && ctx->return_valid) - result = (ctx->return_code == value); + result = audit_comparator(ctx->return_code, op, value); break; case AUDIT_SUCCESS: if (ctx && ctx->return_valid) { if (value) - result = (ctx->return_valid == AUDITSC_SUCCESS); + result = audit_comparator(ctx->return_valid, op, AUDITSC_SUCCESS); else - result = (ctx->return_valid == AUDITSC_FAILURE); + result = audit_comparator(ctx->return_valid, op, AUDITSC_FAILURE); } break; case AUDIT_DEVMAJOR: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (MAJOR(ctx->names[j].dev)==value) { + if (audit_comparator(MAJOR(ctx->names[j].dev), op, value)) { ++result; break; } @@ -469,7 +505,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_DEVMINOR: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (MINOR(ctx->names[j].dev)==value) { + if (audit_comparator(MINOR(ctx->names[j].dev), op, value)) { ++result; break; } @@ -479,7 +515,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_INODE: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (ctx->names[j].ino == value) { + if (audit_comparator(ctx->names[j].ino, op, value)) { ++result; break; } @@ -489,19 +525,17 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID: result = 0; if (ctx) - result = (ctx->loginuid == value); + result = audit_comparator(ctx->loginuid, op, value); break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: if (ctx) - result = (ctx->argv[field-AUDIT_ARG0]==value); + result = audit_comparator(ctx->argv[field-AUDIT_ARG0], op, value); break; } - if (rule->fields[i] & AUDIT_NEGATE) - result = !result; if (!result) return 0; } @@ -550,49 +584,48 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, rcu_read_lock(); if (!list_empty(list)) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit - && audit_filter_rules(tsk, &e->rule, ctx, &state)) { - rcu_read_unlock(); - return state; - } - } + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit + && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + rcu_read_unlock(); + return state; + } + } } rcu_read_unlock(); return AUDIT_BUILD_CONTEXT; } static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_rule *rule, - enum audit_state *state) + struct audit_rule *rule, + enum audit_state *state) { int i; for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_NEGATE; + u32 field = rule->fields[i] & ~AUDIT_OPERATORS; + u32 op = rule->fields[i] & AUDIT_OPERATORS; u32 value = rule->values[i]; int result = 0; switch (field) { case AUDIT_PID: - result = (cb->creds.pid == value); + result = audit_comparator(cb->creds.pid, op, value); break; case AUDIT_UID: - result = (cb->creds.uid == value); + result = audit_comparator(cb->creds.uid, op, value); break; case AUDIT_GID: - result = (cb->creds.gid == value); + result = audit_comparator(cb->creds.gid, op, value); break; case AUDIT_LOGINUID: - result = (cb->loginuid == value); + result = audit_comparator(cb->loginuid, op, value); break; } - if (rule->fields[i] & AUDIT_NEGATE) - result = !result; if (!result) return 0; } -- cgit v1.1 From 90d526c074ae5db484388da56c399acf892b6c17 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 3 Nov 2005 15:48:08 +0000 Subject: [PATCH] Define new range of userspace messages. The attached patch updates various items for the new user space messages. Please apply. Signed-off-by: Steve Grubb Signed-off-by: David Woodhouse --- kernel/audit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 973ca5a..6d61dd7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -369,6 +369,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) err = -EPERM; break; @@ -449,6 +450,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; -- cgit v1.1 From f38aa94224c5517a40ba56d453779f70d3229803 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 3 Nov 2005 15:57:06 +0000 Subject: [PATCH] Pass dentry, not just name, in fsnotify creation hooks. The audit hooks (to be added shortly) will want to see dentry->d_inode too, not just the name. Signed-off-by: Amy Griffis Signed-off-by: David Woodhouse --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 95076fa..55ba331 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -515,7 +515,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_INODE: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, op, value)) { + if ( audit_comparator(ctx->names[j].ino, op, value)) { ++result; break; } -- cgit v1.1 From 73241ccca0f7786933f1d31b3d86f2456549953a Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 3 Nov 2005 16:00:25 +0000 Subject: [PATCH] Collect more inode information during syscall processing. This patch augments the collection of inode info during syscall processing. It represents part of the functionality that was provided by the auditfs patch included in RHEL4. Specifically, it: - Collects information for target inodes created or removed during syscalls. Previous code only collects information for the target inode's parent. - Adds the audit_inode() hook to syscalls that operate on a file descriptor (e.g. fchown), enabling audit to do inode filtering for these calls. - Modifies filtering code to check audit context for either an inode # or a parent inode # matching a given rule. - Modifies logging to provide inode # for both parent and child. - Protect debug info from NULL audit_names.name. [AV: folded a later typo fix from the same author] Signed-off-by: Amy Griffis Signed-off-by: David Woodhouse Signed-off-by: Al Viro --- kernel/auditsc.c | 142 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 118 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 55ba331..73f932b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2,6 +2,7 @@ * Handles all system-call specific auditing features. * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright (C) 2005 IBM Corporation * All Rights Reserved. * @@ -31,11 +32,16 @@ * The support of additional filter rules compares (>, <, >=, <=) was * added by Dustin Kirkland , 2005. * + * Modified by Amy Griffis to collect additional + * filesystem information. */ #include #include #include +#include +#include +#include #include #include #include @@ -97,12 +103,12 @@ enum audit_state { struct audit_names { const char *name; unsigned long ino; + unsigned long pino; dev_t dev; umode_t mode; uid_t uid; gid_t gid; dev_t rdev; - unsigned flags; }; struct audit_aux_data { @@ -515,7 +521,8 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_INODE: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if ( audit_comparator(ctx->names[j].ino, op, value)) { + if (audit_comparator(ctx->names[j].ino, op, value) || + audit_comparator(ctx->names[j].pino, op, value)) { ++result; break; } @@ -696,17 +703,17 @@ static inline void audit_free_names(struct audit_context *context) #if AUDIT_DEBUG == 2 if (context->auditable ||context->put_count + context->ino_count != context->name_count) { - printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d" + printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", - __LINE__, + __FILE__, __LINE__, context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); for (i = 0; i < context->name_count; i++) printk(KERN_ERR "names[%d] = %p = %s\n", i, context->names[i].name, - context->names[i].name); + context->names[i].name ?: "(null)"); dump_stack(); return; } @@ -932,27 +939,34 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) } } for (i = 0; i < context->name_count; i++) { + unsigned long ino = context->names[i].ino; + unsigned long pino = context->names[i].pino; + ab = audit_log_start(context, gfp_mask, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ audit_log_format(ab, "item=%d", i); - if (context->names[i].name) { - audit_log_format(ab, " name="); + + audit_log_format(ab, " name="); + if (context->names[i].name) audit_log_untrustedstring(ab, context->names[i].name); - } - audit_log_format(ab, " flags=%x\n", context->names[i].flags); - - if (context->names[i].ino != (unsigned long)-1) - audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" - " ouid=%u ogid=%u rdev=%02x:%02x", - context->names[i].ino, - MAJOR(context->names[i].dev), - MINOR(context->names[i].dev), - context->names[i].mode, - context->names[i].uid, - context->names[i].gid, - MAJOR(context->names[i].rdev), + else + audit_log_format(ab, "(null)"); + + if (pino != (unsigned long)-1) + audit_log_format(ab, " parent=%lu", pino); + if (ino != (unsigned long)-1) + audit_log_format(ab, " inode=%lu", ino); + if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) + audit_log_format(ab, " dev=%02x:%02x mode=%#o" + " ouid=%u ogid=%u rdev=%02x:%02x", + MAJOR(context->names[i].dev), + MINOR(context->names[i].dev), + context->names[i].mode, + context->names[i].uid, + context->names[i].gid, + MAJOR(context->names[i].rdev), MINOR(context->names[i].rdev)); audit_log_end(ab); } @@ -1174,7 +1188,7 @@ void audit_putname(const char *name) for (i = 0; i < context->name_count; i++) printk(KERN_ERR "name[%d] = %p = %s\n", i, context->names[i].name, - context->names[i].name); + context->names[i].name ?: "(null)"); } #endif __putname(name); @@ -1204,7 +1218,7 @@ void audit_putname(const char *name) * * Called from fs/namei.c:path_lookup(). */ -void audit_inode(const char *name, const struct inode *inode, unsigned flags) +void __audit_inode(const char *name, const struct inode *inode, unsigned flags) { int idx; struct audit_context *context = current->audit_context; @@ -1230,13 +1244,93 @@ void audit_inode(const char *name, const struct inode *inode, unsigned flags) ++context->ino_count; #endif } - context->names[idx].flags = flags; - context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; context->names[idx].mode = inode->i_mode; context->names[idx].uid = inode->i_uid; context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; + if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && + (strcmp(name, ".") != 0)) { + context->names[idx].ino = (unsigned long)-1; + context->names[idx].pino = inode->i_ino; + } else { + context->names[idx].ino = inode->i_ino; + context->names[idx].pino = (unsigned long)-1; + } +} + +/** + * audit_inode_child - collect inode info for created/removed objects + * @dname: inode's dentry name + * @inode: inode being audited + * @pino: inode number of dentry parent + * + * For syscalls that create or remove filesystem objects, audit_inode + * can only collect information for the filesystem object's parent. + * This call updates the audit context with the child's information. + * Syscalls that create a new filesystem object must be hooked after + * the object is created. Syscalls that remove a filesystem object + * must be hooked prior, in order to capture the target inode during + * unsuccessful attempts. + */ +void __audit_inode_child(const char *dname, const struct inode *inode, + unsigned long pino) +{ + int idx; + struct audit_context *context = current->audit_context; + + if (!context->in_syscall) + return; + + /* determine matching parent */ + if (dname) + for (idx = 0; idx < context->name_count; idx++) + if (context->names[idx].pino == pino) { + const char *n; + const char *name = context->names[idx].name; + int dlen = strlen(dname); + int nlen = name ? strlen(name) : 0; + + if (nlen < dlen) + continue; + + /* disregard trailing slashes */ + n = name + nlen - 1; + while ((*n == '/') && (n > name)) + n--; + + /* find last path component */ + n = n - dlen + 1; + if (n < name) + continue; + else if (n > name) { + if (*--n != '/') + continue; + else + n++; + } + + if (strncmp(n, dname, dlen) == 0) + goto update_context; + } + + /* catch-all in case match not found */ + idx = context->name_count++; + context->names[idx].name = NULL; + context->names[idx].pino = pino; +#if AUDIT_DEBUG + context->ino_count++; +#endif + +update_context: + if (inode) { + context->names[idx].ino = inode->i_ino; + context->names[idx].dev = inode->i_sb->s_dev; + context->names[idx].mode = inode->i_mode; + context->names[idx].uid = inode->i_uid; + context->names[idx].gid = inode->i_gid; + context->names[idx].rdev = inode->i_rdev; + } } /** -- cgit v1.1 From c8edc80c8b8c397c53f4f659a05b9ea6208029bf Mon Sep 17 00:00:00 2001 From: Dustin Kirkland Date: Thu, 3 Nov 2005 16:12:36 +0000 Subject: [PATCH] Exclude messages by message type - Add a new, 5th filter called "exclude". - And add a new field AUDIT_MSGTYPE. - Define a new function audit_filter_exclude() that takes a message type as input and examines all rules in the filter. It returns '1' if the message is to be excluded, and '0' otherwise. - Call the audit_filter_exclude() function near the top of audit_log_start() just after asserting audit_initialized. If the message type is not to be audited, return NULL very early, before doing a lot of work. [combined with followup fix for bug in original patch, Nov 4, same author] [combined with later renaming AUDIT_FILTER_EXCLUDE->AUDIT_FILTER_TYPE and audit_filter_exclude() -> audit_filter_type()] Signed-off-by: Dustin Kirkland Signed-off-by: David Woodhouse Signed-off-by: Al Viro --- kernel/audit.c | 3 +++ kernel/auditsc.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6d61dd7..1c3eb1b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -702,6 +702,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (!audit_initialized) return NULL; + if (unlikely(audit_filter_type(type))) + return NULL; + if (gfp_mask & __GFP_WAIT) reserve = 0; else diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 73f932b..31917ac 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -187,7 +187,8 @@ static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[2]), LIST_HEAD_INIT(audit_filter_list[3]), LIST_HEAD_INIT(audit_filter_list[4]), -#if AUDIT_NR_FILTERS != 5 + LIST_HEAD_INIT(audit_filter_list[5]), +#if AUDIT_NR_FILTERS != 6 #error Fix audit_filter_list initialiser #endif }; @@ -663,6 +664,38 @@ int audit_filter_user(struct netlink_skb_parms *cb, int type) return ret; /* Audit by default */ } +int audit_filter_type(int type) +{ + struct audit_entry *e; + int result = 0; + + rcu_read_lock(); + if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) + goto unlock_and_return; + + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], + list) { + struct audit_rule *rule = &e->rule; + int i; + for (i = 0; i < rule->field_count; i++) { + u32 field = rule->fields[i] & ~AUDIT_OPERATORS; + u32 op = rule->fields[i] & AUDIT_OPERATORS; + u32 value = rule->values[i]; + if ( field == AUDIT_MSGTYPE ) { + result = audit_comparator(type, op, value); + if (!result) + break; + } + } + if (result) + goto unlock_and_return; + } +unlock_and_return: + rcu_read_unlock(); + return result; +} + + /* This should be called with task_lock() held. */ static inline struct audit_context *audit_get_context(struct task_struct *tsk, int return_valid, -- cgit v1.1 From 8c8570fb8feef2bc166bee75a85748b25cda22d9 Mon Sep 17 00:00:00 2001 From: Dustin Kirkland Date: Thu, 3 Nov 2005 17:15:16 +0000 Subject: [PATCH] Capture selinux subject/object context information. This patch extends existing audit records with subject/object context information. Audit records associated with filesystem inodes, ipc, and tasks now contain SELinux label information in the field "subj" if the item is performing the action, or in "obj" if the item is the receiver of an action. These labels are collected via hooks in SELinux and appended to the appropriate record in the audit code. This additional information is required for Common Criteria Labeled Security Protection Profile (LSPP). [AV: fixed kmalloc flags use] [folded leak fixes] [folded cleanup from akpm (kfree(NULL)] [folded audit_inode_context() leak fix] [folded akpm's fix for audit_ipc_perm() definition in case of !CONFIG_AUDIT] Signed-off-by: Dustin Kirkland Signed-off-by: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/audit.c | 2 +- kernel/auditsc.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 135 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 1c3eb1b..45c123e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -142,7 +142,7 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid) nlh->nlmsg_pid = pid; } -static void audit_panic(const char *message) +void audit_panic(const char *message) { switch (audit_failure) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 31917ac..4e2256e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -34,6 +34,9 @@ * * Modified by Amy Griffis to collect additional * filesystem information. + * + * Subject and object context labeling support added by + * and for LSPP certification compliance. */ #include @@ -53,6 +56,7 @@ #include #include #include +#include /* 0 = no checking 1 = put_count checking @@ -109,6 +113,7 @@ struct audit_names { uid_t uid; gid_t gid; dev_t rdev; + char *ctx; }; struct audit_aux_data { @@ -125,6 +130,7 @@ struct audit_aux_data_ipcctl { uid_t uid; gid_t gid; mode_t mode; + char *ctx; }; struct audit_aux_data_socketcall { @@ -743,10 +749,11 @@ static inline void audit_free_names(struct audit_context *context) context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); - for (i = 0; i < context->name_count; i++) + for (i = 0; i < context->name_count; i++) { printk(KERN_ERR "names[%d] = %p = %s\n", i, context->names[i].name, context->names[i].name ?: "(null)"); + } dump_stack(); return; } @@ -756,9 +763,13 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count = 0; #endif - for (i = 0; i < context->name_count; i++) + for (i = 0; i < context->name_count; i++) { + char *p = context->names[i].ctx; + context->names[i].ctx = NULL; + kfree(p); if (context->names[i].name) __putname(context->names[i].name); + } context->name_count = 0; if (context->pwd) dput(context->pwd); @@ -778,6 +789,12 @@ static inline void audit_free_aux(struct audit_context *context) dput(axi->dentry); mntput(axi->mnt); } + if ( aux->type == AUDIT_IPC ) { + struct audit_aux_data_ipcctl *axi = (void *)aux; + if (axi->ctx) + kfree(axi->ctx); + } + context->aux = aux->next; kfree(aux); } @@ -862,7 +879,38 @@ static inline void audit_free_context(struct audit_context *context) printk(KERN_ERR "audit: freed %d contexts\n", count); } -static void audit_log_task_info(struct audit_buffer *ab) +static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask) +{ + char *ctx = NULL; + ssize_t len = 0; + + len = security_getprocattr(current, "current", NULL, 0); + if (len < 0) { + if (len != -EINVAL) + goto error_path; + return; + } + + ctx = kmalloc(len, gfp_mask); + if (!ctx) { + goto error_path; + return; + } + + len = security_getprocattr(current, "current", ctx, len); + if (len < 0 ) + goto error_path; + + audit_log_format(ab, " subj=%s", ctx); + +error_path: + if (ctx) + kfree(ctx); + audit_panic("security_getprocattr error in audit_log_task_context"); + return; +} + +static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask) { char name[sizeof(current->comm)]; struct mm_struct *mm = current->mm; @@ -875,6 +923,10 @@ static void audit_log_task_info(struct audit_buffer *ab) if (!mm) return; + /* + * this is brittle; all callers that pass GFP_ATOMIC will have + * NULL current->mm and we won't get here. + */ down_read(&mm->mmap_sem); vma = mm->mmap; while (vma) { @@ -888,6 +940,7 @@ static void audit_log_task_info(struct audit_buffer *ab) vma = vma->vm_next; } up_read(&mm->mmap_sem); + audit_log_task_context(ab, gfp_mask); } static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) @@ -923,7 +976,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) context->gid, context->euid, context->suid, context->fsuid, context->egid, context->sgid, context->fsgid); - audit_log_task_info(ab); + audit_log_task_info(ab, gfp_mask); audit_log_end(ab); for (aux = context->aux; aux; aux = aux->next) { @@ -936,8 +989,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx iuid=%u igid=%u mode=%x", - axi->qbytes, axi->uid, axi->gid, axi->mode); + " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s", + axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx); break; } case AUDIT_SOCKETCALL: { @@ -1001,6 +1054,11 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) context->names[i].gid, MAJOR(context->names[i].rdev), MINOR(context->names[i].rdev)); + if (context->names[i].ctx) { + audit_log_format(ab, " obj=%s", + context->names[i].ctx); + } + audit_log_end(ab); } } @@ -1243,6 +1301,39 @@ void audit_putname(const char *name) #endif } +void audit_inode_context(int idx, const struct inode *inode) +{ + struct audit_context *context = current->audit_context; + char *ctx = NULL; + int len = 0; + + if (!security_inode_xattr_getsuffix()) + return; + + len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), NULL, 0, 0); + if (len < 0) + goto error_path; + + ctx = kmalloc(len, GFP_KERNEL); + if (!ctx) + goto error_path; + + len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), ctx, len, 0); + if (len < 0) + goto error_path; + + kfree(context->names[idx].ctx); + context->names[idx].ctx = ctx; + return; + +error_path: + if (ctx) + kfree(ctx); + audit_panic("error in audit_inode_context"); + return; +} + + /** * audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1282,6 +1373,7 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags) context->names[idx].uid = inode->i_uid; context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; + audit_inode_context(idx, inode); if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && (strcmp(name, ".") != 0)) { context->names[idx].ino = (unsigned long)-1; @@ -1363,6 +1455,7 @@ update_context: context->names[idx].uid = inode->i_uid; context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; + audit_inode_context(idx, inode); } } @@ -1423,6 +1516,38 @@ uid_t audit_get_loginuid(struct audit_context *ctx) return ctx ? ctx->loginuid : -1; } +static char *audit_ipc_context(struct kern_ipc_perm *ipcp) +{ + struct audit_context *context = current->audit_context; + char *ctx = NULL; + int len = 0; + + if (likely(!context)) + return NULL; + + len = security_ipc_getsecurity(ipcp, NULL, 0); + if (len == -EOPNOTSUPP) + goto ret; + if (len < 0) + goto error_path; + + ctx = kmalloc(len, GFP_ATOMIC); + if (!ctx) + goto error_path; + + len = security_ipc_getsecurity(ipcp, ctx, len); + if (len < 0) + goto error_path; + + return ctx; + +error_path: + kfree(ctx); + audit_panic("error in audit_ipc_context"); +ret: + return NULL; +} + /** * audit_ipc_perms - record audit data for ipc * @qbytes: msgq bytes @@ -1432,7 +1557,7 @@ uid_t audit_get_loginuid(struct audit_context *ctx) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; @@ -1440,7 +1565,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) if (likely(!context)) return 0; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); if (!ax) return -ENOMEM; @@ -1448,6 +1573,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) ax->uid = uid; ax->gid = gid; ax->mode = mode; + ax->ctx = audit_ipc_context(ipcp); ax->d.type = AUDIT_IPC; ax->d.next = context->aux; -- cgit v1.1 From 7306a0b9b3e2056a616c84841288ca2431a05627 Mon Sep 17 00:00:00 2001 From: Dustin Kirkland Date: Wed, 16 Nov 2005 15:53:13 +0000 Subject: [PATCH] Miscellaneous bug and warning fixes This patch fixes a couple of bugs revealed in new features recently added to -mm1: * fixes warnings due to inconsistent use of const struct inode *inode * fixes bug that prevent a kernel from booting with audit on, and SELinux off due to a missing function in security/dummy.c * fixes a bug that throws spurious audit_panic() messages due to a missing return just before an error_path label * some reasonable house cleaning in audit_ipc_context(), audit_inode_context(), and audit_log_task_context() Signed-off-by: Dustin Kirkland Signed-off-by: David Woodhouse --- kernel/auditsc.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4e2256e..4ef1451 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -892,21 +892,20 @@ static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask) } ctx = kmalloc(len, gfp_mask); - if (!ctx) { + if (!ctx) goto error_path; - return; - } len = security_getprocattr(current, "current", ctx, len); if (len < 0 ) goto error_path; audit_log_format(ab, " subj=%s", ctx); + return; error_path: if (ctx) kfree(ctx); - audit_panic("security_getprocattr error in audit_log_task_context"); + audit_panic("error in audit_log_task_context"); return; } @@ -1304,13 +1303,16 @@ void audit_putname(const char *name) void audit_inode_context(int idx, const struct inode *inode) { struct audit_context *context = current->audit_context; + const char *suffix = security_inode_xattr_getsuffix(); char *ctx = NULL; int len = 0; - if (!security_inode_xattr_getsuffix()) - return; + if (!suffix) + goto ret; - len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), NULL, 0, 0); + len = security_inode_getsecurity(inode, suffix, NULL, 0, 0); + if (len == -EOPNOTSUPP) + goto ret; if (len < 0) goto error_path; @@ -1318,18 +1320,19 @@ void audit_inode_context(int idx, const struct inode *inode) if (!ctx) goto error_path; - len = security_inode_getsecurity(inode, (char *)security_inode_xattr_getsuffix(), ctx, len, 0); + len = security_inode_getsecurity(inode, suffix, ctx, len, 0); if (len < 0) goto error_path; kfree(context->names[idx].ctx); context->names[idx].ctx = ctx; - return; + goto ret; error_path: if (ctx) kfree(ctx); audit_panic("error in audit_inode_context"); +ret: return; } -- cgit v1.1 From fe7752bab26a9ac0651b695ad4f55659761f68f7 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 15 Dec 2005 18:33:52 +0000 Subject: [PATCH] Fix audit record filtering with !CONFIG_AUDITSYSCALL This fixes the per-user and per-message-type filtering when syscall auditing isn't enabled. [AV: folded followup fix from the same author] Signed-off-by: David Woodhouse Signed-off-by: Al Viro --- kernel/Makefile | 2 +- kernel/audit.c | 1 + kernel/audit.h | 70 ++++++++++ kernel/auditfilter.c | 378 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/auditsc.c | 380 +-------------------------------------------------- 5 files changed, 454 insertions(+), 377 deletions(-) create mode 100644 kernel/audit.h create mode 100644 kernel/auditfilter.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 4ae0fbd..58cf129 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -26,7 +26,7 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o -obj-$(CONFIG_AUDIT) += audit.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/kernel/audit.c b/kernel/audit.c index 45c123e..07c5d2b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -305,6 +305,7 @@ static int kauditd_thread(void *dummy) remove_wait_queue(&kauditd_wait, &wait); } } + return 0; } /** diff --git a/kernel/audit.h b/kernel/audit.h new file mode 100644 index 0000000..7643e46 --- /dev/null +++ b/kernel/audit.h @@ -0,0 +1,70 @@ +/* audit -- definition of audit_context structure and supporting types + * + * Copyright 2003-2004 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include + +/* 0 = no checking + 1 = put_count checking + 2 = verbose put_count checking +*/ +#define AUDIT_DEBUG 0 + +/* At task start time, the audit_state is set in the audit_context using + a per-task filter. At syscall entry, the audit_state is augmented by + the syscall filter. */ +enum audit_state { + AUDIT_DISABLED, /* Do not create per-task audit_context. + * No syscall-specific audit records can + * be generated. */ + AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, + * but don't necessarily fill it in at + * syscall entry time (i.e., filter + * instead). */ + AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, + * and always fill it in at syscall + * entry time. This makes a full + * syscall record available if some + * other part of the kernel decides it + * should be recorded. */ + AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, + * always fill it in at syscall entry + * time, and always write out the audit + * record at syscall exit time. */ +}; + +/* Rule lists */ +struct audit_entry { + struct list_head list; + struct rcu_head rcu; + struct audit_rule rule; +}; + + +extern int audit_pid; +extern int audit_comparator(const u32 left, const u32 op, const u32 right); + +extern void audit_send_reply(int pid, int seq, int type, + int done, int multi, + void *payload, int size); +extern void audit_log_lost(const char *message); +extern void audit_panic(const char *message); +extern struct semaphore audit_netlink_sem; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c new file mode 100644 index 0000000..7f347c3 --- /dev/null +++ b/kernel/auditfilter.c @@ -0,0 +1,378 @@ +/* auditfilter.c -- filtering of audit events + * + * Copyright 2003-2004 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include "audit.h" + +/* There are three lists of rules -- one to search at task creation + * time, one to search at syscall entry time, and another to search at + * syscall exit time. */ +struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_filter_list[0]), + LIST_HEAD_INIT(audit_filter_list[1]), + LIST_HEAD_INIT(audit_filter_list[2]), + LIST_HEAD_INIT(audit_filter_list[3]), + LIST_HEAD_INIT(audit_filter_list[4]), + LIST_HEAD_INIT(audit_filter_list[5]), +#if AUDIT_NR_FILTERS != 6 +#error Fix audit_filter_list initialiser +#endif +}; + +/* Copy rule from user-space to kernel-space. Called from + * audit_add_rule during AUDIT_ADD. */ +static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) +{ + int i; + + if (s->action != AUDIT_NEVER + && s->action != AUDIT_POSSIBLE + && s->action != AUDIT_ALWAYS) + return -1; + if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) + return -1; + if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS) + return -1; + + d->flags = s->flags; + d->action = s->action; + d->field_count = s->field_count; + for (i = 0; i < d->field_count; i++) { + d->fields[i] = s->fields[i]; + d->values[i] = s->values[i]; + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; + return 0; +} + +/* Check to see if two rules are identical. It is called from + * audit_add_rule during AUDIT_ADD and + * audit_del_rule during AUDIT_DEL. */ +static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) +{ + int i; + + if (a->flags != b->flags) + return 1; + + if (a->action != b->action) + return 1; + + if (a->field_count != b->field_count) + return 1; + + for (i = 0; i < a->field_count; i++) { + if (a->fields[i] != b->fields[i] + || a->values[i] != b->values[i]) + return 1; + } + + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + if (a->mask[i] != b->mask[i]) + return 1; + + return 0; +} + +/* Note that audit_add_rule and audit_del_rule are called via + * audit_receive() in audit.c, and are protected by + * audit_netlink_sem. */ +static inline int audit_add_rule(struct audit_rule *rule, + struct list_head *list) +{ + struct audit_entry *entry; + int i; + + /* Do not use the _rcu iterator here, since this is the only + * addition routine. */ + list_for_each_entry(entry, list, list) { + if (!audit_compare_rule(rule, &entry->rule)) { + return -EEXIST; + } + } + + for (i = 0; i < rule->field_count; i++) { + if (rule->fields[i] & AUDIT_UNUSED_BITS) + return -EINVAL; + if ( rule->fields[i] & AUDIT_NEGATE ) + rule->fields[i] |= AUDIT_NOT_EQUAL; + else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 ) + rule->fields[i] |= AUDIT_EQUAL; + rule->fields[i] &= (~AUDIT_NEGATE); + } + + if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) + return -ENOMEM; + if (audit_copy_rule(&entry->rule, rule)) { + kfree(entry); + return -EINVAL; + } + + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + entry->rule.flags &= ~AUDIT_FILTER_PREPEND; + list_add_rcu(&entry->list, list); + } else { + list_add_tail_rcu(&entry->list, list); + } + + return 0; +} + +static inline void audit_free_rule(struct rcu_head *head) +{ + struct audit_entry *e = container_of(head, struct audit_entry, rcu); + kfree(e); +} + +/* Note that audit_add_rule and audit_del_rule are called via + * audit_receive() in audit.c, and are protected by + * audit_netlink_sem. */ +static inline int audit_del_rule(struct audit_rule *rule, + struct list_head *list) +{ + struct audit_entry *e; + + /* Do not use the _rcu iterator here, since this is the only + * deletion routine. */ + list_for_each_entry(e, list, list) { + if (!audit_compare_rule(rule, &e->rule)) { + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule); + return 0; + } + } + return -ENOENT; /* No matching rule */ +} + +static int audit_list_rules(void *_dest) +{ + int pid, seq; + int *dest = _dest; + struct audit_entry *entry; + int i; + + pid = dest[0]; + seq = dest[1]; + kfree(dest); + + down(&audit_netlink_sem); + + /* The *_rcu iterators not needed here because we are + always called with audit_netlink_sem held. */ + for (i=0; irule, sizeof(entry->rule)); + } + audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + + up(&audit_netlink_sem); + return 0; +} + +/** + * audit_receive_filter - apply all rules to the specified message type + * @type: audit message type + * @pid: target pid for netlink audit messages + * @uid: target uid for netlink audit messages + * @seq: netlink audit message sequence (serial) number + * @data: payload data + * @loginuid: loginuid of sender + */ +int audit_receive_filter(int type, int pid, int uid, int seq, void *data, + uid_t loginuid) +{ + struct task_struct *tsk; + int *dest; + int err = 0; + unsigned listnr; + + switch (type) { + case AUDIT_LIST: + /* We can't just spew out the rules here because we might fill + * the available socket buffer space and deadlock waiting for + * auditctl to read from it... which isn't ever going to + * happen if we're actually running in the context of auditctl + * trying to _send_ the stuff */ + + dest = kmalloc(2 * sizeof(int), GFP_KERNEL); + if (!dest) + return -ENOMEM; + dest[0] = pid; + dest[1] = seq; + + tsk = kthread_run(audit_list_rules, dest, "audit_list_rules"); + if (IS_ERR(tsk)) { + kfree(dest); + err = PTR_ERR(tsk); + } + break; + case AUDIT_ADD: + listnr = ((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; + switch(listnr) { + default: + return -EINVAL; + + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: +#ifdef CONFIG_AUDITSYSCALL + case AUDIT_FILTER_ENTRY: + case AUDIT_FILTER_EXIT: + case AUDIT_FILTER_TASK: +#endif + ; + } + err = audit_add_rule(data, &audit_filter_list[listnr]); + if (!err) + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u added an audit rule\n", loginuid); + break; + case AUDIT_DEL: + listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; + if (listnr >= AUDIT_NR_FILTERS) + return -EINVAL; + + err = audit_del_rule(data, &audit_filter_list[listnr]); + if (!err) + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u removed an audit rule\n", loginuid); + break; + default: + return -EINVAL; + } + + return err; +} + +int audit_comparator(const u32 left, const u32 op, const u32 right) +{ + switch (op) { + case AUDIT_EQUAL: + return (left == right); + case AUDIT_NOT_EQUAL: + return (left != right); + case AUDIT_LESS_THAN: + return (left < right); + case AUDIT_LESS_THAN_OR_EQUAL: + return (left <= right); + case AUDIT_GREATER_THAN: + return (left > right); + case AUDIT_GREATER_THAN_OR_EQUAL: + return (left >= right); + default: + return -EINVAL; + } +} + + + +static int audit_filter_user_rules(struct netlink_skb_parms *cb, + struct audit_rule *rule, + enum audit_state *state) +{ + int i; + + for (i = 0; i < rule->field_count; i++) { + u32 field = rule->fields[i] & ~AUDIT_OPERATORS; + u32 op = rule->fields[i] & AUDIT_OPERATORS; + u32 value = rule->values[i]; + int result = 0; + + switch (field) { + case AUDIT_PID: + result = audit_comparator(cb->creds.pid, op, value); + break; + case AUDIT_UID: + result = audit_comparator(cb->creds.uid, op, value); + break; + case AUDIT_GID: + result = audit_comparator(cb->creds.gid, op, value); + break; + case AUDIT_LOGINUID: + result = audit_comparator(cb->loginuid, op, value); + break; + } + + if (!result) + return 0; + } + switch (rule->action) { + case AUDIT_NEVER: *state = AUDIT_DISABLED; break; + case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; + case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + } + return 1; +} + +int audit_filter_user(struct netlink_skb_parms *cb, int type) +{ + struct audit_entry *e; + enum audit_state state; + int ret = 1; + + rcu_read_lock(); + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { + if (audit_filter_user_rules(cb, &e->rule, &state)) { + if (state == AUDIT_DISABLED) + ret = 0; + break; + } + } + rcu_read_unlock(); + + return ret; /* Audit by default */ +} + +int audit_filter_type(int type) +{ + struct audit_entry *e; + int result = 0; + + rcu_read_lock(); + if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) + goto unlock_and_return; + + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], + list) { + struct audit_rule *rule = &e->rule; + int i; + for (i = 0; i < rule->field_count; i++) { + u32 field = rule->fields[i] & ~AUDIT_OPERATORS; + u32 op = rule->fields[i] & AUDIT_OPERATORS; + u32 value = rule->values[i]; + if ( field == AUDIT_MSGTYPE ) { + result = audit_comparator(type, op, value); + if (!result) + break; + } + } + if (result) + goto unlock_and_return; + } +unlock_and_return: + rcu_read_unlock(); + return result; +} + + diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4ef1451..17719b3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -52,17 +52,15 @@ #include #include #include -#include #include #include #include #include +#include -/* 0 = no checking - 1 = put_count checking - 2 = verbose put_count checking -*/ -#define AUDIT_DEBUG 0 +#include "audit.h" + +extern struct list_head audit_filter_list[]; /* No syscall auditing will take place unless audit_enabled != 0. */ extern int audit_enabled; @@ -76,29 +74,6 @@ extern int audit_enabled; * path_lookup. */ #define AUDIT_NAMES_RESERVED 7 -/* At task start time, the audit_state is set in the audit_context using - a per-task filter. At syscall entry, the audit_state is augmented by - the syscall filter. */ -enum audit_state { - AUDIT_DISABLED, /* Do not create per-task audit_context. - * No syscall-specific audit records can - * be generated. */ - AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, - * but don't necessarily fill it in at - * syscall entry time (i.e., filter - * instead). */ - AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, - * and always fill it in at syscall - * entry time. This makes a full - * syscall record available if some - * other part of the kernel decides it - * should be recorded. */ - AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, - * always fill it in at syscall entry - * time, and always write out the audit - * record at syscall exit time. */ -}; - /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -183,264 +158,6 @@ struct audit_context { #endif }; - /* Public API */ -/* There are three lists of rules -- one to search at task creation - * time, one to search at syscall entry time, and another to search at - * syscall exit time. */ -static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { - LIST_HEAD_INIT(audit_filter_list[0]), - LIST_HEAD_INIT(audit_filter_list[1]), - LIST_HEAD_INIT(audit_filter_list[2]), - LIST_HEAD_INIT(audit_filter_list[3]), - LIST_HEAD_INIT(audit_filter_list[4]), - LIST_HEAD_INIT(audit_filter_list[5]), -#if AUDIT_NR_FILTERS != 6 -#error Fix audit_filter_list initialiser -#endif -}; - -struct audit_entry { - struct list_head list; - struct rcu_head rcu; - struct audit_rule rule; -}; - -extern int audit_pid; - -/* Copy rule from user-space to kernel-space. Called from - * audit_add_rule during AUDIT_ADD. */ -static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) -{ - int i; - - if (s->action != AUDIT_NEVER - && s->action != AUDIT_POSSIBLE - && s->action != AUDIT_ALWAYS) - return -1; - if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) - return -1; - if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS) - return -1; - - d->flags = s->flags; - d->action = s->action; - d->field_count = s->field_count; - for (i = 0; i < d->field_count; i++) { - d->fields[i] = s->fields[i]; - d->values[i] = s->values[i]; - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; - return 0; -} - -/* Check to see if two rules are identical. It is called from - * audit_add_rule during AUDIT_ADD and - * audit_del_rule during AUDIT_DEL. */ -static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) -{ - int i; - - if (a->flags != b->flags) - return 1; - - if (a->action != b->action) - return 1; - - if (a->field_count != b->field_count) - return 1; - - for (i = 0; i < a->field_count; i++) { - if (a->fields[i] != b->fields[i] - || a->values[i] != b->values[i]) - return 1; - } - - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) - if (a->mask[i] != b->mask[i]) - return 1; - - return 0; -} - -/* Note that audit_add_rule and audit_del_rule are called via - * audit_receive() in audit.c, and are protected by - * audit_netlink_sem. */ -static inline int audit_add_rule(struct audit_rule *rule, - struct list_head *list) -{ - struct audit_entry *entry; - int i; - - /* Do not use the _rcu iterator here, since this is the only - * addition routine. */ - list_for_each_entry(entry, list, list) { - if (!audit_compare_rule(rule, &entry->rule)) { - return -EEXIST; - } - } - - for (i = 0; i < rule->field_count; i++) { - if (rule->fields[i] & AUDIT_UNUSED_BITS) - return -EINVAL; - if ( rule->fields[i] & AUDIT_NEGATE ) - rule->fields[i] |= AUDIT_NOT_EQUAL; - else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 ) - rule->fields[i] |= AUDIT_EQUAL; - rule->fields[i] &= (~AUDIT_NEGATE); - } - - if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) - return -ENOMEM; - if (audit_copy_rule(&entry->rule, rule)) { - kfree(entry); - return -EINVAL; - } - - if (entry->rule.flags & AUDIT_FILTER_PREPEND) { - entry->rule.flags &= ~AUDIT_FILTER_PREPEND; - list_add_rcu(&entry->list, list); - } else { - list_add_tail_rcu(&entry->list, list); - } - - return 0; -} - -static inline void audit_free_rule(struct rcu_head *head) -{ - struct audit_entry *e = container_of(head, struct audit_entry, rcu); - kfree(e); -} - -/* Note that audit_add_rule and audit_del_rule are called via - * audit_receive() in audit.c, and are protected by - * audit_netlink_sem. */ -static inline int audit_del_rule(struct audit_rule *rule, - struct list_head *list) -{ - struct audit_entry *e; - - /* Do not use the _rcu iterator here, since this is the only - * deletion routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(rule, &e->rule)) { - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule); - return 0; - } - } - return -ENOENT; /* No matching rule */ -} - -static int audit_list_rules(void *_dest) -{ - int pid, seq; - int *dest = _dest; - struct audit_entry *entry; - int i; - - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - down(&audit_netlink_sem); - - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_sem held. */ - for (i=0; irule, sizeof(entry->rule)); - } - audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - - up(&audit_netlink_sem); - return 0; -} - -/** - * audit_receive_filter - apply all rules to the specified message type - * @type: audit message type - * @pid: target pid for netlink audit messages - * @uid: target uid for netlink audit messages - * @seq: netlink audit message sequence (serial) number - * @data: payload data - * @loginuid: loginuid of sender - */ -int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - uid_t loginuid) -{ - struct task_struct *tsk; - int *dest; - int err = 0; - unsigned listnr; - - switch (type) { - case AUDIT_LIST: - /* We can't just spew out the rules here because we might fill - * the available socket buffer space and deadlock waiting for - * auditctl to read from it... which isn't ever going to - * happen if we're actually running in the context of auditctl - * trying to _send_ the stuff */ - - dest = kmalloc(2 * sizeof(int), GFP_KERNEL); - if (!dest) - return -ENOMEM; - dest[0] = pid; - dest[1] = seq; - - tsk = kthread_run(audit_list_rules, dest, "audit_list_rules"); - if (IS_ERR(tsk)) { - kfree(dest); - err = PTR_ERR(tsk); - } - break; - case AUDIT_ADD: - listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; - if (listnr >= AUDIT_NR_FILTERS) - return -EINVAL; - - err = audit_add_rule(data, &audit_filter_list[listnr]); - if (!err) - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u added an audit rule\n", loginuid); - break; - case AUDIT_DEL: - listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; - if (listnr >= AUDIT_NR_FILTERS) - return -EINVAL; - - err = audit_del_rule(data, &audit_filter_list[listnr]); - if (!err) - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u removed an audit rule\n", loginuid); - break; - default: - return -EINVAL; - } - - return err; -} - -static int audit_comparator(const u32 left, const u32 op, const u32 right) -{ - switch (op) { - case AUDIT_EQUAL: - return (left == right); - case AUDIT_NOT_EQUAL: - return (left != right); - case AUDIT_LESS_THAN: - return (left < right); - case AUDIT_LESS_THAN_OR_EQUAL: - return (left <= right); - case AUDIT_GREATER_THAN: - return (left > right); - case AUDIT_GREATER_THAN_OR_EQUAL: - return (left >= right); - default: - return -EINVAL; - } -} /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ @@ -613,95 +330,6 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } -static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_rule *rule, - enum audit_state *state) -{ - int i; - - for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_OPERATORS; - u32 op = rule->fields[i] & AUDIT_OPERATORS; - u32 value = rule->values[i]; - int result = 0; - - switch (field) { - case AUDIT_PID: - result = audit_comparator(cb->creds.pid, op, value); - break; - case AUDIT_UID: - result = audit_comparator(cb->creds.uid, op, value); - break; - case AUDIT_GID: - result = audit_comparator(cb->creds.gid, op, value); - break; - case AUDIT_LOGINUID: - result = audit_comparator(cb->loginuid, op, value); - break; - } - - if (!result) - return 0; - } - switch (rule->action) { - case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; - case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; - } - return 1; -} - -int audit_filter_user(struct netlink_skb_parms *cb, int type) -{ - struct audit_entry *e; - enum audit_state state; - int ret = 1; - - rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(cb, &e->rule, &state)) { - if (state == AUDIT_DISABLED) - ret = 0; - break; - } - } - rcu_read_unlock(); - - return ret; /* Audit by default */ -} - -int audit_filter_type(int type) -{ - struct audit_entry *e; - int result = 0; - - rcu_read_lock(); - if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) - goto unlock_and_return; - - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], - list) { - struct audit_rule *rule = &e->rule; - int i; - for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_OPERATORS; - u32 op = rule->fields[i] & AUDIT_OPERATORS; - u32 value = rule->values[i]; - if ( field == AUDIT_MSGTYPE ) { - result = audit_comparator(type, op, value); - if (!result) - break; - } - } - if (result) - goto unlock_and_return; - } -unlock_and_return: - rcu_read_unlock(); - return result; -} - - /* This should be called with task_lock() held. */ static inline struct audit_context *audit_get_context(struct task_struct *tsk, int return_valid, -- cgit v1.1 From d884596f44ef5a0bcd8a66405dc04902aeaa6fc7 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 16 Dec 2005 10:48:28 +0000 Subject: [PATCH] Minor cosmetic cleanups to the code moved into auditfilter.c Signed-off-by: David Woodhouse --- kernel/auditfilter.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7f347c3..a3a3275 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -69,7 +69,7 @@ static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) /* Check to see if two rules are identical. It is called from * audit_add_rule during AUDIT_ADD and * audit_del_rule during AUDIT_DEL. */ -static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) +static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) { int i; @@ -107,19 +107,18 @@ static inline int audit_add_rule(struct audit_rule *rule, /* Do not use the _rcu iterator here, since this is the only * addition routine. */ list_for_each_entry(entry, list, list) { - if (!audit_compare_rule(rule, &entry->rule)) { + if (!audit_compare_rule(rule, &entry->rule)) return -EEXIST; - } } for (i = 0; i < rule->field_count; i++) { if (rule->fields[i] & AUDIT_UNUSED_BITS) return -EINVAL; - if ( rule->fields[i] & AUDIT_NEGATE ) + if ( rule->fields[i] & AUDIT_NEGATE) rule->fields[i] |= AUDIT_NOT_EQUAL; else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 ) rule->fields[i] |= AUDIT_EQUAL; - rule->fields[i] &= (~AUDIT_NEGATE); + rule->fields[i] &= ~AUDIT_NEGATE; } if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) @@ -374,5 +373,3 @@ unlock_and_return: rcu_read_unlock(); return result; } - - -- cgit v1.1 From 93315ed6dd12dacfc941f9eb8ca0293aadf99793 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Tue, 7 Feb 2006 12:05:27 -0500 Subject: [PATCH] audit string fields interface + consumer Updated patch to dynamically allocate audit rule fields in kernel's internal representation. Added unlikely() calls for testing memory allocation result. Amy Griffis wrote: [Wed Jan 11 2006, 02:02:31PM EST] > Modify audit's kernel-userspace interface to allow the specification > of string fields in audit rules. > > Signed-off-by: Amy Griffis Signed-off-by: Al Viro (cherry picked from 5ffc4a863f92351b720fe3e9c5cd647accff9e03 commit) --- kernel/audit.c | 19 ++- kernel/audit.h | 23 ++- kernel/auditfilter.c | 467 +++++++++++++++++++++++++++++++++++++++------------ kernel/auditsc.c | 50 +++--- 4 files changed, 418 insertions(+), 141 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 07c5d2b..4eb97b6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -52,6 +52,7 @@ #include #include +#include #include #include @@ -361,9 +362,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) switch (msg_type) { case AUDIT_GET: case AUDIT_LIST: + case AUDIT_LIST_RULES: case AUDIT_SET: case AUDIT_ADD: + case AUDIT_ADD_RULE: case AUDIT_DEL: + case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) err = -EPERM; @@ -470,12 +474,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_ADD: case AUDIT_DEL: - if (nlh->nlmsg_len < sizeof(struct audit_rule)) + if (nlmsg_len(nlh) < sizeof(struct audit_rule)) return -EINVAL; /* fallthrough */ case AUDIT_LIST: err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, - uid, seq, data, loginuid); + uid, seq, data, nlmsg_len(nlh), + loginuid); + break; + case AUDIT_ADD_RULE: + case AUDIT_DEL_RULE: + if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + return -EINVAL; + /* fallthrough */ + case AUDIT_LIST_RULES: + err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + uid, seq, data, nlmsg_len(nlh), + loginuid); break; case AUDIT_SIGNAL_INFO: sig_data.uid = audit_sig_uid; diff --git a/kernel/audit.h b/kernel/audit.h index 7643e46..4b602cd 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -52,10 +52,27 @@ enum audit_state { }; /* Rule lists */ +struct audit_field { + u32 type; + u32 val; + u32 op; +}; + +struct audit_krule { + int vers_ops; + u32 flags; + u32 listnr; + u32 action; + u32 mask[AUDIT_BITMASK_SIZE]; + u32 buflen; /* for data alloc on list rules */ + u32 field_count; + struct audit_field *fields; +}; + struct audit_entry { - struct list_head list; - struct rcu_head rcu; - struct audit_rule rule; + struct list_head list; + struct rcu_head rcu; + struct audit_krule rule; }; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a3a3275..686d514 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -40,52 +40,279 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #endif }; -/* Copy rule from user-space to kernel-space. Called from - * audit_add_rule during AUDIT_ADD. */ -static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) +static inline void audit_free_rule(struct audit_entry *e) { + kfree(e->rule.fields); + kfree(e); +} + +static inline void audit_free_rule_rcu(struct rcu_head *head) +{ + struct audit_entry *e = container_of(head, struct audit_entry, rcu); + audit_free_rule(e); +} + +/* Unpack a filter field's string representation from user-space + * buffer. */ +static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len) +{ + char *str; + + if (!*bufp || (len == 0) || (len > *remain)) + return ERR_PTR(-EINVAL); + + /* Of the currently implemented string fields, PATH_MAX + * defines the longest valid length. + */ + if (len > PATH_MAX) + return ERR_PTR(-ENAMETOOLONG); + + str = kmalloc(len + 1, GFP_KERNEL); + if (unlikely(!str)) + return ERR_PTR(-ENOMEM); + + memcpy(str, *bufp, len); + str[len] = 0; + *bufp += len; + *remain -= len; + + return str; +} + +/* Common user-space to kernel rule translation. */ +static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) +{ + unsigned listnr; + struct audit_entry *entry; + struct audit_field *fields; + int i, err; + + err = -EINVAL; + listnr = rule->flags & ~AUDIT_FILTER_PREPEND; + switch(listnr) { + default: + goto exit_err; + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: +#ifdef CONFIG_AUDITSYSCALL + case AUDIT_FILTER_ENTRY: + case AUDIT_FILTER_EXIT: + case AUDIT_FILTER_TASK: +#endif + ; + } + if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && + rule->action != AUDIT_ALWAYS) + goto exit_err; + if (rule->field_count > AUDIT_MAX_FIELDS) + goto exit_err; + + err = -ENOMEM; + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (unlikely(!entry)) + goto exit_err; + fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL); + if (unlikely(!fields)) { + kfree(entry); + goto exit_err; + } + + memset(&entry->rule, 0, sizeof(struct audit_krule)); + memset(fields, 0, sizeof(struct audit_field)); + + entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; + entry->rule.listnr = listnr; + entry->rule.action = rule->action; + entry->rule.field_count = rule->field_count; + entry->rule.fields = fields; + + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + entry->rule.mask[i] = rule->mask[i]; + + return entry; + +exit_err: + return ERR_PTR(err); +} + +/* Translate struct audit_rule to kernel's rule respresentation. + * Exists for backward compatibility with userspace. */ +static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) +{ + struct audit_entry *entry; + int err = 0; int i; - if (s->action != AUDIT_NEVER - && s->action != AUDIT_POSSIBLE - && s->action != AUDIT_ALWAYS) - return -1; - if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) - return -1; - if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS) - return -1; - - d->flags = s->flags; - d->action = s->action; - d->field_count = s->field_count; - for (i = 0; i < d->field_count; i++) { - d->fields[i] = s->fields[i]; - d->values[i] = s->values[i]; + entry = audit_to_entry_common(rule); + if (IS_ERR(entry)) + goto exit_nofree; + + for (i = 0; i < rule->field_count; i++) { + struct audit_field *f = &entry->rule.fields[i]; + + if (rule->fields[i] & AUDIT_UNUSED_BITS) { + err = -EINVAL; + goto exit_free; + } + + f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); + f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); + f->val = rule->values[i]; + + entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; + if (f->op & AUDIT_NEGATE) + f->op |= AUDIT_NOT_EQUAL; + else if (!(f->op & AUDIT_OPERATORS)) + f->op |= AUDIT_EQUAL; + f->op &= ~AUDIT_NEGATE; } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; - return 0; + +exit_nofree: + return entry; + +exit_free: + audit_free_rule(entry); + return ERR_PTR(err); } -/* Check to see if two rules are identical. It is called from - * audit_add_rule during AUDIT_ADD and - * audit_del_rule during AUDIT_DEL. */ -static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) +/* Translate struct audit_rule_data to kernel's rule respresentation. */ +static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, + size_t datasz) { + int err = 0; + struct audit_entry *entry; + void *bufp; + /* size_t remain = datasz - sizeof(struct audit_rule_data); */ int i; - if (a->flags != b->flags) - return 1; + entry = audit_to_entry_common((struct audit_rule *)data); + if (IS_ERR(entry)) + goto exit_nofree; - if (a->action != b->action) - return 1; + bufp = data->buf; + entry->rule.vers_ops = 2; + for (i = 0; i < data->field_count; i++) { + struct audit_field *f = &entry->rule.fields[i]; + + err = -EINVAL; + if (!(data->fieldflags[i] & AUDIT_OPERATORS) || + data->fieldflags[i] & ~AUDIT_OPERATORS) + goto exit_free; + + f->op = data->fieldflags[i] & AUDIT_OPERATORS; + f->type = data->fields[i]; + switch(f->type) { + /* call type-specific conversion routines here */ + default: + f->val = data->values[i]; + } + } + +exit_nofree: + return entry; + +exit_free: + audit_free_rule(entry); + return ERR_PTR(err); +} + +/* Pack a filter field's string representation into data block. */ +static inline size_t audit_pack_string(void **bufp, char *str) +{ + size_t len = strlen(str); + + memcpy(*bufp, str, len); + *bufp += len; + + return len; +} + +/* Translate kernel rule respresentation to struct audit_rule. + * Exists for backward compatibility with userspace. */ +static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) +{ + struct audit_rule *rule; + int i; + + rule = kmalloc(sizeof(*rule), GFP_KERNEL); + if (unlikely(!rule)) + return ERR_PTR(-ENOMEM); + memset(rule, 0, sizeof(*rule)); + + rule->flags = krule->flags | krule->listnr; + rule->action = krule->action; + rule->field_count = krule->field_count; + for (i = 0; i < rule->field_count; i++) { + rule->values[i] = krule->fields[i].val; + rule->fields[i] = krule->fields[i].type; + + if (krule->vers_ops == 1) { + if (krule->fields[i].op & AUDIT_NOT_EQUAL) + rule->fields[i] |= AUDIT_NEGATE; + } else { + rule->fields[i] |= krule->fields[i].op; + } + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; + + return rule; +} - if (a->field_count != b->field_count) +/* Translate kernel rule respresentation to struct audit_rule_data. */ +static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) +{ + struct audit_rule_data *data; + void *bufp; + int i; + + data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); + if (unlikely(!data)) + return ERR_PTR(-ENOMEM); + memset(data, 0, sizeof(*data)); + + data->flags = krule->flags | krule->listnr; + data->action = krule->action; + data->field_count = krule->field_count; + bufp = data->buf; + for (i = 0; i < data->field_count; i++) { + struct audit_field *f = &krule->fields[i]; + + data->fields[i] = f->type; + data->fieldflags[i] = f->op; + switch(f->type) { + /* call type-specific conversion routines here */ + default: + data->values[i] = f->val; + } + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; + + return data; +} + +/* Compare two rules in kernel format. Considered success if rules + * don't match. */ +static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) +{ + int i; + + if (a->flags != b->flags || + a->listnr != b->listnr || + a->action != b->action || + a->field_count != b->field_count) return 1; for (i = 0; i < a->field_count; i++) { - if (a->fields[i] != b->fields[i] - || a->values[i] != b->values[i]) + if (a->fields[i].type != b->fields[i].type || + a->fields[i].op != b->fields[i].op) return 1; + + switch(a->fields[i].type) { + /* call type-specific comparison routines here */ + default: + if (a->fields[i].val != b->fields[i].val) + return 1; + } } for (i = 0; i < AUDIT_BITMASK_SIZE; i++) @@ -95,41 +322,21 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) return 0; } -/* Note that audit_add_rule and audit_del_rule are called via - * audit_receive() in audit.c, and are protected by +/* Add rule to given filterlist if not a duplicate. Protected by * audit_netlink_sem. */ -static inline int audit_add_rule(struct audit_rule *rule, +static inline int audit_add_rule(struct audit_entry *entry, struct list_head *list) { - struct audit_entry *entry; - int i; + struct audit_entry *e; /* Do not use the _rcu iterator here, since this is the only * addition routine. */ - list_for_each_entry(entry, list, list) { - if (!audit_compare_rule(rule, &entry->rule)) + list_for_each_entry(e, list, list) { + if (!audit_compare_rule(&entry->rule, &e->rule)) return -EEXIST; } - for (i = 0; i < rule->field_count; i++) { - if (rule->fields[i] & AUDIT_UNUSED_BITS) - return -EINVAL; - if ( rule->fields[i] & AUDIT_NEGATE) - rule->fields[i] |= AUDIT_NOT_EQUAL; - else if ( (rule->fields[i] & AUDIT_OPERATORS) == 0 ) - rule->fields[i] |= AUDIT_EQUAL; - rule->fields[i] &= ~AUDIT_NEGATE; - } - - if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) - return -ENOMEM; - if (audit_copy_rule(&entry->rule, rule)) { - kfree(entry); - return -EINVAL; - } - if (entry->rule.flags & AUDIT_FILTER_PREPEND) { - entry->rule.flags &= ~AUDIT_FILTER_PREPEND; list_add_rcu(&entry->list, list); } else { list_add_tail_rcu(&entry->list, list); @@ -138,16 +345,9 @@ static inline int audit_add_rule(struct audit_rule *rule, return 0; } -static inline void audit_free_rule(struct rcu_head *head) -{ - struct audit_entry *e = container_of(head, struct audit_entry, rcu); - kfree(e); -} - -/* Note that audit_add_rule and audit_del_rule are called via - * audit_receive() in audit.c, and are protected by +/* Remove an existing rule from filterlist. Protected by * audit_netlink_sem. */ -static inline int audit_del_rule(struct audit_rule *rule, +static inline int audit_del_rule(struct audit_entry *entry, struct list_head *list) { struct audit_entry *e; @@ -155,16 +355,18 @@ static inline int audit_del_rule(struct audit_rule *rule, /* Do not use the _rcu iterator here, since this is the only * deletion routine. */ list_for_each_entry(e, list, list) { - if (!audit_compare_rule(rule, &e->rule)) { + if (!audit_compare_rule(&entry->rule, &e->rule)) { list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule); + call_rcu(&e->rcu, audit_free_rule_rcu); return 0; } } return -ENOENT; /* No matching rule */ } -static int audit_list_rules(void *_dest) +/* List rules using struct audit_rule. Exists for backward + * compatibility with userspace. */ +static int audit_list(void *_dest) { int pid, seq; int *dest = _dest; @@ -180,9 +382,16 @@ static int audit_list_rules(void *_dest) /* The *_rcu iterators not needed here because we are always called with audit_netlink_sem held. */ for (i=0; irule); + if (unlikely(!rule)) + break; audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, - &entry->rule, sizeof(entry->rule)); + rule, sizeof(*rule)); + kfree(rule); + } } audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); @@ -190,6 +399,40 @@ static int audit_list_rules(void *_dest) return 0; } +/* List rules using struct audit_rule_data. */ +static int audit_list_rules(void *_dest) +{ + int pid, seq; + int *dest = _dest; + struct audit_entry *e; + int i; + + pid = dest[0]; + seq = dest[1]; + kfree(dest); + + down(&audit_netlink_sem); + + /* The *_rcu iterators not needed here because we are + always called with audit_netlink_sem held. */ + for (i=0; irule); + if (unlikely(!data)) + break; + audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data)); + kfree(data); + } + } + audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + + up(&audit_netlink_sem); + return 0; +} + /** * audit_receive_filter - apply all rules to the specified message type * @type: audit message type @@ -197,18 +440,20 @@ static int audit_list_rules(void *_dest) * @uid: target uid for netlink audit messages * @seq: netlink audit message sequence (serial) number * @data: payload data + * @datasz: size of payload data * @loginuid: loginuid of sender */ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - uid_t loginuid) + size_t datasz, uid_t loginuid) { struct task_struct *tsk; int *dest; - int err = 0; - unsigned listnr; + int err = 0; + struct audit_entry *entry; switch (type) { case AUDIT_LIST: + case AUDIT_LIST_RULES: /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for * auditctl to read from it... which isn't ever going to @@ -221,41 +466,48 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, dest[0] = pid; dest[1] = seq; - tsk = kthread_run(audit_list_rules, dest, "audit_list_rules"); + if (type == AUDIT_LIST) + tsk = kthread_run(audit_list, dest, "audit_list"); + else + tsk = kthread_run(audit_list_rules, dest, + "audit_list_rules"); if (IS_ERR(tsk)) { kfree(dest); err = PTR_ERR(tsk); } break; case AUDIT_ADD: - listnr = ((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; - switch(listnr) { - default: - return -EINVAL; - - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: -#ifdef CONFIG_AUDITSYSCALL - case AUDIT_FILTER_ENTRY: - case AUDIT_FILTER_EXIT: - case AUDIT_FILTER_TASK: -#endif - ; - } - err = audit_add_rule(data, &audit_filter_list[listnr]); + case AUDIT_ADD_RULE: + if (type == AUDIT_ADD) + entry = audit_rule_to_entry(data); + else + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + err = audit_add_rule(entry, + &audit_filter_list[entry->rule.listnr]); if (!err) audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "auid=%u added an audit rule\n", loginuid); + else + audit_free_rule(entry); break; case AUDIT_DEL: - listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; - if (listnr >= AUDIT_NR_FILTERS) - return -EINVAL; - - err = audit_del_rule(data, &audit_filter_list[listnr]); + case AUDIT_DEL_RULE: + if (type == AUDIT_DEL) + entry = audit_rule_to_entry(data); + else + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + err = audit_del_rule(entry, + &audit_filter_list[entry->rule.listnr]); if (!err) audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "auid=%u removed an audit rule\n", loginuid); + audit_free_rule(entry); break; default: return -EINVAL; @@ -287,29 +539,27 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) static int audit_filter_user_rules(struct netlink_skb_parms *cb, - struct audit_rule *rule, + struct audit_krule *rule, enum audit_state *state) { int i; for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_OPERATORS; - u32 op = rule->fields[i] & AUDIT_OPERATORS; - u32 value = rule->values[i]; + struct audit_field *f = &rule->fields[i]; int result = 0; - switch (field) { + switch (f->type) { case AUDIT_PID: - result = audit_comparator(cb->creds.pid, op, value); + result = audit_comparator(cb->creds.pid, f->op, f->val); break; case AUDIT_UID: - result = audit_comparator(cb->creds.uid, op, value); + result = audit_comparator(cb->creds.uid, f->op, f->val); break; case AUDIT_GID: - result = audit_comparator(cb->creds.gid, op, value); + result = audit_comparator(cb->creds.gid, f->op, f->val); break; case AUDIT_LOGINUID: - result = audit_comparator(cb->loginuid, op, value); + result = audit_comparator(cb->loginuid, f->op, f->val); break; } @@ -354,14 +604,11 @@ int audit_filter_type(int type) list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], list) { - struct audit_rule *rule = &e->rule; int i; - for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_OPERATORS; - u32 op = rule->fields[i] & AUDIT_OPERATORS; - u32 value = rule->values[i]; - if ( field == AUDIT_MSGTYPE ) { - result = audit_comparator(type, op, value); + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + if (f->type == AUDIT_MSGTYPE) { + result = audit_comparator(type, f->op, f->val); if (!result) break; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 17719b3..ba08788 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -162,70 +162,68 @@ struct audit_context { /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ static int audit_filter_rules(struct task_struct *tsk, - struct audit_rule *rule, + struct audit_krule *rule, struct audit_context *ctx, enum audit_state *state) { int i, j; for (i = 0; i < rule->field_count; i++) { - u32 field = rule->fields[i] & ~AUDIT_OPERATORS; - u32 op = rule->fields[i] & AUDIT_OPERATORS; - u32 value = rule->values[i]; + struct audit_field *f = &rule->fields[i]; int result = 0; - switch (field) { + switch (f->type) { case AUDIT_PID: - result = audit_comparator(tsk->pid, op, value); + result = audit_comparator(tsk->pid, f->op, f->val); break; case AUDIT_UID: - result = audit_comparator(tsk->uid, op, value); + result = audit_comparator(tsk->uid, f->op, f->val); break; case AUDIT_EUID: - result = audit_comparator(tsk->euid, op, value); + result = audit_comparator(tsk->euid, f->op, f->val); break; case AUDIT_SUID: - result = audit_comparator(tsk->suid, op, value); + result = audit_comparator(tsk->suid, f->op, f->val); break; case AUDIT_FSUID: - result = audit_comparator(tsk->fsuid, op, value); + result = audit_comparator(tsk->fsuid, f->op, f->val); break; case AUDIT_GID: - result = audit_comparator(tsk->gid, op, value); + result = audit_comparator(tsk->gid, f->op, f->val); break; case AUDIT_EGID: - result = audit_comparator(tsk->egid, op, value); + result = audit_comparator(tsk->egid, f->op, f->val); break; case AUDIT_SGID: - result = audit_comparator(tsk->sgid, op, value); + result = audit_comparator(tsk->sgid, f->op, f->val); break; case AUDIT_FSGID: - result = audit_comparator(tsk->fsgid, op, value); + result = audit_comparator(tsk->fsgid, f->op, f->val); break; case AUDIT_PERS: - result = audit_comparator(tsk->personality, op, value); + result = audit_comparator(tsk->personality, f->op, f->val); break; case AUDIT_ARCH: if (ctx) - result = audit_comparator(ctx->arch, op, value); + result = audit_comparator(ctx->arch, f->op, f->val); break; case AUDIT_EXIT: if (ctx && ctx->return_valid) - result = audit_comparator(ctx->return_code, op, value); + result = audit_comparator(ctx->return_code, f->op, f->val); break; case AUDIT_SUCCESS: if (ctx && ctx->return_valid) { - if (value) - result = audit_comparator(ctx->return_valid, op, AUDITSC_SUCCESS); + if (f->val) + result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); else - result = audit_comparator(ctx->return_valid, op, AUDITSC_FAILURE); + result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); } break; case AUDIT_DEVMAJOR: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MAJOR(ctx->names[j].dev), op, value)) { + if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { ++result; break; } @@ -235,7 +233,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_DEVMINOR: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MINOR(ctx->names[j].dev), op, value)) { + if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { ++result; break; } @@ -245,8 +243,8 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_INODE: if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, op, value) || - audit_comparator(ctx->names[j].pino, op, value)) { + if (audit_comparator(ctx->names[j].ino, f->op, f->val) || + audit_comparator(ctx->names[j].pino, f->op, f->val)) { ++result; break; } @@ -256,14 +254,14 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID: result = 0; if (ctx) - result = audit_comparator(ctx->loginuid, op, value); + result = audit_comparator(ctx->loginuid, f->op, f->val); break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: if (ctx) - result = audit_comparator(ctx->argv[field-AUDIT_ARG0], op, value); + result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); break; } -- cgit v1.1 From 5d3301088f7e412992d9e61cc3604cbdff3090ff Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Mon, 9 Jan 2006 09:48:17 -0500 Subject: [PATCH] add/remove rule update Hi, The following patch adds a little more information to the add/remove rule message emitted by the kernel. Signed-off-by: Steve Grubb Signed-off-by: Al Viro --- kernel/auditfilter.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 686d514..35f8fa8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -487,10 +487,11 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); - if (!err) - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u added an audit rule\n", loginuid); - else + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u add rule to list=%d res=%d\n", + loginuid, entry->rule.listnr, !err); + + if (err) audit_free_rule(entry); break; case AUDIT_DEL: @@ -504,9 +505,10 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_del_rule(entry, &audit_filter_list[entry->rule.listnr]); - if (!err) - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "auid=%u removed an audit rule\n", loginuid); + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u remove rule from list=%d res=%d\n", + loginuid, entry->rule.listnr, !err); + audit_free_rule(entry); break; default: -- cgit v1.1 From a6c043a887a9db32a545539426ddfc8cc2c28f8f Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Sun, 1 Jan 2006 14:07:00 -0500 Subject: [PATCH] Add tty to syscall audit records Hi, >From the RBAC specs: FAU_SAR.1.1 The TSF shall provide the set of authorized RBAC administrators with the capability to read the following audit information from the audit records: (e) The User Session Identifier or Terminal Type A patch adding the tty for all syscalls is included in this email. Please apply. Signed-off-by: Steve Grubb Signed-off-by: Al Viro --- kernel/auditsc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ba08788..d3d4992 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "audit.h" @@ -573,6 +574,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) int i; struct audit_buffer *ab; struct audit_aux_data *aux; + const char *tty; ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); if (!ab) @@ -585,11 +587,15 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); + if (current->signal->tty && current->signal->tty->name) + tty = current->signal->tty->name; + else + tty = "(none)"; audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u", + " egid=%u sgid=%u fsgid=%u tty=%s", context->argv[0], context->argv[1], context->argv[2], @@ -600,7 +606,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) context->uid, context->gid, context->euid, context->suid, context->fsuid, - context->egid, context->sgid, context->fsgid); + context->egid, context->sgid, context->fsgid, tty); audit_log_task_info(ab, gfp_mask); audit_log_end(ab); -- cgit v1.1 From d9d9ec6e2c45b22282cd36cf92fcb23d504350a8 Mon Sep 17 00:00:00 2001 From: Dustin Kirkland Date: Thu, 16 Feb 2006 13:40:01 -0600 Subject: [PATCH] Fix audit operators Darrel Goeddel initiated a discussion on IRC regarding the possibility of audit_comparator() returning -EINVAL signaling an invalid operator. It is possible when creating the rule to assure that the operator is one of the 6 sane values. Here's a snip from include/linux/audit.h Note that 0 (nonsense) and 7 (all operators) are not valid values for an operator. ... /* These are the supported operators. * 4 2 1 * = > < * ------- * 0 0 0 0 nonsense * 0 0 1 1 < * 0 1 0 2 > * 0 1 1 3 != * 1 0 0 4 = * 1 0 1 5 <= * 1 1 0 6 >= * 1 1 1 7 all operators */ ... Furthermore, prior to adding these extended operators, flagging the AUDIT_NEGATE bit implied !=, and otherwise == was assumed. The following code forces the operator to be != if the AUDIT_NEGATE bit was flipped on. And if no operator was specified, == is assumed. The only invalid condition is if the AUDIT_NEGATE bit is off and all of the AUDIT_EQUAL, AUDIT_LESS_THAN, and AUDIT_GREATER_THAN bits are on--clearly a nonsensical operator. Now that this is handled at rule insertion time, the default -EINVAL return of audit_comparator() is eliminated such that the function can only return 1 or 0. If this is acceptable, let's get this applied to the current tree. :-Dustin -- Signed-off-by: Al Viro (cherry picked from 9bf0a8e137040f87d1b563336d4194e38fb2ba1a commit) --- kernel/auditfilter.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 35f8fa8..b85fd8cc 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -160,11 +160,17 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->val = rule->values[i]; entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; + + /* Support for legacy operators where + * AUDIT_NEGATE bit signifies != and otherwise assumes == */ if (f->op & AUDIT_NEGATE) - f->op |= AUDIT_NOT_EQUAL; - else if (!(f->op & AUDIT_OPERATORS)) - f->op |= AUDIT_EQUAL; - f->op &= ~AUDIT_NEGATE; + f->op = AUDIT_NOT_EQUAL; + else if (!f->op) + f->op = AUDIT_EQUAL; + else if (f->op == AUDIT_OPERATORS) { + err = -EINVAL; + goto exit_free; + } } exit_nofree: @@ -533,9 +539,9 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) return (left > right); case AUDIT_GREATER_THAN_OR_EQUAL: return (left >= right); - default: - return -EINVAL; } + BUG(); + return 0; } -- cgit v1.1 From 4023e020807ea249ae83f0d1d851b4c7cf0afd8a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 7 Mar 2006 23:51:39 -0800 Subject: [PATCH] simplify audit_free() locking Simplify audit_free()'s locking: no need to lock a task that we are tearing down. [the extra locking also caused false positives in the lock validator] Signed-off-by: Ingo Molnar Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/auditsc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d3d4992..b613ec8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -704,10 +704,14 @@ void audit_free(struct task_struct *tsk) { struct audit_context *context; - task_lock(tsk); + /* + * No need to lock the task - when we execute audit_free() + * then the task has no external references anymore, and + * we are tearing it down. (The locking also confuses + * DEBUG_LOCKDEP - this freeing may occur in softirq + * contexts as well, via RCU.) + */ context = audit_get_context(tsk, 0, 0); - task_unlock(tsk); - if (likely(!context)) return; -- cgit v1.1 From 5a0bbce58bb25bd756f7ec437319d6ed2201a18b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 7 Mar 2006 23:51:38 -0800 Subject: [PATCH] sem2mutex: audit_netlink_sem Semaphore to mutex conversion. The conversion was generated via scripts, and the result was validated automatically via a script as well. Signed-off-by: Ingo Molnar Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/audit.c | 6 +++--- kernel/audit.h | 3 ++- kernel/auditfilter.c | 16 ++++++++-------- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 4eb97b6..6a44e0a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -114,7 +114,7 @@ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); /* The netlink socket is only to be read by 1 CPU, which lets us assume * that list additions and deletions never happen simultaneously in * auditsc.c */ -DECLARE_MUTEX(audit_netlink_sem); +DEFINE_MUTEX(audit_netlink_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -538,14 +538,14 @@ static void audit_receive(struct sock *sk, int length) struct sk_buff *skb; unsigned int qlen; - down(&audit_netlink_sem); + mutex_lock(&audit_netlink_mutex); for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); audit_receive_skb(skb); kfree_skb(skb); } - up(&audit_netlink_sem); + mutex_unlock(&audit_netlink_mutex); } diff --git a/kernel/audit.h b/kernel/audit.h index 4b602cd..bc53920 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -19,6 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include @@ -84,4 +85,4 @@ extern void audit_send_reply(int pid, int seq, int type, void *payload, int size); extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); -extern struct semaphore audit_netlink_sem; +extern struct mutex audit_netlink_mutex; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b85fd8cc..d3a8539 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -329,7 +329,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) } /* Add rule to given filterlist if not a duplicate. Protected by - * audit_netlink_sem. */ + * audit_netlink_mutex. */ static inline int audit_add_rule(struct audit_entry *entry, struct list_head *list) { @@ -352,7 +352,7 @@ static inline int audit_add_rule(struct audit_entry *entry, } /* Remove an existing rule from filterlist. Protected by - * audit_netlink_sem. */ + * audit_netlink_mutex. */ static inline int audit_del_rule(struct audit_entry *entry, struct list_head *list) { @@ -383,10 +383,10 @@ static int audit_list(void *_dest) seq = dest[1]; kfree(dest); - down(&audit_netlink_sem); + mutex_lock(&audit_netlink_mutex); /* The *_rcu iterators not needed here because we are - always called with audit_netlink_sem held. */ + always called with audit_netlink_mutex held. */ for (i=0; i Date: Thu, 9 Mar 2006 00:33:47 +0100 Subject: [PATCH] EXPORT_SYMBOL patch for audit_log, audit_log_start, audit_log_end and audit_format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hi, This is a trivial patch that enables the possibility of using some auditing functions within loadable kernel modules (ie. inside a Linux Security Module). _ Make the audit_log_start, audit_log_end, audit_format and audit_log interfaces available to Loadable Kernel Modules, thus making possible the usage of the audit framework inside LSMs, etc. Signed-off-by: > Signed-off-by: Al Viro --- kernel/audit.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6a44e0a..c9345d3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -994,3 +994,8 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, audit_log_end(ab); } } + +EXPORT_SYMBOL(audit_log_start); +EXPORT_SYMBOL(audit_log_end); +EXPORT_SYMBOL(audit_log_format); +EXPORT_SYMBOL(audit_log); -- cgit v1.1 From 71e1c784b24a026a490b3de01541fc5ee14ebc09 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Mon, 6 Mar 2006 22:40:05 -0500 Subject: [PATCH] fix audit_init failure path Make audit_init() failure path handle situations where the audit_panic() action is not AUDIT_FAIL_PANIC (default is AUDIT_FAIL_PRINTK). Other uses of audit_sock are not reached unless audit's netlink message handler is properly registered. Bug noticed by Peter Staubach. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/audit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index c9345d3..04fe2e3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -558,8 +558,9 @@ static int __init audit_init(void) THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); + else + audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; skb_queue_head_init(&audit_skb_queue); audit_initialized = 1; audit_enabled = audit_default; -- cgit v1.1 From 6978c7052f2e22c6c40781cdd4eba5c4bce9a789 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Fri, 24 Mar 2006 18:45:21 +0100 Subject: BUG_ON() Conversion in kernel/cpu.c this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- kernel/cpu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index e882c6b..8be22bd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -223,8 +223,7 @@ int __devinit cpu_up(unsigned int cpu) ret = __cpu_up(cpu); if (ret != 0) goto out_notify; - if (!cpu_online(cpu)) - BUG(); + BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); -- cgit v1.1 From 185ae6d7a32721e9062030a9f2d24ed714fa45df Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 25 Mar 2006 03:06:32 -0800 Subject: [PATCH] timer irq driven soft watchdog fix I seem to have lost this hunk in yesterday's patch. It brings the coming-online CPU's softlockup timer up to date so we don't get false-positive tripups during CPU hot-add. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softlockup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index dd9524f..d9b3d58 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -118,6 +118,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) printk("watchdog for %i failed\n", hotcpu); return NOTIFY_BAD; } + per_cpu(touch_timestamp, hotcpu) = jiffies; per_cpu(watchdog_task, hotcpu) = p; kthread_bind(p, hotcpu); break; -- cgit v1.1 From c08b8a49100715b20e6f7c997e992428b5e06078 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Mar 2006 03:06:33 -0800 Subject: [PATCH] sys_alarm() unsigned signed conversion fixup alarm() calls the kernel with an unsigend int timeout in seconds. The value is stored in the tv_sec field of a struct timeval to setup the itimer. The tv_sec field of struct timeval is of type long, which causes the tv_sec value to be negative on 32 bit machines if seconds > INT_MAX. Before the hrtimer merge (pre 2.6.16) such a negative value was converted to the maximum jiffies timeout by the timeval_to_jiffies conversion. It's not clear whether this was intended or just happened to be done by the timeval_to_jiffies code. hrtimers expect a timeval in canonical form and treat a negative timeout as already expired. This breaks the legitimate usage of alarm() with a timeout value > INT_MAX seconds. For 32 bit machines it is therefor necessary to limit the internal seconds value to avoid API breakage. Instead of doing this in all implementations of sys_alarm the duplicated sys_alarm code is moved into a common function in itimer.c Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/itimer.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/timer.c | 14 +------------- 2 files changed, 38 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index 379be2f..a2dc3759 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -226,6 +226,43 @@ again: return 0; } +/** + * alarm_setitimer - set alarm in seconds + * + * @seconds: number of seconds until alarm + * 0 disables the alarm + * + * Returns the remaining time in seconds of a pending timer or 0 when + * the timer is not active. + * + * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid + * negative timeval settings which would cause immediate expiry. + */ +unsigned int alarm_setitimer(unsigned int seconds) +{ + struct itimerval it_new, it_old; + +#if BITS_PER_LONG < 64 + if (seconds > INT_MAX) + seconds = INT_MAX; +#endif + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + + do_setitimer(ITIMER_REAL, &it_new, &it_old); + + /* + * We can't return 0 if we have an alarm pending ... And we'd + * better return too much than too little anyway + */ + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || + it_old.it_value.tv_usec >= 500000) + it_old.it_value.tv_sec++; + + return it_old.it_value.tv_sec; +} + asmlinkage long sys_setitimer(int which, struct itimerval __user *value, struct itimerval __user *ovalue) diff --git a/kernel/timer.c b/kernel/timer.c index 17d956c..13fa72c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -956,19 +956,7 @@ void do_timer(struct pt_regs *regs) */ asmlinkage unsigned long sys_alarm(unsigned int seconds) { - struct itimerval it_new, it_old; - unsigned int oldalarm; - - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; - it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - do_setitimer(ITIMER_REAL, &it_new, &it_old); - oldalarm = it_old.it_value.tv_sec; - /* ehhh.. We can't return 0 if we have an alarm pending.. */ - /* And we'd better return too much than too little anyway */ - if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000) - oldalarm++; - return oldalarm; + return alarm_setitimer(seconds); } #endif -- cgit v1.1 From 7d99b7d634d81bb372e03e4561c80430aa4cfac2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Mar 2006 03:06:35 -0800 Subject: [PATCH] Validate and sanitze itimer timeval from userspace According to the specification the timevals must be validated and an errorcode -EINVAL returned in case the timevals are not in canonical form. This check was never done in Linux. The pre 2.6.16 code converted invalid timevals silently. Negative timeouts were converted by the timeval_to_jiffies conversion to the maximum timeout. hrtimers and the ktime_t operations expect timevals in canonical form. Otherwise random results might happen on 32 bits machines due to the optimized ktime_add/sub operations. Negative timeouts are treated as already expired. This might break applications which work on pre 2.6.16. To prevent random behaviour and API breakage the timevals are checked and invalid timevals sanitized in a simliar way as the pre 2.6.16 code did. Invalid timevals are reported with a per boot limited number of kernel messages so applications which use this misfeature can be corrected. After a grace period of one year the sanitizing should be replaced by a correct validation check. This is also documented in Documentation/feature-removal-schedule.txt The validation and sanitizing is done inside do_setitimer so all callers (sys_setitimer, compat_sys_setitimer, osf_setitimer) are catched. Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/itimer.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index a2dc3759..680e6b7 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -143,6 +143,60 @@ int it_real_fn(void *data) return HRTIMER_NORESTART; } +/* + * We do not care about correctness. We just sanitize the values so + * the ktime_t operations which expect normalized values do not + * break. This converts negative values to long timeouts similar to + * the code in kernel versions < 2.6.16 + * + * Print a limited number of warning messages when an invalid timeval + * is detected. + */ +static void fixup_timeval(struct timeval *tv, int interval) +{ + static int warnlimit = 10; + unsigned long tmp; + + if (warnlimit > 0) { + warnlimit--; + printk(KERN_WARNING + "setitimer: %s (pid = %d) provided " + "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n", + current->comm, current->pid, + interval ? "it_interval" : "it_value", + tv->tv_sec, (long) tv->tv_usec); + } + + tmp = tv->tv_usec; + if (tmp >= USEC_PER_SEC) { + tv->tv_usec = tmp % USEC_PER_SEC; + tv->tv_sec += tmp / USEC_PER_SEC; + } + + tmp = tv->tv_sec; + if (tmp > LONG_MAX) + tv->tv_sec = LONG_MAX; +} + +/* + * Returns true if the timeval is in canonical form + */ +#define timeval_valid(t) \ + (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) + +/* + * Check for invalid timevals, sanitize them and print a limited + * number of warnings. + */ +static void check_itimerval(struct itimerval *value) { + + if (unlikely(!timeval_valid(&value->it_value))) + fixup_timeval(&value->it_value, 0); + + if (unlikely(!timeval_valid(&value->it_interval))) + fixup_timeval(&value->it_interval, 1); +} + int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; @@ -150,6 +204,18 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) ktime_t expires; cputime_t cval, cinterval, nval, ninterval; + /* + * Validate the timevals in value. + * + * Note: Although the spec requires that invalid values shall + * return -EINVAL, we just fixup the value and print a limited + * number of warnings in order not to break users of this + * historical misfeature. + * + * Scheduled for replacement in March 2007 + */ + check_itimerval(value); + switch (which) { case ITIMER_REAL: again: -- cgit v1.1 From 8d3b33f67fdc0fb364a1ef6d8fbbea7c2e4e6c98 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 25 Mar 2006 03:07:05 -0800 Subject: [PATCH] Remove MODULE_PARM MODULE_PARM was actually breaking: recent gcc version optimize them out as unused. It's time to replace the last users, which are generally in the most unloved drivers anyway. Signed-off-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 183 ++++------------------------------------------------ kernel/rcutorture.c | 10 +-- 2 files changed, 16 insertions(+), 177 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 54623c7..ddfe45a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -233,24 +233,6 @@ static unsigned long __find_symbol(const char *name, return 0; } -/* Find a symbol in this elf symbol table */ -static unsigned long find_local_symbol(Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab, - const char *name) -{ - unsigned int i; - Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; - - /* Search (defined) internal symbols first. */ - for (i = 1; i < sechdrs[symindex].sh_size/sizeof(*sym); i++) { - if (sym[i].st_shndx != SHN_UNDEF - && strcmp(name, strtab + sym[i].st_name) == 0) - return sym[i].st_value; - } - return 0; -} - /* Search for module by name: must hold module_mutex. */ static struct module *find_module(const char *name) { @@ -785,139 +767,6 @@ static struct module_attribute *modinfo_attrs[] = { NULL, }; -#ifdef CONFIG_OBSOLETE_MODPARM -/* Bounds checking done below */ -static int obsparm_copy_string(const char *val, struct kernel_param *kp) -{ - strcpy(kp->arg, val); - return 0; -} - -static int set_obsolete(const char *val, struct kernel_param *kp) -{ - unsigned int min, max; - unsigned int size, maxsize; - int dummy; - char *endp; - const char *p; - struct obsolete_modparm *obsparm = kp->arg; - - if (!val) { - printk(KERN_ERR "Parameter %s needs an argument\n", kp->name); - return -EINVAL; - } - - /* type is: [min[-max]]{b,h,i,l,s} */ - p = obsparm->type; - min = simple_strtol(p, &endp, 10); - if (endp == obsparm->type) - min = max = 1; - else if (*endp == '-') { - p = endp+1; - max = simple_strtol(p, &endp, 10); - } else - max = min; - switch (*endp) { - case 'b': - return param_array(kp->name, val, min, max, obsparm->addr, - 1, param_set_byte, &dummy); - case 'h': - return param_array(kp->name, val, min, max, obsparm->addr, - sizeof(short), param_set_short, &dummy); - case 'i': - return param_array(kp->name, val, min, max, obsparm->addr, - sizeof(int), param_set_int, &dummy); - case 'l': - return param_array(kp->name, val, min, max, obsparm->addr, - sizeof(long), param_set_long, &dummy); - case 's': - return param_array(kp->name, val, min, max, obsparm->addr, - sizeof(char *), param_set_charp, &dummy); - - case 'c': - /* Undocumented: 1-5c50 means 1-5 strings of up to 49 chars, - and the decl is "char xxx[5][50];" */ - p = endp+1; - maxsize = simple_strtol(p, &endp, 10); - /* We check lengths here (yes, this is a hack). */ - p = val; - while (p[size = strcspn(p, ",")]) { - if (size >= maxsize) - goto oversize; - p += size+1; - } - if (size >= maxsize) - goto oversize; - return param_array(kp->name, val, min, max, obsparm->addr, - maxsize, obsparm_copy_string, &dummy); - } - printk(KERN_ERR "Unknown obsolete parameter type %s\n", obsparm->type); - return -EINVAL; - oversize: - printk(KERN_ERR - "Parameter %s doesn't fit in %u chars.\n", kp->name, maxsize); - return -EINVAL; -} - -static int obsolete_params(const char *name, - char *args, - struct obsolete_modparm obsparm[], - unsigned int num, - Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab) -{ - struct kernel_param *kp; - unsigned int i; - int ret; - - kp = kmalloc(sizeof(kp[0]) * num, GFP_KERNEL); - if (!kp) - return -ENOMEM; - - for (i = 0; i < num; i++) { - char sym_name[128 + sizeof(MODULE_SYMBOL_PREFIX)]; - - snprintf(sym_name, sizeof(sym_name), "%s%s", - MODULE_SYMBOL_PREFIX, obsparm[i].name); - - kp[i].name = obsparm[i].name; - kp[i].perm = 000; - kp[i].set = set_obsolete; - kp[i].get = NULL; - obsparm[i].addr - = (void *)find_local_symbol(sechdrs, symindex, strtab, - sym_name); - if (!obsparm[i].addr) { - printk("%s: falsely claims to have parameter %s\n", - name, obsparm[i].name); - ret = -EINVAL; - goto out; - } - kp[i].arg = &obsparm[i]; - } - - ret = parse_args(name, args, kp, num, NULL); - out: - kfree(kp); - return ret; -} -#else -static int obsolete_params(const char *name, - char *args, - struct obsolete_modparm obsparm[], - unsigned int num, - Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab) -{ - if (num != 0) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - name); - return 0; -} -#endif /* CONFIG_OBSOLETE_MODPARM */ - static const char vermagic[] = VERMAGIC_STRING; #ifdef CONFIG_MODVERSIONS @@ -1874,27 +1723,17 @@ static struct module *load_module(void __user *umod, set_fs(old_fs); mod->args = args; - if (obsparmindex) { - err = obsolete_params(mod->name, mod->args, - (struct obsolete_modparm *) - sechdrs[obsparmindex].sh_addr, - sechdrs[obsparmindex].sh_size - / sizeof(struct obsolete_modparm), - sechdrs, symindex, - (char *)sechdrs[strindex].sh_addr); - if (setupindex) - printk(KERN_WARNING "%s: Ignoring new-style " - "parameters in presence of obsolete ones\n", - mod->name); - } else { - /* Size of section 0 is 0, so this works well if no params */ - err = parse_args(mod->name, mod->args, - (struct kernel_param *) - sechdrs[setupindex].sh_addr, - sechdrs[setupindex].sh_size - / sizeof(struct kernel_param), - NULL); - } + if (obsparmindex) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + mod->name); + + /* Size of section 0 is 0, so this works well if no params */ + err = parse_args(mod->name, mod->args, + (struct kernel_param *) + sechdrs[setupindex].sh_addr, + sechdrs[setupindex].sh_size + / sizeof(struct kernel_param), + NULL); if (err < 0) goto arch_cleanup; diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9a1fa88..b4b362b 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -54,15 +54,15 @@ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ -MODULE_PARM(nreaders, "i"); +module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -MODULE_PARM(stat_interval, "i"); +module_param(stat_interval, int, 0); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -MODULE_PARM(verbose, "i"); +module_param(verbose, bool, 0); MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); -MODULE_PARM(test_no_idle_hz, "i"); +module_param(test_no_idle_hz, bool, 0); MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -MODULE_PARM(shuffle_interval, "i"); +module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); #define TORTURE_FLAG "rcutorture: " #define PRINTK_STRING(s) \ -- cgit v1.1 From 9871728b756646e0d758a966ba00f2c0ff812817 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 25 Mar 2006 03:07:06 -0800 Subject: [PATCH] kernel/params.c: make param_array() static param_array() in kernel/params.c can now become static. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index a291505..9de637a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -265,12 +265,12 @@ int param_get_invbool(char *buffer, struct kernel_param *kp) } /* We cheat here and temporarily mangle the string. */ -int param_array(const char *name, - const char *val, - unsigned int min, unsigned int max, - void *elem, int elemsize, - int (*set)(const char *, struct kernel_param *kp), - int *num) +static int param_array(const char *name, + const char *val, + unsigned int min, unsigned int max, + void *elem, int elemsize, + int (*set)(const char *, struct kernel_param *kp), + int *num) { int ret; struct kernel_param kp; -- cgit v1.1 From c777ac5594f772ac760e02c3ac71d067616b579d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 25 Mar 2006 03:07:36 -0800 Subject: [PATCH] irq: uninline migration functions Uninline some massive IRQ migration functions. Put them in the new kernel/irq/migration.c. Cc: Andi Kleen Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/Makefile | 3 +-- kernel/irq/migration.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 kernel/irq/migration.c (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 4937873..2b33f85 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,4 @@ -obj-y := handle.o manage.o spurious.o +obj-y := handle.o manage.o spurious.o migration.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o - diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c new file mode 100644 index 0000000..6bdd03c --- /dev/null +++ b/kernel/irq/migration.c @@ -0,0 +1,52 @@ +#include + +#if defined(CONFIG_GENERIC_PENDING_IRQ) + +void set_pending_irq(unsigned int irq, cpumask_t mask) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + desc->move_irq = 1; + pending_irq_cpumask[irq] = mask; + spin_unlock_irqrestore(&desc->lock, flags); +} + +void move_native_irq(int irq) +{ + cpumask_t tmp; + irq_desc_t *desc = irq_descp(irq); + + if (likely (!desc->move_irq)) + return; + + desc->move_irq = 0; + + if (likely(cpus_empty(pending_irq_cpumask[irq]))) + return; + + if (!desc->handler->set_affinity) + return; + + /* note - we hold the desc->lock */ + cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); + + /* + * If there was a valid mask to work with, please + * do the disable, re-program, enable sequence. + * This is *not* particularly important for level triggered + * but in a edge trigger case, we might be setting rte + * when an active trigger is comming in. This could + * cause some ioapics to mal-function. + * Being paranoid i guess! + */ + if (unlikely(!cpus_empty(tmp))) { + desc->handler->disable(irq); + desc->handler->set_affinity(irq,tmp); + desc->handler->enable(irq); + } + cpus_clear(pending_irq_cpumask[irq]); +} + +#endif -- cgit v1.1 From 501f2499b897ca4be68b1acc7a4bc8cf66f5fd24 Mon Sep 17 00:00:00 2001 From: Bryan Holty Date: Sat, 25 Mar 2006 03:07:37 -0800 Subject: [PATCH] IRQ: prevent enabling of previously disabled interrupt This fix prevents re-disabling and enabling of a previously disabled interrupt. On an SMP system with irq balancing enabled; If an interrupt is disabled from within its own interrupt context with disable_irq_nosync and is also earmarked for processor migration, the interrupt is blindly moved to the other processor and enabled without regard for its current "enabled" state. If there is an interrupt pending, it will unexpectedly invoke the irq handler on the new irq owning processor (even though the irq was previously disabled) The more intuitive fix would be to invoke disable_irq_nosync and enable_irq, but since we already have the desc->lock from __do_IRQ, we cannot call them directly. Instead we can use the same logic to disable and enable found in disable_irq_nosync and enable_irq, with regards to the desc->depth. This now prevents a disabled interrupt from being re-disabled, and more importantly prevents a disabled interrupt from being incorrectly enabled on a different processor. Signed-off-by: Bryan Holty Cc: Andi Kleen Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/migration.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 6bdd03c..52a8655 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -18,9 +18,17 @@ void move_native_irq(int irq) cpumask_t tmp; irq_desc_t *desc = irq_descp(irq); - if (likely (!desc->move_irq)) + if (likely(!desc->move_irq)) return; + /* + * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. + */ + if (CHECK_IRQ_PER_CPU(desc->status)) { + WARN_ON(1); + return; + } + desc->move_irq = 0; if (likely(cpus_empty(pending_irq_cpumask[irq]))) @@ -29,7 +37,8 @@ void move_native_irq(int irq) if (!desc->handler->set_affinity) return; - /* note - we hold the desc->lock */ + assert_spin_locked(&desc->lock); + cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); /* @@ -42,9 +51,13 @@ void move_native_irq(int irq) * Being paranoid i guess! */ if (unlikely(!cpus_empty(tmp))) { - desc->handler->disable(irq); + if (likely(!(desc->status & IRQ_DISABLED))) + desc->handler->disable(irq); + desc->handler->set_affinity(irq,tmp); - desc->handler->enable(irq); + + if (likely(!(desc->status & IRQ_DISABLED))) + desc->handler->enable(irq); } cpus_clear(pending_irq_cpumask[irq]); } -- cgit v1.1 From 12b5989be10011387a9da5dee82e5c0d6f9d02e7 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Sat, 25 Mar 2006 03:07:41 -0800 Subject: [PATCH] refactor capable() to one implementation, add __capable() helper Move capable() to kernel/capability.c and eliminate duplicate implementations. Add __capable() function which can be used to check for capabiilty of any process. Signed-off-by: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 16 ++++++++++++++++ kernel/sys.c | 12 ------------ 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index bfa3c92..1a4d8a4 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -233,3 +233,19 @@ out: return ret; } + +int __capable(struct task_struct *t, int cap) +{ + if (security_capable(t, cap) == 0) { + t->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} +EXPORT_SYMBOL(__capable); + +int capable(int cap) +{ + return __capable(current, cap); +} +EXPORT_SYMBOL(capable); diff --git a/kernel/sys.c b/kernel/sys.c index 19d058b..421009c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -224,18 +224,6 @@ int unregister_reboot_notifier(struct notifier_block * nb) EXPORT_SYMBOL(unregister_reboot_notifier); -#ifndef CONFIG_SECURITY -int capable(int cap) -{ - if (cap_raised(current->cap_effective, cap)) { - current->flags |= PF_SUPERPRIV; - return 1; - } - return 0; -} -EXPORT_SYMBOL(capable); -#endif - static int set_one_prio(struct task_struct *p, int niceval, int error) { int no_nice; -- cgit v1.1 From 05eeae208d08a05a6980cf2ff61f02843c0955fd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 25 Mar 2006 03:07:48 -0800 Subject: [PATCH] find_task_by_pid() needs tasklist_lock A couple of places are forgetting to take it. The kswapd case is probably unimportant. keventd_create_kthread() was racy. The whole thing is a bit flakey: you start a kernel thread, get its pid from kernel_thread() then look up its task_struct. a) It assumes that pid recycling takes a "long" time. b) We get a task_struct but no reference was taken on it. The owner of the kswapd and kthread task_struct*'s must assume that the new thread won't exit unexpectedly. Because if it does, they're left holding dead memory and any attempt to control or stop that task will crash. Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 6a53738..c5f3c66 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -115,7 +115,9 @@ static void keventd_create_kthread(void *_create) create->result = ERR_PTR(pid); } else { wait_for_completion(&create->started); + read_lock(&tasklist_lock); create->result = find_task_by_pid(pid); + read_unlock(&tasklist_lock); } complete(&create->done); } -- cgit v1.1 From 231bed205879236357171e50bd8965e70797ecdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 25 Mar 2006 03:08:00 -0800 Subject: [PATCH] No need to protect current->group_info in sys_getgroups(), in_group_p() and in_egroup_p() While doing some benchmarks of an Apache/PHP SMP server, I noticed high oprofile numbers in in_group_p() and _atomic_dec_and_lock(). rank percent 1 4.8911 % __link_path_walk 2 4.8503 % __d_lookup *3 4.2911 % _atomic_dec_and_lock 4 3.9307 % __copy_to_user_ll 5 4.9004 % sysenter_past_esp *6 3.3248 % in_group_p It appears that in_group_p() does an uncessary get_group_info(current->group_info); /* atomic_inc() */ ... /* access current->group_info */ put_group_info(current->group_info); /* _atomic_dec_and_lock */ It is not necessary to do this, because the current task holds a reference on its own group_info, and this reference cannot change during the lookup. This patch deletes the get_group_info()/put_group_info() pair from sys_getgroups(), in_group_p() and in_egroup_p() functions. Signed-off-by: Eric Dumazet Cc: Tim Hockin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 421009c..119fb0d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1421,7 +1421,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) return -EINVAL; /* no need to grab task_lock here; it cannot change */ - get_group_info(current->group_info); i = current->group_info->ngroups; if (gidsetsize) { if (i > gidsetsize) { @@ -1434,7 +1433,6 @@ asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) } } out: - put_group_info(current->group_info); return i; } @@ -1475,9 +1473,7 @@ int in_group_p(gid_t grp) { int retval = 1; if (grp != current->fsgid) { - get_group_info(current->group_info); retval = groups_search(current->group_info, grp); - put_group_info(current->group_info); } return retval; } @@ -1488,9 +1484,7 @@ int in_egroup_p(gid_t grp) { int retval = 1; if (grp != current->egid) { - get_group_info(current->group_info); retval = groups_search(current->group_info, grp); - put_group_info(current->group_info); } return retval; } -- cgit v1.1 From 34f361ade2fb4a869f6a7714d01c04ce4cfa75d9 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Sat, 25 Mar 2006 03:08:18 -0800 Subject: [PATCH] Check if cpu can be onlined before calling smp_prepare_cpu() - Moved check for online cpu out of smp_prepare_cpu() - Moved default declaration of smp_prepare_cpu() to kernel/cpu.c - Removed lock_cpu_hotplug() from smp_prepare_cpu() to around it, since its called from cpu_up() as well now. - Removed clearing from cpu_present_map during cpu_offline as it breaks using cpu_up() directly during a subsequent online operation. Signed-off-by: Ashok Raj Cc: Srivatsa Vaddagiri Cc: "Li, Shaohua" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/smp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/smp.c b/kernel/power/smp.c index 911fc62..5957312 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c @@ -49,9 +49,7 @@ void enable_nonboot_cpus(void) printk("Thawing cpus ...\n"); for_each_cpu_mask(cpu, frozen_cpus) { - error = smp_prepare_cpu(cpu); - if (!error) - error = cpu_up(cpu); + error = cpu_up(cpu); if (!error) { printk("CPU%d is up\n", cpu); continue; -- cgit v1.1 From d74beb9f33a5f16d2965f11b275e401f225c949d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 25 Mar 2006 03:08:19 -0800 Subject: [PATCH] Use unsigned int types for a faster bsearch This patch avoids arithmetic on 'signed' types that are slower than 'unsigned'. This saves space and cpu cycles. size of kernel/sys.o before the patch (gcc-3.4.5) text data bss dec hex filename 10924 252 4 11180 2bac kernel/sys.o size of kernel/sys.o after the patch text data bss dec hex filename 10903 252 4 11159 2b97 kernel/sys.o I noticed that gcc-4.1.0 (from Fedora Core 5) even uses idiv instruction for (a+b)/2 if a and b are signed. Signed-off-by: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 119fb0d..38bc73e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1363,7 +1363,7 @@ static void groups_sort(struct group_info *group_info) /* a simple bsearch */ int groups_search(struct group_info *group_info, gid_t grp) { - int left, right; + unsigned int left, right; if (!group_info) return 0; @@ -1371,7 +1371,7 @@ int groups_search(struct group_info *group_info, gid_t grp) left = 0; right = group_info->ngroups; while (left < right) { - int mid = (left+right)/2; + unsigned int mid = (left+right)/2; int cmp = grp - GROUP_AT(group_info, mid); if (cmp > 0) left = mid + 1; -- cgit v1.1 From f5163427453bc6ca2dd497eeb3e7a30d1c74b487 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Sat, 25 Mar 2006 03:08:23 -0800 Subject: [PATCH] Add SA_PERCPU_IRQ flag support Add support for SA_PERCPU_IRQ (only mmtimer.c uses this at this stage). Signed-off-by: Dimitri Sivanich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 97d5559..6edfcef 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -204,10 +204,14 @@ int setup_irq(unsigned int irq, struct irqaction * new) p = &desc->action; if ((old = *p) != NULL) { /* Can't share interrupts unless both agree to */ - if (!(old->flags & new->flags & SA_SHIRQ)) { - spin_unlock_irqrestore(&desc->lock,flags); - return -EBUSY; - } + if (!(old->flags & new->flags & SA_SHIRQ)) + goto mismatch; + +#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) + /* All handlers must agree on per-cpuness */ + if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) + goto mismatch; +#endif /* add new interrupt at end of irq queue */ do { @@ -218,7 +222,10 @@ int setup_irq(unsigned int irq, struct irqaction * new) } *p = new; - +#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) + if (new->flags & SA_PERCPU_IRQ) + desc->status |= IRQ_PER_CPU; +#endif if (!shared) { desc->depth = 0; desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | @@ -236,6 +243,12 @@ int setup_irq(unsigned int irq, struct irqaction * new) register_handler_proc(irq, new); return 0; + +mismatch: + spin_unlock_irqrestore(&desc->lock, flags); + printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); + dump_stack(); + return -EBUSY; } /** -- cgit v1.1 From 5ddcfa878d5b10b0ab94251a4229a8a9daaf93ed Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sat, 25 Mar 2006 03:08:28 -0800 Subject: [PATCH] remove pps support This removes the support for pps. It's completely unused within the kernel and is basically in the way for further cleanups. It should be easier to readd proper support for it after the rest has been converted to NTP4 (where the pps mechanisms are quite different from NTP3 anyway). Signed-off-by: Roman Zippel Cc: Adrian Bunk Cc: john stultz Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time.c | 59 ++++++++++++++++------------------------------------------ kernel/timer.c | 13 ++----------- 2 files changed, 18 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 8045391..e00a97b 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -202,24 +202,6 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv, return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -long pps_offset; /* pps time offset (us) */ -long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */ - -long pps_freq; /* frequency offset (scaled ppm) */ -long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ - -long pps_valid = PPS_VALID; /* pps signal watchdog counter */ - -int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ - -long pps_jitcnt; /* jitter limit exceeded */ -long pps_calcnt; /* calibration intervals */ -long pps_errcnt; /* calibration errors */ -long pps_stbcnt; /* stability limit exceeded */ - -/* hook for a loadable hardpps kernel module */ -void (*hardpps_ptr)(struct timeval *); - /* we call this to notify the arch when the clock is being * controlled. If no such arch routine, do nothing. */ @@ -279,7 +261,7 @@ int do_adjtimex(struct timex *txc) result = -EINVAL; goto leave; } - time_freq = txc->freq - pps_freq; + time_freq = txc->freq; } if (txc->modes & ADJ_MAXERROR) { @@ -312,10 +294,8 @@ int do_adjtimex(struct timex *txc) if ((time_next_adjust = txc->offset) == 0) time_adjust = 0; } - else if ( time_status & (STA_PLL | STA_PPSTIME) ) { - ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) == - (STA_PPSTIME | STA_PPSSIGNAL) ? - pps_offset : txc->offset; + else if (time_status & STA_PLL) { + ltemp = txc->offset; /* * Scale the phase adjustment and @@ -356,23 +336,14 @@ int do_adjtimex(struct timex *txc) } time_freq = min(time_freq, time_tolerance); time_freq = max(time_freq, -time_tolerance); - } /* STA_PLL || STA_PPSTIME */ + } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) { tick_usec = txc->tick; tick_nsec = TICK_USEC_TO_NSEC(tick_usec); } } /* txc->modes */ -leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0 - && (time_status & STA_PPSSIGNAL) == 0) - /* p. 24, (b) */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) - == (STA_PPSTIME|STA_PPSJITTER)) - /* p. 24, (c) */ - || ((time_status & STA_PPSFREQ) != 0 - && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0)) - /* p. 24, (d) */ +leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) result = TIME_ERROR; if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) @@ -380,7 +351,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 else { txc->offset = shift_right(time_offset, SHIFT_UPDATE); } - txc->freq = time_freq + pps_freq; + txc->freq = time_freq; txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; @@ -388,14 +359,16 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 txc->precision = time_precision; txc->tolerance = time_tolerance; txc->tick = tick_usec; - txc->ppsfreq = pps_freq; - txc->jitter = pps_jitter >> PPS_AVG; - txc->shift = pps_shift; - txc->stabil = pps_stabil; - txc->jitcnt = pps_jitcnt; - txc->calcnt = pps_calcnt; - txc->errcnt = pps_errcnt; - txc->stbcnt = pps_stbcnt; + + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; write_sequnlock_irq(&xtime_lock); do_gettimeofday(&txc->time); notify_arch_cmos_timer(); diff --git a/kernel/timer.c b/kernel/timer.c index 13fa72c..ab189dd 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -697,18 +697,9 @@ static void second_overflow(void) /* * Compute the frequency estimate and additional phase adjustment due - * to frequency error for the next second. When the PPS signal is - * engaged, gnaw on the watchdog counter and update the frequency - * computed by the pll and the PPS signal. + * to frequency error for the next second. */ - pps_valid++; - if (pps_valid == PPS_VALID) { /* PPS signal lost */ - pps_jitter = MAXTIME; - pps_stabil = MAXFREQ; - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - } - ltemp = time_freq + pps_freq; + ltemp = time_freq; time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); #if HZ == 100 -- cgit v1.1 From 910dea7fdda22f0ee83d26d459e460c79ed94557 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 26 Mar 2006 18:29:26 +0200 Subject: BUG_ON() Conversion in kernel/fork.c this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- kernel/fork.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a020639..d93ab2b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -769,8 +769,7 @@ int unshare_files(void) struct files_struct *files = current->files; int rc; - if(!files) - BUG(); + BUG_ON(!files); /* This can race but the race causes us to copy when we don't need to and drop the copy */ -- cgit v1.1 From cd7b24bb1891a10ee25168a912ff2304a9571d23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 26 Mar 2006 01:36:54 -0800 Subject: [PATCH] warn if free_irq() is called from IRQ context Warn if free_irq() is called in IRQ context - free_irq() can execute /proc VFS work, which must not be done in IRQ context. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6edfcef..ac766ad 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -271,6 +271,7 @@ void free_irq(unsigned int irq, void *dev_id) struct irqaction **p; unsigned long flags; + WARN_ON(in_interrupt()); if (irq >= NR_IRQS) return; -- cgit v1.1 From e655a250d5fc12b6dfe0d436180ba4a3bfffdc9f Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 26 Mar 2006 01:37:11 -0800 Subject: [PATCH] swswsup: return correct load_image error If there's an error in load_image() we should return that without checking snapshot_image_loaded. Signed-off-by: Con Kolivas Acked-by: "Rafael J. Wysocki" Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 9177f3f..044b8e0 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -454,10 +454,11 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } } while (ret > 0); - if (!error) + if (!error) { printk("\b\b\b\bdone\n"); - if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + if (!snapshot_image_loaded(snapshot)) + error = -ENODATA; + } return error; } -- cgit v1.1 From 3158e9411a66fb98d495ac441c242264f31aaf3e Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sun, 26 Mar 2006 01:37:29 -0800 Subject: [PATCH] consolidate sys32/compat_adjtimex Create compat_sys_adjtimex and use it an all appropriate places. Signed-off-by: Stephen Rothwell Cc: Arnd Bergmann Acked-by: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 8c9cd88b..b9bdd12 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -898,3 +899,61 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat return -ERESTARTNOHAND; } #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ + +asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) +{ + struct timex txc; + int ret; + + memset(&txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || + __get_user(txc.modes, &utp->modes) || + __get_user(txc.offset, &utp->offset) || + __get_user(txc.freq, &utp->freq) || + __get_user(txc.maxerror, &utp->maxerror) || + __get_user(txc.esterror, &utp->esterror) || + __get_user(txc.status, &utp->status) || + __get_user(txc.constant, &utp->constant) || + __get_user(txc.precision, &utp->precision) || + __get_user(txc.tolerance, &utp->tolerance) || + __get_user(txc.time.tv_sec, &utp->time.tv_sec) || + __get_user(txc.time.tv_usec, &utp->time.tv_usec) || + __get_user(txc.tick, &utp->tick) || + __get_user(txc.ppsfreq, &utp->ppsfreq) || + __get_user(txc.jitter, &utp->jitter) || + __get_user(txc.shift, &utp->shift) || + __get_user(txc.stabil, &utp->stabil) || + __get_user(txc.jitcnt, &utp->jitcnt) || + __get_user(txc.calcnt, &utp->calcnt) || + __get_user(txc.errcnt, &utp->errcnt) || + __get_user(txc.stbcnt, &utp->stbcnt)) + return -EFAULT; + + ret = do_adjtimex(&txc); + + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || + __put_user(txc.modes, &utp->modes) || + __put_user(txc.offset, &utp->offset) || + __put_user(txc.freq, &utp->freq) || + __put_user(txc.maxerror, &utp->maxerror) || + __put_user(txc.esterror, &utp->esterror) || + __put_user(txc.status, &utp->status) || + __put_user(txc.constant, &utp->constant) || + __put_user(txc.precision, &utp->precision) || + __put_user(txc.tolerance, &utp->tolerance) || + __put_user(txc.time.tv_sec, &utp->time.tv_sec) || + __put_user(txc.time.tv_usec, &utp->time.tv_usec) || + __put_user(txc.tick, &utp->tick) || + __put_user(txc.ppsfreq, &utp->ppsfreq) || + __put_user(txc.jitter, &utp->jitter) || + __put_user(txc.shift, &utp->shift) || + __put_user(txc.stabil, &utp->stabil) || + __put_user(txc.jitcnt, &utp->jitcnt) || + __put_user(txc.calcnt, &utp->calcnt) || + __put_user(txc.errcnt, &utp->errcnt) || + __put_user(txc.stbcnt, &utp->stbcnt)) + ret = -EFAULT; + + return ret; +} -- cgit v1.1 From 92127c7a45d4d167d9b015a5f9de6b41ed66f1d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Mar 2006 01:38:05 -0800 Subject: [PATCH] hrtimers: optimize softirq runqueues The hrtimer softirq is called from the timer softirq every tick. Retrieve the current time from xtime and wall_to_monotonic instead of calling base->get_time() for each timer base. Store the time in the base structure and provide a hook once clock source abstractions are in place and to keep the code open for new base clocks. Based on a patch from: Roman Zippel Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 14bc9cf..b728cc5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -123,6 +123,26 @@ void ktime_get_ts(struct timespec *ts) EXPORT_SYMBOL_GPL(ktime_get_ts); /* + * Get the coarse grained time at the softirq based on xtime and + * wall_to_monotonic. + */ +static void hrtimer_get_softirq_time(struct hrtimer_base *base) +{ + ktime_t xtim, tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + xtim = timespec_to_ktime(xtime); + tomono = timespec_to_ktime(wall_to_monotonic); + + } while (read_seqretry(&xtime_lock, seq)); + + base[CLOCK_REALTIME].softirq_time = xtim; + base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); +} + +/* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ @@ -586,9 +606,11 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) */ static inline void run_hrtimer_queue(struct hrtimer_base *base) { - ktime_t now = base->get_time(); struct rb_node *node; + if (base->get_softirq_time) + base->softirq_time = base->get_softirq_time(); + spin_lock_irq(&base->lock); while ((node = base->first)) { @@ -598,7 +620,7 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) void *data; timer = rb_entry(node, struct hrtimer, node); - if (now.tv64 <= timer->expires.tv64) + if (base->softirq_time.tv64 <= timer->expires.tv64) break; fn = timer->function; @@ -641,6 +663,8 @@ void hrtimer_run_queues(void) struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); int i; + hrtimer_get_softirq_time(base); + for (i = 0; i < MAX_HRTIMER_BASES; i++) run_hrtimer_queue(&base[i]); } -- cgit v1.1 From 44f21475511bbc0135b52c66ad74dcc6a9026da3 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sun, 26 Mar 2006 01:38:06 -0800 Subject: [PATCH] hrtimers: pass current time to hrtimer_forward() Pass current time to hrtimer_forward(). This allows to use the softirq time in the timer base when the forward function is called from the timer callback. Other places pass current time with a call to timer->base->get_time(). Signed-off-by: Roman Zippel Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 7 +++---- kernel/itimer.c | 3 ++- kernel/posix-timers.c | 14 ++++++++++---- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b728cc5..e989c99 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -301,18 +301,17 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * hrtimer_forward - forward the timer expiry * * @timer: hrtimer to forward + * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. */ unsigned long -hrtimer_forward(struct hrtimer *timer, ktime_t interval) +hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { unsigned long orun = 1; - ktime_t delta, now; - - now = timer->base->get_time(); + ktime_t delta; delta = ktime_sub(now, timer->expires); diff --git a/kernel/itimer.c b/kernel/itimer.c index 680e6b7..af2ec6b 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -136,7 +136,8 @@ int it_real_fn(void *data) if (tsk->signal->it_real_incr.tv64 != 0) { hrtimer_forward(&tsk->signal->real_timer, - tsk->signal->it_real_incr); + tsk->signal->real_timer.base->softirq_time, + tsk->signal->it_real_incr); return HRTIMER_RESTART; } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9944379..255657a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -251,15 +251,18 @@ __initcall(init_posix_timers); static void schedule_next_timer(struct k_itimer *timr) { + struct hrtimer *timer = &timr->it.real.timer; + if (timr->it.real.interval.tv64 == 0) return; - timr->it_overrun += hrtimer_forward(&timr->it.real.timer, + timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), timr->it.real.interval); + timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1; ++timr->it_requeue_pending; - hrtimer_restart(&timr->it.real.timer); + hrtimer_restart(timer); } /* @@ -334,6 +337,7 @@ EXPORT_SYMBOL_GPL(posix_timer_event); static int posix_timer_fn(void *data) { struct k_itimer *timr = data; + struct hrtimer *timer = &timr->it.real.timer; unsigned long flags; int si_private = 0; int ret = HRTIMER_NORESTART; @@ -351,7 +355,8 @@ static int posix_timer_fn(void *data) */ if (timr->it.real.interval.tv64 != 0) { timr->it_overrun += - hrtimer_forward(&timr->it.real.timer, + hrtimer_forward(timer, + timer->base->softirq_time, timr->it.real.interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; @@ -623,7 +628,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) if (timr->it_requeue_pending & REQUEUE_PENDING || (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { timr->it_overrun += - hrtimer_forward(timer, timr->it.real.interval); + hrtimer_forward(timer, timer->base->get_time(), + timr->it.real.interval); remaining = hrtimer_get_remaining(timer); } calci: -- cgit v1.1 From 3b98a5328171cebc867f70484b20bd34948cd7f6 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sun, 26 Mar 2006 01:38:07 -0800 Subject: [PATCH] hrtimers: posix-timer: cleanup common_timer_get() Cleanup common_timer_get() a little. Signed-off-by: Roman Zippel Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 255657a..7c5f447 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -608,39 +608,41 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) static void common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { - ktime_t remaining; + ktime_t now, remaining, iv; struct hrtimer *timer = &timr->it.real.timer; memset(cur_setting, 0, sizeof(struct itimerspec)); - remaining = hrtimer_get_remaining(timer); - /* Time left ? or timer pending */ - if (remaining.tv64 > 0 || hrtimer_active(timer)) - goto calci; + iv = timr->it.real.interval; + /* interval timer ? */ - if (timr->it.real.interval.tv64 == 0) + if (iv.tv64) + cur_setting->it_interval = ktime_to_timespec(iv); + else if (!hrtimer_active(timer) && + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) return; + + now = timer->base->get_time(); + /* - * When a requeue is pending or this is a SIGEV_NONE timer - * move the expiry time forward by intervals, so expiry is > - * now. + * When a requeue is pending or this is a SIGEV_NONE + * timer move the expiry time forward by intervals, so + * expiry is > now. */ - if (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - timr->it_overrun += - hrtimer_forward(timer, timer->base->get_time(), - timr->it.real.interval); - remaining = hrtimer_get_remaining(timer); - } - calci: - /* interval timer ? */ - if (timr->it.real.interval.tv64 != 0) - cur_setting->it_interval = - ktime_to_timespec(timr->it.real.interval); + if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + timr->it_overrun += hrtimer_forward(timer, now, iv); + + remaining = ktime_sub(timer->expires, now); /* Return 0 only, when the timer is expired and not pending */ - if (remaining.tv64 <= 0) - cur_setting->it_value.tv_nsec = 1; - else + if (remaining.tv64 <= 0) { + /* + * A single shot SIGEV_NONE timer must return 0, when + * it is expired ! + */ + if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + cur_setting->it_value.tv_nsec = 1; + } else cur_setting->it_value = ktime_to_timespec(remaining); } -- cgit v1.1 From 432569bb9d9d424d7ffe5b21f8205c55bdd1aaa8 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sun, 26 Mar 2006 01:38:08 -0800 Subject: [PATCH] hrtimers: simplify nanosleep nanosleep is the only user of the expired state, so let it manage this itself, which makes the hrtimer code a bit simpler. The remaining time is also only calculated if requested. Signed-off-by: Roman Zippel Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 142 ++++++++++++++++++++++++------------------------------- 1 file changed, 62 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e989c99..59ec50c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -625,30 +625,20 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) fn = timer->function; data = timer->data; set_curr_timer(base, timer); - timer->state = HRTIMER_RUNNING; + timer->state = HRTIMER_INACTIVE; __remove_hrtimer(timer, base); spin_unlock_irq(&base->lock); - /* - * fn == NULL is special case for the simplest timer - * variant - wake up process and do not restart: - */ - if (!fn) { - wake_up_process(data); - restart = HRTIMER_NORESTART; - } else - restart = fn(data); + restart = fn(data); spin_lock_irq(&base->lock); /* Another CPU has added back the timer */ - if (timer->state != HRTIMER_RUNNING) + if (timer->state != HRTIMER_INACTIVE) continue; - if (restart == HRTIMER_RESTART) + if (restart != HRTIMER_NORESTART) enqueue_hrtimer(timer, base); - else - timer->state = HRTIMER_EXPIRED; } set_curr_timer(base, NULL); spin_unlock_irq(&base->lock); @@ -672,79 +662,70 @@ void hrtimer_run_queues(void) * Sleep related functions: */ -/** - * schedule_hrtimer - sleep until timeout - * - * @timer: hrtimer variable initialized with the correct clock base - * @mode: timeout value is abs/rel - * - * Make the current task sleep until @timeout is - * elapsed. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * will be returned - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - */ -static ktime_t __sched -schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode) -{ - /* fn stays NULL, meaning single-shot wakeup: */ - timer->data = current; +struct sleep_hrtimer { + struct hrtimer timer; + struct task_struct *task; + int expired; +}; - hrtimer_start(timer, timer->expires, mode); +static int nanosleep_wakeup(void *data) +{ + struct sleep_hrtimer *t = data; - schedule(); - hrtimer_cancel(timer); + t->expired = 1; + wake_up_process(t->task); - /* Return the remaining time: */ - if (timer->state != HRTIMER_EXPIRED) - return ktime_sub(timer->expires, timer->base->get_time()); - else - return (ktime_t) {.tv64 = 0 }; + return HRTIMER_NORESTART; } -static inline ktime_t __sched -schedule_hrtimer_interruptible(struct hrtimer *timer, - const enum hrtimer_mode mode) +static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode) { - set_current_state(TASK_INTERRUPTIBLE); + t->timer.function = nanosleep_wakeup; + t->timer.data = t; + t->task = current; + t->expired = 0; + + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start(&t->timer, t->timer.expires, mode); + + schedule(); - return schedule_hrtimer(timer, mode); + if (unlikely(!t->expired)) { + hrtimer_cancel(&t->timer); + mode = HRTIMER_ABS; + } + } while (!t->expired && !signal_pending(current)); + + return t->expired; } static long __sched nanosleep_restart(struct restart_block *restart) { + struct sleep_hrtimer t; struct timespec __user *rmtp; struct timespec tu; - void *rfn_save = restart->fn; - struct hrtimer timer; - ktime_t rem; + ktime_t time; restart->fn = do_no_restart_syscall; - hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); - - timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; + hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); + t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; - rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS); - - if (rem.tv64 <= 0) + if (do_nanosleep(&t, HRTIMER_ABS)) return 0; rmtp = (struct timespec __user *) restart->arg2; - tu = ktime_to_timespec(rem); - if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) - return -EFAULT; + if (rmtp) { + time = ktime_sub(t.timer.expires, t.timer.base->get_time()); + if (time.tv64 <= 0) + return 0; + tu = ktime_to_timespec(time); + if (copy_to_user(rmtp, &tu, sizeof(tu))) + return -EFAULT; + } - restart->fn = rfn_save; + restart->fn = nanosleep_restart; /* The other values in restart are already filled in */ return -ERESTART_RESTARTBLOCK; @@ -754,33 +735,34 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; - struct hrtimer timer; + struct sleep_hrtimer t; struct timespec tu; ktime_t rem; - hrtimer_init(&timer, clockid, mode); - - timer.expires = timespec_to_ktime(*rqtp); - - rem = schedule_hrtimer_interruptible(&timer, mode); - if (rem.tv64 <= 0) + hrtimer_init(&t.timer, clockid, mode); + t.timer.expires = timespec_to_ktime(*rqtp); + if (do_nanosleep(&t, mode)) return 0; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_ABS) return -ERESTARTNOHAND; - tu = ktime_to_timespec(rem); - - if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) - return -EFAULT; + if (rmtp) { + rem = ktime_sub(t.timer.expires, t.timer.base->get_time()); + if (rem.tv64 <= 0) + return 0; + tu = ktime_to_timespec(rem); + if (copy_to_user(rmtp, &tu, sizeof(tu))) + return -EFAULT; + } restart = ¤t_thread_info()->restart_block; restart->fn = nanosleep_restart; - restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; - restart->arg1 = timer.expires.tv64 >> 32; + restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; + restart->arg1 = t.timer.expires.tv64 >> 32; restart->arg2 = (unsigned long) rmtp; - restart->arg3 = (unsigned long) timer.base->index; + restart->arg3 = (unsigned long) t.timer.base->index; return -ERESTART_RESTARTBLOCK; } -- cgit v1.1 From b75f7a51ca75c977d7d77f735d7a7859194eb39e Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sun, 26 Mar 2006 01:38:09 -0800 Subject: [PATCH] hrtimers: remove state field Remove the state field and encode this information in the rb_node similiar to normal timer. Signed-off-by: Roman Zippel Acked-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 59ec50c..658d49f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -374,8 +374,6 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) rb_link_node(&timer->node, parent, link); rb_insert_color(&timer->node, &base->active); - timer->state = HRTIMER_PENDING; - if (!base->first || timer->expires.tv64 < rb_entry(base->first, struct hrtimer, node)->expires.tv64) base->first = &timer->node; @@ -395,6 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) if (base->first == &timer->node) base->first = rb_next(&timer->node); rb_erase(&timer->node, &base->active); + timer->node.rb_parent = HRTIMER_INACTIVE; } /* @@ -405,7 +404,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) { if (hrtimer_active(timer)) { __remove_hrtimer(timer, base); - timer->state = HRTIMER_INACTIVE; return 1; } return 0; @@ -579,6 +577,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &bases[clock_id]; + timer->node.rb_parent = HRTIMER_INACTIVE; } /** @@ -625,7 +624,6 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) fn = timer->function; data = timer->data; set_curr_timer(base, timer); - timer->state = HRTIMER_INACTIVE; __remove_hrtimer(timer, base); spin_unlock_irq(&base->lock); @@ -633,12 +631,10 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) spin_lock_irq(&base->lock); - /* Another CPU has added back the timer */ - if (timer->state != HRTIMER_INACTIVE) - continue; - - if (restart != HRTIMER_NORESTART) + if (restart != HRTIMER_NORESTART) { + BUG_ON(hrtimer_active(timer)); enqueue_hrtimer(timer, base); + } } set_curr_timer(base, NULL); spin_unlock_irq(&base->lock); -- cgit v1.1 From df869b630d9d9131c10cf073fb61646048874b2f Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sun, 26 Mar 2006 01:38:11 -0800 Subject: [PATCH] hrtimers: remove nsec_t typedef nsec_t predates ktime_t and has mostly been superseded by it. In the few places that are left it's better to make it explicit that we're dealing with 64 bit values here. Signed-off-by: Roman Zippel Acked-by: Thomas Gleixner Acked-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 4 ++-- kernel/time.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 658d49f..44108de 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -266,7 +266,7 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) /* * Divide a ktime value by a nanosecond value */ -static unsigned long ktime_divns(const ktime_t kt, nsec_t div) +static unsigned long ktime_divns(const ktime_t kt, s64 div) { u64 dclc, inc, dns; int sft = 0; @@ -322,7 +322,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) interval.tv64 = timer->base->resolution.tv64; if (unlikely(delta.tv64 >= interval.tv64)) { - nsec_t incr = ktime_to_ns(interval); + s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); timer->expires = ktime_add_ns(timer->expires, incr * orun); diff --git a/kernel/time.c b/kernel/time.c index e00a97b..ff8e701 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -610,7 +610,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) * * Returns the timespec representation of the nsec parameter. */ -struct timespec ns_to_timespec(const nsec_t nsec) +struct timespec ns_to_timespec(const s64 nsec) { struct timespec ts; @@ -630,7 +630,7 @@ struct timespec ns_to_timespec(const nsec_t nsec) * * Returns the timeval representation of the nsec parameter. */ -struct timeval ns_to_timeval(const nsec_t nsec) +struct timeval ns_to_timeval(const s64 nsec) { struct timespec ts = ns_to_timespec(nsec); struct timeval tv; -- cgit v1.1 From 05cfb614ddbf3181540ce09d44d96486f8ba8d6a Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Sun, 26 Mar 2006 01:38:12 -0800 Subject: [PATCH] hrtimers: remove data field The nanosleep cleanup allows to remove the data field of hrtimer. The callback function can use container_of() to get it's own data. Since the hrtimer structure is anyway embedded in other structures, this adds no overhead. Signed-off-by: Roman Zippel Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/hrtimer.c | 12 +++++------- kernel/itimer.c | 15 +++++++-------- kernel/posix-timers.c | 9 ++++----- 4 files changed, 17 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a020639..4bd6486 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -848,7 +848,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->real_timer.data = tsk; + sig->tsk = tsk; sig->it_virt_expires = cputime_zero; sig->it_virt_incr = cputime_zero; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 44108de..0237a55 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -613,21 +613,19 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) while ((node = base->first)) { struct hrtimer *timer; - int (*fn)(void *); + int (*fn)(struct hrtimer *); int restart; - void *data; timer = rb_entry(node, struct hrtimer, node); if (base->softirq_time.tv64 <= timer->expires.tv64) break; fn = timer->function; - data = timer->data; set_curr_timer(base, timer); __remove_hrtimer(timer, base); spin_unlock_irq(&base->lock); - restart = fn(data); + restart = fn(timer); spin_lock_irq(&base->lock); @@ -664,9 +662,10 @@ struct sleep_hrtimer { int expired; }; -static int nanosleep_wakeup(void *data) +static int nanosleep_wakeup(struct hrtimer *timer) { - struct sleep_hrtimer *t = data; + struct sleep_hrtimer *t = + container_of(timer, struct sleep_hrtimer, timer); t->expired = 1; wake_up_process(t->task); @@ -677,7 +676,6 @@ static int nanosleep_wakeup(void *data) static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode) { t->timer.function = nanosleep_wakeup; - t->timer.data = t; t->task = current; t->expired = 0; diff --git a/kernel/itimer.c b/kernel/itimer.c index af2ec6b..204ed79 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -128,17 +128,16 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) /* * The timer is automagically restarted, when interval != 0 */ -int it_real_fn(void *data) +int it_real_fn(struct hrtimer *timer) { - struct task_struct *tsk = (struct task_struct *) data; + struct signal_struct *sig = + container_of(timer, struct signal_struct, real_timer); - send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk); - - if (tsk->signal->it_real_incr.tv64 != 0) { - hrtimer_forward(&tsk->signal->real_timer, - tsk->signal->real_timer.base->softirq_time, - tsk->signal->it_real_incr); + send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); + if (sig->it_real_incr.tv64 != 0) { + hrtimer_forward(timer, timer->base->softirq_time, + sig->it_real_incr); return HRTIMER_RESTART; } return HRTIMER_NORESTART; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7c5f447..ac6dc87 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); static int common_timer_del(struct k_itimer *timer); -static int posix_timer_fn(void *data); +static int posix_timer_fn(struct hrtimer *data); static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); @@ -334,14 +334,14 @@ EXPORT_SYMBOL_GPL(posix_timer_event); * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. */ -static int posix_timer_fn(void *data) +static int posix_timer_fn(struct hrtimer *timer) { - struct k_itimer *timr = data; - struct hrtimer *timer = &timr->it.real.timer; + struct k_itimer *timr; unsigned long flags; int si_private = 0; int ret = HRTIMER_NORESTART; + timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); if (timr->it.real.interval.tv64 != 0) @@ -725,7 +725,6 @@ common_timer_set(struct k_itimer *timr, int flags, mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.data = timr; timr->it.real.timer.function = posix_timer_fn; timer->expires = timespec_to_ktime(new_setting->it_value); -- cgit v1.1 From c6fd91f0bdcd294a0ae0ba2b2a7f7456ef4b7144 Mon Sep 17 00:00:00 2001 From: bibo mao Date: Sun, 26 Mar 2006 01:38:20 -0800 Subject: [PATCH] kretprobe instance recycled by parent process When kretprobe probes the schedule() function, if the probed process exits then schedule() will never return, so some kretprobe instances will never be recycled. In this patch the parent process will recycle retprobe instances of the probed function and there will be no memory leak of kretprobe instances. Signed-off-by: bibo mao Cc: Masami Hiramatsu Cc: Prasanna S Panchamukhi Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 10 +++++----- kernel/sched.c | 9 ++++++++- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fb9f75..1156eb0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -323,10 +323,10 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) } /* - * This function is called from exit_thread or flush_thread when task tk's - * stack is being recycled so that we can recycle any function-return probe - * instances associated with this task. These left over instances represent - * probed functions that have been called but will never return. + * This function is called from finish_task_switch when task tk becomes dead, + * so that we can recycle any function-return probe instances associated + * with this task. These left over instances represent probed functions + * that have been called but will never return. */ void __kprobes kprobe_flush_task(struct task_struct *tk) { @@ -336,7 +336,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) unsigned long flags = 0; spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + head = kretprobe_inst_table_head(tk); hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { if (ri->task == tk) recycle_rp_inst(ri); diff --git a/kernel/sched.c b/kernel/sched.c index 7ffaabd..78acdef 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1546,8 +1547,14 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) finish_lock_switch(rq, prev); if (mm) mmdrop(mm); - if (unlikely(prev_task_flags & PF_DEAD)) + if (unlikely(prev_task_flags & PF_DEAD)) { + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); put_task_struct(prev); + } } /** -- cgit v1.1 From 013d3868143387be99bb0373d49452eaa3c55d41 Mon Sep 17 00:00:00 2001 From: Martin Andersson Date: Mon, 27 Mar 2006 01:15:18 -0800 Subject: [PATCH] sched: fix task interactivity calculation Is a truncation error in kernel/sched.c triggered when the nice value is negative. The affected code is used in the TASK_INTERACTIVE macro. The code is: #define SCALE(v1,v1_max,v2_max) \ (v1) * (v2_max) / (v1_max) which is used in this way: SCALE(TASK_NICE(p), 40, MAX_BONUS) Comments in the code says: * This part scales the interactivity limit depending on niceness. * * We scale it linearly, offset by the INTERACTIVE_DELTA delta. * Here are a few examples of different nice levels: * * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] * * (the X axis represents the possible -5 ... 0 ... +5 dynamic * priority range a task can explore, a value of '1' means the * task is rated interactive.) However, the current code does not scale it linearly and the result differs from the given examples. If the mathematical function "floor" is used when the nice value is negative instead of the truncation one gets when using integer division, the result conforms to the documentation. Output of TASK_INTERACTIVE when using the kernel code: nice dynamic priorities -20 1 1 1 1 1 1 1 1 1 0 0 -19 1 1 1 1 1 1 1 1 0 0 0 -18 1 1 1 1 1 1 1 1 0 0 0 -17 1 1 1 1 1 1 1 1 0 0 0 -16 1 1 1 1 1 1 1 1 0 0 0 -15 1 1 1 1 1 1 1 0 0 0 0 -14 1 1 1 1 1 1 1 0 0 0 0 -13 1 1 1 1 1 1 1 0 0 0 0 -12 1 1 1 1 1 1 1 0 0 0 0 -11 1 1 1 1 1 1 0 0 0 0 0 -10 1 1 1 1 1 1 0 0 0 0 0 -9 1 1 1 1 1 1 0 0 0 0 0 -8 1 1 1 1 1 1 0 0 0 0 0 -7 1 1 1 1 1 0 0 0 0 0 0 -6 1 1 1 1 1 0 0 0 0 0 0 -5 1 1 1 1 1 0 0 0 0 0 0 -4 1 1 1 1 1 0 0 0 0 0 0 -3 1 1 1 1 0 0 0 0 0 0 0 -2 1 1 1 1 0 0 0 0 0 0 0 -1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 2 1 1 1 1 0 0 0 0 0 0 0 3 1 1 1 1 0 0 0 0 0 0 0 4 1 1 1 0 0 0 0 0 0 0 0 5 1 1 1 0 0 0 0 0 0 0 0 6 1 1 1 0 0 0 0 0 0 0 0 7 1 1 1 0 0 0 0 0 0 0 0 8 1 1 0 0 0 0 0 0 0 0 0 9 1 1 0 0 0 0 0 0 0 0 0 10 1 1 0 0 0 0 0 0 0 0 0 11 1 1 0 0 0 0 0 0 0 0 0 12 1 0 0 0 0 0 0 0 0 0 0 13 1 0 0 0 0 0 0 0 0 0 0 14 1 0 0 0 0 0 0 0 0 0 0 15 1 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 18 0 0 0 0 0 0 0 0 0 0 0 19 0 0 0 0 0 0 0 0 0 0 0 Output of TASK_INTERACTIVE when using "floor" nice dynamic priorities -20 1 1 1 1 1 1 1 1 1 0 0 -19 1 1 1 1 1 1 1 1 1 0 0 -18 1 1 1 1 1 1 1 1 1 0 0 -17 1 1 1 1 1 1 1 1 1 0 0 -16 1 1 1 1 1 1 1 1 0 0 0 -15 1 1 1 1 1 1 1 1 0 0 0 -14 1 1 1 1 1 1 1 1 0 0 0 -13 1 1 1 1 1 1 1 1 0 0 0 -12 1 1 1 1 1 1 1 0 0 0 0 -11 1 1 1 1 1 1 1 0 0 0 0 -10 1 1 1 1 1 1 1 0 0 0 0 -9 1 1 1 1 1 1 1 0 0 0 0 -8 1 1 1 1 1 1 0 0 0 0 0 -7 1 1 1 1 1 1 0 0 0 0 0 -6 1 1 1 1 1 1 0 0 0 0 0 -5 1 1 1 1 1 1 0 0 0 0 0 -4 1 1 1 1 1 0 0 0 0 0 0 -3 1 1 1 1 1 0 0 0 0 0 0 -2 1 1 1 1 1 0 0 0 0 0 0 -1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 2 1 1 1 1 0 0 0 0 0 0 0 3 1 1 1 1 0 0 0 0 0 0 0 4 1 1 1 0 0 0 0 0 0 0 0 5 1 1 1 0 0 0 0 0 0 0 0 6 1 1 1 0 0 0 0 0 0 0 0 7 1 1 1 0 0 0 0 0 0 0 0 8 1 1 0 0 0 0 0 0 0 0 0 9 1 1 0 0 0 0 0 0 0 0 0 10 1 1 0 0 0 0 0 0 0 0 0 11 1 1 0 0 0 0 0 0 0 0 0 12 1 0 0 0 0 0 0 0 0 0 0 13 1 0 0 0 0 0 0 0 0 0 0 14 1 0 0 0 0 0 0 0 0 0 0 15 1 0 0 0 0 0 0 0 0 0 0 16 0 0 0 0 0 0 0 0 0 0 0 17 0 0 0 0 0 0 0 0 0 0 0 18 0 0 0 0 0 0 0 0 0 0 0 19 0 0 0 0 0 0 0 0 0 0 0 Signed-off-by: Martin Andersson Acked-by: Ingo Molnar Cc: Nick Piggin Cc: Mike Galbraith Cc: Peter Williams Cc: Con Kolivas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 78acdef..dc599c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -145,7 +145,8 @@ (v1) * (v2_max) / (v1_max) #define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ + INTERACTIVE_DELTA) #define TASK_INTERACTIVE(p) \ ((p)->prio <= (p)->static_prio - DELTA(p)) -- cgit v1.1 From 77e4bfbcf071f795b54862455dce8902b3fc29c2 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Mon, 27 Mar 2006 01:15:20 -0800 Subject: [PATCH] Small schedule() optimization small schedule() microoptimization. Signed-off-by: Andreas Mohr Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dc599c8..a96a05d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2879,13 +2879,11 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!current->exit_state)) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - dump_stack(); - } + if (unlikely(in_atomic() && !current->exit_state)) { + printk(KERN_ERR "BUG: scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -- cgit v1.1 From 1e9f28fa1eb9773bf65bae08288c6a0a38eef4a7 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Mon, 27 Mar 2006 01:15:22 -0800 Subject: [PATCH] sched: new sched domain for representing multi-core Add a new sched domain for representing multi-core with shared caches between cores. Consider a dual package system, each package containing two cores and with last level cache shared between cores with in a package. If there are two runnable processes, with this appended patch those two processes will be scheduled on different packages. On such systems, with this patch we have observed 8% perf improvement with specJBB(2 warehouse) benchmark and 35% improvement with CFP2000 rate(with 2 users). This new domain will come into play only on multi-core systems with shared caches. On other systems, this sched domain will be removed by domain degeneration code. This new domain can be also used for implementing power savings policy (see OLS 2005 CMP kernel scheduler paper for more details.. I will post another patch for power savings policy soon) Most of the arch/* file changes are for cpu_coregroup_map() implementation. Signed-off-by: Suresh Siddha Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 68 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a96a05d..8a8b71b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu) } #endif +#ifdef CONFIG_SCHED_MC +static DEFINE_PER_CPU(struct sched_domain, core_domains); +static struct sched_group sched_group_core[NR_CPUS]; +#endif + +#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) +static int cpu_to_core_group(int cpu) +{ + return first_cpu(cpu_sibling_map[cpu]); +} +#elif defined(CONFIG_SCHED_MC) +static int cpu_to_core_group(int cpu) +{ + return cpu; +} +#endif + static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; static int cpu_to_phys_group(int cpu) { -#ifdef CONFIG_SCHED_SMT +#if defined(CONFIG_SCHED_MC) + cpumask_t mask = cpu_coregroup_map(cpu); + return first_cpu(mask); +#elif defined(CONFIG_SCHED_SMT) return first_cpu(cpu_sibling_map[cpu]); #else return cpu; @@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map) sd->parent = p; sd->groups = &sched_group_phys[group]; +#ifdef CONFIG_SCHED_MC + p = sd; + sd = &per_cpu(core_domains, i); + group = cpu_to_core_group(i); + *sd = SD_MC_INIT; + sd->span = cpu_coregroup_map(i); + cpus_and(sd->span, sd->span, *cpu_map); + sd->parent = p; + sd->groups = &sched_group_core[group]; +#endif + #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); @@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map) } #endif +#ifdef CONFIG_SCHED_MC + /* Set up multi-core groups */ + for_each_cpu_mask(i, *cpu_map) { + cpumask_t this_core_map = cpu_coregroup_map(i); + cpus_and(this_core_map, this_core_map, *cpu_map); + if (i != first_cpu(this_core_map)) + continue; + init_sched_build_groups(sched_group_core, this_core_map, + &cpu_to_core_group); + } +#endif + + /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); @@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map) power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif +#ifdef CONFIG_SCHED_MC + sd = &per_cpu(core_domains, i); + power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) + * SCHED_LOAD_SCALE / 10; + sd->groups->cpu_power = power; + + sd = &per_cpu(phys_domains, i); + /* + * This has to be < 2 * SCHED_LOAD_SCALE + * Lets keep it SCHED_LOAD_SCALE, so that + * while calculating NUMA group's cpu_power + * we can simply do + * numa_group->cpu_power += phys_group->cpu_power; + * + * See "only add power once for each physical pkg" + * comment below + */ + sd->groups->cpu_power = SCHED_LOAD_SCALE; +#else sd = &per_cpu(phys_domains, i); power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; +#endif #ifdef CONFIG_NUMA sd = &per_cpu(allnodes_domains, i); @@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map) next_sg: for_each_cpu_mask(j, sg->cpumask) { struct sched_domain *sd; - int power; sd = &per_cpu(phys_domains, j); if (j != first_cpu(sd->groups->cpumask)) { @@ -5833,10 +5896,8 @@ next_sg: */ continue; } - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; - sg->cpu_power += power; + sg->cpu_power += sd->groups->cpu_power; } sg = sg->next; if (sg != sched_group_nodes[i]) @@ -5849,6 +5910,8 @@ next_sg: struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); +#elif defined(CONFIG_SCHED_MC) + sd = &per_cpu(core_domains, i); #else sd = &per_cpu(phys_domains, i); #endif -- cgit v1.1 From 0806903316d516a3b3851c51cea5c71724d9051d Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Mon, 27 Mar 2006 01:15:23 -0800 Subject: [PATCH] sched: fix group power for allnodes_domains Current sched groups power calculation for allnodes_domains is wrong. We should really be using cumulative power of the physical packages in that group (similar to the calculation in node_domains) Signed-off-by: Suresh Siddha Cc: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 62 +++++++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8a8b71b..7854ee5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5621,6 +5621,32 @@ static int cpu_to_allnodes_group(int cpu) { return cpu_to_node(cpu); } +static void init_numa_sched_groups_power(struct sched_group *group_head) +{ + struct sched_group *sg = group_head; + int j; + + if (!sg) + return; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + + sg->cpu_power += sd->groups->cpu_power; + } + sg = sg->next; + if (sg != group_head) + goto next_sg; +} #endif /* @@ -5866,43 +5892,13 @@ void build_sched_domains(const cpumask_t *cpu_map) (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; #endif - -#ifdef CONFIG_NUMA - sd = &per_cpu(allnodes_domains, i); - if (sd->groups) { - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; - sd->groups->cpu_power = power; - } -#endif } #ifdef CONFIG_NUMA - for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *sg = sched_group_nodes[i]; - int j; - - if (sg == NULL) - continue; -next_sg: - for_each_cpu_mask(j, sg->cpumask) { - struct sched_domain *sd; + for (i = 0; i < MAX_NUMNODES; i++) + init_numa_sched_groups_power(sched_group_nodes[i]); - sd = &per_cpu(phys_domains, j); - if (j != first_cpu(sd->groups->cpumask)) { - /* - * Only add "power" once for each - * physical package. - */ - continue; - } - - sg->cpu_power += sd->groups->cpu_power; - } - sg = sg->next; - if (sg != sched_group_nodes[i]) - goto next_sg; - } + init_numa_sched_groups_power(sched_group_allnodes); #endif /* Attach the domains */ -- cgit v1.1 From 0771dfefc9e538f077d0b43b6dec19a5a67d0e70 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Mar 2006 01:16:22 -0800 Subject: [PATCH] lightweight robust futexes: core Add the core infrastructure for robust futexes: structure definitions, the new syscalls and the do_exit() based cleanup mechanism. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Acked-by: Ulrich Drepper Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 + kernel/futex.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sys_ni.c | 4 ++ 3 files changed, 179 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 8037405..aecb48c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -852,6 +853,8 @@ fastcall NORET_TYPE void do_exit(long code) exit_itimers(tsk->signal); acct_process(code); } + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); exit_mm(tsk); exit_sem(tsk); diff --git a/kernel/futex.c b/kernel/futex.c index 5efa2f9..feb724b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -8,6 +8,10 @@ * Removed page pinning, fix privately mapped COW pages and other cleanups * (C) Copyright 2003, 2004 Jamie Lokier * + * Robust futex support started by Ingo Molnar + * (C) Copyright 2006 Red Hat Inc, All Rights Reserved + * Thanks to Thomas Gleixner for suggestions, analysis and fixes. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -829,6 +833,174 @@ error: goto out; } +/* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. + * + * Implementation: user-space maintains a per-thread list of locks it + * is holding. Upon do_exit(), the kernel carefully walks this list, + * and marks all locks that are owned by this thread with the + * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is + * always manipulated with the lock held, so the list is private and + * per-thread. Userspace also maintains a per-thread 'list_op_pending' + * field, to allow the kernel to clean up if the thread dies after + * acquiring the lock, but just before it could have added itself to + * the list. There can only be one such pending lock. + */ + +/** + * sys_set_robust_list - set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects + */ +asmlinkage long +sys_set_robust_list(struct robust_list_head __user *head, + size_t len) +{ + /* + * The kernel knows only one size for now: + */ + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->robust_list = head; + + return 0; +} + +/** + * sys_get_robust_list - get the robust-futex list head of a task + * @pid: pid of the process [zero for current task] + * @head_ptr: pointer to a list-head pointer, the kernel fills it in + * @len_ptr: pointer to a length field, the kernel fills in the header size + */ +asmlinkage long +sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, + size_t __user *len_ptr) +{ + struct robust_list_head *head; + unsigned long ret; + + if (!pid) + head = current->robust_list; + else { + struct task_struct *p; + + ret = -ESRCH; + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto err_unlock; + ret = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_PTRACE)) + goto err_unlock; + head = p->robust_list; + read_unlock(&tasklist_lock); + } + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(head, head_ptr); + +err_unlock: + read_unlock(&tasklist_lock); + + return ret; +} + +/* + * Process a futex-list entry, check whether it's owned by the + * dying task, and do notification if so: + */ +int handle_futex_death(unsigned int *uaddr, struct task_struct *curr) +{ + unsigned int futex_val; + +repeat: + if (get_user(futex_val, uaddr)) + return -1; + + if ((futex_val & FUTEX_TID_MASK) == curr->pid) { + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + if (futex_atomic_cmpxchg_inuser(uaddr, futex_val, + futex_val | FUTEX_OWNER_DIED) != + futex_val) + goto repeat; + + if (futex_val & FUTEX_WAITERS) + futex_wake((unsigned long)uaddr, 1); + } + return 0; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +void exit_robust_list(struct task_struct *curr) +{ + struct robust_list_head __user *head = curr->robust_list; + struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT; + unsigned long futex_offset; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (get_user(entry, &head->list.next)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (get_user(pending, &head->list_op_pending)) + return; + if (pending) + handle_futex_death((void *)pending + futex_offset, curr); + + while (entry != &head->list) { + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) + if (handle_futex_death((void *)entry + futex_offset, + curr)) + return; + + /* + * Fetch the next entry in the list: + */ + if (get_user(entry, &entry->next)) + return; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } +} + long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, unsigned long uaddr2, int val2, int val3) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1067090..d82864c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg); cond_syscall(sys_socketcall); cond_syscall(sys_futex); cond_syscall(compat_sys_futex); +cond_syscall(sys_set_robust_list); +cond_syscall(compat_sys_set_robust_list); +cond_syscall(sys_get_robust_list); +cond_syscall(compat_sys_get_robust_list); cond_syscall(sys_epoll_create); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); -- cgit v1.1 From 34f192c6527f20c47ccec239e7d51a27691b93fc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Mar 2006 01:16:24 -0800 Subject: [PATCH] lightweight robust futexes: compat 32-bit syscall compatibility support. (This patch also moves all futex related compat functionality into kernel/futex_compat.c.) Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Arjan van de Ven Acked-by: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 3 ++ kernel/compat.c | 23 -------- kernel/exit.c | 5 ++ kernel/futex_compat.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 150 insertions(+), 23 deletions(-) create mode 100644 kernel/futex_compat.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index ff1c11d..58908f9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,9 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o +ifeq ($(CONFIG_COMPAT),y) +obj-$(CONFIG_FUTEX) += futex_compat.o +endif obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o diff --git a/kernel/compat.c b/kernel/compat.c index b9bdd12..c1601a8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -17,7 +17,6 @@ #include #include #include /* for MAX_SCHEDULE_TIMEOUT */ -#include /* for FUTEX_WAIT */ #include #include #include @@ -239,28 +238,6 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, return ret; } -#ifdef CONFIG_FUTEX -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, - struct compat_timespec __user *utime, u32 __user *uaddr2, - int val3) -{ - struct timespec t; - unsigned long timeout = MAX_SCHEDULE_TIMEOUT; - int val2 = 0; - - if ((op == FUTEX_WAIT) && utime) { - if (get_compat_timespec(&t, utime)) - return -EFAULT; - timeout = timespec_to_jiffies(&t) + 1; - } - if (op >= FUTEX_REQUEUE) - val2 = (int) (unsigned long) utime; - - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); -} -#endif - asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { diff --git a/kernel/exit.c b/kernel/exit.c index aecb48c..a8c7efc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -855,6 +856,10 @@ fastcall NORET_TYPE void do_exit(long code) } if (unlikely(tsk->robust_list)) exit_robust_list(tsk); +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); +#endif exit_mm(tsk); exit_sem(tsk); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c new file mode 100644 index 0000000..c153559 --- /dev/null +++ b/kernel/futex_compat.c @@ -0,0 +1,142 @@ +/* + * linux/kernel/futex_compat.c + * + * Futex compatibililty routines. + * + * Copyright 2006, Red Hat, Inc., Ingo Molnar + */ + +#include +#include +#include + +#include + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +void compat_exit_robust_list(struct task_struct *curr) +{ + struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct robust_list __user *entry, *pending; + compat_uptr_t uentry, upending; + unsigned int limit = ROBUST_LIST_LIMIT; + compat_long_t futex_offset; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (get_user(uentry, &head->list.next)) + return; + entry = compat_ptr(uentry); + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (get_user(upending, &head->list_op_pending)) + return; + pending = compat_ptr(upending); + if (upending) + handle_futex_death((void *)pending + futex_offset, curr); + + while (compat_ptr(uentry) != &head->list) { + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) + if (handle_futex_death((void *)entry + futex_offset, + curr)) + return; + + /* + * Fetch the next entry in the list: + */ + if (get_user(uentry, (compat_uptr_t *)&entry->next)) + return; + entry = compat_ptr(uentry); + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } +} + +asmlinkage long +compat_sys_set_robust_list(struct compat_robust_list_head __user *head, + compat_size_t len) +{ + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->compat_robust_list = head; + + return 0; +} + +asmlinkage long +compat_sys_get_robust_list(int pid, compat_uptr_t *head_ptr, + compat_size_t __user *len_ptr) +{ + struct compat_robust_list_head *head; + unsigned long ret; + + if (!pid) + head = current->compat_robust_list; + else { + struct task_struct *p; + + ret = -ESRCH; + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto err_unlock; + ret = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_PTRACE)) + goto err_unlock; + head = p->compat_robust_list; + read_unlock(&tasklist_lock); + } + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(ptr_to_compat(head), head_ptr); + +err_unlock: + read_unlock(&tasklist_lock); + + return ret; +} + +asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, + struct compat_timespec __user *utime, u32 __user *uaddr2, + int val3) +{ + struct timespec t; + unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + int val2 = 0; + + if ((op == FUTEX_WAIT) && utime) { + if (get_compat_timespec(&t, utime)) + return -EFAULT; + timeout = timespec_to_jiffies(&t) + 1; + } + if (op >= FUTEX_REQUEUE) + val2 = (int) (unsigned long) utime; + + return do_futex((unsigned long)uaddr, op, val, timeout, + (unsigned long)uaddr2, val2, val3); +} -- cgit v1.1 From 8f17d3a5049d32392b79925c73a0cf99ce6d5af0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Mar 2006 01:16:27 -0800 Subject: [PATCH] lightweight robust futexes updates - fix: initialize the robust list(s) to NULL in copy_process. - doc update - cleanup: rename _inuser to _inatomic - __user cleanups and other small cleanups Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: Arjan van de Ven Cc: Ulrich Drepper Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++++- kernel/futex.c | 20 +++++++++----------- kernel/futex_compat.c | 7 +++---- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e0a2b44..c49bd19 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1061,7 +1061,10 @@ static task_t *copy_process(unsigned long clone_flags, * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; - + p->robust_list = NULL; +#ifdef CONFIG_COMPAT + p->compat_robust_list = NULL; +#endif /* * sigaltstack should be cleared when sharing the same VM */ diff --git a/kernel/futex.c b/kernel/futex.c index feb724b..9c9b2b6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -913,15 +913,15 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(unsigned int *uaddr, struct task_struct *curr) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) { - unsigned int futex_val; + u32 uval; -repeat: - if (get_user(futex_val, uaddr)) +retry: + if (get_user(uval, uaddr)) return -1; - if ((futex_val & FUTEX_TID_MASK) == curr->pid) { + if ((uval & FUTEX_TID_MASK) == curr->pid) { /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically @@ -932,12 +932,11 @@ repeat: * thread-death.) The rest of the cleanup is done in * userspace. */ - if (futex_atomic_cmpxchg_inuser(uaddr, futex_val, - futex_val | FUTEX_OWNER_DIED) != - futex_val) - goto repeat; + if (futex_atomic_cmpxchg_inatomic(uaddr, uval, + uval | FUTEX_OWNER_DIED) != uval) + goto retry; - if (futex_val & FUTEX_WAITERS) + if (uval & FUTEX_WAITERS) futex_wake((unsigned long)uaddr, 1); } return 0; @@ -985,7 +984,6 @@ void exit_robust_list(struct task_struct *curr) if (handle_futex_death((void *)entry + futex_offset, curr)) return; - /* * Fetch the next entry in the list: */ diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index c153559..9c077cf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -121,9 +121,9 @@ err_unlock: return ret; } -asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, +asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, struct compat_timespec __user *utime, u32 __user *uaddr2, - int val3) + u32 val3) { struct timespec t; unsigned long timeout = MAX_SCHEDULE_TIMEOUT; @@ -137,6 +137,5 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, int val, if (op >= FUTEX_REQUEUE) val2 = (int) (unsigned long) utime; - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); + return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); } -- cgit v1.1 From e041c683412d5bf44dc2b109053e3b837b71742d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 27 Mar 2006 01:16:30 -0800 Subject: [PATCH] Notifier chain update: API changes The kernel's implementation of notifier chains is unsafe. There is no protection against entries being added to or removed from a chain while the chain is in use. The issues were discussed in this thread: http://marc.theaimsgroup.com/?l=linux-kernel&m=113018709002036&w=2 We noticed that notifier chains in the kernel fall into two basic usage classes: "Blocking" chains are always called from a process context and the callout routines are allowed to sleep; "Atomic" chains can be called from an atomic context and the callout routines are not allowed to sleep. We decided to codify this distinction and make it part of the API. Therefore this set of patches introduces three new, parallel APIs: one for blocking notifiers, one for atomic notifiers, and one for "raw" notifiers (which is really just the old API under a new name). New kinds of data structures are used for the heads of the chains, and new routines are defined for registration, unregistration, and calling a chain. The three APIs are explained in include/linux/notifier.h and their implementation is in kernel/sys.c. With atomic and blocking chains, the implementation guarantees that the chain links will not be corrupted and that chain callers will not get messed up by entries being added or removed. For raw chains the implementation provides no guarantees at all; users of this API must provide their own protections. (The idea was that situations may come up where the assumptions of the atomic and blocking APIs are not appropriate, so it should be possible for users to handle these things in their own way.) There are some limitations, which should not be too hard to live with. For atomic/blocking chains, registration and unregistration must always be done in a process context since the chain is protected by a mutex/rwsem. Also, a callout routine for a non-raw chain must not try to register or unregister entries on its own chain. (This did happen in a couple of places and the code had to be changed to avoid it.) Since atomic chains may be called from within an NMI handler, they cannot use spinlocks for synchronization. Instead we use RCU. The overhead falls almost entirely in the unregister routine, which is okay since unregistration is much less frequent that calling a chain. Here is the list of chains that we adjusted and their classifications. None of them use the raw API, so for the moment it is only a placeholder. ATOMIC CHAINS ------------- arch/i386/kernel/traps.c: i386die_chain arch/ia64/kernel/traps.c: ia64die_chain arch/powerpc/kernel/traps.c: powerpc_die_chain arch/sparc64/kernel/traps.c: sparc64die_chain arch/x86_64/kernel/traps.c: die_chain drivers/char/ipmi/ipmi_si_intf.c: xaction_notifier_list kernel/panic.c: panic_notifier_list kernel/profile.c: task_free_notifier net/bluetooth/hci_core.c: hci_notifier net/ipv4/netfilter/ip_conntrack_core.c: ip_conntrack_chain net/ipv4/netfilter/ip_conntrack_core.c: ip_conntrack_expect_chain net/ipv6/addrconf.c: inet6addr_chain net/netfilter/nf_conntrack_core.c: nf_conntrack_chain net/netfilter/nf_conntrack_core.c: nf_conntrack_expect_chain net/netlink/af_netlink.c: netlink_chain BLOCKING CHAINS --------------- arch/powerpc/platforms/pseries/reconfig.c: pSeries_reconfig_chain arch/s390/kernel/process.c: idle_chain arch/x86_64/kernel/process.c idle_notifier drivers/base/memory.c: memory_chain drivers/cpufreq/cpufreq.c cpufreq_policy_notifier_list drivers/cpufreq/cpufreq.c cpufreq_transition_notifier_list drivers/macintosh/adb.c: adb_client_list drivers/macintosh/via-pmu.c sleep_notifier_list drivers/macintosh/via-pmu68k.c sleep_notifier_list drivers/macintosh/windfarm_core.c wf_client_list drivers/usb/core/notify.c usb_notifier_list drivers/video/fbmem.c fb_notifier_list kernel/cpu.c cpu_chain kernel/module.c module_notify_list kernel/profile.c munmap_notifier kernel/profile.c task_exit_notifier kernel/sys.c reboot_notifier_list net/core/dev.c netdev_chain net/decnet/dn_dev.c: dnaddr_chain net/ipv4/devinet.c: inetaddr_chain It's possible that some of these classifications are wrong. If they are, please let us know or submit a patch to fix them. Note that any chain that gets called very frequently should be atomic, because the rwsem read-locking used for blocking chains is very likely to incur cache misses on SMP systems. (However, if the chain's callout routines may sleep then the chain cannot be atomic.) The patch set was written by Alan Stern and Chandra Seetharaman, incorporating material written by Keith Owens and suggestions from Paul McKenney and Andrew Morton. [jes@sgi.com: restructure the notifier chain initialization macros] Signed-off-by: Alan Stern Signed-off-by: Chandra Seetharaman Signed-off-by: Jes Sorensen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 29 ++--- kernel/module.c | 20 +--- kernel/panic.c | 4 +- kernel/profile.c | 53 +++------ kernel/softlockup.c | 2 +- kernel/sys.c | 327 ++++++++++++++++++++++++++++++++++++++++++---------- 6 files changed, 301 insertions(+), 134 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8be22bd..fe2b8d0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -18,7 +18,7 @@ /* This protects CPUs going up and down... */ static DECLARE_MUTEX(cpucontrol); -static struct notifier_block *cpu_chain; +static BLOCKING_NOTIFIER_HEAD(cpu_chain); #ifdef CONFIG_HOTPLUG_CPU static struct task_struct *lock_cpu_hotplug_owner; @@ -71,21 +71,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); /* Need to know about CPUs going up/down? */ int register_cpu_notifier(struct notifier_block *nb) { - int ret; - - if ((ret = lock_cpu_hotplug_interruptible()) != 0) - return ret; - ret = notifier_chain_register(&cpu_chain, nb); - unlock_cpu_hotplug(); - return ret; + return blocking_notifier_chain_register(&cpu_chain, nb); } EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) { - lock_cpu_hotplug(); - notifier_chain_unregister(&cpu_chain, nb); - unlock_cpu_hotplug(); + blocking_notifier_chain_unregister(&cpu_chain, nb); } EXPORT_SYMBOL(unregister_cpu_notifier); @@ -141,7 +133,7 @@ int cpu_down(unsigned int cpu) goto out; } - err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, + err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, (void *)(long)cpu); if (err == NOTIFY_BAD) { printk("%s: attempt to take down CPU %u failed\n", @@ -159,7 +151,7 @@ int cpu_down(unsigned int cpu) p = __stop_machine_run(take_cpu_down, NULL, cpu); if (IS_ERR(p)) { /* CPU didn't die: tell everyone. Can't complain. */ - if (notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, + if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, (void *)(long)cpu) == NOTIFY_BAD) BUG(); @@ -182,8 +174,8 @@ int cpu_down(unsigned int cpu) put_cpu(); /* CPU is completely dead: tell everyone. Too late to complain. */ - if (notifier_call_chain(&cpu_chain, CPU_DEAD, (void *)(long)cpu) - == NOTIFY_BAD) + if (blocking_notifier_call_chain(&cpu_chain, CPU_DEAD, + (void *)(long)cpu) == NOTIFY_BAD) BUG(); check_for_tasks(cpu); @@ -211,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu) goto out; } - ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); + ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", __FUNCTION__, cpu); @@ -226,11 +218,12 @@ int __devinit cpu_up(unsigned int cpu) BUG_ON(!cpu_online(cpu)); /* Now call notifier in preparation. */ - notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); + blocking_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); out_notify: if (ret != 0) - notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); + blocking_notifier_call_chain(&cpu_chain, + CPU_UP_CANCELED, hcpu); out: unlock_cpu_hotplug(); return ret; diff --git a/kernel/module.c b/kernel/module.c index ddfe45a..4fafd58 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -64,26 +64,17 @@ static DEFINE_SPINLOCK(modlist_lock); static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); -static DEFINE_MUTEX(notify_mutex); -static struct notifier_block * module_notify_list; +static BLOCKING_NOTIFIER_HEAD(module_notify_list); int register_module_notifier(struct notifier_block * nb) { - int err; - mutex_lock(¬ify_mutex); - err = notifier_chain_register(&module_notify_list, nb); - mutex_unlock(¬ify_mutex); - return err; + return blocking_notifier_chain_register(&module_notify_list, nb); } EXPORT_SYMBOL(register_module_notifier); int unregister_module_notifier(struct notifier_block * nb) { - int err; - mutex_lock(¬ify_mutex); - err = notifier_chain_unregister(&module_notify_list, nb); - mutex_unlock(¬ify_mutex); - return err; + return blocking_notifier_chain_unregister(&module_notify_list, nb); } EXPORT_SYMBOL(unregister_module_notifier); @@ -1816,9 +1807,8 @@ sys_init_module(void __user *umod, /* Drop lock so they can recurse */ mutex_unlock(&module_mutex); - mutex_lock(¬ify_mutex); - notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); - mutex_unlock(¬ify_mutex); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); /* Start the module */ if (mod->init != NULL) diff --git a/kernel/panic.c b/kernel/panic.c index acd95ad..f895c7c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -29,7 +29,7 @@ static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; EXPORT_SYMBOL(panic_timeout); -struct notifier_block *panic_notifier_list; +ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); @@ -97,7 +97,7 @@ NORET_TYPE void panic(const char * fmt, ...) smp_send_stop(); #endif - notifier_call_chain(&panic_notifier_list, 0, buf); + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); if (!panic_blink) panic_blink = no_blink; diff --git a/kernel/profile.c b/kernel/profile.c index ad81f79..5a730fd 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -87,72 +87,52 @@ void __init profile_init(void) #ifdef CONFIG_PROFILING -static DECLARE_RWSEM(profile_rwsem); -static DEFINE_RWLOCK(handoff_lock); -static struct notifier_block * task_exit_notifier; -static struct notifier_block * task_free_notifier; -static struct notifier_block * munmap_notifier; +static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); +static ATOMIC_NOTIFIER_HEAD(task_free_notifier); +static BLOCKING_NOTIFIER_HEAD(munmap_notifier); void profile_task_exit(struct task_struct * task) { - down_read(&profile_rwsem); - notifier_call_chain(&task_exit_notifier, 0, task); - up_read(&profile_rwsem); + blocking_notifier_call_chain(&task_exit_notifier, 0, task); } int profile_handoff_task(struct task_struct * task) { int ret; - read_lock(&handoff_lock); - ret = notifier_call_chain(&task_free_notifier, 0, task); - read_unlock(&handoff_lock); + ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); return (ret == NOTIFY_OK) ? 1 : 0; } void profile_munmap(unsigned long addr) { - down_read(&profile_rwsem); - notifier_call_chain(&munmap_notifier, 0, (void *)addr); - up_read(&profile_rwsem); + blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); } int task_handoff_register(struct notifier_block * n) { - int err = -EINVAL; - - write_lock(&handoff_lock); - err = notifier_chain_register(&task_free_notifier, n); - write_unlock(&handoff_lock); - return err; + return atomic_notifier_chain_register(&task_free_notifier, n); } int task_handoff_unregister(struct notifier_block * n) { - int err = -EINVAL; - - write_lock(&handoff_lock); - err = notifier_chain_unregister(&task_free_notifier, n); - write_unlock(&handoff_lock); - return err; + return atomic_notifier_chain_unregister(&task_free_notifier, n); } int profile_event_register(enum profile_type type, struct notifier_block * n) { int err = -EINVAL; - down_write(&profile_rwsem); - switch (type) { case PROFILE_TASK_EXIT: - err = notifier_chain_register(&task_exit_notifier, n); + err = blocking_notifier_chain_register( + &task_exit_notifier, n); break; case PROFILE_MUNMAP: - err = notifier_chain_register(&munmap_notifier, n); + err = blocking_notifier_chain_register( + &munmap_notifier, n); break; } - up_write(&profile_rwsem); - return err; } @@ -161,18 +141,17 @@ int profile_event_unregister(enum profile_type type, struct notifier_block * n) { int err = -EINVAL; - down_write(&profile_rwsem); - switch (type) { case PROFILE_TASK_EXIT: - err = notifier_chain_unregister(&task_exit_notifier, n); + err = blocking_notifier_chain_unregister( + &task_exit_notifier, n); break; case PROFILE_MUNMAP: - err = notifier_chain_unregister(&munmap_notifier, n); + err = blocking_notifier_chain_unregister( + &munmap_notifier, n); break; } - up_write(&profile_rwsem); return err; } diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d9b3d58..ced91e1 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -152,5 +152,5 @@ __init void spawn_softlockup_task(void) cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); - notifier_chain_register(&panic_notifier_list, &panic_block); + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); } diff --git a/kernel/sys.c b/kernel/sys.c index 38bc73e..c93d37f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -95,99 +95,304 @@ int cad_pid = 1; * and the like. */ -static struct notifier_block *reboot_notifier_list; -static DEFINE_RWLOCK(notifier_lock); +static BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); + +/* + * Notifier chain core routines. The exported routines below + * are layered on top of these, with appropriate locking added. + */ + +static int notifier_chain_register(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if (n->priority > (*nl)->priority) + break; + nl = &((*nl)->next); + } + n->next = *nl; + rcu_assign_pointer(*nl, n); + return 0; +} + +static int notifier_chain_unregister(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if ((*nl) == n) { + rcu_assign_pointer(*nl, n->next); + return 0; + } + nl = &((*nl)->next); + } + return -ENOENT; +} + +static int __kprobes notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v) +{ + int ret = NOTIFY_DONE; + struct notifier_block *nb; + + nb = rcu_dereference(*nl); + while (nb) { + ret = nb->notifier_call(nb, val, v); + if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) + break; + nb = rcu_dereference(nb->next); + } + return ret; +} + +/* + * Atomic notifier chain routines. Registration and unregistration + * use a mutex, and call_chain is synchronized by RCU (no locks). + */ /** - * notifier_chain_register - Add notifier to a notifier chain - * @list: Pointer to root list pointer + * atomic_notifier_chain_register - Add notifier to an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain * @n: New entry in notifier chain * - * Adds a notifier to a notifier chain. + * Adds a notifier to an atomic notifier chain. * * Currently always returns zero. */ + +int atomic_notifier_chain_register(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n); + spin_unlock_irqrestore(&nh->lock, flags); + return ret; +} + +EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); + +/** + * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an atomic notifier chain. + * + * Returns zero on success or %-ENOENT on failure. + */ +int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_unregister(&nh->head, n); + spin_unlock_irqrestore(&nh->lock, flags); + synchronize_rcu(); + return ret; +} + +EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); + +/** + * atomic_notifier_call_chain - Call functions in an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in a notifier chain in turn. The functions + * run in an atomic context, so they must not block. + * This routine uses RCU to synchronize with changes to the chain. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ -int notifier_chain_register(struct notifier_block **list, struct notifier_block *n) +int atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) { - write_lock(¬ifier_lock); - while(*list) - { - if(n->priority > (*list)->priority) - break; - list= &((*list)->next); - } - n->next = *list; - *list=n; - write_unlock(¬ifier_lock); - return 0; + int ret; + + rcu_read_lock(); + ret = notifier_call_chain(&nh->head, val, v); + rcu_read_unlock(); + return ret; } -EXPORT_SYMBOL(notifier_chain_register); +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); + +/* + * Blocking notifier chain routines. All access to the chain is + * synchronized by an rwsem. + */ /** - * notifier_chain_unregister - Remove notifier from a notifier chain - * @nl: Pointer to root list pointer + * blocking_notifier_chain_register - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain * @n: New entry in notifier chain * - * Removes a notifier from a notifier chain. + * Adds a notifier to a blocking notifier chain. + * Must be called in process context. * - * Returns zero on success, or %-ENOENT on failure. + * Currently always returns zero. */ -int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) +int blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n) { - write_lock(¬ifier_lock); - while((*nl)!=NULL) - { - if((*nl)==n) - { - *nl=n->next; - write_unlock(¬ifier_lock); - return 0; - } - nl=&((*nl)->next); - } - write_unlock(¬ifier_lock); - return -ENOENT; + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + down_write(&nh->rwsem); + ret = notifier_chain_register(&nh->head, n); + up_write(&nh->rwsem); + return ret; } -EXPORT_SYMBOL(notifier_chain_unregister); +EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** - * notifier_call_chain - Call functions in a notifier chain - * @n: Pointer to root pointer of notifier chain + * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from a blocking notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + down_write(&nh->rwsem); + ret = notifier_chain_unregister(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} + +EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); + +/** + * blocking_notifier_call_chain - Call functions in a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain * @val: Value passed unmodified to notifier function * @v: Pointer passed unmodified to notifier function * - * Calls each function in a notifier chain in turn. + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. * - * If the return value of the notifier can be and'd - * with %NOTIFY_STOP_MASK, then notifier_call_chain + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain * will return immediately, with the return value of * the notifier function which halted execution. - * Otherwise, the return value is the return value + * Otherwise the return value is the return value * of the last notifier function called. */ -int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) { - int ret=NOTIFY_DONE; - struct notifier_block *nb = *n; + int ret; - while(nb) - { - ret=nb->notifier_call(nb,val,v); - if(ret&NOTIFY_STOP_MASK) - { - return ret; - } - nb=nb->next; - } + down_read(&nh->rwsem); + ret = notifier_call_chain(&nh->head, val, v); + up_read(&nh->rwsem); return ret; } -EXPORT_SYMBOL(notifier_call_chain); +EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); + +/* + * Raw notifier chain routines. There is no protection; + * the caller must provide it. Use at your own risk! + */ + +/** + * raw_notifier_chain_register - Add notifier to a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a raw notifier chain. + * All locking must be provided by the caller. + * + * Currently always returns zero. + */ + +int raw_notifier_chain_register(struct raw_notifier_head *nh, + struct notifier_block *n) +{ + return notifier_chain_register(&nh->head, n); +} + +EXPORT_SYMBOL_GPL(raw_notifier_chain_register); + +/** + * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from a raw notifier chain. + * All locking must be provided by the caller. + * + * Returns zero on success or %-ENOENT on failure. + */ +int raw_notifier_chain_unregister(struct raw_notifier_head *nh, + struct notifier_block *n) +{ + return notifier_chain_unregister(&nh->head, n); +} + +EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); + +/** + * raw_notifier_call_chain - Call functions in a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in a notifier chain in turn. The functions + * run in an undefined context. + * All locking must be provided by the caller. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then raw_notifier_call_chain + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ + +int raw_notifier_call_chain(struct raw_notifier_head *nh, + unsigned long val, void *v) +{ + return notifier_call_chain(&nh->head, val, v); +} + +EXPORT_SYMBOL_GPL(raw_notifier_call_chain); /** * register_reboot_notifier - Register function to be called at reboot time @@ -196,13 +401,13 @@ EXPORT_SYMBOL(notifier_call_chain); * Registers a function with the list of functions * to be called at reboot time. * - * Currently always returns zero, as notifier_chain_register + * Currently always returns zero, as blocking_notifier_chain_register * always returns zero. */ int register_reboot_notifier(struct notifier_block * nb) { - return notifier_chain_register(&reboot_notifier_list, nb); + return blocking_notifier_chain_register(&reboot_notifier_list, nb); } EXPORT_SYMBOL(register_reboot_notifier); @@ -219,7 +424,7 @@ EXPORT_SYMBOL(register_reboot_notifier); int unregister_reboot_notifier(struct notifier_block * nb) { - return notifier_chain_unregister(&reboot_notifier_list, nb); + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); } EXPORT_SYMBOL(unregister_reboot_notifier); @@ -380,7 +585,7 @@ EXPORT_SYMBOL_GPL(emergency_restart); void kernel_restart_prepare(char *cmd) { - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; device_shutdown(); } @@ -430,7 +635,7 @@ EXPORT_SYMBOL_GPL(kernel_kexec); void kernel_shutdown_prepare(enum system_states state) { - notifier_call_chain(&reboot_notifier_list, + blocking_notifier_call_chain(&reboot_notifier_list, (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); system_state = state; device_shutdown(); -- cgit v1.1 From f83ca9fe3ee390755f18b4a7780c25ce593b484a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 28 Mar 2006 01:56:20 -0800 Subject: [PATCH] symversion warning fix gcc-4.2: kernel/module.c: In function '__find_symbol': kernel/module.c:158: warning: the address of '__start___kcrctab', will always evaluate as 'true' kernel/module.c:165: warning: the address of '__start___kcrctab_gpl', will always evaluate as 'true' kernel/module.c:182: warning: the address of '__start___kcrctab_gpl_future', will always evaluate as 'true' Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4fafd58..bd088a7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -127,7 +127,7 @@ extern const unsigned long __start___kcrctab_gpl_future[]; #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL #else -#define symversion(base, idx) ((base) ? ((base) + (idx)) : NULL) +#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) #endif /* lookup symbol in given range of kernel_symbols */ -- cgit v1.1 From b9e20a920092eb3840424f85c78852c0433df00d Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Tue, 28 Mar 2006 01:56:24 -0800 Subject: [PATCH] Change dash2underscore() return value to char Since dash2underscore() just operates and returns chars, I guess its safe to change the return value to a char. With my .config, this reduces its size by 5 bytes. text data bss dec hex filename 4155 152 0 4307 10d3 params.o.orig 4150 152 0 4302 10ce params.o Signed-off-by: Eric Sesterhenn Signed-off-by: Alexey Dobriyan Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 9de637a..af43ecd 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -31,7 +31,7 @@ #define DEBUGP(fmt, a...) #endif -static inline int dash2underscore(char c) +static inline char dash2underscore(char c) { if (c == '-') return '_'; -- cgit v1.1 From 0a945022778f100115d0cb6234eb28fc1b15ccaf Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 28 Mar 2006 01:56:37 -0800 Subject: [PATCH] for_each_possible_cpu: fixes for generic part replaces for_each_cpu with for_each_possible_cpu(). Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcutorture.c | 4 ++-- kernel/sched.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b4b362b..8154e75 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -301,7 +301,7 @@ rcu_torture_printk(char *page) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; @@ -535,7 +535,7 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) atomic_set(&rcu_torture_wcount[i], 0); - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { per_cpu(rcu_torture_count, cpu)[i] = 0; per_cpu(rcu_torture_batch, cpu)[i] = 0; diff --git a/kernel/sched.c b/kernel/sched.c index 7854ee5..a9ecac3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1625,7 +1625,7 @@ unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += cpu_rq(i)->nr_uninterruptible; /* @@ -1642,7 +1642,7 @@ unsigned long long nr_context_switches(void) { unsigned long long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches; return sum; @@ -1652,7 +1652,7 @@ unsigned long nr_iowait(void) { unsigned long i, sum = 0; - for_each_cpu(i) + for_each_possible_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); return sum; @@ -6080,7 +6080,7 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; - for_each_cpu(i) { + for_each_possible_cpu(i) { prio_array_t *array; rq = cpu_rq(i); -- cgit v1.1 From ec7e15d6486e9d1da1c2f344b670b8388ba7019b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 28 Mar 2006 01:56:55 -0800 Subject: [PATCH] compat_sys_futex() warning fix kernel/futex_compat.c: In function `compat_sys_futex': kernel/futex_compat.c:140: warning: passing arg 1 of `do_futex' makes integer from pointer without a cast kernel/futex_compat.c:140: warning: passing arg 5 of `do_futex' makes integer from pointer without a cast Not sure what Ingo was thinking of here. Put the casts back in. Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex_compat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 9c077cf..54274fc 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -137,5 +137,6 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, if (op >= FUTEX_REQUEUE) val2 = (int) (unsigned long) utime; - return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); + return do_futex((unsigned long)uaddr, op, val, timeout, + (unsigned long)uaddr2, val2, val3); } -- cgit v1.1 From fef23e7fbb11a0a78cd61935f7056bc2b237995a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 28 Mar 2006 16:10:58 -0800 Subject: [PATCH] exec: allow init to exec from any thread. After looking at the problem of init calling exec some more I figured out an easy way to make the code work. The actual symptom without out this patch is that all threads will die except pid == 1, and the thread calling exec. The thread calling exec will wait forever for pid == 1 to die. Since pid == 1 does not install a handler for SIGKILL it will never die. This modifies the tests for init from current->pid == 1 to the equivalent current == child_reaper. And then it causes exec in the ugly case to modify child_reaper. The only weird symptom is that you wind up with an init process that doesn't have the oldest start time on the box. Signed-off-by: Eric W. Biederman Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- kernel/signal.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a8c7efc..223a880 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -807,7 +807,7 @@ fastcall NORET_TYPE void do_exit(long code) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(tsk->pid == 1)) + if (unlikely(tsk == child_reaper)) panic("Attempted to kill init!"); if (unlikely(current->ptrace & PT_TRACE_EXIT)) { diff --git a/kernel/signal.c b/kernel/signal.c index 75f7341..dc8f91b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1990,7 +1990,7 @@ relock: continue; /* Init gets no signals it doesn't want. */ - if (current->pid == 1) + if (current == child_reaper) continue; if (sig_kernel_stop(signr)) { -- cgit v1.1 From d73d65293e3e2de7e916a89c8da30be0948afab7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 28 Mar 2006 16:11:03 -0800 Subject: [PATCH] pidhash: kill switch_exec_pids switch_exec_pids is only called from de_thread by way of exec, and it is only called when we are exec'ing from a non thread group leader. Currently switch_exec_pids gives the leader the pid of the thread and unhashes and rehashes all of the process groups. The leader is already in the EXIT_DEAD state so no one cares about it's pids. The only concern for the leader is that __unhash_process called from release_task will function correctly. If we don't touch the leader at all we know that __unhash_process will work fine so there is no need to touch the leader. For the task becomming the thread group leader, we just need to give it the pid of the old thread group leader, add it to the task list, and attach it to the session and the process group of the thread group. Currently de_thread is also adding the task to the task list which is just silly. Currently the only leader of __detach_pid besides detach_pid is switch_exec_pids because of the ugly extra work that was being performed. So this patch removes switch_exec_pids because it is doing too much, it is creating an unnecessary special case in pid.c, duing work duplicated in de_thread, and generally obscuring what it is going on. The necessary work is added to de_thread, and it seems to be a little clearer there what is going on. Signed-off-by: Eric W. Biederman Cc: Oleg Nesterov Cc: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 1acc072..7781d99 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -218,36 +218,6 @@ task_t *find_task_by_pid_type(int type, int nr) EXPORT_SYMBOL(find_task_by_pid_type); /* - * This function switches the PIDs if a non-leader thread calls - * sys_execve() - this must be done without releasing the PID. - * (which a detach_pid() would eventually do.) - */ -void switch_exec_pids(task_t *leader, task_t *thread) -{ - __detach_pid(leader, PIDTYPE_PID); - __detach_pid(leader, PIDTYPE_TGID); - __detach_pid(leader, PIDTYPE_PGID); - __detach_pid(leader, PIDTYPE_SID); - - __detach_pid(thread, PIDTYPE_PID); - __detach_pid(thread, PIDTYPE_TGID); - - leader->pid = leader->tgid = thread->pid; - thread->pid = thread->tgid; - - attach_pid(thread, PIDTYPE_PID, thread->pid); - attach_pid(thread, PIDTYPE_TGID, thread->tgid); - attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); - attach_pid(thread, PIDTYPE_SID, thread->signal->session); - list_add_tail(&thread->tasks, &init_task.tasks); - - attach_pid(leader, PIDTYPE_PID, leader->pid); - attach_pid(leader, PIDTYPE_TGID, leader->tgid); - attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp); - attach_pid(leader, PIDTYPE_SID, leader->signal->session); -} - -/* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. -- cgit v1.1 From d799f03597cabc6112acb518fc8ab4487aa4f953 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:04 -0800 Subject: [PATCH] choose_new_parent: remove unused arg, sanitize exit_state check 'child_reaper' arg is not used in choose_new_parent(). "->exit_state >= EXIT_ZOMBIE" check is a leftover, was valid when EXIT_ZOMBIE lived in ->state var. Signed-off-by: Oleg Nesterov Acked-by: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 223a880..e04a594 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -538,13 +538,13 @@ static void exit_mm(struct task_struct * tsk) mmput(mm); } -static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) +static inline void choose_new_parent(task_t *p, task_t *reaper) { /* * Make sure we're not reparenting to ourselves and that * the parent is not a zombie. */ - BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); + BUG_ON(p == reaper || reaper->exit_state); p->real_parent = reaper; } @@ -645,7 +645,7 @@ static void forget_original_parent(struct task_struct * father, if (father == p->real_parent) { /* reparent with a reaper, real father it's us */ - choose_new_parent(p, reaper, child_reaper); + choose_new_parent(p, reaper); reparent_thread(p, father, 0); } else { /* reparent ptraced task to its real parent */ @@ -666,7 +666,7 @@ static void forget_original_parent(struct task_struct * father, } list_for_each_safe(_p, _n, &father->ptrace_children) { p = list_entry(_p,struct task_struct,ptrace_list); - choose_new_parent(p, reaper, child_reaper); + choose_new_parent(p, reaper); reparent_thread(p, father, 1); } } -- cgit v1.1 From 8fafabd86f1b75ed3cc6a6ffbe6c3e53e3d8457d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:05 -0800 Subject: [PATCH] remove add_parent()'s parent argument add_parent(p, parent) is always called with parent == p->parent, and it makes no sense to do it differently. This patch removes this argument. No changes in affected .o files. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e04a594..df26c33 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1281,7 +1281,7 @@ bail_ref: /* move to end of parent's list to avoid starvation */ remove_parent(p); - add_parent(p, p->parent); + add_parent(p); write_unlock_irq(&tasklist_lock); -- cgit v1.1 From 9b678ece42893b53aae5ed7cb8d7cb261cacb72c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:05 -0800 Subject: [PATCH] don't use REMOVE_LINKS/SET_LINKS for reparenting There are places where kernel uses REMOVE_LINKS/SET_LINKS while changing process's ->parent. Use add_parent/remove_parent instead, they don't abuse of global process list. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++-- kernel/ptrace.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index df26c33..5b5e8b6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -238,10 +238,10 @@ static void reparent_to_init(void) ptrace_unlink(current); /* Reparent to init */ - REMOVE_LINKS(current); + remove_parent(current); current->parent = child_reaper; current->real_parent = child_reaper; - SET_LINKS(current); + add_parent(current); /* Set the exit signal to SIGCHLD so we signal init on exit */ current->exit_signal = SIGCHLD; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d95a72c..86a7f6c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -35,9 +35,9 @@ void __ptrace_link(task_t *child, task_t *new_parent) if (child->parent == new_parent) return; list_add(&child->ptrace_list, &child->parent->ptrace_children); - REMOVE_LINKS(child); + remove_parent(child); child->parent = new_parent; - SET_LINKS(child); + add_parent(child); } /* @@ -77,9 +77,9 @@ void __ptrace_unlink(task_t *child) child->ptrace = 0; if (!list_empty(&child->ptrace_list)) { list_del_init(&child->ptrace_list); - REMOVE_LINKS(child); + remove_parent(child); child->parent = child->real_parent; - SET_LINKS(child); + add_parent(child); } ptrace_untrace(child); -- cgit v1.1 From c97d98931ac52ef110b62d9b75c6a6f2bfbc1898 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:06 -0800 Subject: [PATCH] kill SET_LINKS/REMOVE_LINKS Both SET_LINKS() and SET_LINKS/REMOVE_LINKS() have exactly one caller, and these callers already check thread_group_leader(). This patch kills theese macros, they mix two different things: setting process's parent and registering it in init_task.tasks list. Callers are updated to do these actions by hand. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 +++- kernel/fork.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 5b5e8b6..f436a6b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,11 +54,13 @@ static void __unhash_process(struct task_struct *p) if (thread_group_leader(p)) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); + + list_del_init(&p->tasks); if (p->pid) __get_cpu_var(process_counts)--; } - REMOVE_LINKS(p); + remove_parent(p); } void release_task(struct task_struct * p) diff --git a/kernel/fork.c b/kernel/fork.c index c49bd19..74c6762 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1181,7 +1181,7 @@ static task_t *copy_process(unsigned long clone_flags, */ p->ioprio = current->ioprio; - SET_LINKS(p); + add_parent(p); if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); @@ -1191,6 +1191,8 @@ static task_t *copy_process(unsigned long clone_flags, p->signal->session = current->signal->session; attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->signal->session); + + list_add_tail(&p->tasks, &init_task.tasks); if (p->pid) __get_cpu_var(process_counts)++; } -- cgit v1.1 From 73b9ebfe126a4a886ee46cbab637374d7024668a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:07 -0800 Subject: [PATCH] pidhash: don't count idle threads fork_idle() does unhash_process() just after copy_process(). Contrary, boot_cpu's idle thread explicitely registers itself for each pid_type with nr = 0. copy_process() already checks p->pid != 0 before process_counts++, I think we can just skip attach_pid() calls and job control inits for idle threads and kill unhash_process(). We don't need to cleanup ->proc_dentry in fork_idle() because with this patch idle threads are never hashed in kernel/pid.c:pid_hash[]. We don't need to hash pid == 0 in pidmap_init(). free_pidmap() is never called with pid == 0 arg, so it will never be reused. So it is still possible to use pid == 0 in any PIDTYPE_xxx namespace from kernel/pid.c's POV. However with this patch we don't hash pid == 0 for PIDTYPE_PID case. We still have have PIDTYPE_PGID/PIDTYPE_SID entries with pid == 0: /sbin/init and kernel threads which don't call daemonize(). Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 18 +----------------- kernel/fork.c | 35 ++++++++++++++++++----------------- kernel/pid.c | 10 +--------- 3 files changed, 20 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f436a6b..a94e1c3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,8 +56,7 @@ static void __unhash_process(struct task_struct *p) detach_pid(p, PIDTYPE_SID); list_del_init(&p->tasks); - if (p->pid) - __get_cpu_var(process_counts)--; + __get_cpu_var(process_counts)--; } remove_parent(p); @@ -118,21 +117,6 @@ repeat: goto repeat; } -/* we are using it only for SMP init */ - -void unhash_process(struct task_struct *p) -{ - struct dentry *proc_dentry; - - spin_lock(&p->proc_lock); - proc_dentry = proc_pid_unhash(p); - write_lock_irq(&tasklist_lock); - __unhash_process(p); - write_unlock_irq(&tasklist_lock); - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); -} - /* * This checks not only the pgrp, but falls back on the pid if no * satisfactory pgrp is found. I dunno - gdb doesn't work correctly diff --git a/kernel/fork.c b/kernel/fork.c index 74c6762..0c32e28 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1181,25 +1181,26 @@ static task_t *copy_process(unsigned long clone_flags, */ p->ioprio = current->ioprio; - add_parent(p); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); - - if (thread_group_leader(p)) { - p->signal->tty = current->signal->tty; - p->signal->pgrp = process_group(current); - p->signal->session = current->signal->session; - attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->signal->session); - - list_add_tail(&p->tasks, &init_task.tasks); - if (p->pid) + if (likely(p->pid)) { + add_parent(p); + if (unlikely(p->ptrace & PT_PTRACED)) + __ptrace_link(p, current->parent); + + if (thread_group_leader(p)) { + p->signal->tty = current->signal->tty; + p->signal->pgrp = process_group(current); + p->signal->session = current->signal->session; + attach_pid(p, PIDTYPE_PGID, process_group(p)); + attach_pid(p, PIDTYPE_SID, p->signal->session); + + list_add_tail(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; + } + attach_pid(p, PIDTYPE_TGID, p->tgid); + attach_pid(p, PIDTYPE_PID, p->pid); + nr_threads++; } - attach_pid(p, PIDTYPE_TGID, p->tgid); - attach_pid(p, PIDTYPE_PID, p->pid); - nr_threads++; total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); @@ -1263,7 +1264,7 @@ task_t * __devinit fork_idle(int cpu) if (!task) return ERR_PTR(-ENOMEM); init_idle(task, cpu); - unhash_process(task); + return task; } diff --git a/kernel/pid.c b/kernel/pid.c index 7781d99..a9f2dfd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -247,16 +247,8 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - int i; - pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, pidmap_array->page); atomic_dec(&pidmap_array->nr_free); - - /* - * Allocate PID 0, and hash it via all PID types: - */ - - for (i = 0; i < PIDTYPE_MAX; i++) - attach_pid(current, i, 0); } -- cgit v1.1 From 6ac781b11ade6e3451f6b460991c8b2b87e58280 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:09 -0800 Subject: [PATCH] reparent_thread: use remove_parent/add_parent Use remove_parent/add_parent instead of open coding. No changes in kernel/exit.o Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a94e1c3..98eec59 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -555,9 +555,9 @@ static void reparent_thread(task_t *p, task_t *father, int traced) * anyway, so let go of it. */ p->ptrace = 0; - list_del_init(&p->sibling); + remove_parent(p); p->parent = p->real_parent; - list_add_tail(&p->sibling, &p->parent->children); + add_parent(p); /* If we'd notified the old parent about this child's death, * also notify the new parent. -- cgit v1.1 From 8292d633add73d40eda1d26089e2fc758944ac7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:10 -0800 Subject: [PATCH] wait_for_helper: trivial style cleanup Use NULL instead of (... *)0 Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 51a8920..20a997c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -170,7 +170,7 @@ static int wait_for_helper(void *data) sa.sa.sa_handler = SIG_IGN; sa.sa.sa_flags = 0; siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD)); - do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0); + do_sigaction(SIGCHLD, &sa, NULL); allow_signal(SIGCHLD); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); -- cgit v1.1 From 1f09f9749cdde4e69f95d62d96d2e03f50b3353c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:11 -0800 Subject: [PATCH] release_task: replace open-coded ptrace_unlink() Use ptrace_unlink() instead of open-coding. No changes in kernel/exit.o Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 98eec59..77c35ef 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,13 +68,12 @@ void release_task(struct task_struct * p) task_t *leader; struct dentry *proc_dentry; -repeat: +repeat: atomic_dec(&p->user->processes); spin_lock(&p->proc_lock); proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); - if (unlikely(p->ptrace)) - __ptrace_unlink(p); + ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); /* -- cgit v1.1 From aa1757f90bea3f598b6e5d04d922a6a60200f1da Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:12 -0800 Subject: [PATCH] convert sighand_cache to use SLAB_DESTROY_BY_RCU This patch borrows a clever Hugh's 'struct anon_vma' trick. Without tasklist_lock held we can't trust task->sighand until we locked it and re-checked that it is still the same. But this means we don't need to defer 'kmem_cache_free(sighand)'. We can return the memory to slab immediately, all we need is to be sure that sighand->siglock can't dissapear inside rcu protected section. To do so we need to initialize ->siglock inside ctor function, SLAB_DESTROY_BY_RCU does the rest. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 21 +++++++++++---------- kernel/signal.c | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0c32e28..33ffb5b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -786,14 +786,6 @@ int unshare_files(void) EXPORT_SYMBOL(unshare_files); -void sighand_free_cb(struct rcu_head *rhp) -{ - struct sighand_struct *sp; - - sp = container_of(rhp, struct sighand_struct, rcu); - kmem_cache_free(sighand_cachep, sp); -} - static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { struct sighand_struct *sig; @@ -806,7 +798,6 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; - spin_lock_init(&sig->siglock); atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); return 0; @@ -1356,11 +1347,21 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif +static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) +{ + struct sighand_struct *sighand = data; + + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + spin_lock_init(&sighand->siglock); +} + void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, + sighand_ctor, NULL); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); diff --git a/kernel/signal.c b/kernel/signal.c index dc8f91b..b0b1ca9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -330,7 +330,7 @@ void __exit_sighand(struct task_struct *tsk) /* Ok, we're done with the signal handlers */ tsk->sighand = NULL; if (atomic_dec_and_test(&sighand->count)) - sighand_free(sighand); + kmem_cache_free(sighand_cachep, sighand); } void exit_sighand(struct task_struct *tsk) -- cgit v1.1 From f63ee72e0fb82e504a0489490babc7612c7cd6c2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:13 -0800 Subject: [PATCH] introduce lock_task_sighand() helper Add lock_task_sighand() helper and converts group_send_sig_info() to use it. Hopefully we will have more users soon. This patch also removes '!sighand->count' and '!p->usage' checks, I think they both are bogus, racy and unneeded (but probably it makes sense to restore them as BUG_ON()s). ->sighand is cleared and it's ->count is decremented in release_task() with sighand->siglock held, so it is a bug to have '!p->usage || !->count' after we already locked and verified it is the same. On the other hand, an already dead task without ->sighand can have a non-zero ->usage due to ptrace, for example. If we read the stale value of ->sighand we must see the change after spin_lock(), because that change was done while holding that same old ->sighand.siglock. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b0b1ca9..819fa49 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1120,27 +1120,37 @@ void zap_other_threads(struct task_struct *p) /* * Must be called under rcu_read_lock() or with tasklist_lock read-held. */ +struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) +{ + struct sighand_struct *sighand; + + for (;;) { + sighand = rcu_dereference(tsk->sighand); + if (unlikely(sighand == NULL)) + break; + + spin_lock_irqsave(&sighand->siglock, *flags); + if (likely(sighand == tsk->sighand)) + break; + spin_unlock_irqrestore(&sighand->siglock, *flags); + } + + return sighand; +} + int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { unsigned long flags; - struct sighand_struct *sp; int ret; -retry: ret = check_kill_permission(sig, info, p); - if (!ret && sig && (sp = rcu_dereference(p->sighand))) { - spin_lock_irqsave(&sp->siglock, flags); - if (p->sighand != sp) { - spin_unlock_irqrestore(&sp->siglock, flags); - goto retry; - } - if ((atomic_read(&sp->count) == 0) || - (atomic_read(&p->usage) == 0)) { - spin_unlock_irqrestore(&sp->siglock, flags); - return -ESRCH; + + if (!ret && sig) { + ret = -ESRCH; + if (lock_task_sighand(p, &flags)) { + ret = __group_send_sig_info(sig, info, p); + unlock_task_sighand(p, &flags); } - ret = __group_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&sp->siglock, flags); } return ret; -- cgit v1.1 From a9e88e84b5245da0a1dadb6ccca70ae84e93ccf6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:14 -0800 Subject: [PATCH] introduce sig_needs_tasklist() helper In my opinion this patch cleans up the code. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 819fa49..c5b65aa 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -147,6 +147,9 @@ static kmem_cache_t *sigqueue_cachep; #define sig_kernel_stop(sig) \ (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) +#define sig_needs_tasklist(sig) \ + (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK | M(SIGCONT))) + #define sig_user_defined(t, signr) \ (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) @@ -1199,7 +1202,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) struct task_struct *p; rcu_read_lock(); - if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { + if (unlikely(sig_needs_tasklist(sig))) { read_lock(&tasklist_lock); acquired_tasklist_lock = 1; } -- cgit v1.1 From 7001510d0cbf51ad202dd2d0744f54104285cbb9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:14 -0800 Subject: [PATCH] copy_process: cleanup bad_fork_cleanup_sighand The only caller of exit_sighand(tsk) is copy_process's error path. We can call __exit_sighand() directly and kill exit_sighand(). This 'tsk' was not yet registered in pid_hash[] or init_task.tasks, it has no external references, nobody can see it, and IF (clone_flags & CLONE_SIGHAND) At least 'current' has a reference to ->sighand, this means atomic_dec_and_test(sighand->count) can't be true. ELSE Nobody can see this ->sighand, this means we can free it without any locking. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- kernel/signal.c | 14 -------------- 2 files changed, 2 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 33ffb5b..8a46ad5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1208,7 +1208,8 @@ bad_fork_cleanup_mm: bad_fork_cleanup_signal: exit_signal(p); bad_fork_cleanup_sighand: - exit_sighand(p); + if (p->sighand) + __exit_sighand(p); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: diff --git a/kernel/signal.c b/kernel/signal.c index c5b65aa..1d7f446 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -336,20 +336,6 @@ void __exit_sighand(struct task_struct *tsk) kmem_cache_free(sighand_cachep, sighand); } -void exit_sighand(struct task_struct *tsk) -{ - write_lock_irq(&tasklist_lock); - rcu_read_lock(); - if (tsk->sighand != NULL) { - struct sighand_struct *sighand = rcu_dereference(tsk->sighand); - spin_lock(&sighand->siglock); - __exit_sighand(tsk); - spin_unlock(&sighand->siglock); - } - rcu_read_unlock(); - write_unlock_irq(&tasklist_lock); -} - /* * This function expects the tasklist_lock write-locked. */ -- cgit v1.1 From 6b3934ef52712ece50605dfc72e55d00c580831a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:16 -0800 Subject: [PATCH] copy_process: cleanup bad_fork_cleanup_signal __exit_signal() does important cleanups atomically under ->siglock. It is also called from copy_process's error path. This is not good, for example we can't move __unhash_process() under ->siglock for that reason. We should not mix these 2 paths, just look at ugly 'if (p->sighand)' under 'bad_fork_cleanup_sighand:' label. For copy_process() case it is sufficient to just backout copy_signal(), nothing more. Again, nobody can see this task yet. For CLONE_THREAD case we just decrement signal->count, otherwise nobody can see this ->signal and we can free it lockless. This patch assumes it is safe to do exit_thread_group_keys() without tasklist_lock. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: David Howells Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 23 +++++++++++++++++++---- kernel/signal.c | 15 +-------------- 2 files changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8a46ad5..0aff28c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -84,7 +84,7 @@ static kmem_cache_t *task_struct_cachep; #endif /* SLAB cache for signal_struct structures (tsk->signal) */ -kmem_cache_t *signal_cachep; +static kmem_cache_t *signal_cachep; /* SLAB cache for sighand_struct structures (tsk->sighand) */ kmem_cache_t *sighand_cachep; @@ -872,6 +872,22 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts return 0; } +void __cleanup_signal(struct signal_struct *sig) +{ + exit_thread_group_keys(sig); + kmem_cache_free(signal_cachep, sig); +} + +static inline void cleanup_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + + atomic_dec(&sig->live); + + if (atomic_dec_and_test(&sig->count)) + __cleanup_signal(sig); +} + static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; @@ -1206,10 +1222,9 @@ bad_fork_cleanup_mm: if (p->mm) mmput(p->mm); bad_fork_cleanup_signal: - exit_signal(p); + cleanup_signal(p); bad_fork_cleanup_sighand: - if (p->sighand) - __exit_sighand(p); + __exit_sighand(p); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: diff --git a/kernel/signal.c b/kernel/signal.c index 1d7f446..54e9ef6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -395,23 +395,10 @@ void __exit_signal(struct task_struct *tsk) clear_tsk_thread_flag(tsk,TIF_SIGPENDING); flush_sigqueue(&tsk->pending); if (sig) { - /* - * We are cleaning up the signal_struct here. - */ - exit_thread_group_keys(sig); - kmem_cache_free(signal_cachep, sig); + __cleanup_signal(sig); } } -void exit_signal(struct task_struct *tsk) -{ - atomic_dec(&tsk->signal->live); - - write_lock_irq(&tasklist_lock); - __exit_signal(tsk); - write_unlock_irq(&tasklist_lock); -} - /* * Flush all handlers for a task. */ -- cgit v1.1 From 29ff471234d53c7235db287bc52f91884c2977c6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:17 -0800 Subject: [PATCH] cleanup __exit_signal() This patch factors out duplicated code under 'if' branches. Also, BUG_ON() conversions and whitespace cleanups. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 54e9ef6..ca1fa85 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -341,24 +341,20 @@ void __exit_sighand(struct task_struct *tsk) */ void __exit_signal(struct task_struct *tsk) { - struct signal_struct * sig = tsk->signal; - struct sighand_struct * sighand; + struct signal_struct *sig = tsk->signal; + struct sighand_struct *sighand; + + BUG_ON(!sig); + BUG_ON(!atomic_read(&sig->count)); - if (!sig) - BUG(); - if (!atomic_read(&sig->count)) - BUG(); rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); + posix_cpu_timers_exit(tsk); - if (atomic_dec_and_test(&sig->count)) { + if (atomic_dec_and_test(&sig->count)) posix_cpu_timers_exit_group(tsk); - tsk->signal = NULL; - __exit_sighand(tsk); - spin_unlock(&sighand->siglock); - flush_sigqueue(&sig->shared_pending); - } else { + else { /* * If there is any task waiting for the group exit * then notify it: @@ -369,7 +365,6 @@ void __exit_signal(struct task_struct *tsk) } if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - tsk->signal = NULL; /* * Accumulate here the counters for all threads but the * group leader as they die, so they can be added into @@ -387,14 +382,18 @@ void __exit_signal(struct task_struct *tsk) sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->sched_time += tsk->sched_time; - __exit_sighand(tsk); - spin_unlock(&sighand->siglock); - sig = NULL; /* Marker for below. */ + sig = NULL; /* Marker for below. */ } + + tsk->signal = NULL; + __exit_sighand(tsk); + spin_unlock(&sighand->siglock); rcu_read_unlock(); + clear_tsk_thread_flag(tsk,TIF_SIGPENDING); flush_sigqueue(&tsk->pending); if (sig) { + flush_sigqueue(&sig->shared_pending); __cleanup_signal(sig); } } -- cgit v1.1 From c81addc9d3a0ebff2155e0cd86f90820ab97147e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:17 -0800 Subject: [PATCH] rename __exit_sighand to cleanup_sighand Cosmetic, rename __exit_sighand to cleanup_sighand and move it close to copy_sighand(). This matches copy_signal/cleanup_signal naming, and I think it is easier to follow. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 +++++++++++- kernel/signal.c | 19 ++----------------- 2 files changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0aff28c..12cdd9f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -803,6 +803,16 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t return 0; } +void cleanup_sighand(struct task_struct *tsk) +{ + struct sighand_struct * sighand = tsk->sighand; + + /* Ok, we're done with the signal handlers */ + tsk->sighand = NULL; + if (atomic_dec_and_test(&sighand->count)) + kmem_cache_free(sighand_cachep, sighand); +} + static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk) { struct signal_struct *sig; @@ -1224,7 +1234,7 @@ bad_fork_cleanup_mm: bad_fork_cleanup_signal: cleanup_signal(p); bad_fork_cleanup_sighand: - __exit_sighand(p); + cleanup_sighand(p); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: diff --git a/kernel/signal.c b/kernel/signal.c index ca1fa85..b29c868 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -310,9 +310,7 @@ static void flush_sigqueue(struct sigpending *queue) /* * Flush all pending signals for a task. */ - -void -flush_signals(struct task_struct *t) +void flush_signals(struct task_struct *t) { unsigned long flags; @@ -326,19 +324,6 @@ flush_signals(struct task_struct *t) /* * This function expects the tasklist_lock write-locked. */ -void __exit_sighand(struct task_struct *tsk) -{ - struct sighand_struct * sighand = tsk->sighand; - - /* Ok, we're done with the signal handlers */ - tsk->sighand = NULL; - if (atomic_dec_and_test(&sighand->count)) - kmem_cache_free(sighand_cachep, sighand); -} - -/* - * This function expects the tasklist_lock write-locked. - */ void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; @@ -386,7 +371,7 @@ void __exit_signal(struct task_struct *tsk) } tsk->signal = NULL; - __exit_sighand(tsk); + cleanup_sighand(tsk); spin_unlock(&sighand->siglock); rcu_read_unlock(); -- cgit v1.1 From 6a14c5c9da0b4c34b5be783403c54f0396fcfe77 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:18 -0800 Subject: [PATCH] move __exit_signal() to kernel/exit.c __exit_signal() is private to release_task() now. I think it is better to make it static in kernel/exit.c and export flush_sigqueue() instead - this function is much more simple and straightforward. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/signal.c | 65 +-------------------------------------------------------- 2 files changed, 64 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 77c35ef..3823ec8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,68 @@ static void __unhash_process(struct task_struct *p) remove_parent(p); } +/* + * This function expects the tasklist_lock write-locked. + */ +static void __exit_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct sighand_struct *sighand; + + BUG_ON(!sig); + BUG_ON(!atomic_read(&sig->count)); + + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); + spin_lock(&sighand->siglock); + + posix_cpu_timers_exit(tsk); + if (atomic_dec_and_test(&sig->count)) + posix_cpu_timers_exit_group(tsk); + else { + /* + * If there is any task waiting for the group exit + * then notify it: + */ + if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { + wake_up_process(sig->group_exit_task); + sig->group_exit_task = NULL; + } + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + /* + * Accumulate here the counters for all threads but the + * group leader as they die, so they can be added into + * the process-wide totals when those are taken. + * The group leader stays around as a zombie as long + * as there are other threads. When it gets reaped, + * the exit.c code will add its counts into these totals. + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->sched_time += tsk->sched_time; + sig = NULL; /* Marker for below. */ + } + + tsk->signal = NULL; + cleanup_sighand(tsk); + spin_unlock(&sighand->siglock); + rcu_read_unlock(); + + clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + flush_sigqueue(&tsk->pending); + if (sig) { + flush_sigqueue(&sig->shared_pending); + __cleanup_signal(sig); + } +} + void release_task(struct task_struct * p) { int zap_leader; diff --git a/kernel/signal.c b/kernel/signal.c index b29c868..6ea49f7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -295,7 +294,7 @@ static void __sigqueue_free(struct sigqueue *q) kmem_cache_free(sigqueue_cachep, q); } -static void flush_sigqueue(struct sigpending *queue) +void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; @@ -322,68 +321,6 @@ void flush_signals(struct task_struct *t) } /* - * This function expects the tasklist_lock write-locked. - */ -void __exit_signal(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - struct sighand_struct *sighand; - - BUG_ON(!sig); - BUG_ON(!atomic_read(&sig->count)); - - rcu_read_lock(); - sighand = rcu_dereference(tsk->sighand); - spin_lock(&sighand->siglock); - - posix_cpu_timers_exit(tsk); - if (atomic_dec_and_test(&sig->count)) - posix_cpu_timers_exit_group(tsk); - else { - /* - * If there is any task waiting for the group exit - * then notify it: - */ - if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) { - wake_up_process(sig->group_exit_task); - sig->group_exit_task = NULL; - } - if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->sched_time += tsk->sched_time; - sig = NULL; /* Marker for below. */ - } - - tsk->signal = NULL; - cleanup_sighand(tsk); - spin_unlock(&sighand->siglock); - rcu_read_unlock(); - - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - flush_sigqueue(&tsk->pending); - if (sig) { - flush_sigqueue(&sig->shared_pending); - __cleanup_signal(sig); - } -} - -/* * Flush all handlers for a task. */ -- cgit v1.1 From 35f5cad8c4bab94ecc5acdc4055df5ea12dc76f8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:19 -0800 Subject: [PATCH] revert "Optimize sys_times for a single thread process" This patch reverts 'CONFIG_SMP && thread_group_empty()' optimization in sys_times(). The reason is that the next patch breaks memory ordering which is needed for that optimization. tasklist_lock in sys_times() will be eliminated completely by further patch. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 6 +---- kernel/sys.c | 86 ++++++++++++++++++----------------------------------------- 2 files changed, 27 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3823ec8..6b2e4cf 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -139,11 +139,7 @@ repeat: ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); - /* - * Note that the fastpath in sys_times depends on __exit_signal having - * updated the counters before a task is removed from the tasklist of - * the process by __unhash_process. - */ + __unhash_process(p); /* diff --git a/kernel/sys.c b/kernel/sys.c index c93d37f..84371fd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1202,69 +1202,35 @@ asmlinkage long sys_times(struct tms __user * tbuf) */ if (tbuf) { struct tms tmp; + struct task_struct *tsk = current; + struct task_struct *t; cputime_t utime, stime, cutime, cstime; -#ifdef CONFIG_SMP - if (thread_group_empty(current)) { - /* - * Single thread case without the use of any locks. - * - * We may race with release_task if two threads are - * executing. However, release task first adds up the - * counters (__exit_signal) before removing the task - * from the process tasklist (__unhash_process). - * __exit_signal also acquires and releases the - * siglock which results in the proper memory ordering - * so that the list modifications are always visible - * after the counters have been updated. - * - * If the counters have been updated by the second thread - * but the thread has not yet been removed from the list - * then the other branch will be executing which will - * block on tasklist_lock until the exit handling of the - * other task is finished. - * - * This also implies that the sighand->siglock cannot - * be held by another processor. So we can also - * skip acquiring that lock. - */ - utime = cputime_add(current->signal->utime, current->utime); - stime = cputime_add(current->signal->utime, current->stime); - cutime = current->signal->cutime; - cstime = current->signal->cstime; - } else -#endif - { - - /* Process with multiple threads */ - struct task_struct *tsk = current; - struct task_struct *t; - - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); + read_lock(&tasklist_lock); + utime = tsk->signal->utime; + stime = tsk->signal->stime; + t = tsk; + do { + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); + t = next_thread(t); + } while (t != tsk); + + /* + * While we have tasklist_lock read-locked, no dying thread + * can be updating current->signal->[us]time. Instead, + * we got their counts included in the live thread loop. + * However, another thread can come in right now and + * do a wait call that updates current->signal->c[us]time. + * To make sure we always see that pair updated atomically, + * we take the siglock around fetching them. + */ + spin_lock_irq(&tsk->sighand->siglock); + cutime = tsk->signal->cutime; + cstime = tsk->signal->cstime; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - } tmp.tms_utime = cputime_to_clock_t(utime); tmp.tms_stime = cputime_to_clock_t(stime); tmp.tms_cutime = cputime_to_clock_t(cutime); -- cgit v1.1 From 5876700cd399112ecfa70df36203c8c6660d84f8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:20 -0800 Subject: [PATCH] do __unhash_process() under ->siglock This patch moves __unhash_process() call from realease_task() to __exit_signal(), so __detach_pid() is called with ->siglock held. This means we don't need tasklist_lock to iterate over thread group anymore: copy_process() was already changed to do attach_pid() under ->siglock. Eric's "pidhash-kill-switch_exec_pids.patch" from -mm changed de_thread() so it doesn't touch PIDTYPE_TGID. NOTE: de_thread() still needs some attention. It still changes task->pid lockless. Taking ->sighand.siglock here allows to do more tasklist_lock removals. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 6b2e4cf..44d6c6e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -112,6 +112,8 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } + __unhash_process(tsk); + tsk->signal = NULL; cleanup_sighand(tsk); spin_unlock(&sighand->siglock); @@ -140,8 +142,6 @@ repeat: BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); - __unhash_process(p); - /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the -- cgit v1.1 From 7d7185c818925ba5fe90efa75840d0b415032774 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:21 -0800 Subject: [PATCH] sys_times: don't take tasklist_lock sys_times: don't take tasklist_lock Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 84371fd..7ef7f60 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1206,7 +1206,7 @@ asmlinkage long sys_times(struct tms __user * tbuf) struct task_struct *t; cputime_t utime, stime, cutime, cstime; - read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); utime = tsk->signal->utime; stime = tsk->signal->stime; t = tsk; @@ -1216,20 +1216,9 @@ asmlinkage long sys_times(struct tms __user * tbuf) t = next_thread(t); } while (t != tsk); - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); cutime = tsk->signal->cutime; cstime = tsk->signal->cstime; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); tmp.tms_utime = cputime_to_clock_t(utime); tmp.tms_stime = cputime_to_clock_t(stime); -- cgit v1.1 From 6108ccd3e2f3012d5eec582e0af4d75e693824da Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:22 -0800 Subject: [PATCH] relax sig_needs_tasklist() handle_stop_signal() does not need tasklist_lock for SIG_KERNEL_STOP_MASK signals anymore. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 6ea49f7..e99ec2f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -146,8 +146,7 @@ static kmem_cache_t *sigqueue_cachep; #define sig_kernel_stop(sig) \ (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK)) -#define sig_needs_tasklist(sig) \ - (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK | M(SIGCONT))) +#define sig_needs_tasklist(sig) ((sig) == SIGCONT) #define sig_user_defined(t, signr) \ (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ -- cgit v1.1 From a122b341b74c08020f6521b615acca6a692aac79 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:22 -0800 Subject: [PATCH] do_signal_stop: don't take tasklist_lock do_signal_stop() does not need tasklist_lock anymore. So it does not need to do misc re-checks, and we can simplify the code. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 69 ++++++++++++++------------------------------------------- 1 file changed, 17 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e99ec2f..e3bdec9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1682,8 +1682,7 @@ out: * Returns nonzero if we've actually stopped and released the siglock. * Returns zero if we didn't stop and still hold the siglock. */ -static int -do_signal_stop(int signr) +static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; struct sighand_struct *sighand = current->sighand; @@ -1703,7 +1702,6 @@ do_signal_stop(int signr) set_current_state(TASK_STOPPED); if (stop_count == 0) sig->flags = SIGNAL_STOP_STOPPED; - spin_unlock_irq(&sighand->siglock); } else if (thread_group_empty(current)) { /* @@ -1712,71 +1710,38 @@ do_signal_stop(int signr) current->exit_code = current->signal->group_exit_code = signr; set_current_state(TASK_STOPPED); sig->flags = SIGNAL_STOP_STOPPED; - spin_unlock_irq(&sighand->siglock); } else { /* + * (sig->group_stop_count == 0) * There is no group stop already in progress. - * We must initiate one now, but that requires - * dropping siglock to get both the tasklist lock - * and siglock again in the proper order. Note that - * this allows an intervening SIGCONT to be posted. - * We need to check for that and bail out if necessary. + * We must initiate one now. */ struct task_struct *t; - spin_unlock_irq(&sighand->siglock); - - /* signals can be posted during this window */ - - read_lock(&tasklist_lock); - spin_lock_irq(&sighand->siglock); + current->exit_code = signr; + sig->group_exit_code = signr; - if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) { + stop_count = 0; + for (t = next_thread(current); t != current; t = next_thread(t)) /* - * Another stop or continue happened while we - * didn't have the lock. We can just swallow this - * signal now. If we raced with a SIGCONT, that - * should have just cleared it now. If we raced - * with another processor delivering a stop signal, - * then the SIGCONT that wakes us up should clear it. + * Setting state to TASK_STOPPED for a group + * stop is always done with the siglock held, + * so this check has no races. */ - read_unlock(&tasklist_lock); - return 0; - } - - if (sig->group_stop_count == 0) { - sig->group_exit_code = signr; - stop_count = 0; - for (t = next_thread(current); t != current; - t = next_thread(t)) - /* - * Setting state to TASK_STOPPED for a group - * stop is always done with the siglock held, - * so this check has no races. - */ - if (!t->exit_state && - !(t->state & (TASK_STOPPED|TASK_TRACED))) { - stop_count++; - signal_wake_up(t, 0); - } - sig->group_stop_count = stop_count; - } - else { - /* A race with another thread while unlocked. */ - signr = sig->group_exit_code; - stop_count = --sig->group_stop_count; - } + if (!t->exit_state && + !(t->state & (TASK_STOPPED|TASK_TRACED))) { + stop_count++; + signal_wake_up(t, 0); + } + sig->group_stop_count = stop_count; - current->exit_code = signr; set_current_state(TASK_STOPPED); if (stop_count == 0) sig->flags = SIGNAL_STOP_STOPPED; - - spin_unlock_irq(&sighand->siglock); - read_unlock(&tasklist_lock); } + spin_unlock_irq(&sighand->siglock); finish_stop(stop_count); return 1; } -- cgit v1.1 From aacc90944d4b1f2fcec73a8127eb60a3a701ce1c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:23 -0800 Subject: [PATCH] do_group_exit: don't take tasklist_lock do_group_exit() takes tasklist_lock for zap_other_threads(), this is unneeded now. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 44d6c6e..aea23e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -985,7 +985,6 @@ do_group_exit(int exit_code) else if (!thread_group_empty(current)) { struct signal_struct *const sig = current->signal; struct sighand_struct *const sighand = current->sighand; - read_lock(&tasklist_lock); spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ @@ -995,7 +994,6 @@ do_group_exit(int exit_code) zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); - read_unlock(&tasklist_lock); } do_exit(exit_code); -- cgit v1.1 From 88531f725bd52e37a7be726860e4ff3f09031d89 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:24 -0800 Subject: [PATCH] do_sigaction: don't take tasklist_lock do_sigaction() does not need tasklist_lock anymore, we can simplify the code. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e3bdec9..2dfaa50 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2301,8 +2301,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) return kill_proc_info(sig, &info, pid); } -int -do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) +int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct k_sigaction *k; sigset_t mask; @@ -2328,6 +2327,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (act) { sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + *k = *act; /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is @@ -2340,19 +2340,8 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * be discarded, whether or not it is blocked" */ if (act->sa.sa_handler == SIG_IGN || - (act->sa.sa_handler == SIG_DFL && - sig_kernel_ignore(sig))) { - /* - * This is a fairly rare case, so we only take the - * tasklist_lock once we're sure we'll need it. - * Now we must do this little unlock and relock - * dance to maintain the lock hierarchy. - */ + (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) { struct task_struct *t = current; - spin_unlock_irq(&t->sighand->siglock); - read_lock(&tasklist_lock); - spin_lock_irq(&t->sighand->siglock); - *k = *act; sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); @@ -2361,12 +2350,7 @@ do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) recalc_sigpending_tsk(t); t = next_thread(t); } while (t != current); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); - return 0; } - - *k = *act; } spin_unlock_irq(¤t->sighand->siglock); -- cgit v1.1 From 47e65328a7b1cdfc4e3102e50d60faf94ebba7d3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:25 -0800 Subject: [PATCH] pids: kill PIDTYPE_TGID This patch kills PIDTYPE_TGID pid_type thus saving one hash table in kernel/pid.c and speeding up subthreads create/destroy a bit. It is also a preparation for the further tref/pids rework. This patch adds 'struct list_head thread_group' to 'struct task_struct' instead. We don't detach group leader from PIDTYPE_PID namespace until another thread inherits it's ->pid == ->tgid, so we are safe wrt premature free_pidmap(->tgid) call. Currently there are no users of find_task_by_pid_type(PIDTYPE_TGID). Should the need arise, we can use find_task_by_pid()->group_leader. Signed-off-by: Oleg Nesterov Acked-By: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 10 +--------- kernel/fork.c | 4 +++- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index aea23e7..22399ca 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,7 +51,6 @@ static void __unhash_process(struct task_struct *p) { nr_threads--; detach_pid(p, PIDTYPE_PID); - detach_pid(p, PIDTYPE_TGID); if (thread_group_leader(p)) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -59,7 +58,7 @@ static void __unhash_process(struct task_struct *p) list_del_init(&p->tasks); __get_cpu_var(process_counts)--; } - + list_del_rcu(&p->thread_group); remove_parent(p); } @@ -964,13 +963,6 @@ asmlinkage long sys_exit(int error_code) do_exit((error_code&0xff)<<8); } -task_t fastcall *next_thread(const task_t *p) -{ - return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); -} - -EXPORT_SYMBOL(next_thread); - /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). diff --git a/kernel/fork.c b/kernel/fork.c index 12cdd9f..bc551efb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1112,6 +1112,7 @@ static task_t *copy_process(unsigned long clone_flags, * We dont wake it up yet. */ p->group_leader = p; + INIT_LIST_HEAD(&p->thread_group); INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_list); @@ -1165,7 +1166,9 @@ static task_t *copy_process(unsigned long clone_flags, retval = -EAGAIN; goto bad_fork_cleanup_namespace; } + p->group_leader = current->group_leader; + list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); if (current->signal->group_stop_count > 0) { /* @@ -1213,7 +1216,6 @@ static task_t *copy_process(unsigned long clone_flags, list_add_tail(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } - attach_pid(p, PIDTYPE_TGID, p->tgid); attach_pid(p, PIDTYPE_PID, p->pid); nr_threads++; } -- cgit v1.1 From 4a2c7a7837da1b91468e50426066d988050e4d56 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:26 -0800 Subject: [PATCH] make fork() atomic wrt pgrp/session signals Eric W. Biederman wrote: > > Ok. SUSV3/Posix is clear, fork is atomic with respect > to signals. Either a signal comes before or after a > fork but not during. (See the rationale section). > http://www.opengroup.org/onlinepubs/000095399/functions/fork.html > > The tasklist_lock does not stop forks from adding to a process > group. The forks stall while the tasklist_lock is held, but a fork > that began before we grabbed the tasklist_lock simply completes > afterwards, and the child does not receive the signal. This also means that SIGSTOP or sig_kernel_coredump() signal can't be delivered to pgrp/session reliably. With this patch copy_process() returns -ERESTARTNOINTR when it detects a pending signal, fork() will be restarted transparently after handling the signals. This patch also deletes now unneeded "group_stop_count > 0" check, copy_process() can no longer succeed while group stop in progress. Signed-off-by: Oleg Nesterov Acked-By: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bc551efb..aa50c84 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1136,16 +1136,6 @@ static task_t *copy_process(unsigned long clone_flags, !cpu_online(task_cpu(p)))) set_task_cpu(p, smp_processor_id()); - /* - * Check for pending SIGKILL! The new thread should not be allowed - * to slip out of an OOM kill. (or normal SIGKILL.) - */ - if (sigismember(¤t->pending.signal, SIGKILL)) { - write_unlock_irq(&tasklist_lock); - retval = -EINTR; - goto bad_fork_cleanup_namespace; - } - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) p->real_parent = current->real_parent; @@ -1154,6 +1144,23 @@ static task_t *copy_process(unsigned long clone_flags, p->parent = p->real_parent; spin_lock(¤t->sighand->siglock); + + /* + * Process group and session signals need to be delivered to just the + * parent before the fork or both the parent and the child after the + * fork. Restart if a signal comes in before we add the new process to + * it's process group. + * A fatal signal pending means that current will exit, so the new + * thread can't slip out of an OOM kill (or normal SIGKILL). + */ + recalc_sigpending(); + if (signal_pending(current)) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -ERESTARTNOINTR; + goto bad_fork_cleanup_namespace; + } + if (clone_flags & CLONE_THREAD) { /* * Important: if an exit-all has been started then @@ -1170,16 +1177,6 @@ static task_t *copy_process(unsigned long clone_flags, p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - if (current->signal->group_stop_count > 0) { - /* - * There is an all-stop in progress for the group. - * We ourselves will stop as soon as we check signals. - * Make the new thread part of that group stop too. - */ - current->signal->group_stop_count++; - set_tsk_thread_flag(p, TIF_SIGPENDING); - } - if (!cputime_eq(current->signal->it_virt_expires, cputime_zero) || !cputime_eq(current->signal->it_prof_expires, -- cgit v1.1 From a7e5328a06a2beee3a2bbfaf87ce2a7bbe937de1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:27 -0800 Subject: [PATCH] cleanup __exit_signal->cleanup_sighand path Move 'tsk->sighand = NULL' from cleanup_sighand() to __exit_signal(). This makes the exit path more understandable and allows us to do cleanup_sighand() outside of ->siglock protected section. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 ++- kernel/fork.c | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 22399ca..bc0ec67 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -114,10 +114,11 @@ static void __exit_signal(struct task_struct *tsk) __unhash_process(tsk); tsk->signal = NULL; - cleanup_sighand(tsk); + tsk->sighand = NULL; spin_unlock(&sighand->siglock); rcu_read_unlock(); + __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); flush_sigqueue(&tsk->pending); if (sig) { diff --git a/kernel/fork.c b/kernel/fork.c index aa50c84..b3f7a1b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -803,12 +803,8 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t return 0; } -void cleanup_sighand(struct task_struct *tsk) +void __cleanup_sighand(struct sighand_struct *sighand) { - struct sighand_struct * sighand = tsk->sighand; - - /* Ok, we're done with the signal handlers */ - tsk->sighand = NULL; if (atomic_dec_and_test(&sighand->count)) kmem_cache_free(sighand_cachep, sighand); } @@ -1233,7 +1229,7 @@ bad_fork_cleanup_mm: bad_fork_cleanup_signal: cleanup_signal(p); bad_fork_cleanup_sighand: - cleanup_sighand(p); + __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: -- cgit v1.1 From dac27f4a09c274db048e80d2511102e540ac9e3a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:28 -0800 Subject: [PATCH] simplify do_signal_stop() do_signal_stop() considers 'thread_group_empty()' as a special case. This was needed to avoid taking tasklist_lock. Since this lock is unneeded any longer, we can remove this special case and simplify the code even more. Also, before this patch, finish_stop() was called with stop_count == -1 for 'thread_group_empty()' case. This is not strictly wrong, but confusing and unneeded. Signed-off-by: Oleg Nesterov Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2dfaa50..efba396 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1685,8 +1685,7 @@ out: static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - struct sighand_struct *sighand = current->sighand; - int stop_count = -1; + int stop_count; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED)) return 0; @@ -1696,30 +1695,14 @@ static int do_signal_stop(int signr) * There is a group stop in progress. We don't need to * start another one. */ - signr = sig->group_exit_code; stop_count = --sig->group_stop_count; - current->exit_code = signr; - set_current_state(TASK_STOPPED); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - } - else if (thread_group_empty(current)) { - /* - * Lock must be held through transition to stopped state. - */ - current->exit_code = current->signal->group_exit_code = signr; - set_current_state(TASK_STOPPED); - sig->flags = SIGNAL_STOP_STOPPED; - } - else { + } else { /* - * (sig->group_stop_count == 0) * There is no group stop already in progress. * We must initiate one now. */ struct task_struct *t; - current->exit_code = signr; sig->group_exit_code = signr; stop_count = 0; @@ -1735,13 +1718,14 @@ static int do_signal_stop(int signr) signal_wake_up(t, 0); } sig->group_stop_count = stop_count; - - set_current_state(TASK_STOPPED); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; } - spin_unlock_irq(&sighand->siglock); + if (stop_count == 0) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + + spin_unlock_irq(¤t->sighand->siglock); finish_stop(stop_count); return 1; } -- cgit v1.1 From 883606a7c9e84e206f96c38f88c4bd66df72f51e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:28 -0800 Subject: [PATCH] finish_stop: don't check stop_count < 0 Remove an obscure 'stop_count < 0' check in finish_stop(). The previous patch made this case impossible. Signed-off-by: Oleg Nesterov Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index efba396..24d5059 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1657,7 +1657,7 @@ finish_stop(int stop_count) * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count < 0 || (current->ptrace & PT_PTRACED)) + if (current->ptrace & PT_PTRACED) to_self = 1; else if (stop_count == 0) to_self = 0; -- cgit v1.1 From a1d5e21e3e388fb2c16463d007e788b1e41b74f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:29 -0800 Subject: [PATCH] do_notify_parent_cldstop: remove 'to_self' param The previous patch has changed callsites of do_notify_parent_cldstop() so that to_self == (->ptrace & PT_PTRACED) always (as it should be). We can remove this parameter now. Signed-off-by: Oleg Nesterov Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 24d5059..0528a95 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -591,9 +591,7 @@ static int check_kill_permission(int sig, struct siginfo *info, } /* forward decl */ -static void do_notify_parent_cldstop(struct task_struct *tsk, - int to_self, - int why); +static void do_notify_parent_cldstop(struct task_struct *tsk, int why); /* * Handle magic process-wide effects of stop/continue signals. @@ -643,7 +641,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->group_stop_count = 0; p->signal->flags = SIGNAL_STOP_CONTINUED; spin_unlock(&p->sighand->siglock); - do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); + do_notify_parent_cldstop(p, CLD_STOPPED); spin_lock(&p->sighand->siglock); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); @@ -684,7 +682,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->flags = SIGNAL_STOP_CONTINUED; p->signal->group_exit_code = 0; spin_unlock(&p->sighand->siglock); - do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); + do_notify_parent_cldstop(p, CLD_CONTINUED); spin_lock(&p->sighand->siglock); } else { /* @@ -1519,14 +1517,14 @@ void do_notify_parent(struct task_struct *tsk, int sig) spin_unlock_irqrestore(&psig->siglock, flags); } -static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) +static void do_notify_parent_cldstop(struct task_struct *tsk, int why) { struct siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; - if (to_self) + if (tsk->ptrace & PT_PTRACED) parent = tsk->parent; else { tsk = tsk->group_leader; @@ -1601,7 +1599,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) !(current->ptrace & PT_ATTACHED)) && (likely(current->parent->signal != current->signal) || !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { - do_notify_parent_cldstop(current, 1, CLD_TRAPPED); + do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); } else { @@ -1650,25 +1648,17 @@ void ptrace_notify(int exit_code) static void finish_stop(int stop_count) { - int to_self; - /* * If there are no other threads in the group, or if there is * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (current->ptrace & PT_PTRACED) - to_self = 1; - else if (stop_count == 0) - to_self = 0; - else - goto out; - - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, to_self, CLD_STOPPED); - read_unlock(&tasklist_lock); + if (stop_count == 0 || (current->ptrace & PT_PTRACED)) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, CLD_STOPPED); + read_unlock(&tasklist_lock); + } -out: schedule(); /* * Now we don't run again until continued. -- cgit v1.1 From 547679087bc60277d11b11631d826895762c4505 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 28 Mar 2006 16:11:30 -0800 Subject: [PATCH] send_sigqueue: simplify and fix the race send_sigqueue() checks PF_EXITING, then locks p->sighand->siglock. This is unsafe: 'p' can exit in between and set ->sighand = NULL. The race is theoretical, the window is tiny and irqs are disabled by the caller, so I don't think we need the fix for -stable tree. Convert send_sigqueue() to use lock_task_sighand() helper. Also, delete 'p->flags & PF_EXITING' re-check, it is unneeded and the comment is wrong. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 41 ++++------------------------------------- 1 file changed, 4 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0528a95..4922928 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1309,12 +1309,10 @@ void sigqueue_free(struct sigqueue *q) __sigqueue_free(q); } -int -send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) +int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) { unsigned long flags; int ret = 0; - struct sighand_struct *sh; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); @@ -1328,48 +1326,17 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) */ rcu_read_lock(); - if (unlikely(p->flags & PF_EXITING)) { + if (!likely(lock_task_sighand(p, &flags))) { ret = -1; goto out_err; } -retry: - sh = rcu_dereference(p->sighand); - - spin_lock_irqsave(&sh->siglock, flags); - if (p->sighand != sh) { - /* We raced with exec() in a multithreaded process... */ - spin_unlock_irqrestore(&sh->siglock, flags); - goto retry; - } - - /* - * We do the check here again to handle the following scenario: - * - * CPU 0 CPU 1 - * send_sigqueue - * check PF_EXITING - * interrupt exit code running - * __exit_signal - * lock sighand->siglock - * unlock sighand->siglock - * lock sh->siglock - * add(tsk->pending) flush_sigqueue(tsk->pending) - * - */ - - if (unlikely(p->flags & PF_EXITING)) { - ret = -1; - goto out; - } - if (unlikely(!list_empty(&q->list))) { /* * If an SI_TIMER entry is already queue just increment * the overrun count. */ - if (q->info.si_code != SI_TIMER) - BUG(); + BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; goto out; } @@ -1385,7 +1352,7 @@ retry: signal_wake_up(p, sig == SIGKILL); out: - spin_unlock_irqrestore(&sh->siglock, flags); + unlock_task_sighand(p, &flags); out_err: rcu_read_unlock(); -- cgit v1.1