diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 587 | ||||
-rw-r--r-- | kernel/auditsc.c | 259 | ||||
-rw-r--r-- | kernel/cpuset.c | 24 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 7 | ||||
-rw-r--r-- | kernel/irq/handle.c | 3 | ||||
-rw-r--r-- | kernel/irq/manage.c | 8 | ||||
-rw-r--r-- | kernel/module.c | 8 | ||||
-rw-r--r-- | kernel/params.c | 4 | ||||
-rw-r--r-- | kernel/posix-timers.c | 1 | ||||
-rw-r--r-- | kernel/power/main.c | 6 | ||||
-rw-r--r-- | kernel/power/smp.c | 4 | ||||
-rw-r--r-- | kernel/printk.c | 72 | ||||
-rw-r--r-- | kernel/profile.c | 16 | ||||
-rw-r--r-- | kernel/sched.c | 13 | ||||
-rw-r--r-- | kernel/signal.c | 18 | ||||
-rw-r--r-- | kernel/spinlock.c | 8 | ||||
-rw-r--r-- | kernel/stop_machine.c | 4 | ||||
-rw-r--r-- | kernel/sys_ni.c | 1 |
19 files changed, 604 insertions, 443 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 9c4f1af..ef35166 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -46,6 +46,8 @@ #include <asm/types.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/err.h> +#include <linux/kthread.h> #include <linux/audit.h> @@ -68,7 +70,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK; /* If audit records are to be written to the netlink socket, audit_pid * contains the (non-zero) pid. */ -static int audit_pid; +int audit_pid; /* If audit_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -77,7 +79,10 @@ static int audit_rate_limit; /* Number of outstanding audit_buffers allowed. */ static int audit_backlog_limit = 64; -static atomic_t audit_backlog = ATOMIC_INIT(0); + +/* The identity of the user shutting down the audit system. */ +uid_t audit_sig_uid = -1; +pid_t audit_sig_pid = -1; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -91,19 +96,17 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -/* There are two lists of audit buffers. The txlist contains audit - * buffers that cannot be sent immediately to the netlink device because - * we are in an irq context (these are sent later in a tasklet). - * - * The second list is a list of pre-allocated audit buffers (if more +/* The audit_freelist is a list of pre-allocated audit buffers (if more * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ -static DEFINE_SPINLOCK(audit_txlist_lock); static DEFINE_SPINLOCK(audit_freelist_lock); static int audit_freelist_count = 0; -static LIST_HEAD(audit_txlist); static LIST_HEAD(audit_freelist); +static struct sk_buff_head audit_skb_queue; +static struct task_struct *kauditd_task; +static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); + /* There are three lists of rules -- one to search at task creation * time, one to search at syscall entry time, and another to search at * syscall exit time. */ @@ -112,7 +115,7 @@ static LIST_HEAD(audit_entlist); static LIST_HEAD(audit_extlist); /* The netlink socket is only to be read by 1 CPU, which lets us assume - * that list additions and deletions never happen simultaneiously in + * that list additions and deletions never happen simultaneously in * auditsc.c */ static DECLARE_MUTEX(audit_netlink_sem); @@ -132,21 +135,14 @@ static DECLARE_MUTEX(audit_netlink_sem); * use simultaneously. */ struct audit_buffer { struct list_head list; - struct sk_buff_head sklist; /* formatted skbs ready to send */ + struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ - int len; /* used area of tmp */ - char tmp[AUDIT_BUFSIZ]; - - /* Pointer to header and contents */ - struct nlmsghdr *nlh; - int total; - int type; - int pid; }; -void audit_set_type(struct audit_buffer *ab, int type) +static void audit_set_pid(struct audit_buffer *ab, pid_t pid) { - ab->type = type; + struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; + nlh->nlmsg_pid = pid; } struct audit_entry { @@ -154,9 +150,6 @@ struct audit_entry { struct audit_rule rule; }; -static void audit_log_end_irq(struct audit_buffer *ab); -static void audit_log_end_fast(struct audit_buffer *ab); - static void audit_panic(const char *message) { switch (audit_failure) @@ -227,10 +220,8 @@ void audit_log_lost(const char *message) if (print) { printk(KERN_WARNING - "audit: audit_lost=%d audit_backlog=%d" - " audit_rate_limit=%d audit_backlog_limit=%d\n", + "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n", atomic_read(&audit_lost), - atomic_read(&audit_backlog), audit_rate_limit, audit_backlog_limit); audit_panic(message); @@ -242,7 +233,8 @@ static int audit_set_rate_limit(int limit, uid_t loginuid) { int old = audit_rate_limit; audit_rate_limit = limit; - audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u", + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_rate_limit=%d old=%d by auid=%u", audit_rate_limit, old, loginuid); return old; } @@ -251,7 +243,8 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid) { int old = audit_backlog_limit; audit_backlog_limit = limit; - audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u", + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_backlog_limit=%d old=%d by auid=%u", audit_backlog_limit, old, loginuid); return old; } @@ -262,8 +255,9 @@ static int audit_set_enabled(int state, uid_t loginuid) if (state != 0 && state != 1) return -EINVAL; audit_enabled = state; - audit_log(NULL, "audit_enabled=%d old=%d by auid %u", - audit_enabled, old, loginuid); + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_enabled=%d old=%d by auid=%u", + audit_enabled, old, loginuid); return old; } @@ -275,12 +269,44 @@ static int audit_set_failure(int state, uid_t loginuid) && state != AUDIT_FAIL_PANIC) return -EINVAL; audit_failure = state; - audit_log(NULL, "audit_failure=%d old=%d by auid %u", - audit_failure, old, loginuid); + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_failure=%d old=%d by auid=%u", + audit_failure, old, loginuid); return old; } -#ifdef CONFIG_NET +int kauditd_thread(void *dummy) +{ + struct sk_buff *skb; + + while (1) { + skb = skb_dequeue(&audit_skb_queue); + if (skb) { + if (audit_pid) { + int err = netlink_unicast(audit_sock, skb, audit_pid, 0); + if (err < 0) { + BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ + printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); + audit_pid = 0; + } + } else { + printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0)); + kfree_skb(skb); + } + } else { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kauditd_wait, &wait); + + if (!skb_queue_len(&audit_skb_queue)) + schedule(); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kauditd_wait, &wait); + } + } +} + void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { @@ -293,13 +319,16 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, skb = alloc_skb(len, GFP_KERNEL); if (!skb) - goto nlmsg_failure; + return; - nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh)); + nlh = NLMSG_PUT(skb, pid, seq, t, size); nlh->nlmsg_flags = flags; data = NLMSG_DATA(nlh); memcpy(data, payload, size); - netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT); + + /* Ignore failure. It'll only happen if the sender goes away, + because our timeout is set to infinite. */ + netlink_unicast(audit_sock, skb, pid, 0); return; nlmsg_failure: /* Used by NLMSG_PUT */ @@ -321,10 +350,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) case AUDIT_SET: case AUDIT_ADD: case AUDIT_DEL: + case AUDIT_SIGNAL_INFO: if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: + case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) err = -EPERM; break; @@ -344,11 +375,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; uid_t loginuid; /* loginuid of sender */ + struct audit_sig_info sig_data; err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); if (err) return err; + /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ + if (!kauditd_task) + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } + pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; loginuid = NETLINK_CB(skb).loginuid; @@ -363,7 +404,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) status_set.rate_limit = audit_rate_limit; status_set.backlog_limit = audit_backlog_limit; status_set.lost = atomic_read(&audit_lost); - status_set.backlog = atomic_read(&audit_backlog); + status_set.backlog = skb_queue_len(&audit_skb_queue); audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, &status_set, sizeof(status_set)); break; @@ -382,7 +423,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; audit_pid = status_get->pid; - audit_log(NULL, "audit_pid=%d old=%d by auid %u", + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_pid=%d old=%d by auid=%u", audit_pid, old, loginuid); } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) @@ -392,18 +434,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid); break; case AUDIT_USER: - ab = audit_log_start(NULL); + case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: + ab = audit_log_start(NULL, msg_type); if (!ab) break; /* audit_panic has been called */ audit_log_format(ab, - "user pid=%d uid=%d length=%d loginuid=%u" + "user pid=%d uid=%u auid=%u" " msg='%.1024s'", - pid, uid, - (int)(nlh->nlmsg_len - - ((char *)data - (char *)nlh)), - loginuid, (char *)data); - ab->type = AUDIT_USER; - ab->pid = pid; + pid, uid, loginuid, (char *)data); + audit_set_pid(ab, pid); audit_log_end(ab); break; case AUDIT_ADD: @@ -412,12 +451,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; /* fallthrough */ case AUDIT_LIST: -#ifdef CONFIG_AUDITSYSCALL err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, uid, seq, data, loginuid); -#else - err = -EOPNOTSUPP; -#endif + break; + case AUDIT_SIGNAL_INFO: + sig_data.uid = audit_sig_uid; + sig_data.pid = audit_sig_pid; + audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, + 0, 0, &sig_data, sizeof(sig_data)); break; default: err = -EINVAL; @@ -467,87 +508,6 @@ static void audit_receive(struct sock *sk, int length) up(&audit_netlink_sem); } -/* Move data from tmp buffer into an skb. This is an extra copy, and - * that is unfortunate. However, the copy will only occur when a record - * is being written to user space, which is already a high-overhead - * operation. (Elimination of the copy is possible, for example, by - * writing directly into a pre-allocated skb, at the cost of wasting - * memory. */ -static void audit_log_move(struct audit_buffer *ab) -{ - struct sk_buff *skb; - char *start; - int extra = ab->nlh ? 0 : NLMSG_SPACE(0); - - /* possible resubmission */ - if (ab->len == 0) - return; - - skb = skb_peek_tail(&ab->sklist); - if (!skb || skb_tailroom(skb) <= ab->len + extra) { - skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC); - if (!skb) { - ab->len = 0; /* Lose information in ab->tmp */ - audit_log_lost("out of memory in audit_log_move"); - return; - } - __skb_queue_tail(&ab->sklist, skb); - if (!ab->nlh) - ab->nlh = (struct nlmsghdr *)skb_put(skb, - NLMSG_SPACE(0)); - } - start = skb_put(skb, ab->len); - memcpy(start, ab->tmp, ab->len); - ab->len = 0; -} - -/* Iterate over the skbuff in the audit_buffer, sending their contents - * to user space. */ -static inline int audit_log_drain(struct audit_buffer *ab) -{ - struct sk_buff *skb; - - while ((skb = skb_dequeue(&ab->sklist))) { - int retval = 0; - - if (audit_pid) { - if (ab->nlh) { - ab->nlh->nlmsg_len = ab->total; - ab->nlh->nlmsg_type = ab->type; - ab->nlh->nlmsg_flags = 0; - ab->nlh->nlmsg_seq = 0; - ab->nlh->nlmsg_pid = ab->pid; - } - skb_get(skb); /* because netlink_* frees */ - retval = netlink_unicast(audit_sock, skb, audit_pid, - MSG_DONTWAIT); - } - if (retval == -EAGAIN && - (atomic_read(&audit_backlog)) < audit_backlog_limit) { - skb_queue_head(&ab->sklist, skb); - audit_log_end_irq(ab); - return 1; - } - if (retval < 0) { - if (retval == -ECONNREFUSED) { - printk(KERN_ERR - "audit: *NO* daemon at audit_pid=%d\n", - audit_pid); - audit_pid = 0; - } else - audit_log_lost("netlink socket too busy"); - } - if (!audit_pid) { /* No daemon */ - int offset = ab->nlh ? NLMSG_SPACE(0) : 0; - int len = skb->len - offset; - skb->data[offset + len] = '\0'; - printk(KERN_ERR "%s\n", skb->data + offset); - } - kfree_skb(skb); - ab->nlh = NULL; - } - return 0; -} /* Initialize audit support at boot time. */ static int __init audit_init(void) @@ -558,40 +518,13 @@ static int __init audit_init(void) if (!audit_sock) audit_panic("cannot initialize netlink socket"); + audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + skb_queue_head_init(&audit_skb_queue); audit_initialized = 1; audit_enabled = audit_default; - audit_log(NULL, "initialized"); - return 0; -} - -#else -/* Without CONFIG_NET, we have no skbuffs. For now, print what we have - * in the buffer. */ -static void audit_log_move(struct audit_buffer *ab) -{ - printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp); - ab->len = 0; -} - -static inline int audit_log_drain(struct audit_buffer *ab) -{ - return 0; -} - -/* Initialize audit support at boot time. */ -int __init audit_init(void) -{ - printk(KERN_INFO "audit: initializing WITHOUT netlink support\n"); - audit_sock = NULL; - audit_pid = 0; - - audit_initialized = 1; - audit_enabled = audit_default; - audit_log(NULL, "initialized"); + audit_log(NULL, AUDIT_KERNEL, "initialized"); return 0; } -#endif - __initcall(audit_init); /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ @@ -608,6 +541,102 @@ static int __init audit_enable(char *str) __setup("audit=", audit_enable); +static void audit_buffer_free(struct audit_buffer *ab) +{ + unsigned long flags; + + if (!ab) + return; + + if (ab->skb) + kfree_skb(ab->skb); + + spin_lock_irqsave(&audit_freelist_lock, flags); + if (++audit_freelist_count > AUDIT_MAXFREE) + kfree(ab); + else + list_add(&ab->list, &audit_freelist); + spin_unlock_irqrestore(&audit_freelist_lock, flags); +} + +static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, + int gfp_mask, int type) +{ + unsigned long flags; + struct audit_buffer *ab = NULL; + struct nlmsghdr *nlh; + + spin_lock_irqsave(&audit_freelist_lock, flags); + if (!list_empty(&audit_freelist)) { + ab = list_entry(audit_freelist.next, + struct audit_buffer, list); + list_del(&ab->list); + --audit_freelist_count; + } + spin_unlock_irqrestore(&audit_freelist_lock, flags); + + if (!ab) { + ab = kmalloc(sizeof(*ab), gfp_mask); + if (!ab) + goto err; + } + + ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); + if (!ab->skb) + goto err; + + ab->ctx = ctx; + nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); + nlh->nlmsg_type = type; + nlh->nlmsg_flags = 0; + nlh->nlmsg_pid = 0; + nlh->nlmsg_seq = 0; + return ab; +err: + audit_buffer_free(ab); + return NULL; +} + +/* Compute a serial number for the audit record. Audit records are + * written to user-space as soon as they are generated, so a complete + * audit record may be written in several pieces. The timestamp of the + * record and this serial number are used by the user-space tools to + * determine which pieces belong to the same audit record. The + * (timestamp,serial) tuple is unique for each syscall and is live from + * syscall entry to syscall exit. + * + * Atomic values are only guaranteed to be 24-bit, so we count down. + * + * NOTE: Another possibility is to store the formatted records off the + * audit context (for those records that have a context), and emit them + * all at syscall exit. However, this could delay the reporting of + * significant errors until syscall exit (or never, if the system + * halts). */ +unsigned int audit_serial(void) +{ + static atomic_t serial = ATOMIC_INIT(0xffffff); + unsigned int a, b; + + do { + a = atomic_read(&serial); + if (atomic_dec_and_test(&serial)) + atomic_set(&serial, 0xffffff); + b = atomic_read(&serial); + } while (b != a - 1); + + return 0xffffff - b; +} + +static inline void audit_get_stamp(struct audit_context *ctx, + struct timespec *t, unsigned int *serial) +{ + if (ctx) + auditsc_get_stamp(ctx, t, serial); + else { + *t = CURRENT_TIME; + *serial = audit_serial(); + } +} /* Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to @@ -615,10 +644,9 @@ __setup("audit=", audit_enable); * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ -struct audit_buffer *audit_log_start(struct audit_context *ctx) +struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) { struct audit_buffer *ab = NULL; - unsigned long flags; struct timespec t; unsigned int serial; @@ -626,57 +654,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx) return NULL; if (audit_backlog_limit - && atomic_read(&audit_backlog) > audit_backlog_limit) { + && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { if (audit_rate_check()) printk(KERN_WARNING "audit: audit_backlog=%d > " "audit_backlog_limit=%d\n", - atomic_read(&audit_backlog), + skb_queue_len(&audit_skb_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); return NULL; } - spin_lock_irqsave(&audit_freelist_lock, flags); - if (!list_empty(&audit_freelist)) { - ab = list_entry(audit_freelist.next, - struct audit_buffer, list); - list_del(&ab->list); - --audit_freelist_count; - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); - - if (!ab) - ab = kmalloc(sizeof(*ab), GFP_ATOMIC); + ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; } - atomic_inc(&audit_backlog); - skb_queue_head_init(&ab->sklist); - - ab->ctx = ctx; - ab->len = 0; - ab->nlh = NULL; - ab->total = 0; - ab->type = AUDIT_KERNEL; - ab->pid = 0; + audit_get_stamp(ab->ctx, &t, &serial); -#ifdef CONFIG_AUDITSYSCALL - if (ab->ctx) - audit_get_stamp(ab->ctx, &t, &serial); - else -#endif - { - t = CURRENT_TIME; - serial = 0; - } audit_log_format(ab, "audit(%lu.%03lu:%u): ", t.tv_sec, t.tv_nsec/1000000, serial); return ab; } +/** + * audit_expand - expand skb in the audit buffer + * @ab: audit_buffer + * + * Returns 0 (no space) on failed expansion, or available space if + * successful. + */ +static inline int audit_expand(struct audit_buffer *ab, int extra) +{ + struct sk_buff *skb = ab->skb; + int ret = pskb_expand_head(skb, skb_headroom(skb), extra, + GFP_ATOMIC); + if (ret < 0) { + audit_log_lost("out of memory in audit_expand"); + return 0; + } + return skb_tailroom(skb); +} /* Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint @@ -686,26 +705,35 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; + struct sk_buff *skb; + va_list args2; if (!ab) return; - avail = sizeof(ab->tmp) - ab->len; - if (avail <= 0) { - audit_log_move(ab); - avail = sizeof(ab->tmp) - ab->len; + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + if (avail == 0) { + avail = audit_expand(ab, AUDIT_BUFSIZ); + if (!avail) + goto out; } - len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); + va_copy(args2, args); + len = vsnprintf(skb->tail, avail, fmt, args); if (len >= avail) { /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ - audit_log_move(ab); - avail = sizeof(ab->tmp) - ab->len; - len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); + avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); + if (!avail) + goto out; + len = vsnprintf(skb->tail, avail, fmt, args2); } - ab->len += (len < avail) ? len : avail; - ab->total += (len < avail) ? len : avail; + if (len > 0) + skb_put(skb, len); +out: + return; } /* Format a message into the audit buffer. All the work is done in @@ -721,20 +749,47 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) va_end(args); } -void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) +/* This function will take the passed buf and convert it into a string of + * ascii hex digits. The new string is placed onto the skb. */ +void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, + size_t len) { - int i; + int i, avail, new_len; + unsigned char *ptr; + struct sk_buff *skb; + static const unsigned char *hex = "0123456789ABCDEF"; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + new_len = len<<1; + if (new_len >= avail) { + /* Round the buffer request up to the next multiple */ + new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); + avail = audit_expand(ab, new_len); + if (!avail) + return; + } - for (i=0; i<len; i++) - audit_log_format(ab, "%02x", buf[i]); + ptr = skb->tail; + for (i=0; i<len; i++) { + *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ + *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ + } + *ptr = 0; + skb_put(skb, len << 1); /* new string is twice the old string */ } +/* This code will escape a string that is passed to it if the string + * contains a control character, unprintable character, double quote mark, + * or a space. Unescaped strings will start and end with a double quote mark. + * Strings that are escaped are printed in hex (2 digits per char). */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { const unsigned char *p = string; while (*p) { - if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) { + if (*p == '"' || *p < 0x21 || *p > 0x7f) { audit_log_hex(ab, string, strlen(string)); return; } @@ -743,117 +798,63 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) audit_log_format(ab, "\"%s\"", string); } - -/* This is a helper-function to print the d_path without using a static - * buffer or allocating another buffer in addition to the one in - * audit_buffer. */ +/* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, struct dentry *dentry, struct vfsmount *vfsmnt) { - char *p; - int len, avail; + char *p, *path; - if (prefix) audit_log_format(ab, " %s", prefix); - - if (ab->len > 128) - audit_log_move(ab); - avail = sizeof(ab->tmp) - ab->len; - p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail); - if (IS_ERR(p)) { - /* FIXME: can we save some information here? */ - audit_log_format(ab, "<toolong>"); - } else { - /* path isn't at start of buffer */ - len = (ab->tmp + sizeof(ab->tmp) - 1) - p; - memmove(ab->tmp + ab->len, p, len); - ab->len += len; - ab->total += len; - } -} - -/* Remove queued messages from the audit_txlist and send them to userspace. */ -static void audit_tasklet_handler(unsigned long arg) -{ - LIST_HEAD(list); - struct audit_buffer *ab; - unsigned long flags; + if (prefix) + audit_log_format(ab, " %s", prefix); - spin_lock_irqsave(&audit_txlist_lock, flags); - list_splice_init(&audit_txlist, &list); - spin_unlock_irqrestore(&audit_txlist_lock, flags); - - while (!list_empty(&list)) { - ab = list_entry(list.next, struct audit_buffer, list); - list_del(&ab->list); - audit_log_end_fast(ab); + /* We will allow 11 spaces for ' (deleted)' to be appended */ + path = kmalloc(PATH_MAX+11, GFP_KERNEL); + if (!path) { + audit_log_format(ab, "<no memory>"); + return; } + p = d_path(dentry, vfsmnt, path, PATH_MAX+11); + if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ + /* FIXME: can we save some information here? */ + audit_log_format(ab, "<too long>"); + } else + audit_log_untrustedstring(ab, p); + kfree(path); } -static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0); - /* The netlink_* functions cannot be called inside an irq context, so * the audit buffer is places on a queue and a tasklet is scheduled to * remove them from the queue outside the irq context. May be called in * any context. */ -static void audit_log_end_irq(struct audit_buffer *ab) -{ - unsigned long flags; - - if (!ab) - return; - spin_lock_irqsave(&audit_txlist_lock, flags); - list_add_tail(&ab->list, &audit_txlist); - spin_unlock_irqrestore(&audit_txlist_lock, flags); - - tasklet_schedule(&audit_tasklet); -} - -/* Send the message in the audit buffer directly to user space. May not - * be called in an irq context. */ -static void audit_log_end_fast(struct audit_buffer *ab) +void audit_log_end(struct audit_buffer *ab) { - unsigned long flags; - - BUG_ON(in_irq()); if (!ab) return; if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { - audit_log_move(ab); - if (audit_log_drain(ab)) - return; + if (audit_pid) { + struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; + nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); + skb_queue_tail(&audit_skb_queue, ab->skb); + ab->skb = NULL; + wake_up_interruptible(&kauditd_wait); + } else { + printk("%s\n", ab->skb->data + NLMSG_SPACE(0)); + } } - - atomic_dec(&audit_backlog); - spin_lock_irqsave(&audit_freelist_lock, flags); - if (++audit_freelist_count > AUDIT_MAXFREE) - kfree(ab); - else - list_add(&ab->list, &audit_freelist); - spin_unlock_irqrestore(&audit_freelist_lock, flags); -} - -/* Send or queue the message in the audit buffer, depending on the - * current context. (A convenience function that may be called in any - * context.) */ -void audit_log_end(struct audit_buffer *ab) -{ - if (in_irq()) - audit_log_end_irq(ab); - else - audit_log_end_fast(ab); + audit_buffer_free(ab); } /* Log an audit record. This is a convenience function that calls * audit_log_start, audit_log_vformat, and audit_log_end. It may be * called in any context. */ -void audit_log(struct audit_context *ctx, const char *fmt, ...) +void audit_log(struct audit_context *ctx, int type, const char *fmt, ...) { struct audit_buffer *ab; va_list args; - ab = audit_log_start(ctx); + ab = audit_log_start(ctx, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 37b3ac9..e75f84e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -34,7 +34,8 @@ #include <asm/types.h> #include <linux/mm.h> #include <linux/module.h> - +#include <linux/mount.h> +#include <linux/socket.h> #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> @@ -112,6 +113,23 @@ struct audit_aux_data_ipcctl { mode_t mode; }; +struct audit_aux_data_socketcall { + struct audit_aux_data d; + int nargs; + unsigned long args[0]; +}; + +struct audit_aux_data_sockaddr { + struct audit_aux_data d; + int len; + char a[0]; +}; + +struct audit_aux_data_path { + struct audit_aux_data d; + struct dentry *dentry; + struct vfsmount *mnt; +}; /* The per-task audit context. */ struct audit_context { @@ -127,6 +145,8 @@ struct audit_context { int auditable; /* 1 if record should be written */ int name_count; struct audit_names names[AUDIT_NAMES]; + struct dentry * pwd; + struct vfsmount * pwdmnt; struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; @@ -157,6 +177,8 @@ struct audit_entry { struct audit_rule rule; }; +extern int audit_pid; + /* Check to see if two rules are identical. It is called from * audit_del_rule during AUDIT_DEL. */ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) @@ -226,7 +248,6 @@ static inline int audit_del_rule(struct audit_rule *rule, return -EFAULT; /* No matching rule */ } -#ifdef CONFIG_NET /* Copy rule from user-space to kernel-space. Called during * AUDIT_ADD. */ static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) @@ -287,7 +308,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_entlist); if (!err && (flags & AUDIT_AT_EXIT)) err = audit_add_rule(entry, &audit_extlist); - audit_log(NULL, "auid %u added an audit rule\n", loginuid); + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "auid=%u added an audit rule\n", loginuid); break; case AUDIT_DEL: flags =((struct audit_rule *)data)->flags; @@ -297,7 +319,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_del_rule(data, &audit_entlist); if (!err && (flags & AUDIT_AT_EXIT)) err = audit_del_rule(data, &audit_extlist); - audit_log(NULL, "auid %u removed an audit rule\n", loginuid); + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "auid=%u removed an audit rule\n", loginuid); break; default: return -EINVAL; @@ -305,7 +328,6 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return err; } -#endif /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ @@ -444,7 +466,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) /* At syscall entry and exit time, this filter is called if the * audit_state is not low enough that auditing cannot take place, but is - * also not high enough that we already know we have to write and audit + * also not high enough that we already know we have to write an audit * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). */ static enum audit_state audit_filter_syscall(struct task_struct *tsk, @@ -532,6 +554,12 @@ static inline void audit_free_names(struct audit_context *context) if (context->names[i].name) __putname(context->names[i].name); context->name_count = 0; + if (context->pwd) + dput(context->pwd); + if (context->pwdmnt) + mntput(context->pwdmnt); + context->pwd = NULL; + context->pwdmnt = NULL; } static inline void audit_free_aux(struct audit_context *context) @@ -539,6 +567,11 @@ static inline void audit_free_aux(struct audit_context *context) struct audit_aux_data *aux; while ((aux = context->aux)) { + if (aux->type == AUDIT_AVC_PATH) { + struct audit_aux_data_path *axi = (void *)aux; + dput(axi->dentry); + mntput(axi->mnt); + } context->aux = aux->next; kfree(aux); } @@ -625,7 +658,8 @@ static void audit_log_task_info(struct audit_buffer *ab) struct vm_area_struct *vma; get_task_comm(name, current); - audit_log_format(ab, " comm=%s", name); + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, name); if (!mm) return; @@ -649,23 +683,24 @@ static void audit_log_exit(struct audit_context *context) { int i; struct audit_buffer *ab; + struct audit_aux_data *aux; - ab = audit_log_start(context); + ab = audit_log_start(context, AUDIT_SYSCALL); if (!ab) return; /* audit_panic has been called */ - audit_log_format(ab, "syscall=%d", context->major); + audit_log_format(ab, "arch=%x syscall=%d", + context->arch, context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); - audit_log_format(ab, " arch=%x", context->arch); if (context->return_valid) audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " pid=%d loginuid=%d uid=%d gid=%d" - " euid=%d suid=%d fsuid=%d" - " egid=%d sgid=%d fsgid=%d", + " pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u", context->argv[0], context->argv[1], context->argv[2], @@ -679,33 +714,57 @@ static void audit_log_exit(struct audit_context *context) context->egid, context->sgid, context->fsgid); audit_log_task_info(ab); audit_log_end(ab); - while (context->aux) { - struct audit_aux_data *aux; - ab = audit_log_start(context); + for (aux = context->aux; aux; aux = aux->next) { + + ab = audit_log_start(context, aux->type); if (!ab) continue; /* audit_panic has been called */ - aux = context->aux; - context->aux = aux->next; - - audit_log_format(ab, "auxitem=%d", aux->type); switch (aux->type) { - case AUDIT_AUX_IPCPERM: { + case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx uid=%d gid=%d mode=%x", + " qbytes=%lx iuid=%u igid=%u mode=%x", axi->qbytes, axi->uid, axi->gid, axi->mode); - } + break; } + + case AUDIT_SOCKETCALL: { + int i; + struct audit_aux_data_socketcall *axs = (void *)aux; + audit_log_format(ab, "nargs=%d", axs->nargs); + for (i=0; i<axs->nargs; i++) + audit_log_format(ab, " a%d=%lx", i, axs->args[i]); + break; } + + case AUDIT_SOCKADDR: { + struct audit_aux_data_sockaddr *axs = (void *)aux; + + audit_log_format(ab, "saddr="); + audit_log_hex(ab, axs->a, axs->len); + break; } + + case AUDIT_AVC_PATH: { + struct audit_aux_data_path *axi = (void *)aux; + audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); + break; } + } audit_log_end(ab); - kfree(aux); } + if (context->pwd && context->pwdmnt) { + ab = audit_log_start(context, AUDIT_CWD); + if (ab) { + audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); + audit_log_end(ab); + } + } for (i = 0; i < context->name_count; i++) { - ab = audit_log_start(context); + ab = audit_log_start(context, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ + audit_log_format(ab, "item=%d", i); if (context->names[i].name) { audit_log_format(ab, " name="); @@ -713,7 +772,7 @@ static void audit_log_exit(struct audit_context *context) } if (context->names[i].ino != (unsigned long)-1) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" - " uid=%d gid=%d rdev=%02x:%02x", + " ouid=%u ogid=%u rdev=%02x:%02x", context->names[i].ino, MAJOR(context->names[i].dev), MINOR(context->names[i].dev), @@ -741,42 +800,12 @@ void audit_free(struct task_struct *tsk) /* Check for system calls that do not go through the exit * function (e.g., exit_group), then free context block. */ - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->auditable && context->pid != audit_pid) audit_log_exit(context); audit_free_context(context); } -/* Compute a serial number for the audit record. Audit records are - * written to user-space as soon as they are generated, so a complete - * audit record may be written in several pieces. The timestamp of the - * record and this serial number are used by the user-space daemon to - * determine which pieces belong to the same audit record. The - * (timestamp,serial) tuple is unique for each syscall and is live from - * syscall entry to syscall exit. - * - * Atomic values are only guaranteed to be 24-bit, so we count down. - * - * NOTE: Another possibility is to store the formatted records off the - * audit context (for those records that have a context), and emit them - * all at syscall exit. However, this could delay the reporting of - * significant errors until syscall exit (or never, if the system - * halts). */ -static inline unsigned int audit_serial(void) -{ - static atomic_t serial = ATOMIC_INIT(0xffffff); - unsigned int a, b; - - do { - a = atomic_read(&serial); - if (atomic_dec_and_test(&serial)) - atomic_set(&serial, 0xffffff); - b = atomic_read(&serial); - } while (b != a - 1); - - return 0xffffff - b; -} - /* Fill in audit context at syscall entry. This only happens if the * audit context was created when the task was created and the state or * filters demand the audit context be built. If the state from the @@ -876,7 +905,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) if (likely(!context)) return; - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->auditable && context->pid != audit_pid) audit_log_exit(context); context->in_syscall = 0; @@ -916,6 +945,13 @@ void audit_getname(const char *name) context->names[context->name_count].name = name; context->names[context->name_count].ino = (unsigned long)-1; ++context->name_count; + if (!context->pwd) { + read_lock(¤t->fs->lock); + context->pwd = dget(current->fs->pwd); + context->pwdmnt = mntget(current->fs->pwdmnt); + read_unlock(¤t->fs->lock); + } + } /* Intercept a putname request. Called from @@ -994,34 +1030,26 @@ void audit_inode(const char *name, const struct inode *inode) context->names[idx].rdev = inode->i_rdev; } -void audit_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) +void auditsc_get_stamp(struct audit_context *ctx, + struct timespec *t, unsigned int *serial) { - if (ctx) { - t->tv_sec = ctx->ctime.tv_sec; - t->tv_nsec = ctx->ctime.tv_nsec; - *serial = ctx->serial; - ctx->auditable = 1; - } else { - *t = CURRENT_TIME; - *serial = 0; - } + t->tv_sec = ctx->ctime.tv_sec; + t->tv_nsec = ctx->ctime.tv_nsec; + *serial = ctx->serial; + ctx->auditable = 1; } -extern int audit_set_type(struct audit_buffer *ab, int type); - int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { if (task->audit_context) { struct audit_buffer *ab; - ab = audit_log_start(NULL); + ab = audit_log_start(NULL, AUDIT_LOGIN); if (ab) { audit_log_format(ab, "login pid=%d uid=%u " - "old loginuid=%u new loginuid=%u", + "old auid=%u new auid=%u", task->pid, task->uid, task->audit_context->loginuid, loginuid); - audit_set_type(ab, AUDIT_LOGIN); audit_log_end(ab); } task->audit_context->loginuid = loginuid; @@ -1051,8 +1079,89 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) ax->gid = gid; ax->mode = mode; - ax->d.type = AUDIT_AUX_IPCPERM; + ax->d.type = AUDIT_IPC; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +int audit_socketcall(int nargs, unsigned long *args) +{ + struct audit_aux_data_socketcall *ax; + struct audit_context *context = current->audit_context; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->nargs = nargs; + memcpy(ax->args, args, nargs * sizeof(unsigned long)); + + ax->d.type = AUDIT_SOCKETCALL; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +int audit_sockaddr(int len, void *a) +{ + struct audit_aux_data_sockaddr *ax; + struct audit_context *context = current->audit_context; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->len = len; + memcpy(ax->a, a, len); + + ax->d.type = AUDIT_SOCKADDR; ax->d.next = context->aux; context->aux = (void *)ax; return 0; } + +int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) +{ + struct audit_aux_data_path *ax; + struct audit_context *context = current->audit_context; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + ax->dentry = dget(dentry); + ax->mnt = mntget(mnt); + + ax->d.type = AUDIT_AVC_PATH; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +void audit_signal_info(int sig, struct task_struct *t) +{ + extern pid_t audit_sig_pid; + extern uid_t audit_sig_uid; + + if (unlikely(audit_pid && t->pid == audit_pid)) { + if (sig == SIGTERM || sig == SIGHUP) { + struct audit_context *ctx = current->audit_context; + audit_sig_pid = current->pid; + if (ctx) + audit_sig_uid = ctx->loginuid; + else + audit_sig_uid = current->uid; + } + } +} + diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 961d740..00e8f25 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL; * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't * (usually) grab cpuset_sem. These are the two most performance * critical pieces of code here. The exception occurs on exit(), - * if the last task using a cpuset exits, and the cpuset was marked - * notify_on_release. In that case, the cpuset_sem is taken, the - * path to the released cpuset calculated, and a usermode call made + * when a task in a notify_on_release cpuset exits. Then cpuset_sem + * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * @@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk) * * Description: Detach cpuset from @tsk and release it. * + * Note that cpusets marked notify_on_release force every task + * in them to take the global cpuset_sem semaphore when exiting. + * This could impact scaling on very large systems. Be reluctant + * to use notify_on_release cpusets where very high task exit + * scaling is required on large systems. + * + * Don't even think about derefencing 'cs' after the cpuset use + * count goes to zero, except inside a critical section guarded + * by the cpuset_sem semaphore. If you don't hold cpuset_sem, + * then a zero cpuset use count is a license to any other task to + * nuke the cpuset immediately. + * **/ void cpuset_exit(struct task_struct *tsk) @@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk) tsk->cpuset = NULL; task_unlock(tsk); - if (atomic_dec_and_test(&cs->count)) { + if (notify_on_release(cs)) { down(&cpuset_sem); - check_for_release(cs); + if (atomic_dec_and_test(&cs->count)) + check_for_release(cs); up(&cpuset_sem); + } else { + atomic_dec(&cs->count); } } diff --git a/kernel/exit.c b/kernel/exit.c index edaa50b..2ef2ad5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -811,10 +811,8 @@ fastcall NORET_TYPE void do_exit(long code) acct_update_integrals(tsk); update_mem_hiwater(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) { - del_timer_sync(&tsk->signal->real_timer); + if (group_dead) acct_process(code); - } exit_mm(tsk); exit_sem(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index f42a17f..a28d11e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; + mm->cached_hole_size = ~0UL; mm->map_count = 0; set_mm_counter(mm, rss, 0); set_mm_counter(mm, anon_rss, 0); @@ -249,8 +250,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) /* * Link in the new vma and copy the page table entries: - * link in first so that swapoff can see swap entries, - * and try_to_unmap_one's find_vma find the new vma. + * link in first so that swapoff can see swap entries. + * Note that, exceptionally, here the vma is inserted + * without holding mm->mmap_sem. */ spin_lock(&mm->page_table_lock); *pprev = tmp; @@ -322,6 +324,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 2fb0e46..436c7d9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -30,6 +30,7 @@ */ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { + .status = IRQ_DISABLED, .handler = &no_irq_type, .lock = SPIN_LOCK_UNLOCKED } @@ -118,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) */ desc->handler->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); desc->handler->end(irq); return 1; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5202e4c..ac67009 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -6,6 +6,7 @@ * This file contains driver APIs to the irq subsystem. */ +#include <linux/config.h> #include <linux/irq.h> #include <linux/module.h> #include <linux/random.h> @@ -255,6 +256,13 @@ void free_irq(unsigned int irq, void *dev_id) /* Found it - now remove it from the list of entries */ *pp = action->next; + + /* Currently used only by UML, might disappear one day.*/ +#ifdef CONFIG_IRQ_RELEASE_METHOD + if (desc->handler->release) + desc->handler->release(irq, dev_id); +#endif + if (!desc->action) { desc->status |= IRQ_DISABLED; if (desc->handler->shutdown) diff --git a/kernel/module.c b/kernel/module.c index 5734ab0..a566745 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -379,7 +379,7 @@ static void module_unload_init(struct module *mod) for (i = 0; i < NR_CPUS; i++) local_set(&mod->ref[i].count, 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[_smp_processor_id()].count, 1); + local_set(&mod->ref[raw_smp_processor_id()].count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod, const char __user *uargs) { struct module *mod; + mm_segment_t old_fs = get_fs(); int ret = 0; /* Must have permission */ @@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod, return PTR_ERR(mod); } + /* flush the icache in correct context */ + set_fs(KERNEL_DS); + /* Flush the instruction cache, since we've played with text */ if (mod->module_init) flush_icache_range((unsigned long)mod->module_init, @@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod, flush_icache_range((unsigned long)mod->module_core, (unsigned long)mod->module_core + mod->core_size); + set_fs(old_fs); + /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); diff --git a/kernel/params.c b/kernel/params.c index 5513844..d586c35ef 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -629,7 +629,7 @@ static ssize_t module_attr_show(struct kobject *kobj, mk = to_module_kobject(kobj); if (!attribute->show) - return -EPERM; + return -EIO; if (!try_module_get(mk->mod)) return -ENODEV; @@ -653,7 +653,7 @@ static ssize_t module_attr_store(struct kobject *kobj, mk = to_module_kobject(kobj); if (!attribute->store) - return -EPERM; + return -EIO; if (!try_module_get(mk->mod)) return -ENODEV; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index fd316c2..cabb63f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1197,6 +1197,7 @@ void exit_itimers(struct signal_struct *sig) tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); itimer_delete(tmr); } + del_timer_sync(&sig->real_timer); } /* diff --git a/kernel/power/main.c b/kernel/power/main.c index 7960ddf..4cdebc9 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state) goto Unlock; } - pr_debug("PM: Preparing system for suspend\n"); + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); if ((error = suspend_prepare(state))) goto Unlock; - pr_debug("PM: Entering state.\n"); + pr_debug("PM: Entering %s sleep\n", pm_states[state]); error = suspend_enter(state); - pr_debug("PM: Finishing up.\n"); + pr_debug("PM: Finishing wakeup.\n"); suspend_finish(state); Unlock: up(&pm_sem); diff --git a/kernel/power/smp.c b/kernel/power/smp.c index cba3584b..457c230 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c @@ -48,11 +48,11 @@ void disable_nonboot_cpus(void) { oldmask = current->cpus_allowed; set_cpus_allowed(current, cpumask_of_cpu(0)); - printk("Freezing CPUs (at %d)", _smp_processor_id()); + printk("Freezing CPUs (at %d)", raw_smp_processor_id()); current->state = TASK_INTERRUPTIBLE; schedule_timeout(HZ); printk("..."); - BUG_ON(_smp_processor_id() != 0); + BUG_ON(raw_smp_processor_id() != 0); /* FIXME: for this to work, all the CPUs must be running * "idle" thread (or we deadlock). Is that guaranteed? */ diff --git a/kernel/printk.c b/kernel/printk.c index 290a07c..01b58d7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -160,42 +160,6 @@ static int __init console_setup(char *str) __setup("console=", console_setup); -/** - * add_preferred_console - add a device to the list of preferred consoles. - * - * The last preferred console added will be used for kernel messages - * and stdin/out/err for init. Normally this is used by console_setup - * above to handle user-supplied console arguments; however it can also - * be used by arch-specific code either to override the user or more - * commonly to provide a default console (ie from PROM variables) when - * the user has not supplied one. - */ -int __init add_preferred_console(char *name, int idx, char *options) -{ - struct console_cmdline *c; - int i; - - /* - * See if this tty is not yet registered, and - * if we have a slot free. - */ - for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - selected_console = i; - return 0; - } - if (i == MAX_CMDLINECONSOLES) - return -E2BIG; - selected_console = i; - c = &console_cmdline[i]; - memcpy(c->name, name, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx; - return 0; -} - static int __init log_buf_len_setup(char *str) { unsigned long size = memparse(str, &str); @@ -671,6 +635,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {} #endif /** + * add_preferred_console - add a device to the list of preferred consoles. + * + * The last preferred console added will be used for kernel messages + * and stdin/out/err for init. Normally this is used by console_setup + * above to handle user-supplied console arguments; however it can also + * be used by arch-specific code either to override the user or more + * commonly to provide a default console (ie from PROM variables) when + * the user has not supplied one. + */ +int __init add_preferred_console(char *name, int idx, char *options) +{ + struct console_cmdline *c; + int i; + + /* + * See if this tty is not yet registered, and + * if we have a slot free. + */ + for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + selected_console = i; + return 0; + } + if (i == MAX_CMDLINECONSOLES) + return -E2BIG; + selected_console = i; + c = &console_cmdline[i]; + memcpy(c->name, name, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx; + return 0; +} + +/** * acquire_console_sem - lock the console system for exclusive use. * * Acquires a semaphore which guarantees that the caller has diff --git a/kernel/profile.c b/kernel/profile.c index 0221a50..ad8cbb7 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex); static int __init profile_setup(char * str) { + static char __initdata schedstr[] = "schedule"; int par; - if (!strncmp(str, "schedule", 8)) { + if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; - printk(KERN_INFO "kernel schedule profiling enabled\n"); - if (str[7] == ',') - str += 8; - } - if (get_option(&str,&par)) { + if (str[strlen(schedstr)] == ',') + str += strlen(schedstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel schedule profiling enabled (shift: %ld)\n", + prof_shift); + } else if (get_option(&str, &par)) { prof_shift = par; prof_on = CPU_PROFILING; printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", diff --git a/kernel/sched.c b/kernel/sched.c index 0dc3158..deca041 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3755,19 +3755,22 @@ EXPORT_SYMBOL(cond_resched); */ int cond_resched_lock(spinlock_t * lock) { + int ret = 0; + if (need_lockbreak(lock)) { spin_unlock(lock); cpu_relax(); + ret = 1; spin_lock(lock); } if (need_resched()) { _raw_spin_unlock(lock); preempt_enable_no_resched(); __cond_resched(); + ret = 1; spin_lock(lock); - return 1; } - return 0; + return ret; } EXPORT_SYMBOL(cond_resched_lock); @@ -3811,7 +3814,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); atomic_inc(&rq->nr_iowait); schedule(); @@ -3822,7 +3825,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); long ret; atomic_inc(&rq->nr_iowait); @@ -4243,7 +4246,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { - tsk->cpus_allowed = cpuset_cpus_allowed(tsk); + cpus_setall(tsk->cpus_allowed); dest_cpu = any_online_cpu(tsk->cpus_allowed); /* diff --git a/kernel/signal.c b/kernel/signal.c index 8f3debc..c89821b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -24,6 +24,7 @@ #include <linux/ptrace.h> #include <linux/posix-timers.h> #include <linux/signal.h> +#include <linux/audit.h> #include <asm/param.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -522,7 +523,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = 0; - sig = next_signal(pending, mask); + /* SIGKILL must have priority, otherwise it is quite easy + * to create an unkillable process, sending sig < SIGKILL + * to self */ + if (unlikely(sigismember(&pending->signal, SIGKILL))) { + if (!sigismember(mask, SIGKILL)) + sig = SIGKILL; + } + + if (likely(!sig)) + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { @@ -658,7 +668,11 @@ static int check_kill_permission(int sig, struct siginfo *info, && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) return error; - return security_task_kill(t, info, sig); + + error = security_task_kill(t, info, sig); + if (!error) + audit_signal_info(sig, t); /* Let audit system see the signal */ + return error; } /* forward decl */ diff --git a/kernel/spinlock.c b/kernel/spinlock.c index e15ed17..0c3f9d8 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq); void __lockfunc _spin_unlock_bh(spinlock_t *lock) { _raw_spin_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_spin_unlock_bh); @@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq); void __lockfunc _read_unlock_bh(rwlock_t *lock) { _raw_read_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_read_unlock_bh); @@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq); void __lockfunc _write_unlock_bh(rwlock_t *lock) { _raw_write_unlock(lock); - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); } EXPORT_SYMBOL(_write_unlock_bh); @@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) if (_raw_spin_trylock(lock)) return 1; - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); return 0; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6116b25..84a9d18 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -100,7 +100,7 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { - if (i == _smp_processor_id()) + if (i == raw_smp_processor_id()) continue; ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); if (ret < 0) @@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, /* If they don't care which CPU fn runs on, bind to any online one. */ if (cpu == NR_CPUS) - cpu = _smp_processor_id(); + cpu = raw_smp_processor_id(); p = kthread_create(do_stop, &smdata, "kstopmachine"); if (!IS_ERR(p)) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0dda70e..6f15bea 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -77,6 +77,7 @@ cond_syscall(sys_request_key); cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); cond_syscall(compat_sys_socketcall); +cond_syscall(sys_set_zone_reclaim); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); |