summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c587
-rw-r--r--kernel/auditsc.c259
-rw-r--r--kernel/cpuset.c24
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/manage.c8
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/posix-timers.c1
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/smp.c4
-rw-r--r--kernel/printk.c72
-rw-r--r--kernel/profile.c16
-rw-r--r--kernel/sched.c13
-rw-r--r--kernel/signal.c18
-rw-r--r--kernel/spinlock.c8
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys_ni.c1
19 files changed, 604 insertions, 443 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 9c4f1af..ef35166 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,8 @@
#include <asm/types.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
#include <linux/audit.h>
@@ -68,7 +70,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
/* If audit records are to be written to the netlink socket, audit_pid
* contains the (non-zero) pid. */
-static int audit_pid;
+int audit_pid;
/* If audit_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -77,7 +79,10 @@ static int audit_rate_limit;
/* Number of outstanding audit_buffers allowed. */
static int audit_backlog_limit = 64;
-static atomic_t audit_backlog = ATOMIC_INIT(0);
+
+/* The identity of the user shutting down the audit system. */
+uid_t audit_sig_uid = -1;
+pid_t audit_sig_pid = -1;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -91,19 +96,17 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
-/* There are two lists of audit buffers. The txlist contains audit
- * buffers that cannot be sent immediately to the netlink device because
- * we are in an irq context (these are sent later in a tasklet).
- *
- * The second list is a list of pre-allocated audit buffers (if more
+/* The audit_freelist is a list of pre-allocated audit buffers (if more
* than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
* being placed on the freelist). */
-static DEFINE_SPINLOCK(audit_txlist_lock);
static DEFINE_SPINLOCK(audit_freelist_lock);
static int audit_freelist_count = 0;
-static LIST_HEAD(audit_txlist);
static LIST_HEAD(audit_freelist);
+static struct sk_buff_head audit_skb_queue;
+static struct task_struct *kauditd_task;
+static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+
/* There are three lists of rules -- one to search at task creation
* time, one to search at syscall entry time, and another to search at
* syscall exit time. */
@@ -112,7 +115,7 @@ static LIST_HEAD(audit_entlist);
static LIST_HEAD(audit_extlist);
/* The netlink socket is only to be read by 1 CPU, which lets us assume
- * that list additions and deletions never happen simultaneiously in
+ * that list additions and deletions never happen simultaneously in
* auditsc.c */
static DECLARE_MUTEX(audit_netlink_sem);
@@ -132,21 +135,14 @@ static DECLARE_MUTEX(audit_netlink_sem);
* use simultaneously. */
struct audit_buffer {
struct list_head list;
- struct sk_buff_head sklist; /* formatted skbs ready to send */
+ struct sk_buff *skb; /* formatted skb ready to send */
struct audit_context *ctx; /* NULL or associated context */
- int len; /* used area of tmp */
- char tmp[AUDIT_BUFSIZ];
-
- /* Pointer to header and contents */
- struct nlmsghdr *nlh;
- int total;
- int type;
- int pid;
};
-void audit_set_type(struct audit_buffer *ab, int type)
+static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
{
- ab->type = type;
+ struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+ nlh->nlmsg_pid = pid;
}
struct audit_entry {
@@ -154,9 +150,6 @@ struct audit_entry {
struct audit_rule rule;
};
-static void audit_log_end_irq(struct audit_buffer *ab);
-static void audit_log_end_fast(struct audit_buffer *ab);
-
static void audit_panic(const char *message)
{
switch (audit_failure)
@@ -227,10 +220,8 @@ void audit_log_lost(const char *message)
if (print) {
printk(KERN_WARNING
- "audit: audit_lost=%d audit_backlog=%d"
- " audit_rate_limit=%d audit_backlog_limit=%d\n",
+ "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n",
atomic_read(&audit_lost),
- atomic_read(&audit_backlog),
audit_rate_limit,
audit_backlog_limit);
audit_panic(message);
@@ -242,7 +233,8 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
{
int old = audit_rate_limit;
audit_rate_limit = limit;
- audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u",
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_rate_limit=%d old=%d by auid=%u",
audit_rate_limit, old, loginuid);
return old;
}
@@ -251,7 +243,8 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
{
int old = audit_backlog_limit;
audit_backlog_limit = limit;
- audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u",
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_backlog_limit=%d old=%d by auid=%u",
audit_backlog_limit, old, loginuid);
return old;
}
@@ -262,8 +255,9 @@ static int audit_set_enabled(int state, uid_t loginuid)
if (state != 0 && state != 1)
return -EINVAL;
audit_enabled = state;
- audit_log(NULL, "audit_enabled=%d old=%d by auid %u",
- audit_enabled, old, loginuid);
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_enabled=%d old=%d by auid=%u",
+ audit_enabled, old, loginuid);
return old;
}
@@ -275,12 +269,44 @@ static int audit_set_failure(int state, uid_t loginuid)
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
audit_failure = state;
- audit_log(NULL, "audit_failure=%d old=%d by auid %u",
- audit_failure, old, loginuid);
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_failure=%d old=%d by auid=%u",
+ audit_failure, old, loginuid);
return old;
}
-#ifdef CONFIG_NET
+int kauditd_thread(void *dummy)
+{
+ struct sk_buff *skb;
+
+ while (1) {
+ skb = skb_dequeue(&audit_skb_queue);
+ if (skb) {
+ if (audit_pid) {
+ int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
+ if (err < 0) {
+ BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_pid = 0;
+ }
+ } else {
+ printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0));
+ kfree_skb(skb);
+ }
+ } else {
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kauditd_wait, &wait);
+
+ if (!skb_queue_len(&audit_skb_queue))
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kauditd_wait, &wait);
+ }
+ }
+}
+
void audit_send_reply(int pid, int seq, int type, int done, int multi,
void *payload, int size)
{
@@ -293,13 +319,16 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
skb = alloc_skb(len, GFP_KERNEL);
if (!skb)
- goto nlmsg_failure;
+ return;
- nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+ nlh = NLMSG_PUT(skb, pid, seq, t, size);
nlh->nlmsg_flags = flags;
data = NLMSG_DATA(nlh);
memcpy(data, payload, size);
- netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+
+ /* Ignore failure. It'll only happen if the sender goes away,
+ because our timeout is set to infinite. */
+ netlink_unicast(audit_sock, skb, pid, 0);
return;
nlmsg_failure: /* Used by NLMSG_PUT */
@@ -321,10 +350,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
case AUDIT_SET:
case AUDIT_ADD:
case AUDIT_DEL:
+ case AUDIT_SIGNAL_INFO:
if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
+ case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
err = -EPERM;
break;
@@ -344,11 +375,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
uid_t loginuid; /* loginuid of sender */
+ struct audit_sig_info sig_data;
err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
if (err)
return err;
+ /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
+ if (!kauditd_task)
+ kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
+
pid = NETLINK_CREDS(skb)->pid;
uid = NETLINK_CREDS(skb)->uid;
loginuid = NETLINK_CB(skb).loginuid;
@@ -363,7 +404,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
status_set.rate_limit = audit_rate_limit;
status_set.backlog_limit = audit_backlog_limit;
status_set.lost = atomic_read(&audit_lost);
- status_set.backlog = atomic_read(&audit_backlog);
+ status_set.backlog = skb_queue_len(&audit_skb_queue);
audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
&status_set, sizeof(status_set));
break;
@@ -382,7 +423,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (status_get->mask & AUDIT_STATUS_PID) {
int old = audit_pid;
audit_pid = status_get->pid;
- audit_log(NULL, "audit_pid=%d old=%d by auid %u",
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_pid=%d old=%d by auid=%u",
audit_pid, old, loginuid);
}
if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
@@ -392,18 +434,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
loginuid);
break;
case AUDIT_USER:
- ab = audit_log_start(NULL);
+ case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+ ab = audit_log_start(NULL, msg_type);
if (!ab)
break; /* audit_panic has been called */
audit_log_format(ab,
- "user pid=%d uid=%d length=%d loginuid=%u"
+ "user pid=%d uid=%u auid=%u"
" msg='%.1024s'",
- pid, uid,
- (int)(nlh->nlmsg_len
- - ((char *)data - (char *)nlh)),
- loginuid, (char *)data);
- ab->type = AUDIT_USER;
- ab->pid = pid;
+ pid, uid, loginuid, (char *)data);
+ audit_set_pid(ab, pid);
audit_log_end(ab);
break;
case AUDIT_ADD:
@@ -412,12 +451,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
/* fallthrough */
case AUDIT_LIST:
-#ifdef CONFIG_AUDITSYSCALL
err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
uid, seq, data, loginuid);
-#else
- err = -EOPNOTSUPP;
-#endif
+ break;
+ case AUDIT_SIGNAL_INFO:
+ sig_data.uid = audit_sig_uid;
+ sig_data.pid = audit_sig_pid;
+ audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
+ 0, 0, &sig_data, sizeof(sig_data));
break;
default:
err = -EINVAL;
@@ -467,87 +508,6 @@ static void audit_receive(struct sock *sk, int length)
up(&audit_netlink_sem);
}
-/* Move data from tmp buffer into an skb. This is an extra copy, and
- * that is unfortunate. However, the copy will only occur when a record
- * is being written to user space, which is already a high-overhead
- * operation. (Elimination of the copy is possible, for example, by
- * writing directly into a pre-allocated skb, at the cost of wasting
- * memory. */
-static void audit_log_move(struct audit_buffer *ab)
-{
- struct sk_buff *skb;
- char *start;
- int extra = ab->nlh ? 0 : NLMSG_SPACE(0);
-
- /* possible resubmission */
- if (ab->len == 0)
- return;
-
- skb = skb_peek_tail(&ab->sklist);
- if (!skb || skb_tailroom(skb) <= ab->len + extra) {
- skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
- if (!skb) {
- ab->len = 0; /* Lose information in ab->tmp */
- audit_log_lost("out of memory in audit_log_move");
- return;
- }
- __skb_queue_tail(&ab->sklist, skb);
- if (!ab->nlh)
- ab->nlh = (struct nlmsghdr *)skb_put(skb,
- NLMSG_SPACE(0));
- }
- start = skb_put(skb, ab->len);
- memcpy(start, ab->tmp, ab->len);
- ab->len = 0;
-}
-
-/* Iterate over the skbuff in the audit_buffer, sending their contents
- * to user space. */
-static inline int audit_log_drain(struct audit_buffer *ab)
-{
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&ab->sklist))) {
- int retval = 0;
-
- if (audit_pid) {
- if (ab->nlh) {
- ab->nlh->nlmsg_len = ab->total;
- ab->nlh->nlmsg_type = ab->type;
- ab->nlh->nlmsg_flags = 0;
- ab->nlh->nlmsg_seq = 0;
- ab->nlh->nlmsg_pid = ab->pid;
- }
- skb_get(skb); /* because netlink_* frees */
- retval = netlink_unicast(audit_sock, skb, audit_pid,
- MSG_DONTWAIT);
- }
- if (retval == -EAGAIN &&
- (atomic_read(&audit_backlog)) < audit_backlog_limit) {
- skb_queue_head(&ab->sklist, skb);
- audit_log_end_irq(ab);
- return 1;
- }
- if (retval < 0) {
- if (retval == -ECONNREFUSED) {
- printk(KERN_ERR
- "audit: *NO* daemon at audit_pid=%d\n",
- audit_pid);
- audit_pid = 0;
- } else
- audit_log_lost("netlink socket too busy");
- }
- if (!audit_pid) { /* No daemon */
- int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
- int len = skb->len - offset;
- skb->data[offset + len] = '\0';
- printk(KERN_ERR "%s\n", skb->data + offset);
- }
- kfree_skb(skb);
- ab->nlh = NULL;
- }
- return 0;
-}
/* Initialize audit support at boot time. */
static int __init audit_init(void)
@@ -558,40 +518,13 @@ static int __init audit_init(void)
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
+ audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ skb_queue_head_init(&audit_skb_queue);
audit_initialized = 1;
audit_enabled = audit_default;
- audit_log(NULL, "initialized");
- return 0;
-}
-
-#else
-/* Without CONFIG_NET, we have no skbuffs. For now, print what we have
- * in the buffer. */
-static void audit_log_move(struct audit_buffer *ab)
-{
- printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
- ab->len = 0;
-}
-
-static inline int audit_log_drain(struct audit_buffer *ab)
-{
- return 0;
-}
-
-/* Initialize audit support at boot time. */
-int __init audit_init(void)
-{
- printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
- audit_sock = NULL;
- audit_pid = 0;
-
- audit_initialized = 1;
- audit_enabled = audit_default;
- audit_log(NULL, "initialized");
+ audit_log(NULL, AUDIT_KERNEL, "initialized");
return 0;
}
-#endif
-
__initcall(audit_init);
/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
@@ -608,6 +541,102 @@ static int __init audit_enable(char *str)
__setup("audit=", audit_enable);
+static void audit_buffer_free(struct audit_buffer *ab)
+{
+ unsigned long flags;
+
+ if (!ab)
+ return;
+
+ if (ab->skb)
+ kfree_skb(ab->skb);
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (++audit_freelist_count > AUDIT_MAXFREE)
+ kfree(ab);
+ else
+ list_add(&ab->list, &audit_freelist);
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
+ int gfp_mask, int type)
+{
+ unsigned long flags;
+ struct audit_buffer *ab = NULL;
+ struct nlmsghdr *nlh;
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (!list_empty(&audit_freelist)) {
+ ab = list_entry(audit_freelist.next,
+ struct audit_buffer, list);
+ list_del(&ab->list);
+ --audit_freelist_count;
+ }
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+ if (!ab) {
+ ab = kmalloc(sizeof(*ab), gfp_mask);
+ if (!ab)
+ goto err;
+ }
+
+ ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
+ if (!ab->skb)
+ goto err;
+
+ ab->ctx = ctx;
+ nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
+ nlh->nlmsg_type = type;
+ nlh->nlmsg_flags = 0;
+ nlh->nlmsg_pid = 0;
+ nlh->nlmsg_seq = 0;
+ return ab;
+err:
+ audit_buffer_free(ab);
+ return NULL;
+}
+
+/* Compute a serial number for the audit record. Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces. The timestamp of the
+ * record and this serial number are used by the user-space tools to
+ * determine which pieces belong to the same audit record. The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit. However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+unsigned int audit_serial(void)
+{
+ static atomic_t serial = ATOMIC_INIT(0xffffff);
+ unsigned int a, b;
+
+ do {
+ a = atomic_read(&serial);
+ if (atomic_dec_and_test(&serial))
+ atomic_set(&serial, 0xffffff);
+ b = atomic_read(&serial);
+ } while (b != a - 1);
+
+ return 0xffffff - b;
+}
+
+static inline void audit_get_stamp(struct audit_context *ctx,
+ struct timespec *t, unsigned int *serial)
+{
+ if (ctx)
+ auditsc_get_stamp(ctx, t, serial);
+ else {
+ *t = CURRENT_TIME;
+ *serial = audit_serial();
+ }
+}
/* Obtain an audit buffer. This routine does locking to obtain the
* audit buffer, but then no locking is required for calls to
@@ -615,10 +644,9 @@ __setup("audit=", audit_enable);
* syscall, then the syscall is marked as auditable and an audit record
* will be written at syscall exit. If there is no associated task, tsk
* should be NULL. */
-struct audit_buffer *audit_log_start(struct audit_context *ctx)
+struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
{
struct audit_buffer *ab = NULL;
- unsigned long flags;
struct timespec t;
unsigned int serial;
@@ -626,57 +654,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
return NULL;
if (audit_backlog_limit
- && atomic_read(&audit_backlog) > audit_backlog_limit) {
+ && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
if (audit_rate_check())
printk(KERN_WARNING
"audit: audit_backlog=%d > "
"audit_backlog_limit=%d\n",
- atomic_read(&audit_backlog),
+ skb_queue_len(&audit_skb_queue),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
return NULL;
}
- spin_lock_irqsave(&audit_freelist_lock, flags);
- if (!list_empty(&audit_freelist)) {
- ab = list_entry(audit_freelist.next,
- struct audit_buffer, list);
- list_del(&ab->list);
- --audit_freelist_count;
- }
- spin_unlock_irqrestore(&audit_freelist_lock, flags);
-
- if (!ab)
- ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+ ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
return NULL;
}
- atomic_inc(&audit_backlog);
- skb_queue_head_init(&ab->sklist);
-
- ab->ctx = ctx;
- ab->len = 0;
- ab->nlh = NULL;
- ab->total = 0;
- ab->type = AUDIT_KERNEL;
- ab->pid = 0;
+ audit_get_stamp(ab->ctx, &t, &serial);
-#ifdef CONFIG_AUDITSYSCALL
- if (ab->ctx)
- audit_get_stamp(ab->ctx, &t, &serial);
- else
-#endif
- {
- t = CURRENT_TIME;
- serial = 0;
- }
audit_log_format(ab, "audit(%lu.%03lu:%u): ",
t.tv_sec, t.tv_nsec/1000000, serial);
return ab;
}
+/**
+ * audit_expand - expand skb in the audit buffer
+ * @ab: audit_buffer
+ *
+ * Returns 0 (no space) on failed expansion, or available space if
+ * successful.
+ */
+static inline int audit_expand(struct audit_buffer *ab, int extra)
+{
+ struct sk_buff *skb = ab->skb;
+ int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
+ GFP_ATOMIC);
+ if (ret < 0) {
+ audit_log_lost("out of memory in audit_expand");
+ return 0;
+ }
+ return skb_tailroom(skb);
+}
/* Format an audit message into the audit buffer. If there isn't enough
* room in the audit buffer, more room will be allocated and vsnprint
@@ -686,26 +705,35 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
va_list args)
{
int len, avail;
+ struct sk_buff *skb;
+ va_list args2;
if (!ab)
return;
- avail = sizeof(ab->tmp) - ab->len;
- if (avail <= 0) {
- audit_log_move(ab);
- avail = sizeof(ab->tmp) - ab->len;
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ if (avail == 0) {
+ avail = audit_expand(ab, AUDIT_BUFSIZ);
+ if (!avail)
+ goto out;
}
- len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+ va_copy(args2, args);
+ len = vsnprintf(skb->tail, avail, fmt, args);
if (len >= avail) {
/* The printk buffer is 1024 bytes long, so if we get
* here and AUDIT_BUFSIZ is at least 1024, then we can
* log everything that printk could have logged. */
- audit_log_move(ab);
- avail = sizeof(ab->tmp) - ab->len;
- len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+ avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+ if (!avail)
+ goto out;
+ len = vsnprintf(skb->tail, avail, fmt, args2);
}
- ab->len += (len < avail) ? len : avail;
- ab->total += (len < avail) ? len : avail;
+ if (len > 0)
+ skb_put(skb, len);
+out:
+ return;
}
/* Format a message into the audit buffer. All the work is done in
@@ -721,20 +749,47 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
va_end(args);
}
-void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len)
+/* This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb. */
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
+ size_t len)
{
- int i;
+ int i, avail, new_len;
+ unsigned char *ptr;
+ struct sk_buff *skb;
+ static const unsigned char *hex = "0123456789ABCDEF";
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ new_len = len<<1;
+ if (new_len >= avail) {
+ /* Round the buffer request up to the next multiple */
+ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
+ avail = audit_expand(ab, new_len);
+ if (!avail)
+ return;
+ }
- for (i=0; i<len; i++)
- audit_log_format(ab, "%02x", buf[i]);
+ ptr = skb->tail;
+ for (i=0; i<len; i++) {
+ *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
+ *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
+ }
+ *ptr = 0;
+ skb_put(skb, len << 1); /* new string is twice the old string */
}
+/* This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark,
+ * or a space. Unescaped strings will start and end with a double quote mark.
+ * Strings that are escaped are printed in hex (2 digits per char). */
void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
{
const unsigned char *p = string;
while (*p) {
- if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) {
+ if (*p == '"' || *p < 0x21 || *p > 0x7f) {
audit_log_hex(ab, string, strlen(string));
return;
}
@@ -743,117 +798,63 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
audit_log_format(ab, "\"%s\"", string);
}
-
-/* This is a helper-function to print the d_path without using a static
- * buffer or allocating another buffer in addition to the one in
- * audit_buffer. */
+/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
struct dentry *dentry, struct vfsmount *vfsmnt)
{
- char *p;
- int len, avail;
+ char *p, *path;
- if (prefix) audit_log_format(ab, " %s", prefix);
-
- if (ab->len > 128)
- audit_log_move(ab);
- avail = sizeof(ab->tmp) - ab->len;
- p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
- if (IS_ERR(p)) {
- /* FIXME: can we save some information here? */
- audit_log_format(ab, "<toolong>");
- } else {
- /* path isn't at start of buffer */
- len = (ab->tmp + sizeof(ab->tmp) - 1) - p;
- memmove(ab->tmp + ab->len, p, len);
- ab->len += len;
- ab->total += len;
- }
-}
-
-/* Remove queued messages from the audit_txlist and send them to userspace. */
-static void audit_tasklet_handler(unsigned long arg)
-{
- LIST_HEAD(list);
- struct audit_buffer *ab;
- unsigned long flags;
+ if (prefix)
+ audit_log_format(ab, " %s", prefix);
- spin_lock_irqsave(&audit_txlist_lock, flags);
- list_splice_init(&audit_txlist, &list);
- spin_unlock_irqrestore(&audit_txlist_lock, flags);
-
- while (!list_empty(&list)) {
- ab = list_entry(list.next, struct audit_buffer, list);
- list_del(&ab->list);
- audit_log_end_fast(ab);
+ /* We will allow 11 spaces for ' (deleted)' to be appended */
+ path = kmalloc(PATH_MAX+11, GFP_KERNEL);
+ if (!path) {
+ audit_log_format(ab, "<no memory>");
+ return;
}
+ p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
+ if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
+ /* FIXME: can we save some information here? */
+ audit_log_format(ab, "<too long>");
+ } else
+ audit_log_untrustedstring(ab, p);
+ kfree(path);
}
-static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
-
/* The netlink_* functions cannot be called inside an irq context, so
* the audit buffer is places on a queue and a tasklet is scheduled to
* remove them from the queue outside the irq context. May be called in
* any context. */
-static void audit_log_end_irq(struct audit_buffer *ab)
-{
- unsigned long flags;
-
- if (!ab)
- return;
- spin_lock_irqsave(&audit_txlist_lock, flags);
- list_add_tail(&ab->list, &audit_txlist);
- spin_unlock_irqrestore(&audit_txlist_lock, flags);
-
- tasklet_schedule(&audit_tasklet);
-}
-
-/* Send the message in the audit buffer directly to user space. May not
- * be called in an irq context. */
-static void audit_log_end_fast(struct audit_buffer *ab)
+void audit_log_end(struct audit_buffer *ab)
{
- unsigned long flags;
-
- BUG_ON(in_irq());
if (!ab)
return;
if (!audit_rate_check()) {
audit_log_lost("rate limit exceeded");
} else {
- audit_log_move(ab);
- if (audit_log_drain(ab))
- return;
+ if (audit_pid) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+ nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+ skb_queue_tail(&audit_skb_queue, ab->skb);
+ ab->skb = NULL;
+ wake_up_interruptible(&kauditd_wait);
+ } else {
+ printk("%s\n", ab->skb->data + NLMSG_SPACE(0));
+ }
}
-
- atomic_dec(&audit_backlog);
- spin_lock_irqsave(&audit_freelist_lock, flags);
- if (++audit_freelist_count > AUDIT_MAXFREE)
- kfree(ab);
- else
- list_add(&ab->list, &audit_freelist);
- spin_unlock_irqrestore(&audit_freelist_lock, flags);
-}
-
-/* Send or queue the message in the audit buffer, depending on the
- * current context. (A convenience function that may be called in any
- * context.) */
-void audit_log_end(struct audit_buffer *ab)
-{
- if (in_irq())
- audit_log_end_irq(ab);
- else
- audit_log_end_fast(ab);
+ audit_buffer_free(ab);
}
/* Log an audit record. This is a convenience function that calls
* audit_log_start, audit_log_vformat, and audit_log_end. It may be
* called in any context. */
-void audit_log(struct audit_context *ctx, const char *fmt, ...)
+void audit_log(struct audit_context *ctx, int type, const char *fmt, ...)
{
struct audit_buffer *ab;
va_list args;
- ab = audit_log_start(ctx);
+ ab = audit_log_start(ctx, type);
if (ab) {
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 37b3ac9..e75f84e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -34,7 +34,8 @@
#include <asm/types.h>
#include <linux/mm.h>
#include <linux/module.h>
-
+#include <linux/mount.h>
+#include <linux/socket.h>
#include <linux/audit.h>
#include <linux/personality.h>
#include <linux/time.h>
@@ -112,6 +113,23 @@ struct audit_aux_data_ipcctl {
mode_t mode;
};
+struct audit_aux_data_socketcall {
+ struct audit_aux_data d;
+ int nargs;
+ unsigned long args[0];
+};
+
+struct audit_aux_data_sockaddr {
+ struct audit_aux_data d;
+ int len;
+ char a[0];
+};
+
+struct audit_aux_data_path {
+ struct audit_aux_data d;
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+};
/* The per-task audit context. */
struct audit_context {
@@ -127,6 +145,8 @@ struct audit_context {
int auditable; /* 1 if record should be written */
int name_count;
struct audit_names names[AUDIT_NAMES];
+ struct dentry * pwd;
+ struct vfsmount * pwdmnt;
struct audit_context *previous; /* For nested syscalls */
struct audit_aux_data *aux;
@@ -157,6 +177,8 @@ struct audit_entry {
struct audit_rule rule;
};
+extern int audit_pid;
+
/* Check to see if two rules are identical. It is called from
* audit_del_rule during AUDIT_DEL. */
static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
@@ -226,7 +248,6 @@ static inline int audit_del_rule(struct audit_rule *rule,
return -EFAULT; /* No matching rule */
}
-#ifdef CONFIG_NET
/* Copy rule from user-space to kernel-space. Called during
* AUDIT_ADD. */
static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
@@ -287,7 +308,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
err = audit_add_rule(entry, &audit_entlist);
if (!err && (flags & AUDIT_AT_EXIT))
err = audit_add_rule(entry, &audit_extlist);
- audit_log(NULL, "auid %u added an audit rule\n", loginuid);
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "auid=%u added an audit rule\n", loginuid);
break;
case AUDIT_DEL:
flags =((struct audit_rule *)data)->flags;
@@ -297,7 +319,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
err = audit_del_rule(data, &audit_entlist);
if (!err && (flags & AUDIT_AT_EXIT))
err = audit_del_rule(data, &audit_extlist);
- audit_log(NULL, "auid %u removed an audit rule\n", loginuid);
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "auid=%u removed an audit rule\n", loginuid);
break;
default:
return -EINVAL;
@@ -305,7 +328,6 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return err;
}
-#endif
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise. */
@@ -444,7 +466,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
- * also not high enough that we already know we have to write and audit
+ * also not high enough that we already know we have to write an audit
* record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
*/
static enum audit_state audit_filter_syscall(struct task_struct *tsk,
@@ -532,6 +554,12 @@ static inline void audit_free_names(struct audit_context *context)
if (context->names[i].name)
__putname(context->names[i].name);
context->name_count = 0;
+ if (context->pwd)
+ dput(context->pwd);
+ if (context->pwdmnt)
+ mntput(context->pwdmnt);
+ context->pwd = NULL;
+ context->pwdmnt = NULL;
}
static inline void audit_free_aux(struct audit_context *context)
@@ -539,6 +567,11 @@ static inline void audit_free_aux(struct audit_context *context)
struct audit_aux_data *aux;
while ((aux = context->aux)) {
+ if (aux->type == AUDIT_AVC_PATH) {
+ struct audit_aux_data_path *axi = (void *)aux;
+ dput(axi->dentry);
+ mntput(axi->mnt);
+ }
context->aux = aux->next;
kfree(aux);
}
@@ -625,7 +658,8 @@ static void audit_log_task_info(struct audit_buffer *ab)
struct vm_area_struct *vma;
get_task_comm(name, current);
- audit_log_format(ab, " comm=%s", name);
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, name);
if (!mm)
return;
@@ -649,23 +683,24 @@ static void audit_log_exit(struct audit_context *context)
{
int i;
struct audit_buffer *ab;
+ struct audit_aux_data *aux;
- ab = audit_log_start(context);
+ ab = audit_log_start(context, AUDIT_SYSCALL);
if (!ab)
return; /* audit_panic has been called */
- audit_log_format(ab, "syscall=%d", context->major);
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
if (context->personality != PER_LINUX)
audit_log_format(ab, " per=%lx", context->personality);
- audit_log_format(ab, " arch=%x", context->arch);
if (context->return_valid)
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
- " pid=%d loginuid=%d uid=%d gid=%d"
- " euid=%d suid=%d fsuid=%d"
- " egid=%d sgid=%d fsgid=%d",
+ " pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u",
context->argv[0],
context->argv[1],
context->argv[2],
@@ -679,33 +714,57 @@ static void audit_log_exit(struct audit_context *context)
context->egid, context->sgid, context->fsgid);
audit_log_task_info(ab);
audit_log_end(ab);
- while (context->aux) {
- struct audit_aux_data *aux;
- ab = audit_log_start(context);
+ for (aux = context->aux; aux; aux = aux->next) {
+
+ ab = audit_log_start(context, aux->type);
if (!ab)
continue; /* audit_panic has been called */
- aux = context->aux;
- context->aux = aux->next;
-
- audit_log_format(ab, "auxitem=%d", aux->type);
switch (aux->type) {
- case AUDIT_AUX_IPCPERM: {
+ case AUDIT_IPC: {
struct audit_aux_data_ipcctl *axi = (void *)aux;
audit_log_format(ab,
- " qbytes=%lx uid=%d gid=%d mode=%x",
+ " qbytes=%lx iuid=%u igid=%u mode=%x",
axi->qbytes, axi->uid, axi->gid, axi->mode);
- }
+ break; }
+
+ case AUDIT_SOCKETCALL: {
+ int i;
+ struct audit_aux_data_socketcall *axs = (void *)aux;
+ audit_log_format(ab, "nargs=%d", axs->nargs);
+ for (i=0; i<axs->nargs; i++)
+ audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
+ break; }
+
+ case AUDIT_SOCKADDR: {
+ struct audit_aux_data_sockaddr *axs = (void *)aux;
+
+ audit_log_format(ab, "saddr=");
+ audit_log_hex(ab, axs->a, axs->len);
+ break; }
+
+ case AUDIT_AVC_PATH: {
+ struct audit_aux_data_path *axi = (void *)aux;
+ audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
+ break; }
+
}
audit_log_end(ab);
- kfree(aux);
}
+ if (context->pwd && context->pwdmnt) {
+ ab = audit_log_start(context, AUDIT_CWD);
+ if (ab) {
+ audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
+ audit_log_end(ab);
+ }
+ }
for (i = 0; i < context->name_count; i++) {
- ab = audit_log_start(context);
+ ab = audit_log_start(context, AUDIT_PATH);
if (!ab)
continue; /* audit_panic has been called */
+
audit_log_format(ab, "item=%d", i);
if (context->names[i].name) {
audit_log_format(ab, " name=");
@@ -713,7 +772,7 @@ static void audit_log_exit(struct audit_context *context)
}
if (context->names[i].ino != (unsigned long)-1)
audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
- " uid=%d gid=%d rdev=%02x:%02x",
+ " ouid=%u ogid=%u rdev=%02x:%02x",
context->names[i].ino,
MAJOR(context->names[i].dev),
MINOR(context->names[i].dev),
@@ -741,42 +800,12 @@ void audit_free(struct task_struct *tsk)
/* Check for system calls that do not go through the exit
* function (e.g., exit_group), then free context block. */
- if (context->in_syscall && context->auditable)
+ if (context->in_syscall && context->auditable && context->pid != audit_pid)
audit_log_exit(context);
audit_free_context(context);
}
-/* Compute a serial number for the audit record. Audit records are
- * written to user-space as soon as they are generated, so a complete
- * audit record may be written in several pieces. The timestamp of the
- * record and this serial number are used by the user-space daemon to
- * determine which pieces belong to the same audit record. The
- * (timestamp,serial) tuple is unique for each syscall and is live from
- * syscall entry to syscall exit.
- *
- * Atomic values are only guaranteed to be 24-bit, so we count down.
- *
- * NOTE: Another possibility is to store the formatted records off the
- * audit context (for those records that have a context), and emit them
- * all at syscall exit. However, this could delay the reporting of
- * significant errors until syscall exit (or never, if the system
- * halts). */
-static inline unsigned int audit_serial(void)
-{
- static atomic_t serial = ATOMIC_INIT(0xffffff);
- unsigned int a, b;
-
- do {
- a = atomic_read(&serial);
- if (atomic_dec_and_test(&serial))
- atomic_set(&serial, 0xffffff);
- b = atomic_read(&serial);
- } while (b != a - 1);
-
- return 0xffffff - b;
-}
-
/* Fill in audit context at syscall entry. This only happens if the
* audit context was created when the task was created and the state or
* filters demand the audit context be built. If the state from the
@@ -876,7 +905,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
if (likely(!context))
return;
- if (context->in_syscall && context->auditable)
+ if (context->in_syscall && context->auditable && context->pid != audit_pid)
audit_log_exit(context);
context->in_syscall = 0;
@@ -916,6 +945,13 @@ void audit_getname(const char *name)
context->names[context->name_count].name = name;
context->names[context->name_count].ino = (unsigned long)-1;
++context->name_count;
+ if (!context->pwd) {
+ read_lock(&current->fs->lock);
+ context->pwd = dget(current->fs->pwd);
+ context->pwdmnt = mntget(current->fs->pwdmnt);
+ read_unlock(&current->fs->lock);
+ }
+
}
/* Intercept a putname request. Called from
@@ -994,34 +1030,26 @@ void audit_inode(const char *name, const struct inode *inode)
context->names[idx].rdev = inode->i_rdev;
}
-void audit_get_stamp(struct audit_context *ctx,
- struct timespec *t, unsigned int *serial)
+void auditsc_get_stamp(struct audit_context *ctx,
+ struct timespec *t, unsigned int *serial)
{
- if (ctx) {
- t->tv_sec = ctx->ctime.tv_sec;
- t->tv_nsec = ctx->ctime.tv_nsec;
- *serial = ctx->serial;
- ctx->auditable = 1;
- } else {
- *t = CURRENT_TIME;
- *serial = 0;
- }
+ t->tv_sec = ctx->ctime.tv_sec;
+ t->tv_nsec = ctx->ctime.tv_nsec;
+ *serial = ctx->serial;
+ ctx->auditable = 1;
}
-extern int audit_set_type(struct audit_buffer *ab, int type);
-
int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
{
if (task->audit_context) {
struct audit_buffer *ab;
- ab = audit_log_start(NULL);
+ ab = audit_log_start(NULL, AUDIT_LOGIN);
if (ab) {
audit_log_format(ab, "login pid=%d uid=%u "
- "old loginuid=%u new loginuid=%u",
+ "old auid=%u new auid=%u",
task->pid, task->uid,
task->audit_context->loginuid, loginuid);
- audit_set_type(ab, AUDIT_LOGIN);
audit_log_end(ab);
}
task->audit_context->loginuid = loginuid;
@@ -1051,8 +1079,89 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
ax->gid = gid;
ax->mode = mode;
- ax->d.type = AUDIT_AUX_IPCPERM;
+ ax->d.type = AUDIT_IPC;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+int audit_socketcall(int nargs, unsigned long *args)
+{
+ struct audit_aux_data_socketcall *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->nargs = nargs;
+ memcpy(ax->args, args, nargs * sizeof(unsigned long));
+
+ ax->d.type = AUDIT_SOCKETCALL;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+int audit_sockaddr(int len, void *a)
+{
+ struct audit_aux_data_sockaddr *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->len = len;
+ memcpy(ax->a, a, len);
+
+ ax->d.type = AUDIT_SOCKADDR;
ax->d.next = context->aux;
context->aux = (void *)ax;
return 0;
}
+
+int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
+{
+ struct audit_aux_data_path *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->dentry = dget(dentry);
+ ax->mnt = mntget(mnt);
+
+ ax->d.type = AUDIT_AVC_PATH;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+void audit_signal_info(int sig, struct task_struct *t)
+{
+ extern pid_t audit_sig_pid;
+ extern uid_t audit_sig_uid;
+
+ if (unlikely(audit_pid && t->pid == audit_pid)) {
+ if (sig == SIGTERM || sig == SIGHUP) {
+ struct audit_context *ctx = current->audit_context;
+ audit_sig_pid = current->pid;
+ if (ctx)
+ audit_sig_uid = ctx->loginuid;
+ else
+ audit_sig_uid = current->uid;
+ }
+ }
+}
+
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 961d740..00e8f25 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL;
* The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
* (usually) grab cpuset_sem. These are the two most performance
* critical pieces of code here. The exception occurs on exit(),
- * if the last task using a cpuset exits, and the cpuset was marked
- * notify_on_release. In that case, the cpuset_sem is taken, the
- * path to the released cpuset calculated, and a usermode call made
+ * when a task in a notify_on_release cpuset exits. Then cpuset_sem
+ * is taken, and if the cpuset count is zero, a usermode call made
* to /sbin/cpuset_release_agent with the name of the cpuset (path
* relative to the root of cpuset file system) as the argument.
*
@@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk)
*
* Description: Detach cpuset from @tsk and release it.
*
+ * Note that cpusets marked notify_on_release force every task
+ * in them to take the global cpuset_sem semaphore when exiting.
+ * This could impact scaling on very large systems. Be reluctant
+ * to use notify_on_release cpusets where very high task exit
+ * scaling is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use
+ * count goes to zero, except inside a critical section guarded
+ * by the cpuset_sem semaphore. If you don't hold cpuset_sem,
+ * then a zero cpuset use count is a license to any other task to
+ * nuke the cpuset immediately.
+ *
**/
void cpuset_exit(struct task_struct *tsk)
@@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk)
tsk->cpuset = NULL;
task_unlock(tsk);
- if (atomic_dec_and_test(&cs->count)) {
+ if (notify_on_release(cs)) {
down(&cpuset_sem);
- check_for_release(cs);
+ if (atomic_dec_and_test(&cs->count))
+ check_for_release(cs);
up(&cpuset_sem);
+ } else {
+ atomic_dec(&cs->count);
}
}
diff --git a/kernel/exit.c b/kernel/exit.c
index edaa50b..2ef2ad5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -811,10 +811,8 @@ fastcall NORET_TYPE void do_exit(long code)
acct_update_integrals(tsk);
update_mem_hiwater(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
- if (group_dead) {
- del_timer_sync(&tsk->signal->real_timer);
+ if (group_dead)
acct_process(code);
- }
exit_mm(tsk);
exit_sem(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index f42a17f..a28d11e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
mm->mmap = NULL;
mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
+ mm->cached_hole_size = ~0UL;
mm->map_count = 0;
set_mm_counter(mm, rss, 0);
set_mm_counter(mm, anon_rss, 0);
@@ -249,8 +250,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
/*
* Link in the new vma and copy the page table entries:
- * link in first so that swapoff can see swap entries,
- * and try_to_unmap_one's find_vma find the new vma.
+ * link in first so that swapoff can see swap entries.
+ * Note that, exceptionally, here the vma is inserted
+ * without holding mm->mmap_sem.
*/
spin_lock(&mm->page_table_lock);
*pprev = tmp;
@@ -322,6 +324,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
mm->ioctx_list = NULL;
mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
mm->free_area_cache = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = ~0UL;
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 2fb0e46..436c7d9 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,6 +30,7 @@
*/
irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
[0 ... NR_IRQS-1] = {
+ .status = IRQ_DISABLED,
.handler = &no_irq_type,
.lock = SPIN_LOCK_UNLOCKED
}
@@ -118,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
*/
desc->handler->ack(irq);
action_ret = handle_IRQ_event(irq, regs, desc->action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
desc->handler->end(irq);
return 1;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5202e4c..ac67009 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -6,6 +6,7 @@
* This file contains driver APIs to the irq subsystem.
*/
+#include <linux/config.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/random.h>
@@ -255,6 +256,13 @@ void free_irq(unsigned int irq, void *dev_id)
/* Found it - now remove it from the list of entries */
*pp = action->next;
+
+ /* Currently used only by UML, might disappear one day.*/
+#ifdef CONFIG_IRQ_RELEASE_METHOD
+ if (desc->handler->release)
+ desc->handler->release(irq, dev_id);
+#endif
+
if (!desc->action) {
desc->status |= IRQ_DISABLED;
if (desc->handler->shutdown)
diff --git a/kernel/module.c b/kernel/module.c
index 5734ab0..a566745 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -379,7 +379,7 @@ static void module_unload_init(struct module *mod)
for (i = 0; i < NR_CPUS; i++)
local_set(&mod->ref[i].count, 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[_smp_processor_id()].count, 1);
+ local_set(&mod->ref[raw_smp_processor_id()].count, 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod,
const char __user *uargs)
{
struct module *mod;
+ mm_segment_t old_fs = get_fs();
int ret = 0;
/* Must have permission */
@@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod,
return PTR_ERR(mod);
}
+ /* flush the icache in correct context */
+ set_fs(KERNEL_DS);
+
/* Flush the instruction cache, since we've played with text */
if (mod->module_init)
flush_icache_range((unsigned long)mod->module_init,
@@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod,
flush_icache_range((unsigned long)mod->module_core,
(unsigned long)mod->module_core + mod->core_size);
+ set_fs(old_fs);
+
/* Now sew it into the lists. They won't access us, since
strong_try_module_get() will fail. */
stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/params.c b/kernel/params.c
index 5513844..d586c35ef 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -629,7 +629,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
mk = to_module_kobject(kobj);
if (!attribute->show)
- return -EPERM;
+ return -EIO;
if (!try_module_get(mk->mod))
return -ENODEV;
@@ -653,7 +653,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
mk = to_module_kobject(kobj);
if (!attribute->store)
- return -EPERM;
+ return -EIO;
if (!try_module_get(mk->mod))
return -ENODEV;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fd316c2..cabb63f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1197,6 +1197,7 @@ void exit_itimers(struct signal_struct *sig)
tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
itimer_delete(tmr);
}
+ del_timer_sync(&sig->real_timer);
}
/*
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7960ddf..4cdebc9 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state)
goto Unlock;
}
- pr_debug("PM: Preparing system for suspend\n");
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
if ((error = suspend_prepare(state)))
goto Unlock;
- pr_debug("PM: Entering state.\n");
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
error = suspend_enter(state);
- pr_debug("PM: Finishing up.\n");
+ pr_debug("PM: Finishing wakeup.\n");
suspend_finish(state);
Unlock:
up(&pm_sem);
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index cba3584b..457c230 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -48,11 +48,11 @@ void disable_nonboot_cpus(void)
{
oldmask = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(0));
- printk("Freezing CPUs (at %d)", _smp_processor_id());
+ printk("Freezing CPUs (at %d)", raw_smp_processor_id());
current->state = TASK_INTERRUPTIBLE;
schedule_timeout(HZ);
printk("...");
- BUG_ON(_smp_processor_id() != 0);
+ BUG_ON(raw_smp_processor_id() != 0);
/* FIXME: for this to work, all the CPUs must be running
* "idle" thread (or we deadlock). Is that guaranteed? */
diff --git a/kernel/printk.c b/kernel/printk.c
index 290a07c..01b58d7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -160,42 +160,6 @@ static int __init console_setup(char *str)
__setup("console=", console_setup);
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init. Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int __init add_preferred_console(char *name, int idx, char *options)
-{
- struct console_cmdline *c;
- int i;
-
- /*
- * See if this tty is not yet registered, and
- * if we have a slot free.
- */
- for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
- if (strcmp(console_cmdline[i].name, name) == 0 &&
- console_cmdline[i].index == idx) {
- selected_console = i;
- return 0;
- }
- if (i == MAX_CMDLINECONSOLES)
- return -E2BIG;
- selected_console = i;
- c = &console_cmdline[i];
- memcpy(c->name, name, sizeof(c->name));
- c->name[sizeof(c->name) - 1] = 0;
- c->options = options;
- c->index = idx;
- return 0;
-}
-
static int __init log_buf_len_setup(char *str)
{
unsigned long size = memparse(str, &str);
@@ -671,6 +635,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {}
#endif
/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init. Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+ struct console_cmdline *c;
+ int i;
+
+ /*
+ * See if this tty is not yet registered, and
+ * if we have a slot free.
+ */
+ for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+ if (strcmp(console_cmdline[i].name, name) == 0 &&
+ console_cmdline[i].index == idx) {
+ selected_console = i;
+ return 0;
+ }
+ if (i == MAX_CMDLINECONSOLES)
+ return -E2BIG;
+ selected_console = i;
+ c = &console_cmdline[i];
+ memcpy(c->name, name, sizeof(c->name));
+ c->name[sizeof(c->name) - 1] = 0;
+ c->options = options;
+ c->index = idx;
+ return 0;
+}
+
+/**
* acquire_console_sem - lock the console system for exclusive use.
*
* Acquires a semaphore which guarantees that the caller has
diff --git a/kernel/profile.c b/kernel/profile.c
index 0221a50..ad8cbb7 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex);
static int __init profile_setup(char * str)
{
+ static char __initdata schedstr[] = "schedule";
int par;
- if (!strncmp(str, "schedule", 8)) {
+ if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
- printk(KERN_INFO "kernel schedule profiling enabled\n");
- if (str[7] == ',')
- str += 8;
- }
- if (get_option(&str,&par)) {
+ if (str[strlen(schedstr)] == ',')
+ str += strlen(schedstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ printk(KERN_INFO
+ "kernel schedule profiling enabled (shift: %ld)\n",
+ prof_shift);
+ } else if (get_option(&str, &par)) {
prof_shift = par;
prof_on = CPU_PROFILING;
printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc3158..deca041 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3755,19 +3755,22 @@ EXPORT_SYMBOL(cond_resched);
*/
int cond_resched_lock(spinlock_t * lock)
{
+ int ret = 0;
+
if (need_lockbreak(lock)) {
spin_unlock(lock);
cpu_relax();
+ ret = 1;
spin_lock(lock);
}
if (need_resched()) {
_raw_spin_unlock(lock);
preempt_enable_no_resched();
__cond_resched();
+ ret = 1;
spin_lock(lock);
- return 1;
}
- return 0;
+ return ret;
}
EXPORT_SYMBOL(cond_resched_lock);
@@ -3811,7 +3814,7 @@ EXPORT_SYMBOL(yield);
*/
void __sched io_schedule(void)
{
- struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+ struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
atomic_inc(&rq->nr_iowait);
schedule();
@@ -3822,7 +3825,7 @@ EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout)
{
- struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+ struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
long ret;
atomic_inc(&rq->nr_iowait);
@@ -4243,7 +4246,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
- tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+ cpus_setall(tsk->cpus_allowed);
dest_cpu = any_online_cpu(tsk->cpus_allowed);
/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc..c89821b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -24,6 +24,7 @@
#include <linux/ptrace.h>
#include <linux/posix-timers.h>
#include <linux/signal.h>
+#include <linux/audit.h>
#include <asm/param.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -522,7 +523,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
{
int sig = 0;
- sig = next_signal(pending, mask);
+ /* SIGKILL must have priority, otherwise it is quite easy
+ * to create an unkillable process, sending sig < SIGKILL
+ * to self */
+ if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+ if (!sigismember(mask, SIGKILL))
+ sig = SIGKILL;
+ }
+
+ if (likely(!sig))
+ sig = next_signal(pending, mask);
if (sig) {
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
@@ -658,7 +668,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
&& (current->uid ^ t->suid) && (current->uid ^ t->uid)
&& !capable(CAP_KILL))
return error;
- return security_task_kill(t, info, sig);
+
+ error = security_task_kill(t, info, sig);
+ if (!error)
+ audit_signal_info(sig, t); /* Let audit system see the signal */
+ return error;
}
/* forward decl */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index e15ed17..0c3f9d8 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq);
void __lockfunc _spin_unlock_bh(spinlock_t *lock)
{
_raw_spin_unlock(lock);
- preempt_enable();
+ preempt_enable_no_resched();
local_bh_enable();
}
EXPORT_SYMBOL(_spin_unlock_bh);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq);
void __lockfunc _read_unlock_bh(rwlock_t *lock)
{
_raw_read_unlock(lock);
- preempt_enable();
+ preempt_enable_no_resched();
local_bh_enable();
}
EXPORT_SYMBOL(_read_unlock_bh);
@@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq);
void __lockfunc _write_unlock_bh(rwlock_t *lock)
{
_raw_write_unlock(lock);
- preempt_enable();
+ preempt_enable_no_resched();
local_bh_enable();
}
EXPORT_SYMBOL(_write_unlock_bh);
@@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
if (_raw_spin_trylock(lock))
return 1;
- preempt_enable();
+ preempt_enable_no_resched();
local_bh_enable();
return 0;
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6116b25..84a9d18 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int stop_machine(void)
stopmachine_state = STOPMACHINE_WAIT;
for_each_online_cpu(i) {
- if (i == _smp_processor_id())
+ if (i == raw_smp_processor_id())
continue;
ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
if (ret < 0)
@@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
/* If they don't care which CPU fn runs on, bind to any online one. */
if (cpu == NR_CPUS)
- cpu = _smp_processor_id();
+ cpu = raw_smp_processor_id();
p = kthread_create(do_stop, &smdata, "kstopmachine");
if (!IS_ERR(p)) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0dda70e..6f15bea 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -77,6 +77,7 @@ cond_syscall(sys_request_key);
cond_syscall(sys_keyctl);
cond_syscall(compat_sys_keyctl);
cond_syscall(compat_sys_socketcall);
+cond_syscall(sys_set_zone_reclaim);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
OpenPOWER on IntegriCloud