diff options
author | Thomas Gleixner <tglx@tglx.tec.linutronix.de> | 2005-06-26 23:20:36 +0200 |
---|---|---|
committer | Thomas Gleixner <tglx@mtd.linutronix.de> | 2005-06-26 23:20:36 +0200 |
commit | 7ca6448dbfb398bba36eda3c01bc14b86c3675be (patch) | |
tree | 82d934ebf07f22a2c64c3b6d82ec24082878b43a /kernel | |
parent | f1f67a9874f1a4bba1adff6d694aa52e5f52ff1a (diff) | |
parent | 7d681b23d6cc14a8c026ea6756242cb522cbbcae (diff) | |
download | op-kernel-dev-7ca6448dbfb398bba36eda3c01bc14b86c3675be.zip op-kernel-dev-7ca6448dbfb398bba36eda3c01bc14b86c3675be.tar.gz |
Merge with rsync://fileserver/linux
Update to Linus latest
Diffstat (limited to 'kernel')
37 files changed, 3367 insertions, 1269 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz new file mode 100644 index 0000000..248e1c3 --- /dev/null +++ b/kernel/Kconfig.hz @@ -0,0 +1,46 @@ +# +# Timer Interrupt Frequency Configuration +# + +choice + prompt "Timer frequency" + default HZ_250 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 HZ but 100 HZ may be more + beneficial for servers and NUMA systems that do not need to have + a fast response for user interaction and that may experience bus + contention and cacheline bounces as a result of timer interrupts. + Note that the timer interrupt occurs on each processor in an SMP + environment leading to NR_CPUS * HZ number of timer interrupts + per second. + + + config HZ_100 + bool "100 HZ" + help + 100 HZ is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + + config HZ_250 + bool "250 HZ" + help + 250 HZ is a good compromise choice allowing server performance + while also showing good interactive responsiveness even + on SMP and NUMA systems. + + config HZ_1000 + bool "1000 HZ" + help + 1000 HZ is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + +endchoice + +config HZ + int + default 100 if HZ_100 + default 250 if HZ_250 + default 1000 if HZ_1000 + diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt new file mode 100644 index 0000000..0b46a5d --- /dev/null +++ b/kernel/Kconfig.preempt @@ -0,0 +1,65 @@ + +choice + prompt "Preemption Model" + default PREEMPT_NONE + +config PREEMPT_NONE + bool "No Forced Preemption (Server)" + help + This is the traditional Linux preemption model, geared towards + throughput. It will still provide good latencies most of the + time, but there are no guarantees and occasional longer delays + are possible. + + Select this option if you are building a kernel for a server or + scientific/computation system, or if you want to maximize the + raw processing power of the kernel, irrespective of scheduling + latencies. + +config PREEMPT_VOLUNTARY + bool "Voluntary Kernel Preemption (Desktop)" + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new + preemption points have been selected to reduce the maximum + latency of rescheduling, providing faster application reactions, + at the cost of slighly lower throughput. + + This allows reaction to interactive events by allowing a + low priority process to voluntarily preempt itself even if it + is in kernel mode executing a system call. This allows + applications to run more 'smoothly' even when the system is + under load. + + Select this if you are building a kernel for a desktop system. + +config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" + help + This option reduces the latency of the kernel by making + all kernel code (that is not executing in a critical section) + preemptible. This allows reaction to interactive events by + permitting a low priority process to be preempted involuntarily + even if it is in kernel mode executing a system call and would + otherwise not be about to reach a natural preemption point. + This allows applications to run more 'smoothly' even when the + system is under load, at the cost of slighly lower throughput + and a slight runtime overhead to kernel code. + + Select this if you are building a kernel for a desktop or + embedded system with latency requirements in the milliseconds + range. + +endchoice + +config PREEMPT_BKL + bool "Preempt The Big Kernel Lock" + depends on SMP || PREEMPT + default y + help + This option reduces the latency of the kernel by making the + big kernel lock preemptible. + + Say Y here if you are building a kernel for a desktop system. + Say N if you are unsure. + diff --git a/kernel/Makefile b/kernel/Makefile index b01d26f..cb05cd0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o @@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) diff --git a/kernel/audit.c b/kernel/audit.c index 9c4f1af..ef35166 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -46,6 +46,8 @@ #include <asm/types.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/err.h> +#include <linux/kthread.h> #include <linux/audit.h> @@ -68,7 +70,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK; /* If audit records are to be written to the netlink socket, audit_pid * contains the (non-zero) pid. */ -static int audit_pid; +int audit_pid; /* If audit_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -77,7 +79,10 @@ static int audit_rate_limit; /* Number of outstanding audit_buffers allowed. */ static int audit_backlog_limit = 64; -static atomic_t audit_backlog = ATOMIC_INIT(0); + +/* The identity of the user shutting down the audit system. */ +uid_t audit_sig_uid = -1; +pid_t audit_sig_pid = -1; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -91,19 +96,17 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -/* There are two lists of audit buffers. The txlist contains audit - * buffers that cannot be sent immediately to the netlink device because - * we are in an irq context (these are sent later in a tasklet). - * - * The second list is a list of pre-allocated audit buffers (if more +/* The audit_freelist is a list of pre-allocated audit buffers (if more * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ -static DEFINE_SPINLOCK(audit_txlist_lock); static DEFINE_SPINLOCK(audit_freelist_lock); static int audit_freelist_count = 0; -static LIST_HEAD(audit_txlist); static LIST_HEAD(audit_freelist); +static struct sk_buff_head audit_skb_queue; +static struct task_struct *kauditd_task; +static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); + /* There are three lists of rules -- one to search at task creation * time, one to search at syscall entry time, and another to search at * syscall exit time. */ @@ -112,7 +115,7 @@ static LIST_HEAD(audit_entlist); static LIST_HEAD(audit_extlist); /* The netlink socket is only to be read by 1 CPU, which lets us assume - * that list additions and deletions never happen simultaneiously in + * that list additions and deletions never happen simultaneously in * auditsc.c */ static DECLARE_MUTEX(audit_netlink_sem); @@ -132,21 +135,14 @@ static DECLARE_MUTEX(audit_netlink_sem); * use simultaneously. */ struct audit_buffer { struct list_head list; - struct sk_buff_head sklist; /* formatted skbs ready to send */ + struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ - int len; /* used area of tmp */ - char tmp[AUDIT_BUFSIZ]; - - /* Pointer to header and contents */ - struct nlmsghdr *nlh; - int total; - int type; - int pid; }; -void audit_set_type(struct audit_buffer *ab, int type) +static void audit_set_pid(struct audit_buffer *ab, pid_t pid) { - ab->type = type; + struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; + nlh->nlmsg_pid = pid; } struct audit_entry { @@ -154,9 +150,6 @@ struct audit_entry { struct audit_rule rule; }; -static void audit_log_end_irq(struct audit_buffer *ab); -static void audit_log_end_fast(struct audit_buffer *ab); - static void audit_panic(const char *message) { switch (audit_failure) @@ -227,10 +220,8 @@ void audit_log_lost(const char *message) if (print) { printk(KERN_WARNING - "audit: audit_lost=%d audit_backlog=%d" - " audit_rate_limit=%d audit_backlog_limit=%d\n", + "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n", atomic_read(&audit_lost), - atomic_read(&audit_backlog), audit_rate_limit, audit_backlog_limit); audit_panic(message); @@ -242,7 +233,8 @@ static int audit_set_rate_limit(int limit, uid_t loginuid) { int old = audit_rate_limit; audit_rate_limit = limit; - audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u", + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_rate_limit=%d old=%d by auid=%u", audit_rate_limit, old, loginuid); return old; } @@ -251,7 +243,8 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid) { int old = audit_backlog_limit; audit_backlog_limit = limit; - audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u", + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_backlog_limit=%d old=%d by auid=%u", audit_backlog_limit, old, loginuid); return old; } @@ -262,8 +255,9 @@ static int audit_set_enabled(int state, uid_t loginuid) if (state != 0 && state != 1) return -EINVAL; audit_enabled = state; - audit_log(NULL, "audit_enabled=%d old=%d by auid %u", - audit_enabled, old, loginuid); + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_enabled=%d old=%d by auid=%u", + audit_enabled, old, loginuid); return old; } @@ -275,12 +269,44 @@ static int audit_set_failure(int state, uid_t loginuid) && state != AUDIT_FAIL_PANIC) return -EINVAL; audit_failure = state; - audit_log(NULL, "audit_failure=%d old=%d by auid %u", - audit_failure, old, loginuid); + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_failure=%d old=%d by auid=%u", + audit_failure, old, loginuid); return old; } -#ifdef CONFIG_NET +int kauditd_thread(void *dummy) +{ + struct sk_buff *skb; + + while (1) { + skb = skb_dequeue(&audit_skb_queue); + if (skb) { + if (audit_pid) { + int err = netlink_unicast(audit_sock, skb, audit_pid, 0); + if (err < 0) { + BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ + printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); + audit_pid = 0; + } + } else { + printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0)); + kfree_skb(skb); + } + } else { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kauditd_wait, &wait); + + if (!skb_queue_len(&audit_skb_queue)) + schedule(); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kauditd_wait, &wait); + } + } +} + void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { @@ -293,13 +319,16 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, skb = alloc_skb(len, GFP_KERNEL); if (!skb) - goto nlmsg_failure; + return; - nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh)); + nlh = NLMSG_PUT(skb, pid, seq, t, size); nlh->nlmsg_flags = flags; data = NLMSG_DATA(nlh); memcpy(data, payload, size); - netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT); + + /* Ignore failure. It'll only happen if the sender goes away, + because our timeout is set to infinite. */ + netlink_unicast(audit_sock, skb, pid, 0); return; nlmsg_failure: /* Used by NLMSG_PUT */ @@ -321,10 +350,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) case AUDIT_SET: case AUDIT_ADD: case AUDIT_DEL: + case AUDIT_SIGNAL_INFO: if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: + case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) err = -EPERM; break; @@ -344,11 +375,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; uid_t loginuid; /* loginuid of sender */ + struct audit_sig_info sig_data; err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); if (err) return err; + /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */ + if (!kauditd_task) + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } + pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; loginuid = NETLINK_CB(skb).loginuid; @@ -363,7 +404,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) status_set.rate_limit = audit_rate_limit; status_set.backlog_limit = audit_backlog_limit; status_set.lost = atomic_read(&audit_lost); - status_set.backlog = atomic_read(&audit_backlog); + status_set.backlog = skb_queue_len(&audit_skb_queue); audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, &status_set, sizeof(status_set)); break; @@ -382,7 +423,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; audit_pid = status_get->pid; - audit_log(NULL, "audit_pid=%d old=%d by auid %u", + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "audit_pid=%d old=%d by auid=%u", audit_pid, old, loginuid); } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) @@ -392,18 +434,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid); break; case AUDIT_USER: - ab = audit_log_start(NULL); + case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: + ab = audit_log_start(NULL, msg_type); if (!ab) break; /* audit_panic has been called */ audit_log_format(ab, - "user pid=%d uid=%d length=%d loginuid=%u" + "user pid=%d uid=%u auid=%u" " msg='%.1024s'", - pid, uid, - (int)(nlh->nlmsg_len - - ((char *)data - (char *)nlh)), - loginuid, (char *)data); - ab->type = AUDIT_USER; - ab->pid = pid; + pid, uid, loginuid, (char *)data); + audit_set_pid(ab, pid); audit_log_end(ab); break; case AUDIT_ADD: @@ -412,12 +451,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; /* fallthrough */ case AUDIT_LIST: -#ifdef CONFIG_AUDITSYSCALL err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, uid, seq, data, loginuid); -#else - err = -EOPNOTSUPP; -#endif + break; + case AUDIT_SIGNAL_INFO: + sig_data.uid = audit_sig_uid; + sig_data.pid = audit_sig_pid; + audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, + 0, 0, &sig_data, sizeof(sig_data)); break; default: err = -EINVAL; @@ -467,87 +508,6 @@ static void audit_receive(struct sock *sk, int length) up(&audit_netlink_sem); } -/* Move data from tmp buffer into an skb. This is an extra copy, and - * that is unfortunate. However, the copy will only occur when a record - * is being written to user space, which is already a high-overhead - * operation. (Elimination of the copy is possible, for example, by - * writing directly into a pre-allocated skb, at the cost of wasting - * memory. */ -static void audit_log_move(struct audit_buffer *ab) -{ - struct sk_buff *skb; - char *start; - int extra = ab->nlh ? 0 : NLMSG_SPACE(0); - - /* possible resubmission */ - if (ab->len == 0) - return; - - skb = skb_peek_tail(&ab->sklist); - if (!skb || skb_tailroom(skb) <= ab->len + extra) { - skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC); - if (!skb) { - ab->len = 0; /* Lose information in ab->tmp */ - audit_log_lost("out of memory in audit_log_move"); - return; - } - __skb_queue_tail(&ab->sklist, skb); - if (!ab->nlh) - ab->nlh = (struct nlmsghdr *)skb_put(skb, - NLMSG_SPACE(0)); - } - start = skb_put(skb, ab->len); - memcpy(start, ab->tmp, ab->len); - ab->len = 0; -} - -/* Iterate over the skbuff in the audit_buffer, sending their contents - * to user space. */ -static inline int audit_log_drain(struct audit_buffer *ab) -{ - struct sk_buff *skb; - - while ((skb = skb_dequeue(&ab->sklist))) { - int retval = 0; - - if (audit_pid) { - if (ab->nlh) { - ab->nlh->nlmsg_len = ab->total; - ab->nlh->nlmsg_type = ab->type; - ab->nlh->nlmsg_flags = 0; - ab->nlh->nlmsg_seq = 0; - ab->nlh->nlmsg_pid = ab->pid; - } - skb_get(skb); /* because netlink_* frees */ - retval = netlink_unicast(audit_sock, skb, audit_pid, - MSG_DONTWAIT); - } - if (retval == -EAGAIN && - (atomic_read(&audit_backlog)) < audit_backlog_limit) { - skb_queue_head(&ab->sklist, skb); - audit_log_end_irq(ab); - return 1; - } - if (retval < 0) { - if (retval == -ECONNREFUSED) { - printk(KERN_ERR - "audit: *NO* daemon at audit_pid=%d\n", - audit_pid); - audit_pid = 0; - } else - audit_log_lost("netlink socket too busy"); - } - if (!audit_pid) { /* No daemon */ - int offset = ab->nlh ? NLMSG_SPACE(0) : 0; - int len = skb->len - offset; - skb->data[offset + len] = '\0'; - printk(KERN_ERR "%s\n", skb->data + offset); - } - kfree_skb(skb); - ab->nlh = NULL; - } - return 0; -} /* Initialize audit support at boot time. */ static int __init audit_init(void) @@ -558,40 +518,13 @@ static int __init audit_init(void) if (!audit_sock) audit_panic("cannot initialize netlink socket"); + audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + skb_queue_head_init(&audit_skb_queue); audit_initialized = 1; audit_enabled = audit_default; - audit_log(NULL, "initialized"); - return 0; -} - -#else -/* Without CONFIG_NET, we have no skbuffs. For now, print what we have - * in the buffer. */ -static void audit_log_move(struct audit_buffer *ab) -{ - printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp); - ab->len = 0; -} - -static inline int audit_log_drain(struct audit_buffer *ab) -{ - return 0; -} - -/* Initialize audit support at boot time. */ -int __init audit_init(void) -{ - printk(KERN_INFO "audit: initializing WITHOUT netlink support\n"); - audit_sock = NULL; - audit_pid = 0; - - audit_initialized = 1; - audit_enabled = audit_default; - audit_log(NULL, "initialized"); + audit_log(NULL, AUDIT_KERNEL, "initialized"); return 0; } -#endif - __initcall(audit_init); /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ @@ -608,6 +541,102 @@ static int __init audit_enable(char *str) __setup("audit=", audit_enable); +static void audit_buffer_free(struct audit_buffer *ab) +{ + unsigned long flags; + + if (!ab) + return; + + if (ab->skb) + kfree_skb(ab->skb); + + spin_lock_irqsave(&audit_freelist_lock, flags); + if (++audit_freelist_count > AUDIT_MAXFREE) + kfree(ab); + else + list_add(&ab->list, &audit_freelist); + spin_unlock_irqrestore(&audit_freelist_lock, flags); +} + +static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, + int gfp_mask, int type) +{ + unsigned long flags; + struct audit_buffer *ab = NULL; + struct nlmsghdr *nlh; + + spin_lock_irqsave(&audit_freelist_lock, flags); + if (!list_empty(&audit_freelist)) { + ab = list_entry(audit_freelist.next, + struct audit_buffer, list); + list_del(&ab->list); + --audit_freelist_count; + } + spin_unlock_irqrestore(&audit_freelist_lock, flags); + + if (!ab) { + ab = kmalloc(sizeof(*ab), gfp_mask); + if (!ab) + goto err; + } + + ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask); + if (!ab->skb) + goto err; + + ab->ctx = ctx; + nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); + nlh->nlmsg_type = type; + nlh->nlmsg_flags = 0; + nlh->nlmsg_pid = 0; + nlh->nlmsg_seq = 0; + return ab; +err: + audit_buffer_free(ab); + return NULL; +} + +/* Compute a serial number for the audit record. Audit records are + * written to user-space as soon as they are generated, so a complete + * audit record may be written in several pieces. The timestamp of the + * record and this serial number are used by the user-space tools to + * determine which pieces belong to the same audit record. The + * (timestamp,serial) tuple is unique for each syscall and is live from + * syscall entry to syscall exit. + * + * Atomic values are only guaranteed to be 24-bit, so we count down. + * + * NOTE: Another possibility is to store the formatted records off the + * audit context (for those records that have a context), and emit them + * all at syscall exit. However, this could delay the reporting of + * significant errors until syscall exit (or never, if the system + * halts). */ +unsigned int audit_serial(void) +{ + static atomic_t serial = ATOMIC_INIT(0xffffff); + unsigned int a, b; + + do { + a = atomic_read(&serial); + if (atomic_dec_and_test(&serial)) + atomic_set(&serial, 0xffffff); + b = atomic_read(&serial); + } while (b != a - 1); + + return 0xffffff - b; +} + +static inline void audit_get_stamp(struct audit_context *ctx, + struct timespec *t, unsigned int *serial) +{ + if (ctx) + auditsc_get_stamp(ctx, t, serial); + else { + *t = CURRENT_TIME; + *serial = audit_serial(); + } +} /* Obtain an audit buffer. This routine does locking to obtain the * audit buffer, but then no locking is required for calls to @@ -615,10 +644,9 @@ __setup("audit=", audit_enable); * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ -struct audit_buffer *audit_log_start(struct audit_context *ctx) +struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) { struct audit_buffer *ab = NULL; - unsigned long flags; struct timespec t; unsigned int serial; @@ -626,57 +654,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx) return NULL; if (audit_backlog_limit - && atomic_read(&audit_backlog) > audit_backlog_limit) { + && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { if (audit_rate_check()) printk(KERN_WARNING "audit: audit_backlog=%d > " "audit_backlog_limit=%d\n", - atomic_read(&audit_backlog), + skb_queue_len(&audit_skb_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); return NULL; } - spin_lock_irqsave(&audit_freelist_lock, flags); - if (!list_empty(&audit_freelist)) { - ab = list_entry(audit_freelist.next, - struct audit_buffer, list); - list_del(&ab->list); - --audit_freelist_count; - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); - - if (!ab) - ab = kmalloc(sizeof(*ab), GFP_ATOMIC); + ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; } - atomic_inc(&audit_backlog); - skb_queue_head_init(&ab->sklist); - - ab->ctx = ctx; - ab->len = 0; - ab->nlh = NULL; - ab->total = 0; - ab->type = AUDIT_KERNEL; - ab->pid = 0; + audit_get_stamp(ab->ctx, &t, &serial); -#ifdef CONFIG_AUDITSYSCALL - if (ab->ctx) - audit_get_stamp(ab->ctx, &t, &serial); - else -#endif - { - t = CURRENT_TIME; - serial = 0; - } audit_log_format(ab, "audit(%lu.%03lu:%u): ", t.tv_sec, t.tv_nsec/1000000, serial); return ab; } +/** + * audit_expand - expand skb in the audit buffer + * @ab: audit_buffer + * + * Returns 0 (no space) on failed expansion, or available space if + * successful. + */ +static inline int audit_expand(struct audit_buffer *ab, int extra) +{ + struct sk_buff *skb = ab->skb; + int ret = pskb_expand_head(skb, skb_headroom(skb), extra, + GFP_ATOMIC); + if (ret < 0) { + audit_log_lost("out of memory in audit_expand"); + return 0; + } + return skb_tailroom(skb); +} /* Format an audit message into the audit buffer. If there isn't enough * room in the audit buffer, more room will be allocated and vsnprint @@ -686,26 +705,35 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) { int len, avail; + struct sk_buff *skb; + va_list args2; if (!ab) return; - avail = sizeof(ab->tmp) - ab->len; - if (avail <= 0) { - audit_log_move(ab); - avail = sizeof(ab->tmp) - ab->len; + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + if (avail == 0) { + avail = audit_expand(ab, AUDIT_BUFSIZ); + if (!avail) + goto out; } - len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); + va_copy(args2, args); + len = vsnprintf(skb->tail, avail, fmt, args); if (len >= avail) { /* The printk buffer is 1024 bytes long, so if we get * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ - audit_log_move(ab); - avail = sizeof(ab->tmp) - ab->len; - len = vsnprintf(ab->tmp + ab->len, avail, fmt, args); + avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); + if (!avail) + goto out; + len = vsnprintf(skb->tail, avail, fmt, args2); } - ab->len += (len < avail) ? len : avail; - ab->total += (len < avail) ? len : avail; + if (len > 0) + skb_put(skb, len); +out: + return; } /* Format a message into the audit buffer. All the work is done in @@ -721,20 +749,47 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) va_end(args); } -void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) +/* This function will take the passed buf and convert it into a string of + * ascii hex digits. The new string is placed onto the skb. */ +void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, + size_t len) { - int i; + int i, avail, new_len; + unsigned char *ptr; + struct sk_buff *skb; + static const unsigned char *hex = "0123456789ABCDEF"; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + new_len = len<<1; + if (new_len >= avail) { + /* Round the buffer request up to the next multiple */ + new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); + avail = audit_expand(ab, new_len); + if (!avail) + return; + } - for (i=0; i<len; i++) - audit_log_format(ab, "%02x", buf[i]); + ptr = skb->tail; + for (i=0; i<len; i++) { + *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */ + *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ + } + *ptr = 0; + skb_put(skb, len << 1); /* new string is twice the old string */ } +/* This code will escape a string that is passed to it if the string + * contains a control character, unprintable character, double quote mark, + * or a space. Unescaped strings will start and end with a double quote mark. + * Strings that are escaped are printed in hex (2 digits per char). */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { const unsigned char *p = string; while (*p) { - if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) { + if (*p == '"' || *p < 0x21 || *p > 0x7f) { audit_log_hex(ab, string, strlen(string)); return; } @@ -743,117 +798,63 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) audit_log_format(ab, "\"%s\"", string); } - -/* This is a helper-function to print the d_path without using a static - * buffer or allocating another buffer in addition to the one in - * audit_buffer. */ +/* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, struct dentry *dentry, struct vfsmount *vfsmnt) { - char *p; - int len, avail; + char *p, *path; - if (prefix) audit_log_format(ab, " %s", prefix); - - if (ab->len > 128) - audit_log_move(ab); - avail = sizeof(ab->tmp) - ab->len; - p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail); - if (IS_ERR(p)) { - /* FIXME: can we save some information here? */ - audit_log_format(ab, "<toolong>"); - } else { - /* path isn't at start of buffer */ - len = (ab->tmp + sizeof(ab->tmp) - 1) - p; - memmove(ab->tmp + ab->len, p, len); - ab->len += len; - ab->total += len; - } -} - -/* Remove queued messages from the audit_txlist and send them to userspace. */ -static void audit_tasklet_handler(unsigned long arg) -{ - LIST_HEAD(list); - struct audit_buffer *ab; - unsigned long flags; + if (prefix) + audit_log_format(ab, " %s", prefix); - spin_lock_irqsave(&audit_txlist_lock, flags); - list_splice_init(&audit_txlist, &list); - spin_unlock_irqrestore(&audit_txlist_lock, flags); - - while (!list_empty(&list)) { - ab = list_entry(list.next, struct audit_buffer, list); - list_del(&ab->list); - audit_log_end_fast(ab); + /* We will allow 11 spaces for ' (deleted)' to be appended */ + path = kmalloc(PATH_MAX+11, GFP_KERNEL); + if (!path) { + audit_log_format(ab, "<no memory>"); + return; } + p = d_path(dentry, vfsmnt, path, PATH_MAX+11); + if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ + /* FIXME: can we save some information here? */ + audit_log_format(ab, "<too long>"); + } else + audit_log_untrustedstring(ab, p); + kfree(path); } -static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0); - /* The netlink_* functions cannot be called inside an irq context, so * the audit buffer is places on a queue and a tasklet is scheduled to * remove them from the queue outside the irq context. May be called in * any context. */ -static void audit_log_end_irq(struct audit_buffer *ab) -{ - unsigned long flags; - - if (!ab) - return; - spin_lock_irqsave(&audit_txlist_lock, flags); - list_add_tail(&ab->list, &audit_txlist); - spin_unlock_irqrestore(&audit_txlist_lock, flags); - - tasklet_schedule(&audit_tasklet); -} - -/* Send the message in the audit buffer directly to user space. May not - * be called in an irq context. */ -static void audit_log_end_fast(struct audit_buffer *ab) +void audit_log_end(struct audit_buffer *ab) { - unsigned long flags; - - BUG_ON(in_irq()); if (!ab) return; if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { - audit_log_move(ab); - if (audit_log_drain(ab)) - return; + if (audit_pid) { + struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data; + nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); + skb_queue_tail(&audit_skb_queue, ab->skb); + ab->skb = NULL; + wake_up_interruptible(&kauditd_wait); + } else { + printk("%s\n", ab->skb->data + NLMSG_SPACE(0)); + } } - - atomic_dec(&audit_backlog); - spin_lock_irqsave(&audit_freelist_lock, flags); - if (++audit_freelist_count > AUDIT_MAXFREE) - kfree(ab); - else - list_add(&ab->list, &audit_freelist); - spin_unlock_irqrestore(&audit_freelist_lock, flags); -} - -/* Send or queue the message in the audit buffer, depending on the - * current context. (A convenience function that may be called in any - * context.) */ -void audit_log_end(struct audit_buffer *ab) -{ - if (in_irq()) - audit_log_end_irq(ab); - else - audit_log_end_fast(ab); + audit_buffer_free(ab); } /* Log an audit record. This is a convenience function that calls * audit_log_start, audit_log_vformat, and audit_log_end. It may be * called in any context. */ -void audit_log(struct audit_context *ctx, const char *fmt, ...) +void audit_log(struct audit_context *ctx, int type, const char *fmt, ...) { struct audit_buffer *ab; va_list args; - ab = audit_log_start(ctx); + ab = audit_log_start(ctx, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 37b3ac9..e75f84e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -34,7 +34,8 @@ #include <asm/types.h> #include <linux/mm.h> #include <linux/module.h> - +#include <linux/mount.h> +#include <linux/socket.h> #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> @@ -112,6 +113,23 @@ struct audit_aux_data_ipcctl { mode_t mode; }; +struct audit_aux_data_socketcall { + struct audit_aux_data d; + int nargs; + unsigned long args[0]; +}; + +struct audit_aux_data_sockaddr { + struct audit_aux_data d; + int len; + char a[0]; +}; + +struct audit_aux_data_path { + struct audit_aux_data d; + struct dentry *dentry; + struct vfsmount *mnt; +}; /* The per-task audit context. */ struct audit_context { @@ -127,6 +145,8 @@ struct audit_context { int auditable; /* 1 if record should be written */ int name_count; struct audit_names names[AUDIT_NAMES]; + struct dentry * pwd; + struct vfsmount * pwdmnt; struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; @@ -157,6 +177,8 @@ struct audit_entry { struct audit_rule rule; }; +extern int audit_pid; + /* Check to see if two rules are identical. It is called from * audit_del_rule during AUDIT_DEL. */ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) @@ -226,7 +248,6 @@ static inline int audit_del_rule(struct audit_rule *rule, return -EFAULT; /* No matching rule */ } -#ifdef CONFIG_NET /* Copy rule from user-space to kernel-space. Called during * AUDIT_ADD. */ static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) @@ -287,7 +308,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_entlist); if (!err && (flags & AUDIT_AT_EXIT)) err = audit_add_rule(entry, &audit_extlist); - audit_log(NULL, "auid %u added an audit rule\n", loginuid); + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "auid=%u added an audit rule\n", loginuid); break; case AUDIT_DEL: flags =((struct audit_rule *)data)->flags; @@ -297,7 +319,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_del_rule(data, &audit_entlist); if (!err && (flags & AUDIT_AT_EXIT)) err = audit_del_rule(data, &audit_extlist); - audit_log(NULL, "auid %u removed an audit rule\n", loginuid); + audit_log(NULL, AUDIT_CONFIG_CHANGE, + "auid=%u removed an audit rule\n", loginuid); break; default: return -EINVAL; @@ -305,7 +328,6 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, return err; } -#endif /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ @@ -444,7 +466,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) /* At syscall entry and exit time, this filter is called if the * audit_state is not low enough that auditing cannot take place, but is - * also not high enough that we already know we have to write and audit + * also not high enough that we already know we have to write an audit * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). */ static enum audit_state audit_filter_syscall(struct task_struct *tsk, @@ -532,6 +554,12 @@ static inline void audit_free_names(struct audit_context *context) if (context->names[i].name) __putname(context->names[i].name); context->name_count = 0; + if (context->pwd) + dput(context->pwd); + if (context->pwdmnt) + mntput(context->pwdmnt); + context->pwd = NULL; + context->pwdmnt = NULL; } static inline void audit_free_aux(struct audit_context *context) @@ -539,6 +567,11 @@ static inline void audit_free_aux(struct audit_context *context) struct audit_aux_data *aux; while ((aux = context->aux)) { + if (aux->type == AUDIT_AVC_PATH) { + struct audit_aux_data_path *axi = (void *)aux; + dput(axi->dentry); + mntput(axi->mnt); + } context->aux = aux->next; kfree(aux); } @@ -625,7 +658,8 @@ static void audit_log_task_info(struct audit_buffer *ab) struct vm_area_struct *vma; get_task_comm(name, current); - audit_log_format(ab, " comm=%s", name); + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, name); if (!mm) return; @@ -649,23 +683,24 @@ static void audit_log_exit(struct audit_context *context) { int i; struct audit_buffer *ab; + struct audit_aux_data *aux; - ab = audit_log_start(context); + ab = audit_log_start(context, AUDIT_SYSCALL); if (!ab) return; /* audit_panic has been called */ - audit_log_format(ab, "syscall=%d", context->major); + audit_log_format(ab, "arch=%x syscall=%d", + context->arch, context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); - audit_log_format(ab, " arch=%x", context->arch); if (context->return_valid) audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " pid=%d loginuid=%d uid=%d gid=%d" - " euid=%d suid=%d fsuid=%d" - " egid=%d sgid=%d fsgid=%d", + " pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u", context->argv[0], context->argv[1], context->argv[2], @@ -679,33 +714,57 @@ static void audit_log_exit(struct audit_context *context) context->egid, context->sgid, context->fsgid); audit_log_task_info(ab); audit_log_end(ab); - while (context->aux) { - struct audit_aux_data *aux; - ab = audit_log_start(context); + for (aux = context->aux; aux; aux = aux->next) { + + ab = audit_log_start(context, aux->type); if (!ab) continue; /* audit_panic has been called */ - aux = context->aux; - context->aux = aux->next; - - audit_log_format(ab, "auxitem=%d", aux->type); switch (aux->type) { - case AUDIT_AUX_IPCPERM: { + case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx uid=%d gid=%d mode=%x", + " qbytes=%lx iuid=%u igid=%u mode=%x", axi->qbytes, axi->uid, axi->gid, axi->mode); - } + break; } + + case AUDIT_SOCKETCALL: { + int i; + struct audit_aux_data_socketcall *axs = (void *)aux; + audit_log_format(ab, "nargs=%d", axs->nargs); + for (i=0; i<axs->nargs; i++) + audit_log_format(ab, " a%d=%lx", i, axs->args[i]); + break; } + + case AUDIT_SOCKADDR: { + struct audit_aux_data_sockaddr *axs = (void *)aux; + + audit_log_format(ab, "saddr="); + audit_log_hex(ab, axs->a, axs->len); + break; } + + case AUDIT_AVC_PATH: { + struct audit_aux_data_path *axi = (void *)aux; + audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); + break; } + } audit_log_end(ab); - kfree(aux); } + if (context->pwd && context->pwdmnt) { + ab = audit_log_start(context, AUDIT_CWD); + if (ab) { + audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); + audit_log_end(ab); + } + } for (i = 0; i < context->name_count; i++) { - ab = audit_log_start(context); + ab = audit_log_start(context, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ + audit_log_format(ab, "item=%d", i); if (context->names[i].name) { audit_log_format(ab, " name="); @@ -713,7 +772,7 @@ static void audit_log_exit(struct audit_context *context) } if (context->names[i].ino != (unsigned long)-1) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" - " uid=%d gid=%d rdev=%02x:%02x", + " ouid=%u ogid=%u rdev=%02x:%02x", context->names[i].ino, MAJOR(context->names[i].dev), MINOR(context->names[i].dev), @@ -741,42 +800,12 @@ void audit_free(struct task_struct *tsk) /* Check for system calls that do not go through the exit * function (e.g., exit_group), then free context block. */ - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->auditable && context->pid != audit_pid) audit_log_exit(context); audit_free_context(context); } -/* Compute a serial number for the audit record. Audit records are - * written to user-space as soon as they are generated, so a complete - * audit record may be written in several pieces. The timestamp of the - * record and this serial number are used by the user-space daemon to - * determine which pieces belong to the same audit record. The - * (timestamp,serial) tuple is unique for each syscall and is live from - * syscall entry to syscall exit. - * - * Atomic values are only guaranteed to be 24-bit, so we count down. - * - * NOTE: Another possibility is to store the formatted records off the - * audit context (for those records that have a context), and emit them - * all at syscall exit. However, this could delay the reporting of - * significant errors until syscall exit (or never, if the system - * halts). */ -static inline unsigned int audit_serial(void) -{ - static atomic_t serial = ATOMIC_INIT(0xffffff); - unsigned int a, b; - - do { - a = atomic_read(&serial); - if (atomic_dec_and_test(&serial)) - atomic_set(&serial, 0xffffff); - b = atomic_read(&serial); - } while (b != a - 1); - - return 0xffffff - b; -} - /* Fill in audit context at syscall entry. This only happens if the * audit context was created when the task was created and the state or * filters demand the audit context be built. If the state from the @@ -876,7 +905,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) if (likely(!context)) return; - if (context->in_syscall && context->auditable) + if (context->in_syscall && context->auditable && context->pid != audit_pid) audit_log_exit(context); context->in_syscall = 0; @@ -916,6 +945,13 @@ void audit_getname(const char *name) context->names[context->name_count].name = name; context->names[context->name_count].ino = (unsigned long)-1; ++context->name_count; + if (!context->pwd) { + read_lock(¤t->fs->lock); + context->pwd = dget(current->fs->pwd); + context->pwdmnt = mntget(current->fs->pwdmnt); + read_unlock(¤t->fs->lock); + } + } /* Intercept a putname request. Called from @@ -994,34 +1030,26 @@ void audit_inode(const char *name, const struct inode *inode) context->names[idx].rdev = inode->i_rdev; } -void audit_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) +void auditsc_get_stamp(struct audit_context *ctx, + struct timespec *t, unsigned int *serial) { - if (ctx) { - t->tv_sec = ctx->ctime.tv_sec; - t->tv_nsec = ctx->ctime.tv_nsec; - *serial = ctx->serial; - ctx->auditable = 1; - } else { - *t = CURRENT_TIME; - *serial = 0; - } + t->tv_sec = ctx->ctime.tv_sec; + t->tv_nsec = ctx->ctime.tv_nsec; + *serial = ctx->serial; + ctx->auditable = 1; } -extern int audit_set_type(struct audit_buffer *ab, int type); - int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { if (task->audit_context) { struct audit_buffer *ab; - ab = audit_log_start(NULL); + ab = audit_log_start(NULL, AUDIT_LOGIN); if (ab) { audit_log_format(ab, "login pid=%d uid=%u " - "old loginuid=%u new loginuid=%u", + "old auid=%u new auid=%u", task->pid, task->uid, task->audit_context->loginuid, loginuid); - audit_set_type(ab, AUDIT_LOGIN); audit_log_end(ab); } task->audit_context->loginuid = loginuid; @@ -1051,8 +1079,89 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) ax->gid = gid; ax->mode = mode; - ax->d.type = AUDIT_AUX_IPCPERM; + ax->d.type = AUDIT_IPC; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +int audit_socketcall(int nargs, unsigned long *args) +{ + struct audit_aux_data_socketcall *ax; + struct audit_context *context = current->audit_context; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->nargs = nargs; + memcpy(ax->args, args, nargs * sizeof(unsigned long)); + + ax->d.type = AUDIT_SOCKETCALL; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +int audit_sockaddr(int len, void *a) +{ + struct audit_aux_data_sockaddr *ax; + struct audit_context *context = current->audit_context; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->len = len; + memcpy(ax->a, a, len); + + ax->d.type = AUDIT_SOCKADDR; ax->d.next = context->aux; context->aux = (void *)ax; return 0; } + +int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) +{ + struct audit_aux_data_path *ax; + struct audit_context *context = current->audit_context; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + ax->dentry = dget(dentry); + ax->mnt = mntget(mnt); + + ax->d.type = AUDIT_AVC_PATH; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +void audit_signal_info(int sig, struct task_struct *t) +{ + extern pid_t audit_sig_pid; + extern uid_t audit_sig_uid; + + if (unlikely(audit_pid && t->pid == audit_pid)) { + if (sig == SIGTERM || sig == SIGHUP) { + struct audit_context *ctx = current->audit_context; + audit_sig_pid = current->pid; + if (ctx) + audit_sig_uid = ctx->loginuid; + else + audit_sig_uid = current->uid; + } + } +} + diff --git a/kernel/cpu.c b/kernel/cpu.c index 628f4cc..53d8263 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -63,19 +63,15 @@ static int take_cpu_down(void *unused) { int err; - /* Take offline: makes arch_cpu_down somewhat easier. */ - cpu_clear(smp_processor_id(), cpu_online_map); - /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) - cpu_set(smp_processor_id(), cpu_online_map); - else - /* Force idle task to run as soon as we yield: it should - immediately notice cpu is offline and die quickly. */ - sched_idle_next(); + return err; - return err; + /* Force idle task to run as soon as we yield: it should + immediately notice cpu is offline and die quickly. */ + sched_idle_next(); + return 0; } int cpu_down(unsigned int cpu) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 961d740..984c0bf 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL; * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't * (usually) grab cpuset_sem. These are the two most performance * critical pieces of code here. The exception occurs on exit(), - * if the last task using a cpuset exits, and the cpuset was marked - * notify_on_release. In that case, the cpuset_sem is taken, the - * path to the released cpuset calculated, and a usermode call made + * when a task in a notify_on_release cpuset exits. Then cpuset_sem + * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * @@ -229,13 +228,7 @@ static struct dentry_operations cpuset_dops = { static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) { - struct qstr qstr; - struct dentry *d; - - qstr.name = name; - qstr.len = strlen(name); - qstr.hash = full_name_hash(name, qstr.len); - d = lookup_hash(&qstr, parent); + struct dentry *d = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(d)) d->d_op = &cpuset_dops; return d; @@ -602,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) return 0; } +/* + * For a given cpuset cur, partition the system as follows + * a. All cpus in the parent cpuset's cpus_allowed that are not part of any + * exclusive child cpusets + * b. All cpus in the current cpuset's cpus_allowed that are not part of any + * exclusive child cpusets + * Build these two partitions by calling partition_sched_domains + * + * Call with cpuset_sem held. May nest a call to the + * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. + */ +static void update_cpu_domains(struct cpuset *cur) +{ + struct cpuset *c, *par = cur->parent; + cpumask_t pspan, cspan; + + if (par == NULL || cpus_empty(cur->cpus_allowed)) + return; + + /* + * Get all cpus from parent's cpus_allowed not part of exclusive + * children + */ + pspan = par->cpus_allowed; + list_for_each_entry(c, &par->children, sibling) { + if (is_cpu_exclusive(c)) + cpus_andnot(pspan, pspan, c->cpus_allowed); + } + if (is_removed(cur) || !is_cpu_exclusive(cur)) { + cpus_or(pspan, pspan, cur->cpus_allowed); + if (cpus_equal(pspan, cur->cpus_allowed)) + return; + cspan = CPU_MASK_NONE; + } else { + if (cpus_empty(pspan)) + return; + cspan = cur->cpus_allowed; + /* + * Get all cpus from current cpuset's cpus_allowed not part + * of exclusive children + */ + list_for_each_entry(c, &cur->children, sibling) { + if (is_cpu_exclusive(c)) + cpus_andnot(cspan, cspan, c->cpus_allowed); + } + } + + lock_cpu_hotplug(); + partition_sched_domains(&pspan, &cspan); + unlock_cpu_hotplug(); +} + static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; - int retval; + int retval, cpus_unchanged; trialcs = *cs; retval = cpulist_parse(buf, trialcs.cpus_allowed); @@ -615,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (cpus_empty(trialcs.cpus_allowed)) return -ENOSPC; retval = validate_change(cs, &trialcs); - if (retval == 0) - cs->cpus_allowed = trialcs.cpus_allowed; - return retval; + if (retval < 0) + return retval; + cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); + cs->cpus_allowed = trialcs.cpus_allowed; + if (is_cpu_exclusive(cs) && !cpus_unchanged) + update_cpu_domains(cs); + return 0; } static int update_nodemask(struct cpuset *cs, char *buf) @@ -653,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) { int turning_on; struct cpuset trialcs; - int err; + int err, cpu_exclusive_changed; turning_on = (simple_strtoul(buf, NULL, 10) != 0); @@ -664,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) clear_bit(bit, &trialcs.flags); err = validate_change(cs, &trialcs); - if (err == 0) { - if (turning_on) - set_bit(bit, &cs->flags); - else - clear_bit(bit, &cs->flags); - } - return err; + if (err < 0) + return err; + cpu_exclusive_changed = + (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); + if (turning_on) + set_bit(bit, &cs->flags); + else + clear_bit(bit, &cs->flags); + + if (cpu_exclusive_changed) + update_cpu_domains(cs); + return 0; } static int attach_task(struct cpuset *cs, char *buf) @@ -1316,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) up(&cpuset_sem); return -EBUSY; } - spin_lock(&cs->dentry->d_lock); parent = cs->parent; set_bit(CS_REMOVED, &cs->flags); + if (is_cpu_exclusive(cs)) + update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ if (list_empty(&parent->children)) check_for_release(parent); + spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); @@ -1404,6 +1460,18 @@ void cpuset_fork(struct task_struct *tsk) * * Description: Detach cpuset from @tsk and release it. * + * Note that cpusets marked notify_on_release force every task + * in them to take the global cpuset_sem semaphore when exiting. + * This could impact scaling on very large systems. Be reluctant + * to use notify_on_release cpusets where very high task exit + * scaling is required on large systems. + * + * Don't even think about derefencing 'cs' after the cpuset use + * count goes to zero, except inside a critical section guarded + * by the cpuset_sem semaphore. If you don't hold cpuset_sem, + * then a zero cpuset use count is a license to any other task to + * nuke the cpuset immediately. + * **/ void cpuset_exit(struct task_struct *tsk) @@ -1415,10 +1483,13 @@ void cpuset_exit(struct task_struct *tsk) tsk->cpuset = NULL; task_unlock(tsk); - if (atomic_dec_and_test(&cs->count)) { + if (notify_on_release(cs)) { down(&cpuset_sem); - check_for_release(cs); + if (atomic_dec_and_test(&cs->count)) + check_for_release(cs); up(&cpuset_sem); + } else { + atomic_dec(&cs->count); } } diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 0000000..459ba49 --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,52 @@ +/* + * kernel/crash_dump.c - Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include <linux/smp_lock.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/crash_dump.h> + +#include <asm/io.h> +#include <asm/uaccess.h> + +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *page, *vaddr; + + if (!csize) + return 0; + + page = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!page) + return -ENOMEM; + + vaddr = kmap_atomic_pfn(pfn, KM_PTE0); + copy_page(page, vaddr); + kunmap_atomic(vaddr, KM_PTE0); + + if (userbuf) { + if (copy_to_user(buf, (page + offset), csize)) { + kfree(page); + return -EFAULT; + } + } else { + memcpy(buf, (page + offset), csize); + } + + kfree(page); + return csize; +} diff --git a/kernel/exit.c b/kernel/exit.c index edaa50b..3ebcd60 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,11 @@ repeat: BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); __exit_sighand(p); + /* + * Note that the fastpath in sys_times depends on __exit_signal having + * updated the counters before a task is removed from the tasklist of + * the process by __unhash_process. + */ __unhash_process(p); /* @@ -793,6 +798,17 @@ fastcall NORET_TYPE void do_exit(long code) ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); } + /* + * We're taking recursive faults here in do_exit. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + printk(KERN_ALERT + "Fixing recursive fault but reboot is needed!\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } + tsk->flags |= PF_EXITING; /* @@ -811,10 +827,8 @@ fastcall NORET_TYPE void do_exit(long code) acct_update_integrals(tsk); update_mem_hiwater(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) { - del_timer_sync(&tsk->signal->real_timer); + if (group_dead) acct_process(code); - } exit_mm(tsk); exit_sem(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index f42a17f..2c78068 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) mm->mmap = NULL; mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; + mm->cached_hole_size = ~0UL; mm->map_count = 0; set_mm_counter(mm, rss, 0); set_mm_counter(mm, anon_rss, 0); @@ -249,8 +250,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) /* * Link in the new vma and copy the page table entries: - * link in first so that swapoff can see swap entries, - * and try_to_unmap_one's find_vma find the new vma. + * link in first so that swapoff can see swap entries. + * Note that, exceptionally, here the vma is inserted + * without holding mm->mmap_sem. */ spin_lock(&mm->page_table_lock); *pprev = tmp; @@ -322,6 +324,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -1000,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags, p->pdeath_signal = 0; p->exit_state = 0; - /* Perform scheduler related setup */ - sched_fork(p); - /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. @@ -1011,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_list); + /* Perform scheduler related setup. Assign this task to a CPU. */ + sched_fork(p, clone_flags); + /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* - * The task hasn't been attached yet, so cpus_allowed mask cannot - * have changed. The cpus_allowed mask of the parent may have - * changed after it was copied first time, and it may then move to - * another CPU - so we re-copy it here and set the child's CPU to - * the parent's CPU. This avoids alot of nasty races. + * The task hasn't been attached yet, so its cpus_allowed mask will + * not be changed, nor will its assigned CPU. + * + * The cpus_allowed mask of the parent may have changed after it was + * copied first time - so re-copy it here, then check the child's CPU + * to ensure it is on a valid CPU (and if not, just force it back to + * parent's CPU). This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; - set_task_cpu(p, smp_processor_id()); + if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed))) + set_task_cpu(p, smp_processor_id()); /* * Check for pending SIGKILL! The new thread should not be allowed diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 06b5a63..436c7d9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -119,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) */ desc->handler->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); desc->handler->end(irq); return 1; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5202e4c..ac67009 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -6,6 +6,7 @@ * This file contains driver APIs to the irq subsystem. */ +#include <linux/config.h> #include <linux/irq.h> #include <linux/module.h> #include <linux/random.h> @@ -255,6 +256,13 @@ void free_irq(unsigned int irq, void *dev_id) /* Found it - now remove it from the list of entries */ *pp = action->next; + + /* Currently used only by UML, might disappear one day.*/ +#ifdef CONFIG_IRQ_RELEASE_METHOD + if (desc->handler->release) + desc->handler->release(irq, dev_id); +#endif + if (!desc->action) { desc->status |= IRQ_DISABLED; if (desc->handler->shutdown) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index f6297c3..ba039e8 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -45,7 +45,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) } } -void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) { static int count = 100; diff --git a/kernel/kexec.c b/kernel/kexec.c new file mode 100644 index 0000000..7843548 --- /dev/null +++ b/kernel/kexec.c @@ -0,0 +1,1063 @@ +/* + * kexec.c - kexec system call + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/kexec.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/syscalls.h> +#include <linux/ioport.h> +#include <linux/hardirq.h> + +#include <asm/page.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/system.h> +#include <asm/semaphore.h> + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +int kexec_should_crash(struct task_struct *p) +{ + if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) + return 1; + return 0; +} + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in <asm/kexec.h>. + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, unsigned long end); +static struct page *kimage_alloc_page(struct kimage *image, + unsigned int gfp_mask, + unsigned long dest); + +static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + size_t segment_bytes; + struct kimage *image; + unsigned long i; + int result; + + /* Allocate a controlling structure */ + result = -ENOMEM; + image = kmalloc(sizeof(*image), GFP_KERNEL); + if (!image) + goto out; + + memset(image, 0, sizeof(*image)); + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->start = entry; + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unuseable pages */ + INIT_LIST_HEAD(&image->unuseable_pages); + + /* Read in the segments */ + image->nr_segments = nr_segments; + segment_bytes = nr_segments * sizeof(*segments); + result = copy_from_user(image->segment, segments, segment_bytes); + if (result) + goto out; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + * + * Since the kernel does everything in page size chunks ensure + * the destination addreses are page aligned. Too many + * special cases crop of when we don't do this. The most + * insidious is getting overlapping destination addresses + * simply because addresses are changed to page size + * granularity. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) + goto out; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + goto out; + } + + /* Verify our destination addresses do not overlap. + * If we alloed overlapping destination addresses + * through very weird things can happen with no + * easy explanation as one segment stops on another. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + unsigned long j; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + for (j = 0; j < i; j++) { + unsigned long pstart, pend; + pstart = image->segment[j].mem; + pend = pstart + image->segment[j].memsz; + /* Do the segments overlap ? */ + if ((mend > pstart) && (mstart < pend)) + goto out; + } + } + + /* Ensure our buffer sizes are strictly less than + * our memory sizes. This should always be the case, + * and it is easier to check up front than to be surprised + * later on. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + if (image->segment[i].bufsz > image->segment[i].memsz) + goto out; + } + + result = 0; +out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; + +} + +static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + int result; + struct kimage *image; + + /* Allocate and initialize a controlling structure */ + image = NULL; + result = do_kimage_alloc(&image, entry, nr_segments, segments); + if (result) + goto out; + + *rimage = image; + + /* + * Find a location for the control code buffer, and add it + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; + out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; +} + +static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment *segments) +{ + int result; + struct kimage *image; + unsigned long i; + + image = NULL; + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) { + result = -EADDRNOTAVAIL; + goto out; + } + + /* Allocate and initialize a controlling structure */ + result = do_kimage_alloc(&image, entry, nr_segments, segments); + if (result) + goto out; + + /* Enable the special crash kernel control page + * allocation policy. + */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || (mend > crashk_res.end)) + goto out; + } + + /* + * Find a location for the control code buffer, and add + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; +out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; +} + +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) + return 1; + } + + return 0; +} + +static struct page *kimage_alloc_pages(unsigned int gfp_mask, + unsigned int order) +{ + struct page *pages; + + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + pages->mapping = NULL; + pages->private = order; + count = 1 << order; + for (i = 0; i < count; i++) + SetPageReserved(pages + i); + } + + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + + order = page->private; + count = 1 << order; + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +static void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + kimage_free_pages(page); + } +} + +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + + pages = kimage_alloc_pages(GFP_KERNEL, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while (!pages); + + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everyting to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + + return pages; +} + +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * Control pages are also the only pags we must allocate + * when loading a crash kernel. All of the other pages + * are specified by the segments and we just memcpy + * into them directly. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * Given the low demand this implements a very simple + * allocator that finds the first hole of the appropriate + * size in the reserved memory region, and allocates all + * of the memory up to and including the hole. + */ + unsigned long hole_start, hole_end, size; + struct page *pages; + + pages = NULL; + size = (1 << order) << PAGE_SHIFT; + hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + while (hole_end <= crashk_res.end) { + unsigned long i; + + if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) + break; + if (hole_end > crashk_res.end) + break; + /* See if I overlap any of the segments */ + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + if ((hole_end >= mstart) && (hole_start <= mend)) { + /* Advance the hole to the end of the segment */ + hole_start = (mend + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + break; + } + } + /* If I don't overlap any segments I have found my hole! */ + if (i == image->nr_segments) { + pages = pfn_to_page(hole_start >> PAGE_SHIFT); + break; + } + } + if (pages) + image->control_page = hole_end; + + return pages; +} + + +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + struct page *pages = NULL; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + pages = kimage_alloc_normal_control_pages(image, order); + break; + case KEXEC_TYPE_CRASH: + pages = kimage_alloc_crash_control_pages(image, order); + break; + } + + return pages; +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) + image->entry++; + + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) + return -ENOMEM; + + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = ind_page + + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + + return 0; +} + +static int kimage_set_destination(struct kimage *image, + unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + if (result == 0) + image->destination = destination; + + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + if (result == 0) + image->destination += PAGE_SIZE; + + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unuseable pages I have cached */ + kimage_free_page_list(&image->unuseable_pages); + +} +static int kimage_terminate(struct kimage *image) +{ + if (*image->entry != 0) + image->entry++; + + *image->entry = IND_DONE; + + return 0; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION)? \ + phys_to_virt((entry & PAGE_MASK)): ptr +1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +static void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } + else if (entry & IND_SOURCE) + kimage_free_entry(entry); + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, + unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) + destination = entry & PAGE_MASK; + else if (entry & IND_SOURCE) { + if (page == destination) + return ptr; + destination += PAGE_SIZE; + } + } + + return 0; +} + +static struct page *kimage_alloc_page(struct kimage *image, + unsigned int gfp_mask, + unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) + return 0; + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unuseable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, + addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it. + */ + addr = old_addr; + page = old_page; + break; + } + else { + /* Place the page on the destination list I + * will use it later. + */ + list_add(&page->lru, &image->dest_pages); + } + } + + return page; +} + +static int kimage_load_normal_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long maddr; + unsigned long ubytes, mbytes; + int result; + unsigned char *buf; + + result = 0; + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + result = kimage_set_destination(image, maddr); + if (result < 0) + goto out; + + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); + if (page == 0) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) + << PAGE_SHIFT); + if (result < 0) + goto out; + + ptr = kmap(page); + /* Start with a clear page */ + memset(ptr, 0, PAGE_SIZE); + ptr += maddr & ~PAGE_MASK; + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); + if (mchunk > mbytes) + mchunk = mbytes; + + uchunk = mchunk; + if (uchunk > ubytes) + uchunk = ubytes; + + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_crash_segment(struct kimage *image, + struct kexec_segment *segment) +{ + /* For crash dumps kernels we simply copy the data from + * user space to it's destination. + * We do things a page at a time for the sake of kmap. + */ + unsigned long maddr; + unsigned long ubytes, mbytes; + int result; + unsigned char *buf; + + result = 0; + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = pfn_to_page(maddr >> PAGE_SHIFT); + if (page == 0) { + result = -ENOMEM; + goto out; + } + ptr = kmap(page); + ptr += maddr & ~PAGE_MASK; + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); + if (mchunk > mbytes) + mchunk = mbytes; + + uchunk = mchunk; + if (uchunk > ubytes) { + uchunk = ubytes; + /* Zero the trailing part of the page */ + memset(ptr + uchunk, 0, mchunk - uchunk); + } + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + int result = -ENOMEM; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + result = kimage_load_normal_segment(image, segment); + break; + case KEXEC_TYPE_CRASH: + result = kimage_load_crash_segment(image, segment); + break; + } + + return result; +} + +/* + * Exec Kernel system call: for obvious reasons only root may call it. + * + * This call breaks up into three pieces. + * - A generic part which loads the new kernel from the current + * address space, and very carefully places the data in the + * allocated pages. + * + * - A generic part that interacts with the kernel and tells all of + * the devices to shut down. Preventing on-going dmas, and placing + * the devices in a consistent state so a later kernel can + * reinitialize them. + * + * - A machine specific part that includes the syscall number + * and the copies the image to it's final destination. And + * jumps into the image at entry. + * + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ +struct kimage *kexec_image = NULL; +static struct kimage *kexec_crash_image = NULL; +/* + * A home grown binary mutex. + * Nothing can wait so this mutex is safe to use + * in interrupt context :) + */ +static int kexec_lock = 0; + +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) +{ + struct kimage **dest_image, *image; + int locked; + int result; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* + * Verify we have a legal set of flags + * This leaves us room for future extensions. + */ + if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) + return -EINVAL; + + /* Verify we are on the appropriate architecture */ + if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && + ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) + return -EINVAL; + + /* Put an artificial cap on the number + * of segments passed to kexec_load. + */ + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + image = NULL; + result = 0; + + /* Because we write directly to the reserved memory + * region when loading crash kernels we need a mutex here to + * prevent multiple crash kernels from attempting to load + * simultaneously, and to prevent a crash kernel from loading + * over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + locked = xchg(&kexec_lock, 1); + if (locked) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_ON_CRASH) + dest_image = &kexec_crash_image; + if (nr_segments > 0) { + unsigned long i; + + /* Loading another kernel to reboot into */ + if ((flags & KEXEC_ON_CRASH) == 0) + result = kimage_normal_alloc(&image, entry, + nr_segments, segments); + /* Loading another kernel to switch to if this one crashes */ + else if (flags & KEXEC_ON_CRASH) { + /* Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + result = kimage_crash_alloc(&image, entry, + nr_segments, segments); + } + if (result) + goto out; + + result = machine_kexec_prepare(image); + if (result) + goto out; + + for (i = 0; i < nr_segments; i++) { + result = kimage_load_segment(image, &image->segment[i]); + if (result) + goto out; + } + result = kimage_terminate(image); + if (result) + goto out; + } + /* Install the new kernel, and Uninstall the old */ + image = xchg(dest_image, image); + +out: + xchg(&kexec_lock, 0); /* Release the mutex */ + kimage_free(image); + + return result; +} + +#ifdef CONFIG_COMPAT +asmlinkage long compat_sys_kexec_load(unsigned long entry, + unsigned long nr_segments, + struct compat_kexec_segment __user *segments, + unsigned long flags) +{ + struct compat_kexec_segment in; + struct kexec_segment out, __user *ksegments; + unsigned long i, result; + + /* Don't allow clients that don't understand the native + * architecture to do anything. + */ + if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) + return -EINVAL; + + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); + for (i=0; i < nr_segments; i++) { + result = copy_from_user(&in, &segments[i], sizeof(in)); + if (result) + return -EFAULT; + + out.buf = compat_ptr(in.buf); + out.bufsz = in.bufsz; + out.mem = in.mem; + out.memsz = in.memsz; + + result = copy_to_user(&ksegments[i], &out, sizeof(out)); + if (result) + return -EFAULT; + } + + return sys_kexec_load(entry, nr_segments, ksegments, flags); +} +#endif + +void crash_kexec(struct pt_regs *regs) +{ + struct kimage *image; + int locked; + + + /* Take the kexec_lock here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + locked = xchg(&kexec_lock, 1); + if (!locked) { + image = xchg(&kexec_crash_image, NULL); + if (image) { + machine_crash_shutdown(regs); + machine_kexec(image); + } + xchg(&kexec_lock, 0); + } +} diff --git a/kernel/kmod.c b/kernel/kmod.c index eed53d4..44166e3 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -120,6 +120,7 @@ struct subprocess_info { char *path; char **argv; char **envp; + struct key *ring; int wait; int retval; }; @@ -130,16 +131,21 @@ struct subprocess_info { static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + struct key *old_session; int retval; - /* Unblock all signals. */ + /* Unblock all signals and set the session keyring. */ + key_get(sub_info->ring); flush_signals(current); spin_lock_irq(¤t->sighand->siglock); + old_session = __install_session_keyring(current, sub_info->ring); flush_signal_handlers(current, 1); sigemptyset(¤t->blocked); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); + key_put(old_session); + /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed(current, CPU_MASK_ALL); @@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data) } /** - * call_usermodehelper - start a usermode application + * call_usermodehelper_keys - start a usermode application * @path: pathname for the application * @argv: null-terminated argument list * @envp: null-terminated environment list + * @session_keyring: session keyring for process (NULL for an empty keyring) * @wait: wait for the application to finish and return status. * * Runs a user-space application. The application is started @@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data) * Must be called from process context. Returns a negative error code * if program was not execed successfully, or 0. */ -int call_usermodehelper(char *path, char **argv, char **envp, int wait) +int call_usermodehelper_keys(char *path, char **argv, char **envp, + struct key *session_keyring, int wait) { DECLARE_COMPLETION(done); struct subprocess_info sub_info = { @@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) .path = path, .argv = argv, .envp = envp, + .ring = session_keyring, .wait = wait, .retval = 0, }; @@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) wait_for_completion(&done); return sub_info.retval; } -EXPORT_SYMBOL(call_usermodehelper); +EXPORT_SYMBOL(call_usermodehelper_keys); void __init usermodehelper_init(void) { diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 037142b7..334f374 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -27,6 +27,9 @@ * interface to access function arguments. * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes * exceptions notifier to be first on the priority list. + * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston + * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> added function-return probes. */ #include <linux/kprobes.h> #include <linux/spinlock.h> @@ -41,6 +44,7 @@ #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; +static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; unsigned int kprobe_cpu = NR_CPUS; static DEFINE_SPINLOCK(kprobe_lock); @@ -78,22 +82,23 @@ struct kprobe *get_kprobe(void *addr) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; list_for_each_entry(kp, &p->list, list) { if (kp->pre_handler) { curr_kprobe = kp; - kp->pre_handler(kp, regs); - curr_kprobe = NULL; + if (kp->pre_handler(kp, regs)) + return 1; } + curr_kprobe = NULL; } return 0; } -void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -107,7 +112,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, return; } -int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) +static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { /* * if we faulted "during" the execution of a user specified @@ -120,19 +126,191 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) return 0; } +static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp = curr_kprobe; + if (curr_kprobe && kp->break_handler) { + if (kp->break_handler(kp, regs)) { + curr_kprobe = NULL; + return 1; + } + } + curr_kprobe = NULL; + return 0; +} + +struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler, + .post_handler = trampoline_post_handler +}; + +struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) +{ + struct hlist_node *node; + struct kretprobe_instance *ri; + hlist_for_each_entry(ri, node, &rp->free_instances, uflist) + return ri; + return NULL; +} + +static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) +{ + struct hlist_node *node; + struct kretprobe_instance *ri; + hlist_for_each_entry(ri, node, &rp->used_instances, uflist) + return ri; + return NULL; +} + +struct kretprobe_instance *get_rp_inst(void *sara) +{ + struct hlist_head *head; + struct hlist_node *node; + struct task_struct *tsk; + struct kretprobe_instance *ri; + + tsk = arch_get_kprobe_task(sara); + head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; + hlist_for_each_entry(ri, node, head, hlist) { + if (ri->stack_addr == sara) + return ri; + } + return NULL; +} + +void add_rp_inst(struct kretprobe_instance *ri) +{ + struct task_struct *tsk; + /* + * Remove rp inst off the free list - + * Add it back when probed function returns + */ + hlist_del(&ri->uflist); + tsk = arch_get_kprobe_task(ri->stack_addr); + /* Add rp inst onto table */ + INIT_HLIST_NODE(&ri->hlist); + hlist_add_head(&ri->hlist, + &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]); + + /* Also add this rp inst to the used list. */ + INIT_HLIST_NODE(&ri->uflist); + hlist_add_head(&ri->uflist, &ri->rp->used_instances); +} + +void recycle_rp_inst(struct kretprobe_instance *ri) +{ + /* remove rp inst off the rprobe_inst_table */ + hlist_del(&ri->hlist); + if (ri->rp) { + /* remove rp inst off the used list */ + hlist_del(&ri->uflist); + /* put rp inst back onto the free list */ + INIT_HLIST_NODE(&ri->uflist); + hlist_add_head(&ri->uflist, &ri->rp->free_instances); + } else + /* Unregistering */ + kfree(ri); +} + +struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) +{ + return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; +} + +struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk) +{ + struct task_struct *tsk; + struct hlist_head *head; + struct hlist_node *node; + struct kretprobe_instance *ri; + + head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)]; + + hlist_for_each_entry(ri, node, head, hlist) { + tsk = arch_get_kprobe_task(ri->stack_addr); + if (tsk == tk) + return ri; + } + return NULL; +} + +/* + * This function is called from do_exit or do_execv when task tk's stack is + * about to be recycled. Recycle any function-return probe instances + * associated with this task. These represent probed functions that have + * been called but may never return. + */ +void kprobe_flush_task(struct task_struct *tk) +{ + unsigned long flags = 0; + spin_lock_irqsave(&kprobe_lock, flags); + arch_kprobe_flush_task(tk); + spin_unlock_irqrestore(&kprobe_lock, flags); +} + +/* + * This kprobe pre_handler is registered with every kretprobe. When probe + * hits it will set up the return probe. + */ +static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe *rp = container_of(p, struct kretprobe, kp); + + /*TODO: consider to only swap the RA after the last pre_handler fired */ + arch_prepare_kretprobe(rp, regs); + return 0; +} + +static inline void free_rp_inst(struct kretprobe *rp) +{ + struct kretprobe_instance *ri; + while ((ri = get_free_rp_inst(rp)) != NULL) { + hlist_del(&ri->uflist); + kfree(ri); + } +} + +/* + * Keep all fields in the kprobe consistent + */ +static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); + memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); +} + +/* +* Add the new probe to old_p->list. Fail if this is the +* second jprobe at the address - two jprobes can't coexist +*/ +static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + struct kprobe *kp; + + if (p->break_handler) { + list_for_each_entry(kp, &old_p->list, list) { + if (kp->break_handler) + return -EEXIST; + } + list_add_tail(&p->list, &old_p->list); + } else + list_add(&p->list, &old_p->list); + return 0; +} + /* * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { + copy_kprobe(p, ap); ap->addr = p->addr; - ap->opcode = p->opcode; - memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn)); - ap->pre_handler = aggr_pre_handler; ap->post_handler = aggr_post_handler; ap->fault_handler = aggr_fault_handler; + ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); list_add(&p->list, &ap->list); @@ -153,16 +331,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) int ret = 0; struct kprobe *ap; - if (old_p->break_handler || p->break_handler) { - ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ - } else if (old_p->pre_handler == aggr_pre_handler) { - list_add(&p->list, &old_p->list); + if (old_p->pre_handler == aggr_pre_handler) { + copy_kprobe(old_p, p); + ret = add_new_kprobe(old_p, p); } else { ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); if (!ap) return -ENOMEM; add_aggr_kprobe(ap, old_p); - list_add(&p->list, &ap->list); + copy_kprobe(ap, p); + ret = add_new_kprobe(ap, p); } return ret; } @@ -170,10 +348,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) /* kprobe removal house-keeping routines */ static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) { - *p->addr = p->opcode; + arch_disarm_kprobe(p); hlist_del(&p->hlist); - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); spin_unlock_irqrestore(&kprobe_lock, flags); arch_remove_kprobe(p); } @@ -200,6 +376,7 @@ int register_kprobe(struct kprobe *p) } spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); + p->nmissed = 0; if (old_p) { ret = register_aggr_kprobe(old_p, p); goto out; @@ -210,10 +387,8 @@ int register_kprobe(struct kprobe *p) hlist_add_head(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - p->opcode = *p->addr; - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + arch_arm_kprobe(p); + out: spin_unlock_irqrestore(&kprobe_lock, flags); rm_kprobe: @@ -257,16 +432,82 @@ void unregister_jprobe(struct jprobe *jp) unregister_kprobe(&jp->kp); } +#ifdef ARCH_SUPPORTS_KRETPROBES + +int register_kretprobe(struct kretprobe *rp) +{ + int ret = 0; + struct kretprobe_instance *inst; + int i; + + rp->kp.pre_handler = pre_handler_kretprobe; + + /* Pre-allocate memory for max kretprobe instances */ + if (rp->maxactive <= 0) { +#ifdef CONFIG_PREEMPT + rp->maxactive = max(10, 2 * NR_CPUS); +#else + rp->maxactive = NR_CPUS; +#endif + } + INIT_HLIST_HEAD(&rp->used_instances); + INIT_HLIST_HEAD(&rp->free_instances); + for (i = 0; i < rp->maxactive; i++) { + inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); + if (inst == NULL) { + free_rp_inst(rp); + return -ENOMEM; + } + INIT_HLIST_NODE(&inst->uflist); + hlist_add_head(&inst->uflist, &rp->free_instances); + } + + rp->nmissed = 0; + /* Establish function entry probe point */ + if ((ret = register_kprobe(&rp->kp)) != 0) + free_rp_inst(rp); + return ret; +} + +#else /* ARCH_SUPPORTS_KRETPROBES */ + +int register_kretprobe(struct kretprobe *rp) +{ + return -ENOSYS; +} + +#endif /* ARCH_SUPPORTS_KRETPROBES */ + +void unregister_kretprobe(struct kretprobe *rp) +{ + unsigned long flags; + struct kretprobe_instance *ri; + + unregister_kprobe(&rp->kp); + /* No race here */ + spin_lock_irqsave(&kprobe_lock, flags); + free_rp_inst(rp); + while ((ri = get_used_rp_inst(rp)) != NULL) { + ri->rp = NULL; + hlist_del(&ri->uflist); + } + spin_unlock_irqrestore(&kprobe_lock, flags); +} + static int __init init_kprobes(void) { int i, err = 0; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ - for (i = 0; i < KPROBE_TABLE_SIZE; i++) + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); + INIT_HLIST_HEAD(&kretprobe_inst_table[i]); + } err = register_die_notifier(&kprobe_exceptions_nb); + /* Register the trampoline probe for return probe */ + register_kprobe(&trampoline_p); return err; } @@ -277,3 +518,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe); EXPORT_SYMBOL_GPL(register_jprobe); EXPORT_SYMBOL_GPL(unregister_jprobe); EXPORT_SYMBOL_GPL(jprobe_return); +EXPORT_SYMBOL_GPL(register_kretprobe); +EXPORT_SYMBOL_GPL(unregister_kretprobe); + diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1f064a6..015fb69 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) KERNEL_ATTR_RO(hotplug_seqnum); #endif +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +static ssize_t crash_notes_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%p\n", (void *)crash_notes); +} +KERNEL_ATTR_RO(crash_notes); +#endif + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_HOTPLUG &hotplug_seqnum_attr.attr, #endif +#ifdef CONFIG_KEXEC + &crash_notes_attr.attr, +#endif NULL }; diff --git a/kernel/module.c b/kernel/module.c index 5734ab0..068e271 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -35,6 +35,7 @@ #include <linux/notifier.h> #include <linux/stop_machine.h> #include <linux/device.h> +#include <linux/string.h> #include <asm/uaccess.h> #include <asm/semaphore.h> #include <asm/cacheflush.h> @@ -370,6 +371,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, #endif /* CONFIG_SMP */ #ifdef CONFIG_MODULE_UNLOAD +#define MODINFO_ATTR(field) \ +static void setup_modinfo_##field(struct module *mod, const char *s) \ +{ \ + mod->field = kstrdup(s, GFP_KERNEL); \ +} \ +static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ + struct module *mod, char *buffer) \ +{ \ + return sprintf(buffer, "%s\n", mod->field); \ +} \ +static int modinfo_##field##_exists(struct module *mod) \ +{ \ + return mod->field != NULL; \ +} \ +static void free_modinfo_##field(struct module *mod) \ +{ \ + kfree(mod->field); \ + mod->field = NULL; \ +} \ +static struct module_attribute modinfo_##field = { \ + .attr = { .name = __stringify(field), .mode = 0444, \ + .owner = THIS_MODULE }, \ + .show = show_modinfo_##field, \ + .setup = setup_modinfo_##field, \ + .test = modinfo_##field##_exists, \ + .free = free_modinfo_##field, \ +}; + +MODINFO_ATTR(version); +MODINFO_ATTR(srcversion); + +static struct module_attribute *modinfo_attrs[] = { + &modinfo_version, + &modinfo_srcversion, + NULL, +}; + /* Init the unload section of the module. */ static void module_unload_init(struct module *mod) { @@ -379,7 +417,7 @@ static void module_unload_init(struct module *mod) for (i = 0; i < NR_CPUS; i++) local_set(&mod->ref[i].count, 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[_smp_processor_id()].count, 1); + local_set(&mod->ref[raw_smp_processor_id()].count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -692,7 +730,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp) return 0; } -int set_obsolete(const char *val, struct kernel_param *kp) +static int set_obsolete(const char *val, struct kernel_param *kp) { unsigned int min, max; unsigned int size, maxsize; @@ -1031,6 +1069,32 @@ static void module_remove_refcnt_attr(struct module *mod) } #endif +#ifdef CONFIG_MODULE_UNLOAD +static int module_add_modinfo_attrs(struct module *mod) +{ + struct module_attribute *attr; + int error = 0; + int i; + + for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { + if (!attr->test || + (attr->test && attr->test(mod))) + error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr); + } + return error; +} + +static void module_remove_modinfo_attrs(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); + attr->free(mod); + } +} +#endif static int mod_sysfs_setup(struct module *mod, struct kernel_param *kparam, @@ -1056,6 +1120,12 @@ static int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg; +#ifdef CONFIG_MODULE_UNLOAD + err = module_add_modinfo_attrs(mod); + if (err) + goto out_unreg; +#endif + return 0; out_unreg: @@ -1066,6 +1136,9 @@ out: static void mod_kobject_remove(struct module *mod) { +#ifdef CONFIG_MODULE_UNLOAD + module_remove_modinfo_attrs(mod); +#endif module_remove_refcnt_attr(mod); module_param_sysfs_remove(mod); @@ -1311,6 +1384,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs, return NULL; } +#ifdef CONFIG_MODULE_UNLOAD +static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, + unsigned int infoindex) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->setup) + attr->setup(mod, + get_modinfo(sechdrs, + infoindex, + attr->attr.name)); + } +} +#endif + #ifdef CONFIG_KALLSYMS int is_exported(const char *name, const struct module *mod) { @@ -1615,6 +1705,11 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); +#ifdef CONFIG_MODULE_UNLOAD + /* Set up MODINFO_ATTR fields */ + setup_modinfo(mod, sechdrs, infoindex); +#endif + /* Fix up syms, so that st_value is a pointer to location. */ err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, mod); @@ -1758,6 +1853,7 @@ sys_init_module(void __user *umod, const char __user *uargs) { struct module *mod; + mm_segment_t old_fs = get_fs(); int ret = 0; /* Must have permission */ @@ -1775,6 +1871,9 @@ sys_init_module(void __user *umod, return PTR_ERR(mod); } + /* flush the icache in correct context */ + set_fs(KERNEL_DS); + /* Flush the instruction cache, since we've played with text */ if (mod->module_init) flush_icache_range((unsigned long)mod->module_init, @@ -1783,6 +1882,8 @@ sys_init_module(void __user *umod, flush_icache_range((unsigned long)mod->module_core, (unsigned long)mod->module_core + mod->core_size); + set_fs(old_fs); + /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); diff --git a/kernel/panic.c b/kernel/panic.c index 081f746..74ba5f3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -18,6 +18,7 @@ #include <linux/sysrq.h> #include <linux/interrupt.h> #include <linux/nmi.h> +#include <linux/kexec.h> int panic_timeout; int panic_on_oops; @@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...) unsigned long caller = (unsigned long) __builtin_return_address(0); #endif + /* + * It's possible to come here directly from a panic-assertion and not + * have preempt disabled. Some functions called from here want + * preempt to be disabled. No point enabling it later though... + */ + preempt_disable(); + bust_spinlocks(1); va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); @@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...) printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); bust_spinlocks(0); + /* + * If we have crashed and we have a crash kernel loaded let it handle + * everything else. + * Do we want to call this before we try to display a message? + */ + crash_kexec(NULL); + #ifdef CONFIG_SMP + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a panic + * situation. + */ smp_send_stop(); #endif @@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...) if (!panic_blink) panic_blink = no_blink; - if (panic_timeout > 0) - { + if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked.. diff --git a/kernel/params.c b/kernel/params.c index 5513844..d586c35ef 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -629,7 +629,7 @@ static ssize_t module_attr_show(struct kobject *kobj, mk = to_module_kobject(kobj); if (!attribute->show) - return -EPERM; + return -EIO; if (!try_module_get(mk->mod)) return -ENODEV; @@ -653,7 +653,7 @@ static ssize_t module_attr_store(struct kobject *kobj, mk = to_module_kobject(kobj); if (!attribute->store) - return -EPERM; + return -EIO; if (!try_module_get(mk->mod)) return -ENODEV; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index fd316c2..5b7b473 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -89,23 +89,6 @@ static struct idr posix_timers_id; static DEFINE_SPINLOCK(idr_lock); /* - * Just because the timer is not in the timer list does NOT mean it is - * inactive. It could be in the "fire" routine getting a new expire time. - */ -#define TIMER_INACTIVE 1 - -#ifdef CONFIG_SMP -# define timer_active(tmr) \ - ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE) -# define set_timer_inactive(tmr) \ - do { \ - (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \ - } while (0) -#else -# define timer_active(tmr) BARFY // error to use outside of SMP -# define set_timer_inactive(tmr) do { } while (0) -#endif -/* * we assume that the new SIGEV_THREAD_ID shares no bits with the other * SIGEV values. Here we put out an error if this assumption fails. */ @@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer) init_timer(&new_timer->it.real.timer); new_timer->it.real.timer.data = (unsigned long) new_timer; new_timer->it.real.timer.function = posix_timer_fn; - set_timer_inactive(new_timer); return 0; } @@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data) int do_notify = 1; spin_lock_irqsave(&timr->it_lock, flags); - set_timer_inactive(timr); if (!list_empty(&timr->it.real.abs_timer_entry)) { spin_lock(&abs_list.lock); do { @@ -983,8 +964,8 @@ common_timer_set(struct k_itimer *timr, int flags, * careful here. If smp we could be in the "fire" routine which will * be spinning as we hold the lock. But this is ONLY an SMP issue. */ + if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { #ifdef CONFIG_SMP - if (timer_active(timr) && !del_timer(&timr->it.real.timer)) /* * It can only be active if on an other cpu. Since * we have cleared the interval stuff above, it should @@ -994,11 +975,9 @@ common_timer_set(struct k_itimer *timr, int flags, * a "retry" exit status. */ return TIMER_RETRY; - - set_timer_inactive(timr); -#else - del_timer(&timr->it.real.timer); #endif + } + remove_from_abslist(timr); timr->it_requeue_pending = (timr->it_requeue_pending + 2) & @@ -1083,8 +1062,9 @@ retry: static inline int common_timer_del(struct k_itimer *timer) { timer->it.real.incr = 0; + + if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { #ifdef CONFIG_SMP - if (timer_active(timer) && !del_timer(&timer->it.real.timer)) /* * It can only be active if on an other cpu. Since * we have cleared the interval stuff above, it should @@ -1094,9 +1074,9 @@ static inline int common_timer_del(struct k_itimer *timer) * a "retry" exit status. */ return TIMER_RETRY; -#else - del_timer(&timer->it.real.timer); #endif + } + remove_from_abslist(timer); return 0; @@ -1197,6 +1177,7 @@ void exit_itimers(struct signal_struct *sig) tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); itimer_delete(tmr); } + del_timer_sync(&sig->real_timer); } /* diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 696387f..2c7121d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,8 +27,8 @@ config PM_DEBUG like suspend support. config SOFTWARE_SUSPEND - bool "Software Suspend (EXPERIMENTAL)" - depends on EXPERIMENTAL && PM && SWAP + bool "Software Suspend" + depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need APM. @@ -72,3 +72,7 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config SUSPEND_SMP + bool + depends on HOTPLUG_CPU && X86 && PM + default y diff --git a/kernel/power/Makefile b/kernel/power/Makefile index fbdc634..2f438d0 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif -swsusp-smp-$(CONFIG_SMP) += smp.o - obj-y := main.o process.o console.o pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o + +obj-$(CONFIG_SUSPEND_SMP) += smp.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 02b6764..fb8de63 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -117,8 +117,8 @@ static void finish(void) { device_resume(); platform_finish(); - enable_nonboot_cpus(); thaw_processes(); + enable_nonboot_cpus(); pm_restore_console(); } @@ -131,28 +131,35 @@ static int prepare_processes(void) sys_sync(); + disable_nonboot_cpus(); + if (freeze_processes()) { error = -EBUSY; - return error; + goto thaw; } if (pm_disk_mode == PM_DISK_PLATFORM) { if (pm_ops && pm_ops->prepare) { if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) - return error; + goto thaw; } } /* Free memory before shutting down devices. */ free_some_memory(); - return 0; +thaw: + thaw_processes(); + enable_nonboot_cpus(); + pm_restore_console(); + return error; } static void unprepare_processes(void) { - enable_nonboot_cpus(); + platform_finish(); thaw_processes(); + enable_nonboot_cpus(); pm_restore_console(); } @@ -160,15 +167,9 @@ static int prepare_devices(void) { int error; - disable_nonboot_cpus(); - if ((error = device_suspend(PMSG_FREEZE))) { + if ((error = device_suspend(PMSG_FREEZE))) printk("Some devices failed to suspend\n"); - platform_finish(); - enable_nonboot_cpus(); - return error; - } - - return 0; + return error; } /** @@ -185,9 +186,9 @@ int pm_suspend_disk(void) int error; error = prepare_processes(); - if (!error) { - error = prepare_devices(); - } + if (error) + return error; + error = prepare_devices(); if (error) { unprepare_processes(); @@ -250,7 +251,7 @@ static int software_resume(void) if ((error = prepare_processes())) { swsusp_close(); - goto Cleanup; + goto Done; } pr_debug("PM: Reading swsusp image.\n"); diff --git a/kernel/power/main.c b/kernel/power/main.c index 4cdebc9..c94cb9e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state) pm_prepare_console(); + disable_nonboot_cpus(); + + if (num_online_cpus() != 1) { + error = -EPERM; + goto Enable_cpu; + } + if (freeze_processes()) { error = -EAGAIN; goto Thaw; @@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state) pm_ops->finish(state); Thaw: thaw_processes(); + Enable_cpu: + enable_nonboot_cpus(); pm_restore_console(); return error; } @@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state) if (pm_ops && pm_ops->finish) pm_ops->finish(state); thaw_processes(); + enable_nonboot_cpus(); pm_restore_console(); } @@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state) goto Unlock; } - /* Suspend is hard to get right on SMP. */ - if (num_online_cpus() != 1) { - error = -EPERM; - goto Unlock; - } - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); if ((error = suspend_prepare(state))) goto Unlock; diff --git a/kernel/power/process.c b/kernel/power/process.c index 78d92dc..0a08664 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p) } /* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(unsigned long flag) +void refrigerator(void) { /* Hmm, should we be allowed to suspend when there are realtime processes around? */ @@ -41,14 +41,13 @@ void refrigerator(unsigned long flag) current->state = TASK_UNINTERRUPTIBLE; pr_debug("%s entered refrigerator\n", current->comm); printk("="); - current->flags &= ~PF_FREEZE; + frozen_process(current); spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - current->flags |= PF_FROZEN; - while (current->flags & PF_FROZEN) + while (frozen(current)) schedule(); pr_debug("%s left refrigerator\n", current->comm); current->state = save; @@ -57,10 +56,10 @@ void refrigerator(unsigned long flag) /* 0 = success, else # of processes that we failed to stop */ int freeze_processes(void) { - int todo; - unsigned long start_time; + int todo; + unsigned long start_time; struct task_struct *g, *p; - + printk( "Stopping tasks: " ); start_time = jiffies; do { @@ -70,14 +69,12 @@ int freeze_processes(void) unsigned long flags; if (!freezeable(p)) continue; - if ((p->flags & PF_FROZEN) || + if ((frozen(p)) || (p->state == TASK_TRACED) || (p->state == TASK_STOPPED)) continue; - /* FIXME: smp problem here: we may not access other process' flags - without locking */ - p->flags |= PF_FREEZE; + freeze(p); spin_lock_irqsave(&p->sighand->siglock, flags); signal_wake_up(p, 0); spin_unlock_irqrestore(&p->sighand->siglock, flags); @@ -91,7 +88,7 @@ int freeze_processes(void) return todo; } } while(todo); - + printk( "|\n" ); BUG_ON(in_atomic()); return 0; @@ -106,10 +103,7 @@ void thaw_processes(void) do_each_thread(g, p) { if (!freezeable(p)) continue; - if (p->flags & PF_FROZEN) { - p->flags &= ~PF_FROZEN; - wake_up_process(p); - } else + if (!thaw_process(p)) printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); } while_each_thread(g, p); diff --git a/kernel/power/smp.c b/kernel/power/smp.c index cba3584b..bbe2307 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c @@ -13,73 +13,52 @@ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/module.h> +#include <linux/cpu.h> #include <asm/atomic.h> #include <asm/tlbflush.h> -static atomic_t cpu_counter, freeze; - - -static void smp_pause(void * data) -{ - struct saved_context ctxt; - __save_processor_state(&ctxt); - printk("Sleeping in:\n"); - dump_stack(); - atomic_inc(&cpu_counter); - while (atomic_read(&freeze)) { - /* FIXME: restore takes place at random piece inside this. - This should probably be written in assembly, and - preserve general-purpose registers, too - - What about stack? We may need to move to new stack here. - - This should better be ran with interrupts disabled. - */ - cpu_relax(); - barrier(); - } - atomic_dec(&cpu_counter); - __restore_processor_state(&ctxt); -} - -static cpumask_t oldmask; +/* This is protected by pm_sem semaphore */ +static cpumask_t frozen_cpus; void disable_nonboot_cpus(void) { - oldmask = current->cpus_allowed; - set_cpus_allowed(current, cpumask_of_cpu(0)); - printk("Freezing CPUs (at %d)", _smp_processor_id()); - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(HZ); - printk("..."); - BUG_ON(_smp_processor_id() != 0); - - /* FIXME: for this to work, all the CPUs must be running - * "idle" thread (or we deadlock). Is that guaranteed? */ + int cpu, error; - atomic_set(&cpu_counter, 0); - atomic_set(&freeze, 1); - smp_call_function(smp_pause, NULL, 0, 0); - while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { - cpu_relax(); - barrier(); + error = 0; + cpus_clear(frozen_cpus); + printk("Freezing cpus ...\n"); + for_each_online_cpu(cpu) { + if (cpu == 0) + continue; + error = cpu_down(cpu); + if (!error) { + cpu_set(cpu, frozen_cpus); + printk("CPU%d is down\n", cpu); + continue; + } + printk("Error taking cpu %d down: %d\n", cpu, error); } - printk("ok\n"); + BUG_ON(smp_processor_id() != 0); + if (error) + panic("cpus not sleeping"); } void enable_nonboot_cpus(void) { - printk("Restarting CPUs"); - atomic_set(&freeze, 0); - while (atomic_read(&cpu_counter)) { - cpu_relax(); - barrier(); - } - printk("..."); - set_cpus_allowed(current, oldmask); - schedule(); - printk("ok\n"); + int cpu, error; + printk("Thawing cpus ...\n"); + for_each_cpu_mask(cpu, frozen_cpus) { + error = smp_prepare_cpu(cpu); + if (!error) + error = cpu_up(cpu); + if (!error) { + printk("CPU%d is up\n", cpu); + continue; + } + printk("Error taking cpu %d up: %d\n", cpu, error); + panic("Not enough cpus"); + } + cpus_clear(frozen_cpus); } - diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 90b3b68..c285fc5 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -10,12 +10,12 @@ * This file is released under the GPLv2. * * I'd like to thank the following people for their work: - * + * * Pavel Machek <pavel@ucw.cz>: * Modifications, defectiveness pointing, being with me at the very beginning, * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. * - * Steve Doddi <dirk@loth.demon.co.uk>: + * Steve Doddi <dirk@loth.demon.co.uk>: * Support the possibility of hardware state restoring. * * Raph <grey.havens@earthling.net>: @@ -81,14 +81,14 @@ static int nr_copy_pages_check; extern char resume_file[]; /* Local variables that should not be affected by save */ -unsigned int nr_copy_pages __nosavedata = 0; +static unsigned int nr_copy_pages __nosavedata = 0; /* Suspend pagedir is allocated before final copy, therefore it - must be freed after resume + must be freed after resume Warning: this is evil. There are actually two pagedirs at time of resume. One is "pagedir_save", which is empty frame allocated at - time of suspend, that must be freed. Second is "pagedir_nosave", + time of suspend, that must be freed. Second is "pagedir_nosave", allocated at time of resume, that travels through memory not to collide with anything. @@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev) { int error; - rw_swap_page_sync(READ, + rw_swap_page_sync(READ, swp_entry(root_swap, 0), virt_to_page((unsigned long)&swsusp_header)); if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || @@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev) memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); swsusp_header.swsusp_info = prev; - error = rw_swap_page_sync(WRITE, + error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), virt_to_page((unsigned long) &swsusp_header)); @@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info) static int swsusp_swap_check(void) /* This is called before saving image */ { int i, len; - + len=strlen(resume_file); root_swap = 0xFFFF; - + swap_list_lock(); - for(i=0; i<MAX_SWAPFILES; i++) { + for (i=0; i<MAX_SWAPFILES; i++) { if (swap_info[i].flags == 0) { swapfile_used[i]=SWAPFILE_UNUSED; } else { - if(!len) { + if (!len) { printk(KERN_WARNING "resume= option should be used to set suspend device" ); - if(root_swap == 0xFFFF) { + if (root_swap == 0xFFFF) { swapfile_used[i] = SWAPFILE_SUSPEND; root_swap = i; } else - swapfile_used[i] = SWAPFILE_IGNORED; + swapfile_used[i] = SWAPFILE_IGNORED; } else { /* we ignore all swap devices that are not the resume_file */ if (is_resume_device(&swap_info[i])) { @@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */ * This is called after saving image so modification * will be lost after resume... and that's what we want. * we make the device unusable. A new call to - * lock_swapdevices can unlock the devices. + * lock_swapdevices can unlock the devices. */ static void lock_swapdevices(void) { int i; swap_list_lock(); - for(i = 0; i< MAX_SWAPFILES; i++) - if(swapfile_used[i] == SWAPFILE_IGNORED) { + for (i = 0; i< MAX_SWAPFILES; i++) + if (swapfile_used[i] == SWAPFILE_IGNORED) { swap_info[i].flags ^= 0xFF; } swap_list_unlock(); @@ -229,7 +229,7 @@ static void lock_swapdevices(void) * @loc: Place to store the entry we used. * * Allocate a new swap entry and 'sync' it. Note we discard -EIO - * errors. That is an artifact left over from swsusp. It did not + * errors. That is an artifact left over from swsusp. It did not * check the return of rw_swap_page_sync() at all, since most pages * written back to swap would return -EIO. * This is a partial improvement, since we will at least return other @@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc) int error = 0; entry = get_swap_page(); - if (swp_offset(entry) && + if (swp_offset(entry) && swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); @@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc) /** * data_free - Free the swap entries used by the saved image. * - * Walk the list of used swap entries and free each one. + * Walk the list of used swap entries and free each one. * This is only used for cleanup when suspend fails. */ static void data_free(void) @@ -290,7 +290,7 @@ static int data_write(void) mod = 1; printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); - for_each_pbe(p, pagedir_nosave) { + for_each_pbe (p, pagedir_nosave) { if (!(i%mod)) printk( "\b\b\b\b%3d%%", i / mod ); if ((error = write_page(p->address, &(p->swap_address)))) @@ -335,7 +335,7 @@ static int close_swap(void) dump_info(); error = write_page((unsigned long)&swsusp_info, &entry); - if (!error) { + if (!error) { printk( "S" ); error = mark_swapfiles(entry); printk( "|\n" ); @@ -370,7 +370,7 @@ static int write_pagedir(void) struct pbe * pbe; printk( "Writing pagedir..."); - for_each_pb_page(pbe, pagedir_nosave) { + for_each_pb_page (pbe, pagedir_nosave) { if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) return error; } @@ -472,7 +472,7 @@ static int save_highmem(void) int res = 0; pr_debug("swsusp: Saving Highmem\n"); - for_each_zone(zone) { + for_each_zone (zone) { if (is_highmem(zone)) res = save_highmem_zone(zone); if (res) @@ -547,7 +547,7 @@ static void count_data_pages(void) nr_copy_pages = 0; - for_each_zone(zone) { + for_each_zone (zone) { if (is_highmem(zone)) continue; mark_free_pages(zone); @@ -562,9 +562,9 @@ static void copy_data_pages(void) struct zone *zone; unsigned long zone_pfn; struct pbe * pbe = pagedir_nosave; - + pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); - for_each_zone(zone) { + for_each_zone (zone) { if (is_highmem(zone)) continue; mark_free_pages(zone); @@ -702,7 +702,7 @@ static void free_image_pages(void) { struct pbe * p; - for_each_pbe(p, pagedir_save) { + for_each_pbe (p, pagedir_save) { if (p->address) { ClearPageNosave(virt_to_page(p->address)); free_page(p->address); @@ -719,7 +719,7 @@ static int alloc_image_pages(void) { struct pbe * p; - for_each_pbe(p, pagedir_save) { + for_each_pbe (p, pagedir_save) { p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); if (!p->address) return -ENOMEM; @@ -740,7 +740,7 @@ void swsusp_free(void) /** * enough_free_mem - Make sure we enough free memory to snapshot. * - * Returns TRUE or FALSE after checking the number of available + * Returns TRUE or FALSE after checking the number of available * free pages. */ @@ -758,11 +758,11 @@ static int enough_free_mem(void) /** * enough_swap - Make sure we have enough swap to save the image. * - * Returns TRUE or FALSE after checking the total amount of swap + * Returns TRUE or FALSE after checking the total amount of swap * space avaiable. * * FIXME: si_swapinfo(&i) returns all swap devices information. - * We should only consider resume_device. + * We should only consider resume_device. */ static int enough_swap(void) @@ -781,18 +781,18 @@ static int swsusp_alloc(void) { int error; + pagedir_nosave = NULL; + nr_copy_pages = calc_nr(nr_copy_pages); + pr_debug("suspend: (pages needed: %d + %d free: %d)\n", nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); - pagedir_nosave = NULL; if (!enough_free_mem()) return -ENOMEM; if (!enough_swap()) return -ENOSPC; - nr_copy_pages = calc_nr(nr_copy_pages); - if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); return -ENOMEM; @@ -827,8 +827,8 @@ static int suspend_prepare_image(void) error = swsusp_alloc(); if (error) return error; - - /* During allocating of suspend pagedir, new cold pages may appear. + + /* During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(); @@ -929,21 +929,6 @@ int swsusp_resume(void) return error; } -/* More restore stuff */ - -/* - * Returns true if given address/order collides with any orig_address - */ -static int does_collide_order(unsigned long addr, int order) -{ - int i; - - for (i=0; i < (1<<order); i++) - if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE))) - return 1; - return 0; -} - /** * On resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages @@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask) unsigned long m; m = get_zeroed_page(gfp_mask); - while (does_collide_order(m, 0)) { + while (!PageNosaveFree(virt_to_page(m))) { eat_page((void *)m); m = get_zeroed_page(gfp_mask); if (!m) @@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) /* Set page flags */ - for_each_zone(zone) { + for_each_zone (zone) { for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) SetPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); @@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) /* Relocate colliding pages */ for_each_pb_page (pbpage, pblist) { - if (does_collide_order((unsigned long)pbpage, 0)) { + if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); if (!m) { error = -ENOMEM; @@ -1193,8 +1178,10 @@ static const char * sanity_check(void) return "version"; if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) return "machine"; +#if 0 if(swsusp_info.cpus != num_online_cpus()) return "number of cpus"; +#endif return NULL; } diff --git a/kernel/printk.c b/kernel/printk.c index 01b58d7..5092397 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id()) && - system_state != SYSTEM_RUNNING) { + if (!cpu_online(smp_processor_id())) { /* * Some console drivers may assume that per-cpu resources have * been allocated. So don't allow them to be called by this @@ -876,8 +875,10 @@ void register_console(struct console * console) break; console->flags |= CON_ENABLED; console->index = console_cmdline[i].index; - if (i == preferred_console) + if (i == selected_console) { console->flags |= CON_CONSDEV; + preferred_console = selected_console; + } break; } @@ -897,6 +898,8 @@ void register_console(struct console * console) if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { console->next = console_drivers; console_drivers = console; + if (console->next) + console->next->flags &= ~CON_CONSDEV; } else { console->next = console_drivers->next; console_drivers->next = console; @@ -937,10 +940,14 @@ int unregister_console(struct console * console) /* If last console is removed, we re-enable picking the first * one that gets registered. Without that, pmac early boot console * would prevent fbcon from taking over. + * + * If this isn't the last console and it has CON_CONSDEV set, we + * need to set it on the next preferred console. */ if (console_drivers == NULL) preferred_console = selected_console; - + else if (console->flags & CON_CONSDEV) + console_drivers->flags |= CON_CONSDEV; release_console_sem(); return res; diff --git a/kernel/resource.c b/kernel/resource.c index 52f696f..26967e0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new, new->start = min; if (new->end > max) new->end = max; - new->start = (new->start + align - 1) & ~(align - 1); + new->start = ALIGN(new->start, align); if (alignf) alignf(alignf_data, new, size, align); if (new->start < new->end && new->end - new->start >= size - 1) { diff --git a/kernel/sched.c b/kernel/sched.c index 66b2ed7..a07cff9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -166,7 +166,7 @@ #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { if (p->static_prio < NICE_TO_PRIO(0)) return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); @@ -206,7 +206,7 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP - unsigned long cpu_load; + unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -260,23 +260,87 @@ struct runqueue { static DEFINE_PER_CPU(struct runqueue, runqueues); +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ #define for_each_domain(cpu, domain) \ - for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) +for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -/* - * Default context-switch locking: - */ #ifndef prepare_arch_switch -# define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) #endif +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(runqueue_t *rq, task_t *p) +{ + return rq->curr == p; +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(runqueue_t *rq, task_t *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return rq->curr == p; +#endif +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 11 +#define SCHEDSTAT_VERSION 12 static int show_schedstat(struct seq_file *seq, void *v) { @@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v) #ifdef CONFIG_SMP /* domain-specific stats */ + preempt_disable(); for_each_domain(cpu, sd) { enum idle_type itype; char mask_str[NR_CPUS]; @@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_pushed, sd->sbe_attempts, + sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } + preempt_enable(); #endif } return 0; @@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void) return rq; } -#ifdef CONFIG_SCHED_SMT -static int cpu_and_siblings_are_idle(int cpu) -{ - int sib; - for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { - if (idle_cpu(sib)) - continue; - return 0; - } - - return 1; -} -#else -#define cpu_and_siblings_are_idle(A) idle_cpu(A) -#endif - #ifdef CONFIG_SCHEDSTATS /* * Called when a process is dequeued from the active array and given @@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) rq->nr_running++; } -static void recalc_task_prio(task_t *p, unsigned long long now) +static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ unsigned long long __sleep_time = now - p->timestamp; @@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now) } } - p->prio = effective_prio(p); + return effective_prio(p); } /* @@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) } #endif - recalc_task_prio(p, now); + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -782,22 +833,12 @@ inline int task_curr(const task_t *p) } #ifdef CONFIG_SMP -enum request_type { - REQ_MOVE_TASK, - REQ_SET_DOMAIN, -}; - typedef struct { struct list_head list; - enum request_type type; - /* For REQ_MOVE_TASK */ task_t *task; int dest_cpu; - /* For REQ_SET_DOMAIN */ - struct sched_domain *sd; - struct completion done; } migration_req_t; @@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) } init_completion(&req->done); - req->type = REQ_MOVE_TASK; req->task = p; req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); @@ -886,26 +926,154 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu) +static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; - return min(rq->cpu_load, load_now); + return min(rq->cpu_load[type-1], load_now); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long target_load(int cpu) +static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; - return max(rq->cpu_load, load_now); + return max(rq->cpu_load[type-1], load_now); } -#endif +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + local_group = cpu_isset(this_cpu, group->cpumask); + /* XXX: put a cpus allowed check */ + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + group = group->next; + } while (group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_queue - find the idlest runqueue among the cpus in group. + */ +static int find_idlest_cpu(struct sched_group *group, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + for_each_cpu_mask(i, group->cpumask) { + load = source_load(i, 0); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; + + for_each_domain(cpu, tmp) + if (tmp->flags & flag) + sd = tmp; + + while (sd) { + cpumask_t span; + struct sched_group *group; + int new_cpu; + int weight; + + span = sd->span; + group = find_idlest_group(sd, t, cpu); + if (!group) + goto nextlevel; + + new_cpu = find_idlest_cpu(group, cpu); + if (new_cpu == -1 || new_cpu == cpu) + goto nextlevel; + + /* Now try balancing at a lower domain level */ + cpu = new_cpu; +nextlevel: + sd = NULL; + weight = cpus_weight(span); + for_each_domain(cpu, tmp) { + if (weight <= cpus_weight(tmp->span)) + break; + if (tmp->flags & flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return cpu; +} + +#endif /* CONFIG_SMP */ /* * wake_idle() will wake a task on an idle cpu if task->cpu is @@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p) for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_IDLE) { - cpus_and(tmp, sd->span, cpu_online_map); - cpus_and(tmp, tmp, p->cpus_allowed); + cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) return i; } } - else break; + else + break; } return cpu; } @@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) runqueue_t *rq; #ifdef CONFIG_SMP unsigned long load, this_load; - struct sched_domain *sd; + struct sched_domain *sd, *this_sd = NULL; int new_cpu; #endif @@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) if (unlikely(task_running(rq, p))) goto out_activate; -#ifdef CONFIG_SCHEDSTATS + new_cpu = cpu; + schedstat_inc(rq, ttwu_cnt); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); - } else { - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } + goto out_set_cpu; + } + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + this_sd = sd; + break; } } -#endif - new_cpu = cpu; - if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; - load = source_load(cpu); - this_load = target_load(this_cpu); - /* - * If sync wakeup then subtract the (maximum possible) effect of - * the currently running task from the load of the current CPU: + * Check for affine wakeup and passive balancing possibilities. */ - if (sync) - this_load -= SCHED_LOAD_SCALE; + if (this_sd) { + int idx = this_sd->wake_idx; + unsigned int imbalance; - /* Don't pull the task off an idle CPU to a busy one */ - if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) - goto out_set_cpu; + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - new_cpu = this_cpu; /* Wake to this CPU if we can */ + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); - /* - * Scan domains for affine wakeup and passive balancing - * possibilities. - */ - for_each_domain(this_cpu, sd) { - unsigned int imbalance; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + new_cpu = this_cpu; /* Wake to this CPU if we can */ - if ((sd->flags & SD_WAKE_AFFINE) && - !task_hot(p, rq->timestamp_last_tick, sd)) { + if (this_sd->flags & SD_WAKE_AFFINE) { + unsigned long tl = this_load; /* - * This domain has SD_WAKE_AFFINE and p is cache cold - * in this domain. + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_move_affine); + if (sync) + tl -= SCHED_LOAD_SCALE; + + if ((tl <= load && + tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || + 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); goto out_set_cpu; } - } else if ((sd->flags & SD_WAKE_BALANCE) && - imbalance*this_load <= 100*load) { - /* - * This domain has SD_WAKE_BALANCE and there is - * an imbalance. - */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_move_balance); + } + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); goto out_set_cpu; } } @@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state) return try_to_wake_up(p, state, 0); } -#ifdef CONFIG_SMP -static int find_idlest_cpu(struct task_struct *p, int this_cpu, - struct sched_domain *sd); -#endif - /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p) +void fastcall sched_fork(task_t *p, int clone_flags) { + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p) p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); p->array = NULL; - spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif #ifdef CONFIG_PREEMPT - /* - * During context-switch we hold precisely one spinlock, which - * schedule_tail drops. (in the common case it's this_rq()->lock, - * but it also can be p->switch_lock.) So we compensate with a count - * of 1. Also, we want to start with kernel preemption disabled. - */ + /* Want to start with kernel preemption disabled. */ p->thread_info->preempt_count = 1; #endif /* @@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p) * runqueue lock is not a problem. */ current->time_slice = 1; - preempt_disable(); scheduler_tick(); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); + } + local_irq_enable(); + put_cpu(); } /* @@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) runqueue_t *rq, *this_rq; rq = task_rq_lock(p, &flags); - cpu = task_cpu(p); - this_cpu = smp_processor_id(); - BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); + cpu = task_cpu(p); /* * We decrease the sleep average of forking parents @@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p) } /** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void prepare_task_switch(runqueue_t *rq, task_t *next) +{ + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. * - * We enter this with the runqueue still locked, and finish_arch_switch() - * will unlock it along with doing any other architecture-specific cleanup - * actions. + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. * * Note that we may have delayed dropping an mm in context_switch(). If * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(task_t *prev) +static inline void finish_task_switch(runqueue_t *rq, task_t *prev) __releases(rq->lock) { - runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; unsigned long prev_task_flags; @@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_task_flags = prev->flags; - finish_arch_switch(rq, prev); + finish_arch_switch(prev); + finish_lock_switch(rq, prev); if (mm) mmdrop(mm); if (unlikely(prev_task_flags & PF_DEAD)) @@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev) asmlinkage void schedule_tail(task_t *prev) __releases(rq->lock) { - finish_task_switch(prev); - + runqueue_t *rq = this_rq(); + finish_task_switch(rq, prev); +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif if (current->set_child_tid) put_user(current->pid, current->set_child_tid); } @@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) } /* - * find_idlest_cpu - find the least busy runqueue. - */ -static int find_idlest_cpu(struct task_struct *p, int this_cpu, - struct sched_domain *sd) -{ - unsigned long load, min_load, this_load; - int i, min_cpu; - cpumask_t mask; - - min_cpu = UINT_MAX; - min_load = ULONG_MAX; - - cpus_and(mask, sd->span, p->cpus_allowed); - - for_each_cpu_mask(i, mask) { - load = target_load(i); - - if (load < min_load) { - min_cpu = i; - min_load = load; - - /* break out early on an idle CPU: */ - if (!min_load) - break; - } - } - - /* add +1 to account for the new task */ - this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; - - /* - * Would with the addition of the new task to the - * current CPU there be an imbalance between this - * CPU and the idlest CPU? - * - * Use half of the balancing threshold - new-context is - * a good opportunity to balance. - */ - if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) - return min_cpu; - - return this_cpu; -} - -/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then @@ -1571,37 +1712,16 @@ out: } /* - * sched_exec(): find the highest-level, exec-balance-capable - * domain and try to migrate the task to the least loaded CPU. - * - * execve() is a valuable balancing opportunity, because at this point - * the task has the smallest effective memory and cache footprint. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ void sched_exec(void) { - struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - - /* Prefer the current CPU if there's only this task running */ - if (this_rq()->nr_running <= 1) - goto out; - - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_BALANCE_EXEC) - sd = tmp; - - if (sd) { - schedstat_inc(sd, sbe_attempts); - new_cpu = find_idlest_cpu(current, this_cpu, sd); - if (new_cpu != this_cpu) { - schedstat_inc(sd, sbe_pushed); - put_cpu(); - sched_migrate_task(current, new_cpu); - return; - } - } -out: + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); put_cpu(); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); } /* @@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, int *all_pinned) { /* * We do not migrate tasks that are: @@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_running(rq, p)) - return 0; if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; + *all_pinned = 0; + + if (task_running(rq, p)) + return 0; /* * Aggressive migration if: - * 1) the [whole] cpu is idle, or + * 1) task is cache cold, or * 2) too many balance attempts have failed. */ - if (cpu_and_siblings_are_idle(this_cpu) || \ - sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) return 1; if (task_hot(p, rq->timestamp_last_tick, sd)) - return 0; + return 0; return 1; } @@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) + enum idle_type idle, int *all_pinned) { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0; + int idx, pulled = 0, pinned = 0; task_t *tmp; - if (max_nr_move <= 0 || busiest->nr_running <= 1) + if (max_nr_move == 0) goto out; + pinned = 1; + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1717,7 +1840,7 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { if (curr != head) goto skip_queue; idx++; @@ -1746,6 +1869,9 @@ out: * inside pull_task(). */ schedstat_add(sd, lb_gained[idle], pulled); + + if (all_pinned) + *all_pinned = pinned; return pulled; } @@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + int load_idx; max_load = this_load = total_load = total_pwr = 0; + if (idle == NOT_IDLE) + load_idx = sd->busy_idx; + else if (idle == NEWLY_IDLE) + load_idx = sd->newidle_idx; + else + load_idx = sd->idle_idx; do { unsigned long load; @@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i); + load = target_load(i, load_idx); else - load = source_load(i); + load = source_load(i, load_idx); avg_load += load; } @@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (local_group) { this_load = avg_load; this = group; - goto nextgroup; } else if (avg_load > max_load) { max_load = avg_load; busiest = group; } -nextgroup: group = group->next; } while (group != sd->groups); @@ -1870,15 +2001,9 @@ nextgroup: /* Get rid of the scaling factor, rounding down as we divide */ *imbalance = *imbalance / SCHED_LOAD_SCALE; - return busiest; out_balanced: - if (busiest && (idle == NEWLY_IDLE || - (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { - *imbalance = 1; - return busiest; - } *imbalance = 0; return NULL; @@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i); + load = source_load(i, 0); if (load > max_load) { max_load = load; @@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) } /* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + +/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * @@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, struct sched_group *group; runqueue_t *busiest; unsigned long imbalance; - int nr_moved; + int nr_moved, all_pinned = 0; + int active_balance = 0; spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); @@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } + BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance); @@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle); + imbalance, sd, idle, + &all_pinned); spin_unlock(&busiest->lock); + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(all_pinned)) + goto out_balanced; } + spin_unlock(&this_rq->lock); if (!nr_moved) { @@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - int wake = 0; spin_lock(&busiest->lock); if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; - wake = 1; + active_balance = 1; } spin_unlock(&busiest->lock); - if (wake) + if (active_balance) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries; + sd->nr_balance_failed = sd->cache_nice_tries+1; } - - /* - * We were unbalanced, but unsuccessful in move_tasks(), - * so bump the balance_interval to lessen the lock contention. - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval++; - } else { + } else sd->nr_balance_failed = 0; + if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; } return nr_moved; @@ -2005,8 +2137,10 @@ out_balanced: schedstat_inc(sd, lb_balanced[idle]); + sd->nr_balance_failed = 0; /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; return 0; @@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) { - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); - goto out; + goto out_balanced; } busiest = find_busiest_queue(group); - if (!busiest || busiest == this_rq) { - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); - goto out; + goto out_balanced; } + BUG_ON(busiest == this_rq); + /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, NEWLY_IDLE); + imbalance, sd, NEWLY_IDLE, NULL); if (!nr_moved) schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + else + sd->nr_balance_failed = 0; spin_unlock(&busiest->lock); - -out: return nr_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + sd->nr_balance_failed = 0; + return 0; } /* @@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) { struct sched_domain *sd; - struct sched_group *cpu_group; runqueue_t *target_rq; - cpumask_t visited_cpus; - int cpu; + int target_cpu = busiest_rq->push_cpu; + + if (busiest_rq->nr_running <= 1) + /* no task to move */ + return; + + target_rq = cpu_rq(target_cpu); /* - * Search for suitable CPUs to push tasks to in successively higher - * domains with SD_LOAD_BALANCE set. + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. */ - visited_cpus = CPU_MASK_NONE; - for_each_domain(busiest_cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - /* no more domains to search */ - break; + BUG_ON(busiest_rq == target_rq); - schedstat_inc(sd, alb_cnt); + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); - cpu_group = sd->groups; - do { - for_each_cpu_mask(cpu, cpu_group->cpumask) { - if (busiest_rq->nr_running <= 1) - /* no more tasks left to move */ - return; - if (cpu_isset(cpu, visited_cpus)) - continue; - cpu_set(cpu, visited_cpus); - if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) - continue; - - target_rq = cpu_rq(cpu); - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - if (move_tasks(target_rq, cpu, busiest_rq, - 1, sd, SCHED_IDLE)) { - schedstat_inc(sd, alb_pushed); - } else { - schedstat_inc(sd, alb_failed); - } - spin_unlock(&target_rq->lock); - } - cpu_group = cpu_group->next; - } while (cpu_group != sd->groups); - } + /* Search for an sd spanning us and the target CPU. */ + for_each_domain(target_cpu, sd) + if ((sd->flags & SD_LOAD_BALANCE) && + cpu_isset(busiest_cpu, sd->span)) + break; + + if (unlikely(sd == NULL)) + goto out; + + schedstat_inc(sd, alb_cnt); + + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); +out: + spin_unlock(&target_rq->lock); } /* @@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + int i; - /* Update our load */ - old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (this_load > old_load) - old_load++; - this_rq->cpu_load = (old_load + this_load) / 2; + /* Update our load */ + for (i = 0; i < 3; i++) { + unsigned long new_load = this_load; + int scale = 1 << i; + old_load = this_rq->cpu_load[i]; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + } for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2447,11 +2577,15 @@ out: #ifdef CONFIG_SCHED_SMT static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *sd = this_rq->sd; + struct sched_domain *tmp, *sd = NULL; cpumask_t sibling_map; int i; - if (!(sd->flags & SD_SHARE_CPUPOWER)) + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_SHARE_CPUPOWER) + sd = tmp; + + if (!sd) return; /* @@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *sd = this_rq->sd; + struct sched_domain *tmp, *sd = NULL; cpumask_t sibling_map; prio_array_t *array; int ret = 0, i; task_t *p; - if (!(sd->flags & SD_SHARE_CPUPOWER)) + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_SHARE_CPUPOWER) + sd = tmp; + + if (!sd) return 0; /* @@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val) /* * Underflow? */ - BUG_ON(((int)preempt_count() < 0)); + BUG_ON((preempt_count() < 0)); preempt_count() += val; /* * Spinlock count overflowing soon? @@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx; + int cpu, idx, new_prio; /* * Test if we are atomic. Since do_exit() needs to call into @@ -2735,9 +2873,14 @@ go_idle: delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); + new_prio = recalc_task_prio(next, next->timestamp + delta); + + if (unlikely(next->prio != new_prio)) { + dequeue_task(next, array); + next->prio = new_prio; + enqueue_task(next, array); + } else + requeue_task(next, array); } next->activated = 0; switch_tasks: @@ -2761,11 +2904,15 @@ switch_tasks: rq->curr = next; ++*switch_count; - prepare_arch_switch(rq, next); + prepare_task_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); - - finish_task_switch(prev); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); } else spin_unlock_irq(&rq->lock); @@ -2869,7 +3016,7 @@ need_resched: int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - task_t *p = curr->task; + task_t *p = curr->private; return try_to_wake_up(p, mode, sync); } @@ -3384,13 +3531,24 @@ recheck: if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && - param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && - !capable(CAP_SYS_NICE)) - return -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) - return -EPERM; + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (!capable(CAP_SYS_NICE)) { + /* can't change policy */ + if (policy != p->policy) + return -EPERM; + /* can't increase priority */ + if (policy != SCHED_NORMAL && + param->sched_priority > p->rt_priority && + param->sched_priority > + p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + return -EPERM; + /* can't change other user's priorities */ + if ((current->euid != p->euid) && + (current->euid != p->uid)) + return -EPERM; + } retval = security_task_setscheduler(p, policy, param); if (retval) @@ -3755,19 +3913,22 @@ EXPORT_SYMBOL(cond_resched); */ int cond_resched_lock(spinlock_t * lock) { + int ret = 0; + if (need_lockbreak(lock)) { spin_unlock(lock); cpu_relax(); + ret = 1; spin_lock(lock); } if (need_resched()) { _raw_spin_unlock(lock); preempt_enable_no_resched(); __cond_resched(); + ret = 1; spin_lock(lock); - return 1; } - return 0; + return ret; } EXPORT_SYMBOL(cond_resched_lock); @@ -3811,7 +3972,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); atomic_inc(&rq->nr_iowait); schedule(); @@ -3822,7 +3983,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); long ret; atomic_inc(&rq->nr_iowait); @@ -4027,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu) spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif set_tsk_need_resched(idle); spin_unlock_irqrestore(&rq->lock, flags); @@ -4171,8 +4335,7 @@ static int migration_thread(void * data) struct list_head *head; migration_req_t *req; - if (current->flags & PF_FREEZE) - refrigerator(PF_FREEZE); + try_to_freeze(); spin_lock_irq(&rq->lock); @@ -4197,17 +4360,9 @@ static int migration_thread(void * data) req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); - if (req->type == REQ_MOVE_TASK) { - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); - local_irq_enable(); - } else if (req->type == REQ_SET_DOMAIN) { - rq->sd = req->sd; - spin_unlock_irq(&rq->lock); - } else { - spin_unlock_irq(&rq->lock); - WARN_ON(1); - } + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + local_irq_enable(); complete(&req->done); } @@ -4438,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, migration_req_t *req; req = list_entry(rq->migration_queue.next, migration_req_t, list); - BUG_ON(req->type != REQ_MOVE_TASK); list_del_init(&req->list); complete(&req->done); } @@ -4469,12 +4623,17 @@ int __init migration_init(void) #endif #ifdef CONFIG_SMP -#define SCHED_DOMAIN_DEBUG +#undef SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); do { @@ -4557,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) #define sched_domain_debug(sd, cpu) {} #endif +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpus_weight(sd->span) == 1) + return 1; + + /* Following flags need at least 2 groups */ + if (sd->flags & (SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC)) { + if (sd->groups != sd->groups->next) + return 0; + } + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_IDLE | + SD_WAKE_AFFINE | + SD_WAKE_BALANCE)) + return 0; + + return 1; +} + +static int sd_parent_degenerate(struct sched_domain *sd, + struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpus_equal(sd->span, parent->span)) + return 0; + + /* Does parent contain flags not in child? */ + /* WAKE_BALANCE is a subset of WAKE_AFFINE */ + if (cflags & SD_WAKE_AFFINE) + pflags &= ~SD_WAKE_BALANCE; + /* Flags needing groups don't count if only 1 group in parent */ + if (parent->groups == parent->groups->next) { + pflags &= ~(SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC); + } + if (~cflags & pflags) + return 0; + + return 1; +} + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +void cpu_attach_domain(struct sched_domain *sd, int cpu) { - migration_req_t req; - unsigned long flags; runqueue_t *rq = cpu_rq(cpu); - int local = 1; - - sched_domain_debug(sd, cpu); + struct sched_domain *tmp; - spin_lock_irqsave(&rq->lock, flags); - - if (cpu == smp_processor_id() || !cpu_online(cpu)) { - rq->sd = sd; - } else { - init_completion(&req.done); - req.type = REQ_SET_DOMAIN; - req.sd = sd; - list_add(&req.list, &rq->migration_queue); - local = 0; + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; tmp = tmp->parent) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + if (sd_parent_degenerate(tmp, parent)) + tmp->parent = parent->parent; } - spin_unlock_irqrestore(&rq->lock, flags); + if (sd && sd_degenerate(sd)) + sd = sd->parent; - if (!local) { - wake_up_process(rq->migration_thread); - wait_for_completion(&req.done); - } + sched_domain_debug(sd, cpu); + + rcu_assign_pointer(rq->sd, sd); } /* cpus with isolated domains */ @@ -4619,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -void __devinit init_sched_build_groups(struct sched_group groups[], +void init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; @@ -4655,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[], #ifdef ARCH_HAS_SCHED_DOMAIN -extern void __devinit arch_init_sched_domains(void); -extern void __devinit arch_destroy_sched_domains(void); +extern void build_sched_domains(const cpumask_t *cpu_map); +extern void arch_init_sched_domains(const cpumask_t *cpu_map); +extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); #else #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; -static int __devinit cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu) { return cpu; } @@ -4669,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu) static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; -static int __devinit cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu) { #ifdef CONFIG_SCHED_SMT return first_cpu(cpu_sibling_map[cpu]); @@ -4682,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu) static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int __devinit cpu_to_node_group(int cpu) +static int cpu_to_node_group(int cpu) { return cpu_to_node(cpu); } @@ -4713,39 +4917,28 @@ static void check_sibling_maps(void) #endif /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus */ -static void __devinit arch_init_sched_domains(void) +static void build_sched_domains(const cpumask_t *cpu_map) { int i; - cpumask_t cpu_default_map; - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) - check_sibling_maps(); -#endif - /* - * Setup mask for cpus without special case scheduling requirements. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ - cpus_complement(cpu_default_map, cpu_isolated_map); - cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); /* - * Set up domains. Isolated domains just stay on the dummy domain. + * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA sd = &per_cpu(node_domains, i); group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = cpu_default_map; + sd->span = *cpu_map; sd->groups = &sched_group_nodes[group]; #endif @@ -4763,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void) group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; - cpus_and(sd->span, sd->span, cpu_default_map); + cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; sd->groups = &sched_group_cpus[group]; #endif @@ -4773,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void) /* Set up CPU (sibling) groups */ for_each_online_cpu(i) { cpumask_t this_sibling_map = cpu_sibling_map[i]; - cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) continue; @@ -4786,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void) for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); if (cpus_empty(nodemask)) continue; @@ -4796,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void) #ifdef CONFIG_NUMA /* Set up node groups */ - init_sched_build_groups(sched_group_nodes, cpu_default_map, + init_sched_build_groups(sched_group_nodes, *cpu_map, &cpu_to_node_group); #endif /* Calculate CPU power for physical packages and nodes */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { int power; struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT @@ -4825,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void) } /* Attach the domains */ - for_each_online_cpu(i) { + for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -4835,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void) cpu_attach_domain(sd, i); } } +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void arch_init_sched_domains(cpumask_t *cpu_map) +{ + cpumask_t cpu_default_map; + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) + check_sibling_maps(); +#endif + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); + + build_sched_domains(&cpu_default_map); +} -#ifdef CONFIG_HOTPLUG_CPU -static void __devinit arch_destroy_sched_domains(void) +static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { /* Do nothing: everything is statically allocated. */ } -#endif #endif /* ARCH_HAS_SCHED_DOMAIN */ /* - * Initial dummy domain for early boot and for hotplug cpu. Being static, - * it is initialized to zero, so all balancing flags are cleared which is - * what we want. + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain */ -static struct sched_domain sched_domain_dummy; +static inline void detach_destroy_domains(const cpumask_t *cpu_map) +{ + int i; + + for_each_cpu_mask(i, *cpu_map) + cpu_attach_domain(NULL, i); + synchronize_sched(); + arch_destroy_sched_domains(cpu_map); +} + +/* + * Partition sched domains as specified by the cpumasks below. + * This attaches all cpus from the cpumasks to the NULL domain, + * waits for a RCU quiescent period, recalculates sched + * domain information and then attaches them back to the + * correct sched domains + * Call with hotplug lock held + */ +void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +{ + cpumask_t change_map; + + cpus_and(*partition1, *partition1, cpu_online_map); + cpus_and(*partition2, *partition2, cpu_online_map); + cpus_or(change_map, *partition1, *partition2); + + /* Detach sched domains from all of the affected cpus */ + detach_destroy_domains(&change_map); + if (!cpus_empty(*partition1)) + build_sched_domains(partition1); + if (!cpus_empty(*partition2)) + build_sched_domains(partition2); +} #ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing - * code, so we temporarily attach all running cpus to a "dummy" domain + * code, so we temporarily attach all running cpus to the NULL domain * which will prevent rebalancing while the sched domains are recalculated. */ static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int i; - switch (action) { case CPU_UP_PREPARE: case CPU_DOWN_PREPARE: - for_each_online_cpu(i) - cpu_attach_domain(&sched_domain_dummy, i); - arch_destroy_sched_domains(); + detach_destroy_domains(&cpu_online_map); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -4885,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb, } /* The hotplug lock is already held by cpu_up/cpu_down */ - arch_init_sched_domains(); + arch_init_sched_domains(&cpu_online_map); return NOTIFY_OK; } @@ -4894,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb, void __init sched_init_smp(void) { lock_cpu_hotplug(); - arch_init_sched_domains(); + arch_init_sched_domains(&cpu_online_map); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); @@ -4924,13 +5161,15 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); + rq->nr_running = 0; rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP - rq->sd = &sched_domain_dummy; - rq->cpu_load = 0; + rq->sd = NULL; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; diff --git a/kernel/signal.c b/kernel/signal.c index 8f3debc..ca1186e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -24,6 +24,7 @@ #include <linux/ptrace.h> #include <linux/posix-timers.h> #include <linux/signal.h> +#include <linux/audit.h> #include <asm/param.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -212,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) fastcall void recalc_sigpending_tsk(struct task_struct *t) { if (t->signal->group_stop_count > 0 || + (freezing(t)) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) set_tsk_thread_flag(t, TIF_SIGPENDING); @@ -522,7 +524,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = 0; - sig = next_signal(pending, mask); + /* SIGKILL must have priority, otherwise it is quite easy + * to create an unkillable process, sending sig < SIGKILL + * to self */ + if (unlikely(sigismember(&pending->signal, SIGKILL))) { + if (!sigismember(mask, SIGKILL)) + sig = SIGKILL; + } + + if (likely(!sig)) + sig = next_signal(pending, mask); if (sig) { if (current->notifier) { if (sigismember(current->notifier_mask, sig)) { @@ -658,7 +669,11 @@ static int check_kill_permission(int sig, struct siginfo *info, && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) return error; - return security_task_kill(t, info, sig); + + error = security_task_kill(t, info, sig); + if (!error) + audit_signal_info(sig, t); /* Let audit system see the signal */ + return error; } /* forward decl */ @@ -2216,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, current->state = TASK_INTERRUPTIBLE; timeout = schedule_timeout(timeout); - if (current->flags & PF_FREEZE) - refrigerator(PF_FREEZE); + try_to_freeze(); spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &these, &info); current->blocked = current->real_blocked; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6116b25..84a9d18 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -100,7 +100,7 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { - if (i == _smp_processor_id()) + if (i == raw_smp_processor_id()) continue; ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); if (ret < 0) @@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, /* If they don't care which CPU fn runs on, bind to any online one. */ if (cpu == NR_CPUS) - cpu = _smp_processor_id(); + cpu = raw_smp_processor_id(); p = kthread_create(do_stop, &smdata, "kstopmachine"); if (!IS_ERR(p)) { diff --git a/kernel/sys.c b/kernel/sys.c index f006632..9a24374 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -16,6 +16,8 @@ #include <linux/init.h> #include <linux/highuid.h> #include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/kexec.h> #include <linux/workqueue.h> #include <linux/device.h> #include <linux/key.h> @@ -405,6 +407,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_HALT: notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); system_state = SYSTEM_HALT; + device_suspend(PMSG_SUSPEND); device_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); @@ -415,6 +418,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_POWER_OFF: notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); system_state = SYSTEM_POWER_OFF; + device_suspend(PMSG_SUSPEND); device_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); @@ -431,11 +435,30 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); system_state = SYSTEM_RESTART; + device_suspend(PMSG_FREEZE); device_shutdown(); printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); machine_restart(buffer); break; +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + { + struct kimage *image; + image = xchg(&kexec_image, 0); + if (!image) { + unlock_kernel(); + return -EINVAL; + } + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_shutdown(); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + machine_kexec(image); + break; + } +#endif #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { @@ -525,7 +548,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) } if (new_egid != old_egid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } if (rgid != (gid_t) -1 || @@ -556,7 +579,7 @@ asmlinkage long sys_setgid(gid_t gid) { if(old_egid != gid) { - current->mm->dumpable=0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->gid = current->egid = current->sgid = current->fsgid = gid; @@ -565,7 +588,7 @@ asmlinkage long sys_setgid(gid_t gid) { if(old_egid != gid) { - current->mm->dumpable=0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->egid = current->fsgid = gid; @@ -596,7 +619,7 @@ static int set_user(uid_t new_ruid, int dumpclear) if(dumpclear) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->uid = new_ruid; @@ -653,7 +676,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) if (new_euid != old_euid) { - current->mm->dumpable=0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->fsuid = current->euid = new_euid; @@ -703,7 +726,7 @@ asmlinkage long sys_setuid(uid_t uid) if (old_euid != uid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->fsuid = current->euid = uid; @@ -748,7 +771,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (euid != (uid_t) -1) { if (euid != current->euid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->euid = euid; @@ -798,7 +821,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) if (egid != (gid_t) -1) { if (egid != current->egid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->egid = egid; @@ -845,7 +868,7 @@ asmlinkage long sys_setfsuid(uid_t uid) { if (uid != old_fsuid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->fsuid = uid; @@ -875,7 +898,7 @@ asmlinkage long sys_setfsgid(gid_t gid) { if (gid != old_fsgid) { - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; smp_wmb(); } current->fsgid = gid; @@ -894,35 +917,69 @@ asmlinkage long sys_times(struct tms __user * tbuf) */ if (tbuf) { struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; cputime_t utime, stime, cutime, cstime; - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); +#ifdef CONFIG_SMP + if (thread_group_empty(current)) { + /* + * Single thread case without the use of any locks. + * + * We may race with release_task if two threads are + * executing. However, release task first adds up the + * counters (__exit_signal) before removing the task + * from the process tasklist (__unhash_process). + * __exit_signal also acquires and releases the + * siglock which results in the proper memory ordering + * so that the list modifications are always visible + * after the counters have been updated. + * + * If the counters have been updated by the second thread + * but the thread has not yet been removed from the list + * then the other branch will be executing which will + * block on tasklist_lock until the exit handling of the + * other task is finished. + * + * This also implies that the sighand->siglock cannot + * be held by another processor. So we can also + * skip acquiring that lock. + */ + utime = cputime_add(current->signal->utime, current->utime); + stime = cputime_add(current->signal->utime, current->stime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + } else +#endif + { + /* Process with multiple threads */ + struct task_struct *tsk = current; + struct task_struct *t; + + read_lock(&tasklist_lock); + utime = tsk->signal->utime; + stime = tsk->signal->stime; + t = tsk; + do { + utime = cputime_add(utime, t->utime); + stime = cputime_add(stime, t->stime); + t = next_thread(t); + } while (t != tsk); + + /* + * While we have tasklist_lock read-locked, no dying thread + * can be updating current->signal->[us]time. Instead, + * we got their counts included in the live thread loop. + * However, another thread can come in right now and + * do a wait call that updates current->signal->c[us]time. + * To make sure we always see that pair updated atomically, + * we take the siglock around fetching them. + */ + spin_lock_irq(&tsk->sighand->siglock); + cutime = tsk->signal->cutime; + cstime = tsk->signal->cstime; + spin_unlock_irq(&tsk->sighand->siglock); + read_unlock(&tasklist_lock); + } tmp.tms_utime = cputime_to_clock_t(utime); tmp.tms_stime = cputime_to_clock_t(stime); tmp.tms_cutime = cputime_to_clock_t(cutime); @@ -1225,7 +1282,7 @@ static void groups_sort(struct group_info *group_info) } /* a simple bsearch */ -static int groups_search(struct group_info *group_info, gid_t grp) +int groups_search(struct group_info *group_info, gid_t grp) { int left, right; @@ -1652,7 +1709,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = 1; break; case PR_SET_DUMPABLE: - if (arg2 != 0 && arg2 != 1) { + if (arg2 < 0 || arg2 > 2) { error = -EINVAL; break; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0dda70e..29196ce 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -18,6 +18,8 @@ cond_syscall(sys_acct); cond_syscall(sys_lookup_dcookie); cond_syscall(sys_swapon); cond_syscall(sys_swapoff); +cond_syscall(sys_kexec_load); +cond_syscall(compat_sys_kexec_load); cond_syscall(sys_init_module); cond_syscall(sys_delete_module); cond_syscall(sys_socketpair); @@ -77,6 +79,7 @@ cond_syscall(sys_request_key); cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); cond_syscall(compat_sys_socketcall); +cond_syscall(sys_set_zone_reclaim); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 701d12c..270ee7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio; extern int max_threads; extern int sysrq_enabled; extern int core_uses_pid; +extern int suid_dumpable; extern char core_pattern[]; extern int cad_pid; extern int pid_max; @@ -950,6 +951,14 @@ static ctl_table fs_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = KERN_SETUID_DUMPABLE, + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; @@ -991,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol int error = parse_table(name, nlen, oldval, oldlenp, newval, newlen, head->ctl_table, &context); - if (context) - kfree(context); + kfree(context); if (error != -ENOTDIR) return error; tmp = tmp->next; diff --git a/kernel/timer.c b/kernel/timer.c index 207aa4f0..f2a1188 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec); #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) +struct timer_base_s { + spinlock_t lock; + struct timer_list *running_timer; +}; + typedef struct tvec_s { struct list_head vec[TVN_SIZE]; } tvec_t; @@ -66,9 +71,8 @@ typedef struct tvec_root_s { } tvec_root_t; struct tvec_t_base_s { - spinlock_t lock; + struct timer_base_s t_base; unsigned long timer_jiffies; - struct timer_list *running_timer; tvec_root_t tv1; tvec_t tv2; tvec_t tv3; @@ -77,18 +81,16 @@ struct tvec_t_base_s { } ____cacheline_aligned_in_smp; typedef struct tvec_t_base_s tvec_base_t; +static DEFINE_PER_CPU(tvec_base_t, tvec_bases); static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) { #ifdef CONFIG_SMP - base->running_timer = timer; + base->t_base.running_timer = timer; #endif } -/* Fake initialization */ -static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED }; - static void check_timer_failed(struct timer_list *timer) { static int whine_count; @@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer) /* * Now fix it up */ - spin_lock_init(&timer->lock); timer->magic = TIMER_MAGIC; } @@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +typedef struct timer_base_s timer_base_t; +/* + * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) + * at compile time, and we need timer->base to lock the timer. + */ +timer_base_t __init_timer_base + ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; +EXPORT_SYMBOL(__init_timer_base); + +/*** + * init_timer - initialize a timer. + * @timer: the timer to be initialized + * + * init_timer() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void fastcall init_timer(struct timer_list *timer) +{ + timer->entry.next = NULL; + timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; + timer->magic = TIMER_MAGIC; +} +EXPORT_SYMBOL(init_timer); + +static inline void detach_timer(struct timer_list *timer, + int clear_pending) +{ + struct list_head *entry = &timer->entry; + + __list_del(entry->prev, entry->next); + if (clear_pending) + entry->next = NULL; + entry->prev = LIST_POISON2; +} + +/* + * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on ->tvX lists. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static timer_base_t *lock_timer_base(struct timer_list *timer, + unsigned long *flags) +{ + timer_base_t *base; + + for (;;) { + base = timer->base; + if (likely(base != NULL)) { + spin_lock_irqsave(&base->lock, *flags); + if (likely(base == timer->base)) + return base; + /* The timer has migrated to another CPU */ + spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + int __mod_timer(struct timer_list *timer, unsigned long expires) { - tvec_base_t *old_base, *new_base; + timer_base_t *base; + tvec_base_t *new_base; unsigned long flags; int ret = 0; BUG_ON(!timer->function); - check_timer(timer); - spin_lock_irqsave(&timer->lock, flags); + base = lock_timer_base(timer, &flags); + + if (timer_pending(timer)) { + detach_timer(timer, 0); + ret = 1; + } + new_base = &__get_cpu_var(tvec_bases); -repeat: - old_base = timer->base; - /* - * Prevent deadlocks via ordering by old_base < new_base. - */ - if (old_base && (new_base != old_base)) { - if (old_base < new_base) { - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); - } else { - spin_lock(&old_base->lock); - spin_lock(&new_base->lock); - } + if (base != &new_base->t_base) { /* - * The timer base might have been cancelled while we were - * trying to take the lock(s): + * We are trying to schedule the timer on the local CPU. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that + * the timer is serialized wrt itself. */ - if (timer->base != old_base) { - spin_unlock(&new_base->lock); - spin_unlock(&old_base->lock); - goto repeat; - } - } else { - spin_lock(&new_base->lock); - if (timer->base != old_base) { - spin_unlock(&new_base->lock); - goto repeat; + if (unlikely(base->running_timer == timer)) { + /* The timer remains on a former base */ + new_base = container_of(base, tvec_base_t, t_base); + } else { + /* See the comment in lock_timer_base() */ + timer->base = NULL; + spin_unlock(&base->lock); + spin_lock(&new_base->t_base.lock); + timer->base = &new_base->t_base; } } - /* - * Delete the previous timeout (if there was any), and install - * the new one: - */ - if (old_base) { - list_del(&timer->entry); - ret = 1; - } timer->expires = expires; internal_add_timer(new_base, timer); - timer->base = new_base; - - if (old_base && (new_base != old_base)) - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - spin_unlock_irqrestore(&timer->lock, flags); + spin_unlock_irqrestore(&new_base->t_base.lock, flags); return ret; } @@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu) { tvec_base_t *base = &per_cpu(tvec_bases, cpu); unsigned long flags; - + BUG_ON(timer_pending(timer) || !timer->function); check_timer(timer); - spin_lock_irqsave(&base->lock, flags); + spin_lock_irqsave(&base->t_base.lock, flags); + timer->base = &base->t_base; internal_add_timer(base, timer); - timer->base = base; - spin_unlock_irqrestore(&base->lock, flags); + spin_unlock_irqrestore(&base->t_base.lock, flags); } @@ -295,109 +344,84 @@ EXPORT_SYMBOL(mod_timer); */ int del_timer(struct timer_list *timer) { + timer_base_t *base; unsigned long flags; - tvec_base_t *base; + int ret = 0; check_timer(timer); -repeat: - base = timer->base; - if (!base) - return 0; - spin_lock_irqsave(&base->lock, flags); - if (base != timer->base) { + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; + } spin_unlock_irqrestore(&base->lock, flags); - goto repeat; } - list_del(&timer->entry); - /* Need to make sure that anybody who sees a NULL base also sees the list ops */ - smp_wmb(); - timer->base = NULL; - spin_unlock_irqrestore(&base->lock, flags); - return 1; + return ret; } EXPORT_SYMBOL(del_timer); #ifdef CONFIG_SMP -/*** - * del_timer_sync - deactivate a timer and wait for the handler to finish. - * @timer: the timer to be deactivated - * - * This function only differs from del_timer() on SMP: besides deactivating - * the timer it also makes sure the handler has finished executing on other - * CPUs. - * - * Synchronization rules: callers must prevent restarting of the timer, - * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent - * completion of the timer's handler. Upon exit the timer is not queued and - * the handler is not running on any CPU. - * - * The function returns whether it has deactivated a pending timer or not. +/* + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. * - * del_timer_sync() is slow and complicated because it copes with timer - * handlers which re-arm the timer (periodic timers). If the timer handler - * is known to not do this (a single shot timer) then use - * del_singleshot_timer_sync() instead. + * It must not be called from interrupt contexts. */ -int del_timer_sync(struct timer_list *timer) +int try_to_del_timer_sync(struct timer_list *timer) { - tvec_base_t *base; - int i, ret = 0; + timer_base_t *base; + unsigned long flags; + int ret = -1; - check_timer(timer); + base = lock_timer_base(timer, &flags); -del_again: - ret += del_timer(timer); + if (base->running_timer == timer) + goto out; - for_each_online_cpu(i) { - base = &per_cpu(tvec_bases, i); - if (base->running_timer == timer) { - while (base->running_timer == timer) { - cpu_relax(); - preempt_check_resched(); - } - break; - } + ret = 0; + if (timer_pending(timer)) { + detach_timer(timer, 1); + ret = 1; } - smp_rmb(); - if (timer_pending(timer)) - goto del_again; +out: + spin_unlock_irqrestore(&base->lock, flags); return ret; } -EXPORT_SYMBOL(del_timer_sync); /*** - * del_singleshot_timer_sync - deactivate a non-recursive timer + * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated * - * This function is an optimization of del_timer_sync for the case where the - * caller can guarantee the timer does not reschedule itself in its timer - * function. + * This function only differs from del_timer() on SMP: besides deactivating + * the timer it also makes sure the handler has finished executing on other + * CPUs. * * Synchronization rules: callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which wold prevent - * completion of the timer's handler. Upon exit the timer is not queued and - * the handler is not running on any CPU. + * interrupt contexts. The caller must not hold locks which would prevent + * completion of the timer's handler. The timer's handler must not call + * add_timer_on(). Upon exit the timer is not queued and the handler is + * not running on any CPU. * * The function returns whether it has deactivated a pending timer or not. */ -int del_singleshot_timer_sync(struct timer_list *timer) +int del_timer_sync(struct timer_list *timer) { - int ret = del_timer(timer); + check_timer(timer); - if (!ret) { - ret = del_timer_sync(timer); - BUG_ON(ret); + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; } - - return ret; } -EXPORT_SYMBOL(del_singleshot_timer_sync); + +EXPORT_SYMBOL(del_timer_sync); #endif static int cascade(tvec_base_t *base, tvec_t *tv, int index) @@ -415,7 +439,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) struct timer_list *tmp; tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != base); + BUG_ON(tmp->base != &base->t_base); curr = curr->next; internal_add_timer(base, tmp); } @@ -437,7 +461,7 @@ static inline void __run_timers(tvec_base_t *base) { struct timer_list *timer; - spin_lock_irq(&base->lock); + spin_lock_irq(&base->t_base.lock); while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list = LIST_HEAD_INIT(work_list); struct list_head *head = &work_list; @@ -453,8 +477,7 @@ static inline void __run_timers(tvec_base_t *base) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; list_splice_init(base->tv1.vec + index, &work_list); -repeat: - if (!list_empty(head)) { + while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; @@ -462,25 +485,26 @@ repeat: fn = timer->function; data = timer->data; - list_del(&timer->entry); set_running_timer(base, timer); - smp_wmb(); - timer->base = NULL; - spin_unlock_irq(&base->lock); + detach_timer(timer, 1); + spin_unlock_irq(&base->t_base.lock); { - u32 preempt_count = preempt_count(); + int preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { - printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); + printk(KERN_WARNING "huh, entered %p " + "with preempt_count %08x, exited" + " with %08x?\n", + fn, preempt_count, + preempt_count()); BUG(); } } - spin_lock_irq(&base->lock); - goto repeat; + spin_lock_irq(&base->t_base.lock); } } set_running_timer(base, NULL); - spin_unlock_irq(&base->lock); + spin_unlock_irq(&base->t_base.lock); } #ifdef CONFIG_NO_IDLE_HZ @@ -499,7 +523,7 @@ unsigned long next_timer_interrupt(void) int i, j; base = &__get_cpu_var(tvec_bases); - spin_lock(&base->lock); + spin_lock(&base->t_base.lock); expires = base->timer_jiffies + (LONG_MAX >> 1); list = 0; @@ -547,7 +571,7 @@ found: expires = nte->expires; } } - spin_unlock(&base->lock); + spin_unlock(&base->t_base.lock); return expires; } #endif @@ -1286,9 +1310,9 @@ static void __devinit init_timers_cpu(int cpu) { int j; tvec_base_t *base; - + base = &per_cpu(tvec_bases, cpu); - spin_lock_init(&base->lock); + spin_lock_init(&base->t_base.lock); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); INIT_LIST_HEAD(base->tv4.vec + j); @@ -1302,22 +1326,16 @@ static void __devinit init_timers_cpu(int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) +static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) { struct timer_list *timer; while (!list_empty(head)) { timer = list_entry(head->next, struct timer_list, entry); - /* We're locking backwards from __mod_timer order here, - beware deadlock. */ - if (!spin_trylock(&timer->lock)) - return 0; - list_del(&timer->entry); + detach_timer(timer, 0); + timer->base = &new_base->t_base; internal_add_timer(new_base, timer); - timer->base = new_base; - spin_unlock(&timer->lock); } - return 1; } static void __devinit migrate_timers(int cpu) @@ -1331,39 +1349,24 @@ static void __devinit migrate_timers(int cpu) new_base = &get_cpu_var(tvec_bases); local_irq_disable(); -again: - /* Prevent deadlocks via ordering by old_base < new_base. */ - if (old_base < new_base) { - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); - } else { - spin_lock(&old_base->lock); - spin_lock(&new_base->lock); - } + spin_lock(&new_base->t_base.lock); + spin_lock(&old_base->t_base.lock); - if (old_base->running_timer) + if (old_base->t_base.running_timer) BUG(); for (i = 0; i < TVR_SIZE; i++) - if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) - goto unlock_again; - for (i = 0; i < TVN_SIZE; i++) - if (!migrate_timer_list(new_base, old_base->tv2.vec + i) - || !migrate_timer_list(new_base, old_base->tv3.vec + i) - || !migrate_timer_list(new_base, old_base->tv4.vec + i) - || !migrate_timer_list(new_base, old_base->tv5.vec + i)) - goto unlock_again; - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); + migrate_timer_list(new_base, old_base->tv1.vec + i); + for (i = 0; i < TVN_SIZE; i++) { + migrate_timer_list(new_base, old_base->tv2.vec + i); + migrate_timer_list(new_base, old_base->tv3.vec + i); + migrate_timer_list(new_base, old_base->tv4.vec + i); + migrate_timer_list(new_base, old_base->tv5.vec + i); + } + + spin_unlock(&old_base->t_base.lock); + spin_unlock(&new_base->t_base.lock); local_irq_enable(); put_cpu_var(tvec_bases); - return; - -unlock_again: - /* Avoid deadlock with __mod_timer, by backing off. */ - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - cpu_relax(); - goto again; } #endif /* CONFIG_HOTPLUG_CPU */ @@ -1594,7 +1597,7 @@ void msleep(unsigned int msecs) EXPORT_SYMBOL(msleep); /** - * msleep_interruptible - sleep waiting for waitqueue interruptions + * msleep_interruptible - sleep waiting for signals * @msecs: Time in milliseconds to sleep for */ unsigned long msleep_interruptible(unsigned int msecs) |