diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.preempt | 65 | ||||
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 14 | ||||
-rw-r--r-- | kernel/cpuset.c | 89 | ||||
-rw-r--r-- | kernel/crash_dump.c | 52 | ||||
-rw-r--r-- | kernel/fork.c | 21 | ||||
-rw-r--r-- | kernel/kexec.c | 1063 | ||||
-rw-r--r-- | kernel/ksysfs.c | 13 | ||||
-rw-r--r-- | kernel/panic.c | 23 | ||||
-rw-r--r-- | kernel/power/Kconfig | 8 | ||||
-rw-r--r-- | kernel/power/Makefile | 6 | ||||
-rw-r--r-- | kernel/power/disk.c | 35 | ||||
-rw-r--r-- | kernel/power/main.c | 16 | ||||
-rw-r--r-- | kernel/power/smp.c | 89 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 93 | ||||
-rw-r--r-- | kernel/printk.c | 3 | ||||
-rw-r--r-- | kernel/resource.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 1045 | ||||
-rw-r--r-- | kernel/sys.c | 23 | ||||
-rw-r--r-- | kernel/sys_ni.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 3 | ||||
-rw-r--r-- | kernel/timer.c | 2 |
22 files changed, 2090 insertions, 579 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt new file mode 100644 index 0000000..0b46a5d --- /dev/null +++ b/kernel/Kconfig.preempt @@ -0,0 +1,65 @@ + +choice + prompt "Preemption Model" + default PREEMPT_NONE + +config PREEMPT_NONE + bool "No Forced Preemption (Server)" + help + This is the traditional Linux preemption model, geared towards + throughput. It will still provide good latencies most of the + time, but there are no guarantees and occasional longer delays + are possible. + + Select this option if you are building a kernel for a server or + scientific/computation system, or if you want to maximize the + raw processing power of the kernel, irrespective of scheduling + latencies. + +config PREEMPT_VOLUNTARY + bool "Voluntary Kernel Preemption (Desktop)" + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new + preemption points have been selected to reduce the maximum + latency of rescheduling, providing faster application reactions, + at the cost of slighly lower throughput. + + This allows reaction to interactive events by allowing a + low priority process to voluntarily preempt itself even if it + is in kernel mode executing a system call. This allows + applications to run more 'smoothly' even when the system is + under load. + + Select this if you are building a kernel for a desktop system. + +config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" + help + This option reduces the latency of the kernel by making + all kernel code (that is not executing in a critical section) + preemptible. This allows reaction to interactive events by + permitting a low priority process to be preempted involuntarily + even if it is in kernel mode executing a system call and would + otherwise not be about to reach a natural preemption point. + This allows applications to run more 'smoothly' even when the + system is under load, at the cost of slighly lower throughput + and a slight runtime overhead to kernel code. + + Select this if you are building a kernel for a desktop or + embedded system with latency requirements in the milliseconds + range. + +endchoice + +config PREEMPT_BKL + bool "Preempt The Big Kernel Lock" + depends on SMP || PREEMPT + default y + help + This option reduces the latency of the kernel by making the + big kernel lock preemptible. + + Say Y here if you are building a kernel for a desktop system. + Say N if you are unsure. + diff --git a/kernel/Makefile b/kernel/Makefile index b01d26f..cb05cd0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_IKCONFIG) += configs.o @@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) diff --git a/kernel/cpu.c b/kernel/cpu.c index 628f4cc..53d8263 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -63,19 +63,15 @@ static int take_cpu_down(void *unused) { int err; - /* Take offline: makes arch_cpu_down somewhat easier. */ - cpu_clear(smp_processor_id(), cpu_online_map); - /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) - cpu_set(smp_processor_id(), cpu_online_map); - else - /* Force idle task to run as soon as we yield: it should - immediately notice cpu is offline and die quickly. */ - sched_idle_next(); + return err; - return err; + /* Force idle task to run as soon as we yield: it should + immediately notice cpu is offline and die quickly. */ + sched_idle_next(); + return 0; } int cpu_down(unsigned int cpu) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 79dd929..984c0bf 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -595,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) return 0; } +/* + * For a given cpuset cur, partition the system as follows + * a. All cpus in the parent cpuset's cpus_allowed that are not part of any + * exclusive child cpusets + * b. All cpus in the current cpuset's cpus_allowed that are not part of any + * exclusive child cpusets + * Build these two partitions by calling partition_sched_domains + * + * Call with cpuset_sem held. May nest a call to the + * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. + */ +static void update_cpu_domains(struct cpuset *cur) +{ + struct cpuset *c, *par = cur->parent; + cpumask_t pspan, cspan; + + if (par == NULL || cpus_empty(cur->cpus_allowed)) + return; + + /* + * Get all cpus from parent's cpus_allowed not part of exclusive + * children + */ + pspan = par->cpus_allowed; + list_for_each_entry(c, &par->children, sibling) { + if (is_cpu_exclusive(c)) + cpus_andnot(pspan, pspan, c->cpus_allowed); + } + if (is_removed(cur) || !is_cpu_exclusive(cur)) { + cpus_or(pspan, pspan, cur->cpus_allowed); + if (cpus_equal(pspan, cur->cpus_allowed)) + return; + cspan = CPU_MASK_NONE; + } else { + if (cpus_empty(pspan)) + return; + cspan = cur->cpus_allowed; + /* + * Get all cpus from current cpuset's cpus_allowed not part + * of exclusive children + */ + list_for_each_entry(c, &cur->children, sibling) { + if (is_cpu_exclusive(c)) + cpus_andnot(cspan, cspan, c->cpus_allowed); + } + } + + lock_cpu_hotplug(); + partition_sched_domains(&pspan, &cspan); + unlock_cpu_hotplug(); +} + static int update_cpumask(struct cpuset *cs, char *buf) { struct cpuset trialcs; - int retval; + int retval, cpus_unchanged; trialcs = *cs; retval = cpulist_parse(buf, trialcs.cpus_allowed); @@ -608,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (cpus_empty(trialcs.cpus_allowed)) return -ENOSPC; retval = validate_change(cs, &trialcs); - if (retval == 0) - cs->cpus_allowed = trialcs.cpus_allowed; - return retval; + if (retval < 0) + return retval; + cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); + cs->cpus_allowed = trialcs.cpus_allowed; + if (is_cpu_exclusive(cs) && !cpus_unchanged) + update_cpu_domains(cs); + return 0; } static int update_nodemask(struct cpuset *cs, char *buf) @@ -646,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) { int turning_on; struct cpuset trialcs; - int err; + int err, cpu_exclusive_changed; turning_on = (simple_strtoul(buf, NULL, 10) != 0); @@ -657,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) clear_bit(bit, &trialcs.flags); err = validate_change(cs, &trialcs); - if (err == 0) { - if (turning_on) - set_bit(bit, &cs->flags); - else - clear_bit(bit, &cs->flags); - } - return err; + if (err < 0) + return err; + cpu_exclusive_changed = + (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); + if (turning_on) + set_bit(bit, &cs->flags); + else + clear_bit(bit, &cs->flags); + + if (cpu_exclusive_changed) + update_cpu_domains(cs); + return 0; } static int attach_task(struct cpuset *cs, char *buf) @@ -1309,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) up(&cpuset_sem); return -EBUSY; } - spin_lock(&cs->dentry->d_lock); parent = cs->parent; set_bit(CS_REMOVED, &cs->flags); + if (is_cpu_exclusive(cs)) + update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ if (list_empty(&parent->children)) check_for_release(parent); + spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 0000000..459ba49 --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,52 @@ +/* + * kernel/crash_dump.c - Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include <linux/smp_lock.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/crash_dump.h> + +#include <asm/io.h> +#include <asm/uaccess.h> + +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *page, *vaddr; + + if (!csize) + return 0; + + page = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!page) + return -ENOMEM; + + vaddr = kmap_atomic_pfn(pfn, KM_PTE0); + copy_page(page, vaddr); + kunmap_atomic(vaddr, KM_PTE0); + + if (userbuf) { + if (copy_to_user(buf, (page + offset), csize)) { + kfree(page); + return -EFAULT; + } + } else { + memcpy(buf, (page + offset), csize); + } + + kfree(page); + return csize; +} diff --git a/kernel/fork.c b/kernel/fork.c index a28d11e..2c78068 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1003,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags, p->pdeath_signal = 0; p->exit_state = 0; - /* Perform scheduler related setup */ - sched_fork(p); - /* * Ok, make it visible to the rest of the system. * We dont wake it up yet. @@ -1014,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->ptrace_children); INIT_LIST_HEAD(&p->ptrace_list); + /* Perform scheduler related setup. Assign this task to a CPU. */ + sched_fork(p, clone_flags); + /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* - * The task hasn't been attached yet, so cpus_allowed mask cannot - * have changed. The cpus_allowed mask of the parent may have - * changed after it was copied first time, and it may then move to - * another CPU - so we re-copy it here and set the child's CPU to - * the parent's CPU. This avoids alot of nasty races. + * The task hasn't been attached yet, so its cpus_allowed mask will + * not be changed, nor will its assigned CPU. + * + * The cpus_allowed mask of the parent may have changed after it was + * copied first time - so re-copy it here, then check the child's CPU + * to ensure it is on a valid CPU (and if not, just force it back to + * parent's CPU). This avoids alot of nasty races. */ p->cpus_allowed = current->cpus_allowed; - set_task_cpu(p, smp_processor_id()); + if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed))) + set_task_cpu(p, smp_processor_id()); /* * Check for pending SIGKILL! The new thread should not be allowed diff --git a/kernel/kexec.c b/kernel/kexec.c new file mode 100644 index 0000000..7843548 --- /dev/null +++ b/kernel/kexec.c @@ -0,0 +1,1063 @@ +/* + * kexec.c - kexec system call + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/kexec.h> +#include <linux/spinlock.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/syscalls.h> +#include <linux/reboot.h> +#include <linux/syscalls.h> +#include <linux/ioport.h> +#include <linux/hardirq.h> + +#include <asm/page.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/system.h> +#include <asm/semaphore.h> + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +int kexec_should_crash(struct task_struct *p) +{ + if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) + return 1; + return 0; +} + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in <asm/kexec.h>. + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, unsigned long end); +static struct page *kimage_alloc_page(struct kimage *image, + unsigned int gfp_mask, + unsigned long dest); + +static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + size_t segment_bytes; + struct kimage *image; + unsigned long i; + int result; + + /* Allocate a controlling structure */ + result = -ENOMEM; + image = kmalloc(sizeof(*image), GFP_KERNEL); + if (!image) + goto out; + + memset(image, 0, sizeof(*image)); + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->start = entry; + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unuseable pages */ + INIT_LIST_HEAD(&image->unuseable_pages); + + /* Read in the segments */ + image->nr_segments = nr_segments; + segment_bytes = nr_segments * sizeof(*segments); + result = copy_from_user(image->segment, segments, segment_bytes); + if (result) + goto out; + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + * + * Since the kernel does everything in page size chunks ensure + * the destination addreses are page aligned. Too many + * special cases crop of when we don't do this. The most + * insidious is getting overlapping destination addresses + * simply because addresses are changed to page size + * granularity. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) + goto out; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + goto out; + } + + /* Verify our destination addresses do not overlap. + * If we alloed overlapping destination addresses + * through very weird things can happen with no + * easy explanation as one segment stops on another. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + unsigned long j; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + for (j = 0; j < i; j++) { + unsigned long pstart, pend; + pstart = image->segment[j].mem; + pend = pstart + image->segment[j].memsz; + /* Do the segments overlap ? */ + if ((mend > pstart) && (mstart < pend)) + goto out; + } + } + + /* Ensure our buffer sizes are strictly less than + * our memory sizes. This should always be the case, + * and it is easier to check up front than to be surprised + * later on. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + if (image->segment[i].bufsz > image->segment[i].memsz) + goto out; + } + + result = 0; +out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; + +} + +static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + int result; + struct kimage *image; + + /* Allocate and initialize a controlling structure */ + image = NULL; + result = do_kimage_alloc(&image, entry, nr_segments, segments); + if (result) + goto out; + + *rimage = image; + + /* + * Find a location for the control code buffer, and add it + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; + out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; +} + +static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment *segments) +{ + int result; + struct kimage *image; + unsigned long i; + + image = NULL; + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) { + result = -EADDRNOTAVAIL; + goto out; + } + + /* Allocate and initialize a controlling structure */ + result = do_kimage_alloc(&image, entry, nr_segments, segments); + if (result) + goto out; + + /* Enable the special crash kernel control page + * allocation policy. + */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || (mend > crashk_res.end)) + goto out; + } + + /* + * Find a location for the control code buffer, and add + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_CODE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; +out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; +} + +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) + return 1; + } + + return 0; +} + +static struct page *kimage_alloc_pages(unsigned int gfp_mask, + unsigned int order) +{ + struct page *pages; + + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + pages->mapping = NULL; + pages->private = order; + count = 1 << order; + for (i = 0; i < count; i++) + SetPageReserved(pages + i); + } + + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + + order = page->private; + count = 1 << order; + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +static void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + kimage_free_pages(page); + } +} + +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + + pages = kimage_alloc_pages(GFP_KERNEL, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while (!pages); + + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everyting to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + + return pages; +} + +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * Control pages are also the only pags we must allocate + * when loading a crash kernel. All of the other pages + * are specified by the segments and we just memcpy + * into them directly. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * Given the low demand this implements a very simple + * allocator that finds the first hole of the appropriate + * size in the reserved memory region, and allocates all + * of the memory up to and including the hole. + */ + unsigned long hole_start, hole_end, size; + struct page *pages; + + pages = NULL; + size = (1 << order) << PAGE_SHIFT; + hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + while (hole_end <= crashk_res.end) { + unsigned long i; + + if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) + break; + if (hole_end > crashk_res.end) + break; + /* See if I overlap any of the segments */ + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + if ((hole_end >= mstart) && (hole_start <= mend)) { + /* Advance the hole to the end of the segment */ + hole_start = (mend + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + break; + } + } + /* If I don't overlap any segments I have found my hole! */ + if (i == image->nr_segments) { + pages = pfn_to_page(hole_start >> PAGE_SHIFT); + break; + } + } + if (pages) + image->control_page = hole_end; + + return pages; +} + + +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + struct page *pages = NULL; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + pages = kimage_alloc_normal_control_pages(image, order); + break; + case KEXEC_TYPE_CRASH: + pages = kimage_alloc_crash_control_pages(image, order); + break; + } + + return pages; +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) + image->entry++; + + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) + return -ENOMEM; + + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = ind_page + + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + + return 0; +} + +static int kimage_set_destination(struct kimage *image, + unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + if (result == 0) + image->destination = destination; + + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + if (result == 0) + image->destination += PAGE_SIZE; + + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unuseable pages I have cached */ + kimage_free_page_list(&image->unuseable_pages); + +} +static int kimage_terminate(struct kimage *image) +{ + if (*image->entry != 0) + image->entry++; + + *image->entry = IND_DONE; + + return 0; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION)? \ + phys_to_virt((entry & PAGE_MASK)): ptr +1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +static void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } + else if (entry & IND_SOURCE) + kimage_free_entry(entry); + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, + unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) + destination = entry & PAGE_MASK; + else if (entry & IND_SOURCE) { + if (page == destination) + return ptr; + destination += PAGE_SIZE; + } + } + + return 0; +} + +static struct page *kimage_alloc_page(struct kimage *image, + unsigned int gfp_mask, + unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) + return 0; + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unuseable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, + addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it. + */ + addr = old_addr; + page = old_page; + break; + } + else { + /* Place the page on the destination list I + * will use it later. + */ + list_add(&page->lru, &image->dest_pages); + } + } + + return page; +} + +static int kimage_load_normal_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long maddr; + unsigned long ubytes, mbytes; + int result; + unsigned char *buf; + + result = 0; + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + result = kimage_set_destination(image, maddr); + if (result < 0) + goto out; + + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); + if (page == 0) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) + << PAGE_SHIFT); + if (result < 0) + goto out; + + ptr = kmap(page); + /* Start with a clear page */ + memset(ptr, 0, PAGE_SIZE); + ptr += maddr & ~PAGE_MASK; + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); + if (mchunk > mbytes) + mchunk = mbytes; + + uchunk = mchunk; + if (uchunk > ubytes) + uchunk = ubytes; + + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_crash_segment(struct kimage *image, + struct kexec_segment *segment) +{ + /* For crash dumps kernels we simply copy the data from + * user space to it's destination. + * We do things a page at a time for the sake of kmap. + */ + unsigned long maddr; + unsigned long ubytes, mbytes; + int result; + unsigned char *buf; + + result = 0; + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = pfn_to_page(maddr >> PAGE_SHIFT); + if (page == 0) { + result = -ENOMEM; + goto out; + } + ptr = kmap(page); + ptr += maddr & ~PAGE_MASK; + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); + if (mchunk > mbytes) + mchunk = mbytes; + + uchunk = mchunk; + if (uchunk > ubytes) { + uchunk = ubytes; + /* Zero the trailing part of the page */ + memset(ptr + uchunk, 0, mchunk - uchunk); + } + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = (result < 0) ? result : -EIO; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + int result = -ENOMEM; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + result = kimage_load_normal_segment(image, segment); + break; + case KEXEC_TYPE_CRASH: + result = kimage_load_crash_segment(image, segment); + break; + } + + return result; +} + +/* + * Exec Kernel system call: for obvious reasons only root may call it. + * + * This call breaks up into three pieces. + * - A generic part which loads the new kernel from the current + * address space, and very carefully places the data in the + * allocated pages. + * + * - A generic part that interacts with the kernel and tells all of + * the devices to shut down. Preventing on-going dmas, and placing + * the devices in a consistent state so a later kernel can + * reinitialize them. + * + * - A machine specific part that includes the syscall number + * and the copies the image to it's final destination. And + * jumps into the image at entry. + * + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ +struct kimage *kexec_image = NULL; +static struct kimage *kexec_crash_image = NULL; +/* + * A home grown binary mutex. + * Nothing can wait so this mutex is safe to use + * in interrupt context :) + */ +static int kexec_lock = 0; + +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) +{ + struct kimage **dest_image, *image; + int locked; + int result; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* + * Verify we have a legal set of flags + * This leaves us room for future extensions. + */ + if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) + return -EINVAL; + + /* Verify we are on the appropriate architecture */ + if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && + ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) + return -EINVAL; + + /* Put an artificial cap on the number + * of segments passed to kexec_load. + */ + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + image = NULL; + result = 0; + + /* Because we write directly to the reserved memory + * region when loading crash kernels we need a mutex here to + * prevent multiple crash kernels from attempting to load + * simultaneously, and to prevent a crash kernel from loading + * over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + locked = xchg(&kexec_lock, 1); + if (locked) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_ON_CRASH) + dest_image = &kexec_crash_image; + if (nr_segments > 0) { + unsigned long i; + + /* Loading another kernel to reboot into */ + if ((flags & KEXEC_ON_CRASH) == 0) + result = kimage_normal_alloc(&image, entry, + nr_segments, segments); + /* Loading another kernel to switch to if this one crashes */ + else if (flags & KEXEC_ON_CRASH) { + /* Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + result = kimage_crash_alloc(&image, entry, + nr_segments, segments); + } + if (result) + goto out; + + result = machine_kexec_prepare(image); + if (result) + goto out; + + for (i = 0; i < nr_segments; i++) { + result = kimage_load_segment(image, &image->segment[i]); + if (result) + goto out; + } + result = kimage_terminate(image); + if (result) + goto out; + } + /* Install the new kernel, and Uninstall the old */ + image = xchg(dest_image, image); + +out: + xchg(&kexec_lock, 0); /* Release the mutex */ + kimage_free(image); + + return result; +} + +#ifdef CONFIG_COMPAT +asmlinkage long compat_sys_kexec_load(unsigned long entry, + unsigned long nr_segments, + struct compat_kexec_segment __user *segments, + unsigned long flags) +{ + struct compat_kexec_segment in; + struct kexec_segment out, __user *ksegments; + unsigned long i, result; + + /* Don't allow clients that don't understand the native + * architecture to do anything. + */ + if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) + return -EINVAL; + + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); + for (i=0; i < nr_segments; i++) { + result = copy_from_user(&in, &segments[i], sizeof(in)); + if (result) + return -EFAULT; + + out.buf = compat_ptr(in.buf); + out.bufsz = in.bufsz; + out.mem = in.mem; + out.memsz = in.memsz; + + result = copy_to_user(&ksegments[i], &out, sizeof(out)); + if (result) + return -EFAULT; + } + + return sys_kexec_load(entry, nr_segments, ksegments, flags); +} +#endif + +void crash_kexec(struct pt_regs *regs) +{ + struct kimage *image; + int locked; + + + /* Take the kexec_lock here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + locked = xchg(&kexec_lock, 1); + if (!locked) { + image = xchg(&kexec_crash_image, NULL); + if (image) { + machine_crash_shutdown(regs); + machine_kexec(image); + } + xchg(&kexec_lock, 0); + } +} diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1f064a6..015fb69 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) KERNEL_ATTR_RO(hotplug_seqnum); #endif +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +static ssize_t crash_notes_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%p\n", (void *)crash_notes); +} +KERNEL_ATTR_RO(crash_notes); +#endif + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_HOTPLUG &hotplug_seqnum_attr.attr, #endif +#ifdef CONFIG_KEXEC + &crash_notes_attr.attr, +#endif NULL }; diff --git a/kernel/panic.c b/kernel/panic.c index 081f746..74ba5f3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -18,6 +18,7 @@ #include <linux/sysrq.h> #include <linux/interrupt.h> #include <linux/nmi.h> +#include <linux/kexec.h> int panic_timeout; int panic_on_oops; @@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...) unsigned long caller = (unsigned long) __builtin_return_address(0); #endif + /* + * It's possible to come here directly from a panic-assertion and not + * have preempt disabled. Some functions called from here want + * preempt to be disabled. No point enabling it later though... + */ + preempt_disable(); + bust_spinlocks(1); va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); @@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...) printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); bust_spinlocks(0); + /* + * If we have crashed and we have a crash kernel loaded let it handle + * everything else. + * Do we want to call this before we try to display a message? + */ + crash_kexec(NULL); + #ifdef CONFIG_SMP + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a panic + * situation. + */ smp_send_stop(); #endif @@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...) if (!panic_blink) panic_blink = no_blink; - if (panic_timeout > 0) - { + if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked.. diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 696387f..2c7121d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,8 +27,8 @@ config PM_DEBUG like suspend support. config SOFTWARE_SUSPEND - bool "Software Suspend (EXPERIMENTAL)" - depends on EXPERIMENTAL && PM && SWAP + bool "Software Suspend" + depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need APM. @@ -72,3 +72,7 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config SUSPEND_SMP + bool + depends on HOTPLUG_CPU && X86 && PM + default y diff --git a/kernel/power/Makefile b/kernel/power/Makefile index fbdc634..2f438d0 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif -swsusp-smp-$(CONFIG_SMP) += smp.o - obj-y := main.o process.o console.o pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o + +obj-$(CONFIG_SUSPEND_SMP) += smp.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 02b6764..fb8de63 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -117,8 +117,8 @@ static void finish(void) { device_resume(); platform_finish(); - enable_nonboot_cpus(); thaw_processes(); + enable_nonboot_cpus(); pm_restore_console(); } @@ -131,28 +131,35 @@ static int prepare_processes(void) sys_sync(); + disable_nonboot_cpus(); + if (freeze_processes()) { error = -EBUSY; - return error; + goto thaw; } if (pm_disk_mode == PM_DISK_PLATFORM) { if (pm_ops && pm_ops->prepare) { if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) - return error; + goto thaw; } } /* Free memory before shutting down devices. */ free_some_memory(); - return 0; +thaw: + thaw_processes(); + enable_nonboot_cpus(); + pm_restore_console(); + return error; } static void unprepare_processes(void) { - enable_nonboot_cpus(); + platform_finish(); thaw_processes(); + enable_nonboot_cpus(); pm_restore_console(); } @@ -160,15 +167,9 @@ static int prepare_devices(void) { int error; - disable_nonboot_cpus(); - if ((error = device_suspend(PMSG_FREEZE))) { + if ((error = device_suspend(PMSG_FREEZE))) printk("Some devices failed to suspend\n"); - platform_finish(); - enable_nonboot_cpus(); - return error; - } - - return 0; + return error; } /** @@ -185,9 +186,9 @@ int pm_suspend_disk(void) int error; error = prepare_processes(); - if (!error) { - error = prepare_devices(); - } + if (error) + return error; + error = prepare_devices(); if (error) { unprepare_processes(); @@ -250,7 +251,7 @@ static int software_resume(void) if ((error = prepare_processes())) { swsusp_close(); - goto Cleanup; + goto Done; } pr_debug("PM: Reading swsusp image.\n"); diff --git a/kernel/power/main.c b/kernel/power/main.c index 4cdebc9..c94cb9e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state) pm_prepare_console(); + disable_nonboot_cpus(); + + if (num_online_cpus() != 1) { + error = -EPERM; + goto Enable_cpu; + } + if (freeze_processes()) { error = -EAGAIN; goto Thaw; @@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state) pm_ops->finish(state); Thaw: thaw_processes(); + Enable_cpu: + enable_nonboot_cpus(); pm_restore_console(); return error; } @@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state) if (pm_ops && pm_ops->finish) pm_ops->finish(state); thaw_processes(); + enable_nonboot_cpus(); pm_restore_console(); } @@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state) goto Unlock; } - /* Suspend is hard to get right on SMP. */ - if (num_online_cpus() != 1) { - error = -EPERM; - goto Unlock; - } - pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); if ((error = suspend_prepare(state))) goto Unlock; diff --git a/kernel/power/smp.c b/kernel/power/smp.c index 457c230..bbe2307 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c @@ -13,73 +13,52 @@ #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/module.h> +#include <linux/cpu.h> #include <asm/atomic.h> #include <asm/tlbflush.h> -static atomic_t cpu_counter, freeze; - - -static void smp_pause(void * data) -{ - struct saved_context ctxt; - __save_processor_state(&ctxt); - printk("Sleeping in:\n"); - dump_stack(); - atomic_inc(&cpu_counter); - while (atomic_read(&freeze)) { - /* FIXME: restore takes place at random piece inside this. - This should probably be written in assembly, and - preserve general-purpose registers, too - - What about stack? We may need to move to new stack here. - - This should better be ran with interrupts disabled. - */ - cpu_relax(); - barrier(); - } - atomic_dec(&cpu_counter); - __restore_processor_state(&ctxt); -} - -static cpumask_t oldmask; +/* This is protected by pm_sem semaphore */ +static cpumask_t frozen_cpus; void disable_nonboot_cpus(void) { - oldmask = current->cpus_allowed; - set_cpus_allowed(current, cpumask_of_cpu(0)); - printk("Freezing CPUs (at %d)", raw_smp_processor_id()); - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(HZ); - printk("..."); - BUG_ON(raw_smp_processor_id() != 0); - - /* FIXME: for this to work, all the CPUs must be running - * "idle" thread (or we deadlock). Is that guaranteed? */ + int cpu, error; - atomic_set(&cpu_counter, 0); - atomic_set(&freeze, 1); - smp_call_function(smp_pause, NULL, 0, 0); - while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { - cpu_relax(); - barrier(); + error = 0; + cpus_clear(frozen_cpus); + printk("Freezing cpus ...\n"); + for_each_online_cpu(cpu) { + if (cpu == 0) + continue; + error = cpu_down(cpu); + if (!error) { + cpu_set(cpu, frozen_cpus); + printk("CPU%d is down\n", cpu); + continue; + } + printk("Error taking cpu %d down: %d\n", cpu, error); } - printk("ok\n"); + BUG_ON(smp_processor_id() != 0); + if (error) + panic("cpus not sleeping"); } void enable_nonboot_cpus(void) { - printk("Restarting CPUs"); - atomic_set(&freeze, 0); - while (atomic_read(&cpu_counter)) { - cpu_relax(); - barrier(); - } - printk("..."); - set_cpus_allowed(current, oldmask); - schedule(); - printk("ok\n"); + int cpu, error; + printk("Thawing cpus ...\n"); + for_each_cpu_mask(cpu, frozen_cpus) { + error = smp_prepare_cpu(cpu); + if (!error) + error = cpu_up(cpu); + if (!error) { + printk("CPU%d is up\n", cpu); + continue; + } + printk("Error taking cpu %d up: %d\n", cpu, error); + panic("Not enough cpus"); + } + cpus_clear(frozen_cpus); } - diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 53f9f87..c285fc5 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -10,12 +10,12 @@ * This file is released under the GPLv2. * * I'd like to thank the following people for their work: - * + * * Pavel Machek <pavel@ucw.cz>: * Modifications, defectiveness pointing, being with me at the very beginning, * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. * - * Steve Doddi <dirk@loth.demon.co.uk>: + * Steve Doddi <dirk@loth.demon.co.uk>: * Support the possibility of hardware state restoring. * * Raph <grey.havens@earthling.net>: @@ -84,11 +84,11 @@ extern char resume_file[]; static unsigned int nr_copy_pages __nosavedata = 0; /* Suspend pagedir is allocated before final copy, therefore it - must be freed after resume + must be freed after resume Warning: this is evil. There are actually two pagedirs at time of resume. One is "pagedir_save", which is empty frame allocated at - time of suspend, that must be freed. Second is "pagedir_nosave", + time of suspend, that must be freed. Second is "pagedir_nosave", allocated at time of resume, that travels through memory not to collide with anything. @@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev) { int error; - rw_swap_page_sync(READ, + rw_swap_page_sync(READ, swp_entry(root_swap, 0), virt_to_page((unsigned long)&swsusp_header)); if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || @@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev) memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); swsusp_header.swsusp_info = prev; - error = rw_swap_page_sync(WRITE, + error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), virt_to_page((unsigned long) &swsusp_header)); @@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info) static int swsusp_swap_check(void) /* This is called before saving image */ { int i, len; - + len=strlen(resume_file); root_swap = 0xFFFF; - + swap_list_lock(); - for(i=0; i<MAX_SWAPFILES; i++) { + for (i=0; i<MAX_SWAPFILES; i++) { if (swap_info[i].flags == 0) { swapfile_used[i]=SWAPFILE_UNUSED; } else { - if(!len) { + if (!len) { printk(KERN_WARNING "resume= option should be used to set suspend device" ); - if(root_swap == 0xFFFF) { + if (root_swap == 0xFFFF) { swapfile_used[i] = SWAPFILE_SUSPEND; root_swap = i; } else - swapfile_used[i] = SWAPFILE_IGNORED; + swapfile_used[i] = SWAPFILE_IGNORED; } else { /* we ignore all swap devices that are not the resume_file */ if (is_resume_device(&swap_info[i])) { @@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */ * This is called after saving image so modification * will be lost after resume... and that's what we want. * we make the device unusable. A new call to - * lock_swapdevices can unlock the devices. + * lock_swapdevices can unlock the devices. */ static void lock_swapdevices(void) { int i; swap_list_lock(); - for(i = 0; i< MAX_SWAPFILES; i++) - if(swapfile_used[i] == SWAPFILE_IGNORED) { + for (i = 0; i< MAX_SWAPFILES; i++) + if (swapfile_used[i] == SWAPFILE_IGNORED) { swap_info[i].flags ^= 0xFF; } swap_list_unlock(); @@ -229,7 +229,7 @@ static void lock_swapdevices(void) * @loc: Place to store the entry we used. * * Allocate a new swap entry and 'sync' it. Note we discard -EIO - * errors. That is an artifact left over from swsusp. It did not + * errors. That is an artifact left over from swsusp. It did not * check the return of rw_swap_page_sync() at all, since most pages * written back to swap would return -EIO. * This is a partial improvement, since we will at least return other @@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc) int error = 0; entry = get_swap_page(); - if (swp_offset(entry) && + if (swp_offset(entry) && swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); @@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc) /** * data_free - Free the swap entries used by the saved image. * - * Walk the list of used swap entries and free each one. + * Walk the list of used swap entries and free each one. * This is only used for cleanup when suspend fails. */ static void data_free(void) @@ -290,7 +290,7 @@ static int data_write(void) mod = 1; printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); - for_each_pbe(p, pagedir_nosave) { + for_each_pbe (p, pagedir_nosave) { if (!(i%mod)) printk( "\b\b\b\b%3d%%", i / mod ); if ((error = write_page(p->address, &(p->swap_address)))) @@ -335,7 +335,7 @@ static int close_swap(void) dump_info(); error = write_page((unsigned long)&swsusp_info, &entry); - if (!error) { + if (!error) { printk( "S" ); error = mark_swapfiles(entry); printk( "|\n" ); @@ -370,7 +370,7 @@ static int write_pagedir(void) struct pbe * pbe; printk( "Writing pagedir..."); - for_each_pb_page(pbe, pagedir_nosave) { + for_each_pb_page (pbe, pagedir_nosave) { if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) return error; } @@ -472,7 +472,7 @@ static int save_highmem(void) int res = 0; pr_debug("swsusp: Saving Highmem\n"); - for_each_zone(zone) { + for_each_zone (zone) { if (is_highmem(zone)) res = save_highmem_zone(zone); if (res) @@ -547,7 +547,7 @@ static void count_data_pages(void) nr_copy_pages = 0; - for_each_zone(zone) { + for_each_zone (zone) { if (is_highmem(zone)) continue; mark_free_pages(zone); @@ -562,9 +562,9 @@ static void copy_data_pages(void) struct zone *zone; unsigned long zone_pfn; struct pbe * pbe = pagedir_nosave; - + pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); - for_each_zone(zone) { + for_each_zone (zone) { if (is_highmem(zone)) continue; mark_free_pages(zone); @@ -702,7 +702,7 @@ static void free_image_pages(void) { struct pbe * p; - for_each_pbe(p, pagedir_save) { + for_each_pbe (p, pagedir_save) { if (p->address) { ClearPageNosave(virt_to_page(p->address)); free_page(p->address); @@ -719,7 +719,7 @@ static int alloc_image_pages(void) { struct pbe * p; - for_each_pbe(p, pagedir_save) { + for_each_pbe (p, pagedir_save) { p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); if (!p->address) return -ENOMEM; @@ -740,7 +740,7 @@ void swsusp_free(void) /** * enough_free_mem - Make sure we enough free memory to snapshot. * - * Returns TRUE or FALSE after checking the number of available + * Returns TRUE or FALSE after checking the number of available * free pages. */ @@ -758,11 +758,11 @@ static int enough_free_mem(void) /** * enough_swap - Make sure we have enough swap to save the image. * - * Returns TRUE or FALSE after checking the total amount of swap + * Returns TRUE or FALSE after checking the total amount of swap * space avaiable. * * FIXME: si_swapinfo(&i) returns all swap devices information. - * We should only consider resume_device. + * We should only consider resume_device. */ static int enough_swap(void) @@ -781,18 +781,18 @@ static int swsusp_alloc(void) { int error; + pagedir_nosave = NULL; + nr_copy_pages = calc_nr(nr_copy_pages); + pr_debug("suspend: (pages needed: %d + %d free: %d)\n", nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); - pagedir_nosave = NULL; if (!enough_free_mem()) return -ENOMEM; if (!enough_swap()) return -ENOSPC; - nr_copy_pages = calc_nr(nr_copy_pages); - if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); return -ENOMEM; @@ -827,8 +827,8 @@ static int suspend_prepare_image(void) error = swsusp_alloc(); if (error) return error; - - /* During allocating of suspend pagedir, new cold pages may appear. + + /* During allocating of suspend pagedir, new cold pages may appear. * Kill them. */ drain_local_pages(); @@ -929,21 +929,6 @@ int swsusp_resume(void) return error; } -/* More restore stuff */ - -/* - * Returns true if given address/order collides with any orig_address - */ -static int does_collide_order(unsigned long addr, int order) -{ - int i; - - for (i=0; i < (1<<order); i++) - if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE))) - return 1; - return 0; -} - /** * On resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages @@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask) unsigned long m; m = get_zeroed_page(gfp_mask); - while (does_collide_order(m, 0)) { + while (!PageNosaveFree(virt_to_page(m))) { eat_page((void *)m); m = get_zeroed_page(gfp_mask); if (!m) @@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) /* Set page flags */ - for_each_zone(zone) { + for_each_zone (zone) { for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) SetPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); @@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) /* Relocate colliding pages */ for_each_pb_page (pbpage, pblist) { - if (does_collide_order((unsigned long)pbpage, 0)) { + if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); if (!m) { error = -ENOMEM; @@ -1193,8 +1178,10 @@ static const char * sanity_check(void) return "version"; if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) return "machine"; +#if 0 if(swsusp_info.cpus != num_online_cpus()) return "number of cpus"; +#endif return NULL; } diff --git a/kernel/printk.c b/kernel/printk.c index 3a442bf..5092397 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id()) && - system_state != SYSTEM_RUNNING) { + if (!cpu_online(smp_processor_id())) { /* * Some console drivers may assume that per-cpu resources have * been allocated. So don't allow them to be called by this diff --git a/kernel/resource.c b/kernel/resource.c index 52f696f..26967e0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new, new->start = min; if (new->end > max) new->end = max; - new->start = (new->start + align - 1) & ~(align - 1); + new->start = ALIGN(new->start, align); if (alignf) alignf(alignf_data, new, size, align); if (new->start < new->end && new->end - new->start >= size - 1) { diff --git a/kernel/sched.c b/kernel/sched.c index 6fa9ea4..a07cff9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -166,7 +166,7 @@ #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) -static inline unsigned int task_timeslice(task_t *p) +static unsigned int task_timeslice(task_t *p) { if (p->static_prio < NICE_TO_PRIO(0)) return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); @@ -206,7 +206,7 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP - unsigned long cpu_load; + unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -260,22 +260,86 @@ struct runqueue { static DEFINE_PER_CPU(struct runqueue, runqueues); +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ #define for_each_domain(cpu, domain) \ - for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) +for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -/* - * Default context-switch locking: - */ #ifndef prepare_arch_switch -# define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(runqueue_t *rq, task_t *p) +{ + return rq->curr == p; +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(runqueue_t *rq, task_t *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return rq->curr == p; +#endif +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; #endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* * task_rq_lock - lock the runqueue a given task resides on and disable @@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 11 +#define SCHEDSTAT_VERSION 12 static int show_schedstat(struct seq_file *seq, void *v) { @@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v) #ifdef CONFIG_SMP /* domain-specific stats */ + preempt_disable(); for_each_domain(cpu, sd) { enum idle_type itype; char mask_str[NR_CPUS]; @@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_pushed, sd->sbe_attempts, + sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } + preempt_enable(); #endif } return 0; @@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void) return rq; } -#ifdef CONFIG_SCHED_SMT -static int cpu_and_siblings_are_idle(int cpu) -{ - int sib; - for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { - if (idle_cpu(sib)) - continue; - return 0; - } - - return 1; -} -#else -#define cpu_and_siblings_are_idle(A) idle_cpu(A) -#endif - #ifdef CONFIG_SCHEDSTATS /* * Called when a process is dequeued from the active array and given @@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) rq->nr_running++; } -static void recalc_task_prio(task_t *p, unsigned long long now) +static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ unsigned long long __sleep_time = now - p->timestamp; @@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now) } } - p->prio = effective_prio(p); + return effective_prio(p); } /* @@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) } #endif - recalc_task_prio(p, now); + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -782,22 +833,12 @@ inline int task_curr(const task_t *p) } #ifdef CONFIG_SMP -enum request_type { - REQ_MOVE_TASK, - REQ_SET_DOMAIN, -}; - typedef struct { struct list_head list; - enum request_type type; - /* For REQ_MOVE_TASK */ task_t *task; int dest_cpu; - /* For REQ_SET_DOMAIN */ - struct sched_domain *sd; - struct completion done; } migration_req_t; @@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) } init_completion(&req->done); - req->type = REQ_MOVE_TASK; req->task = p; req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); @@ -886,26 +926,154 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu) +static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; - return min(rq->cpu_load, load_now); + return min(rq->cpu_load[type-1], load_now); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long target_load(int cpu) +static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; - return max(rq->cpu_load, load_now); + return max(rq->cpu_load[type-1], load_now); } -#endif +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + local_group = cpu_isset(this_cpu, group->cpumask); + /* XXX: put a cpus allowed check */ + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + group = group->next; + } while (group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_queue - find the idlest runqueue among the cpus in group. + */ +static int find_idlest_cpu(struct sched_group *group, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + for_each_cpu_mask(i, group->cpumask) { + load = source_load(i, 0); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; + + for_each_domain(cpu, tmp) + if (tmp->flags & flag) + sd = tmp; + + while (sd) { + cpumask_t span; + struct sched_group *group; + int new_cpu; + int weight; + + span = sd->span; + group = find_idlest_group(sd, t, cpu); + if (!group) + goto nextlevel; + + new_cpu = find_idlest_cpu(group, cpu); + if (new_cpu == -1 || new_cpu == cpu) + goto nextlevel; + + /* Now try balancing at a lower domain level */ + cpu = new_cpu; +nextlevel: + sd = NULL; + weight = cpus_weight(span); + for_each_domain(cpu, tmp) { + if (weight <= cpus_weight(tmp->span)) + break; + if (tmp->flags & flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return cpu; +} + +#endif /* CONFIG_SMP */ /* * wake_idle() will wake a task on an idle cpu if task->cpu is @@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p) for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_IDLE) { - cpus_and(tmp, sd->span, cpu_online_map); - cpus_and(tmp, tmp, p->cpus_allowed); + cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) return i; } } - else break; + else + break; } return cpu; } @@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) runqueue_t *rq; #ifdef CONFIG_SMP unsigned long load, this_load; - struct sched_domain *sd; + struct sched_domain *sd, *this_sd = NULL; int new_cpu; #endif @@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) if (unlikely(task_running(rq, p))) goto out_activate; -#ifdef CONFIG_SCHEDSTATS + new_cpu = cpu; + schedstat_inc(rq, ttwu_cnt); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); - } else { - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } + goto out_set_cpu; + } + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + this_sd = sd; + break; } } -#endif - new_cpu = cpu; - if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; - load = source_load(cpu); - this_load = target_load(this_cpu); - /* - * If sync wakeup then subtract the (maximum possible) effect of - * the currently running task from the load of the current CPU: + * Check for affine wakeup and passive balancing possibilities. */ - if (sync) - this_load -= SCHED_LOAD_SCALE; + if (this_sd) { + int idx = this_sd->wake_idx; + unsigned int imbalance; - /* Don't pull the task off an idle CPU to a busy one */ - if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) - goto out_set_cpu; + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - new_cpu = this_cpu; /* Wake to this CPU if we can */ + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); - /* - * Scan domains for affine wakeup and passive balancing - * possibilities. - */ - for_each_domain(this_cpu, sd) { - unsigned int imbalance; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + new_cpu = this_cpu; /* Wake to this CPU if we can */ - if ((sd->flags & SD_WAKE_AFFINE) && - !task_hot(p, rq->timestamp_last_tick, sd)) { + if (this_sd->flags & SD_WAKE_AFFINE) { + unsigned long tl = this_load; /* - * This domain has SD_WAKE_AFFINE and p is cache cold - * in this domain. + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_move_affine); + if (sync) + tl -= SCHED_LOAD_SCALE; + + if ((tl <= load && + tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || + 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); goto out_set_cpu; } - } else if ((sd->flags & SD_WAKE_BALANCE) && - imbalance*this_load <= 100*load) { - /* - * This domain has SD_WAKE_BALANCE and there is - * an imbalance. - */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_move_balance); + } + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); goto out_set_cpu; } } @@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state) return try_to_wake_up(p, state, 0); } -#ifdef CONFIG_SMP -static int find_idlest_cpu(struct task_struct *p, int this_cpu, - struct sched_domain *sd); -#endif - /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p) +void fastcall sched_fork(task_t *p, int clone_flags) { + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p) p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); p->array = NULL; - spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif #ifdef CONFIG_PREEMPT - /* - * During context-switch we hold precisely one spinlock, which - * schedule_tail drops. (in the common case it's this_rq()->lock, - * but it also can be p->switch_lock.) So we compensate with a count - * of 1. Also, we want to start with kernel preemption disabled. - */ + /* Want to start with kernel preemption disabled. */ p->thread_info->preempt_count = 1; #endif /* @@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p) * runqueue lock is not a problem. */ current->time_slice = 1; - preempt_disable(); scheduler_tick(); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); + } + local_irq_enable(); + put_cpu(); } /* @@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) runqueue_t *rq, *this_rq; rq = task_rq_lock(p, &flags); - cpu = task_cpu(p); - this_cpu = smp_processor_id(); - BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); + cpu = task_cpu(p); /* * We decrease the sleep average of forking parents @@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p) } /** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void prepare_task_switch(runqueue_t *rq, task_t *next) +{ + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. * - * We enter this with the runqueue still locked, and finish_arch_switch() - * will unlock it along with doing any other architecture-specific cleanup - * actions. + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. * * Note that we may have delayed dropping an mm in context_switch(). If * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) */ -static inline void finish_task_switch(task_t *prev) +static inline void finish_task_switch(runqueue_t *rq, task_t *prev) __releases(rq->lock) { - runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; unsigned long prev_task_flags; @@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_task_flags = prev->flags; - finish_arch_switch(rq, prev); + finish_arch_switch(prev); + finish_lock_switch(rq, prev); if (mm) mmdrop(mm); if (unlikely(prev_task_flags & PF_DEAD)) @@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev) asmlinkage void schedule_tail(task_t *prev) __releases(rq->lock) { - finish_task_switch(prev); - + runqueue_t *rq = this_rq(); + finish_task_switch(rq, prev); +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif if (current->set_child_tid) put_user(current->pid, current->set_child_tid); } @@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) } /* - * find_idlest_cpu - find the least busy runqueue. - */ -static int find_idlest_cpu(struct task_struct *p, int this_cpu, - struct sched_domain *sd) -{ - unsigned long load, min_load, this_load; - int i, min_cpu; - cpumask_t mask; - - min_cpu = UINT_MAX; - min_load = ULONG_MAX; - - cpus_and(mask, sd->span, p->cpus_allowed); - - for_each_cpu_mask(i, mask) { - load = target_load(i); - - if (load < min_load) { - min_cpu = i; - min_load = load; - - /* break out early on an idle CPU: */ - if (!min_load) - break; - } - } - - /* add +1 to account for the new task */ - this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; - - /* - * Would with the addition of the new task to the - * current CPU there be an imbalance between this - * CPU and the idlest CPU? - * - * Use half of the balancing threshold - new-context is - * a good opportunity to balance. - */ - if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) - return min_cpu; - - return this_cpu; -} - -/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then @@ -1571,37 +1712,16 @@ out: } /* - * sched_exec(): find the highest-level, exec-balance-capable - * domain and try to migrate the task to the least loaded CPU. - * - * execve() is a valuable balancing opportunity, because at this point - * the task has the smallest effective memory and cache footprint. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ void sched_exec(void) { - struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - - /* Prefer the current CPU if there's only this task running */ - if (this_rq()->nr_running <= 1) - goto out; - - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_BALANCE_EXEC) - sd = tmp; - - if (sd) { - schedstat_inc(sd, sbe_attempts); - new_cpu = find_idlest_cpu(current, this_cpu, sd); - if (new_cpu != this_cpu) { - schedstat_inc(sd, sbe_pushed); - put_cpu(); - sched_migrate_task(current, new_cpu); - return; - } - } -out: + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); put_cpu(); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); } /* @@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, int *all_pinned) { /* * We do not migrate tasks that are: @@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_running(rq, p)) - return 0; if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; + *all_pinned = 0; + + if (task_running(rq, p)) + return 0; /* * Aggressive migration if: - * 1) the [whole] cpu is idle, or + * 1) task is cache cold, or * 2) too many balance attempts have failed. */ - if (cpu_and_siblings_are_idle(this_cpu) || \ - sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) return 1; if (task_hot(p, rq->timestamp_last_tick, sd)) - return 0; + return 0; return 1; } @@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) + enum idle_type idle, int *all_pinned) { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0; + int idx, pulled = 0, pinned = 0; task_t *tmp; - if (max_nr_move <= 0 || busiest->nr_running <= 1) + if (max_nr_move == 0) goto out; + pinned = 1; + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1717,7 +1840,7 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { if (curr != head) goto skip_queue; idx++; @@ -1746,6 +1869,9 @@ out: * inside pull_task(). */ schedstat_add(sd, lb_gained[idle], pulled); + + if (all_pinned) + *all_pinned = pinned; return pulled; } @@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + int load_idx; max_load = this_load = total_load = total_pwr = 0; + if (idle == NOT_IDLE) + load_idx = sd->busy_idx; + else if (idle == NEWLY_IDLE) + load_idx = sd->newidle_idx; + else + load_idx = sd->idle_idx; do { unsigned long load; @@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i); + load = target_load(i, load_idx); else - load = source_load(i); + load = source_load(i, load_idx); avg_load += load; } @@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (local_group) { this_load = avg_load; this = group; - goto nextgroup; } else if (avg_load > max_load) { max_load = avg_load; busiest = group; } -nextgroup: group = group->next; } while (group != sd->groups); @@ -1870,15 +2001,9 @@ nextgroup: /* Get rid of the scaling factor, rounding down as we divide */ *imbalance = *imbalance / SCHED_LOAD_SCALE; - return busiest; out_balanced: - if (busiest && (idle == NEWLY_IDLE || - (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { - *imbalance = 1; - return busiest; - } *imbalance = 0; return NULL; @@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i); + load = source_load(i, 0); if (load > max_load) { max_load = load; @@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) } /* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + +/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * @@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, struct sched_group *group; runqueue_t *busiest; unsigned long imbalance; - int nr_moved; + int nr_moved, all_pinned = 0; + int active_balance = 0; spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); @@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } + BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance); @@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle); + imbalance, sd, idle, + &all_pinned); spin_unlock(&busiest->lock); + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(all_pinned)) + goto out_balanced; } + spin_unlock(&this_rq->lock); if (!nr_moved) { @@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - int wake = 0; spin_lock(&busiest->lock); if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; - wake = 1; + active_balance = 1; } spin_unlock(&busiest->lock); - if (wake) + if (active_balance) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries; + sd->nr_balance_failed = sd->cache_nice_tries+1; } - - /* - * We were unbalanced, but unsuccessful in move_tasks(), - * so bump the balance_interval to lessen the lock contention. - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval++; - } else { + } else sd->nr_balance_failed = 0; + if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; } return nr_moved; @@ -2005,8 +2137,10 @@ out_balanced: schedstat_inc(sd, lb_balanced[idle]); + sd->nr_balance_failed = 0; /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; return 0; @@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) { - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); - goto out; + goto out_balanced; } busiest = find_busiest_queue(group); - if (!busiest || busiest == this_rq) { - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); - goto out; + goto out_balanced; } + BUG_ON(busiest == this_rq); + /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, NEWLY_IDLE); + imbalance, sd, NEWLY_IDLE, NULL); if (!nr_moved) schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + else + sd->nr_balance_failed = 0; spin_unlock(&busiest->lock); - -out: return nr_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + sd->nr_balance_failed = 0; + return 0; } /* @@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) { struct sched_domain *sd; - struct sched_group *cpu_group; runqueue_t *target_rq; - cpumask_t visited_cpus; - int cpu; + int target_cpu = busiest_rq->push_cpu; + + if (busiest_rq->nr_running <= 1) + /* no task to move */ + return; + + target_rq = cpu_rq(target_cpu); /* - * Search for suitable CPUs to push tasks to in successively higher - * domains with SD_LOAD_BALANCE set. + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. */ - visited_cpus = CPU_MASK_NONE; - for_each_domain(busiest_cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - /* no more domains to search */ - break; + BUG_ON(busiest_rq == target_rq); - schedstat_inc(sd, alb_cnt); + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); - cpu_group = sd->groups; - do { - for_each_cpu_mask(cpu, cpu_group->cpumask) { - if (busiest_rq->nr_running <= 1) - /* no more tasks left to move */ - return; - if (cpu_isset(cpu, visited_cpus)) - continue; - cpu_set(cpu, visited_cpus); - if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) - continue; - - target_rq = cpu_rq(cpu); - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - if (move_tasks(target_rq, cpu, busiest_rq, - 1, sd, SCHED_IDLE)) { - schedstat_inc(sd, alb_pushed); - } else { - schedstat_inc(sd, alb_failed); - } - spin_unlock(&target_rq->lock); - } - cpu_group = cpu_group->next; - } while (cpu_group != sd->groups); - } + /* Search for an sd spanning us and the target CPU. */ + for_each_domain(target_cpu, sd) + if ((sd->flags & SD_LOAD_BALANCE) && + cpu_isset(busiest_cpu, sd->span)) + break; + + if (unlikely(sd == NULL)) + goto out; + + schedstat_inc(sd, alb_cnt); + + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); +out: + spin_unlock(&target_rq->lock); } /* @@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + int i; - /* Update our load */ - old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (this_load > old_load) - old_load++; - this_rq->cpu_load = (old_load + this_load) / 2; + /* Update our load */ + for (i = 0; i < 3; i++) { + unsigned long new_load = this_load; + int scale = 1 << i; + old_load = this_rq->cpu_load[i]; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + } for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2447,11 +2577,15 @@ out: #ifdef CONFIG_SCHED_SMT static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *sd = this_rq->sd; + struct sched_domain *tmp, *sd = NULL; cpumask_t sibling_map; int i; - if (!(sd->flags & SD_SHARE_CPUPOWER)) + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_SHARE_CPUPOWER) + sd = tmp; + + if (!sd) return; /* @@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *sd = this_rq->sd; + struct sched_domain *tmp, *sd = NULL; cpumask_t sibling_map; prio_array_t *array; int ret = 0, i; task_t *p; - if (!(sd->flags & SD_SHARE_CPUPOWER)) + for_each_domain(this_cpu, tmp) + if (tmp->flags & SD_SHARE_CPUPOWER) + sd = tmp; + + if (!sd) return 0; /* @@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx; + int cpu, idx, new_prio; /* * Test if we are atomic. Since do_exit() needs to call into @@ -2735,9 +2873,14 @@ go_idle: delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); + new_prio = recalc_task_prio(next, next->timestamp + delta); + + if (unlikely(next->prio != new_prio)) { + dequeue_task(next, array); + next->prio = new_prio; + enqueue_task(next, array); + } else + requeue_task(next, array); } next->activated = 0; switch_tasks: @@ -2761,11 +2904,15 @@ switch_tasks: rq->curr = next; ++*switch_count; - prepare_arch_switch(rq, next); + prepare_task_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); - - finish_task_switch(prev); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); } else spin_unlock_irq(&rq->lock); @@ -3384,13 +3531,24 @@ recheck: if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && - param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && - !capable(CAP_SYS_NICE)) - return -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) - return -EPERM; + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (!capable(CAP_SYS_NICE)) { + /* can't change policy */ + if (policy != p->policy) + return -EPERM; + /* can't increase priority */ + if (policy != SCHED_NORMAL && + param->sched_priority > p->rt_priority && + param->sched_priority > + p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + return -EPERM; + /* can't change other user's priorities */ + if ((current->euid != p->euid) && + (current->euid != p->uid)) + return -EPERM; + } retval = security_task_setscheduler(p, policy, param); if (retval) @@ -4030,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu) spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif set_tsk_need_resched(idle); spin_unlock_irqrestore(&rq->lock, flags); @@ -4199,17 +4360,9 @@ static int migration_thread(void * data) req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); - if (req->type == REQ_MOVE_TASK) { - spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); - local_irq_enable(); - } else if (req->type == REQ_SET_DOMAIN) { - rq->sd = req->sd; - spin_unlock_irq(&rq->lock); - } else { - spin_unlock_irq(&rq->lock); - WARN_ON(1); - } + spin_unlock(&rq->lock); + __migrate_task(req->task, cpu, req->dest_cpu); + local_irq_enable(); complete(&req->done); } @@ -4440,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, migration_req_t *req; req = list_entry(rq->migration_queue.next, migration_req_t, list); - BUG_ON(req->type != REQ_MOVE_TASK); list_del_init(&req->list); complete(&req->done); } @@ -4471,12 +4623,17 @@ int __init migration_init(void) #endif #ifdef CONFIG_SMP -#define SCHED_DOMAIN_DEBUG +#undef SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); do { @@ -4559,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) #define sched_domain_debug(sd, cpu) {} #endif +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpus_weight(sd->span) == 1) + return 1; + + /* Following flags need at least 2 groups */ + if (sd->flags & (SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC)) { + if (sd->groups != sd->groups->next) + return 0; + } + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_IDLE | + SD_WAKE_AFFINE | + SD_WAKE_BALANCE)) + return 0; + + return 1; +} + +static int sd_parent_degenerate(struct sched_domain *sd, + struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpus_equal(sd->span, parent->span)) + return 0; + + /* Does parent contain flags not in child? */ + /* WAKE_BALANCE is a subset of WAKE_AFFINE */ + if (cflags & SD_WAKE_AFFINE) + pflags &= ~SD_WAKE_BALANCE; + /* Flags needing groups don't count if only 1 group in parent */ + if (parent->groups == parent->groups->next) { + pflags &= ~(SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC); + } + if (~cflags & pflags) + return 0; + + return 1; +} + /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +void cpu_attach_domain(struct sched_domain *sd, int cpu) { - migration_req_t req; - unsigned long flags; runqueue_t *rq = cpu_rq(cpu); - int local = 1; - - sched_domain_debug(sd, cpu); + struct sched_domain *tmp; - spin_lock_irqsave(&rq->lock, flags); - - if (cpu == smp_processor_id() || !cpu_online(cpu)) { - rq->sd = sd; - } else { - init_completion(&req.done); - req.type = REQ_SET_DOMAIN; - req.sd = sd; - list_add(&req.list, &rq->migration_queue); - local = 0; + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; tmp = tmp->parent) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + if (sd_parent_degenerate(tmp, parent)) + tmp->parent = parent->parent; } - spin_unlock_irqrestore(&rq->lock, flags); + if (sd && sd_degenerate(sd)) + sd = sd->parent; - if (!local) { - wake_up_process(rq->migration_thread); - wait_for_completion(&req.done); - } + sched_domain_debug(sd, cpu); + + rcu_assign_pointer(rq->sd, sd); } /* cpus with isolated domains */ @@ -4621,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -void __devinit init_sched_build_groups(struct sched_group groups[], +void init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; @@ -4657,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[], #ifdef ARCH_HAS_SCHED_DOMAIN -extern void __devinit arch_init_sched_domains(void); -extern void __devinit arch_destroy_sched_domains(void); +extern void build_sched_domains(const cpumask_t *cpu_map); +extern void arch_init_sched_domains(const cpumask_t *cpu_map); +extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); #else #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; -static int __devinit cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu) { return cpu; } @@ -4671,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu) static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; -static int __devinit cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu) { #ifdef CONFIG_SCHED_SMT return first_cpu(cpu_sibling_map[cpu]); @@ -4684,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu) static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int __devinit cpu_to_node_group(int cpu) +static int cpu_to_node_group(int cpu) { return cpu_to_node(cpu); } @@ -4715,39 +4917,28 @@ static void check_sibling_maps(void) #endif /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus */ -static void __devinit arch_init_sched_domains(void) +static void build_sched_domains(const cpumask_t *cpu_map) { int i; - cpumask_t cpu_default_map; - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) - check_sibling_maps(); -#endif - /* - * Setup mask for cpus without special case scheduling requirements. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ - cpus_complement(cpu_default_map, cpu_isolated_map); - cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); /* - * Set up domains. Isolated domains just stay on the dummy domain. + * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA sd = &per_cpu(node_domains, i); group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = cpu_default_map; + sd->span = *cpu_map; sd->groups = &sched_group_nodes[group]; #endif @@ -4765,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void) group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; - cpus_and(sd->span, sd->span, cpu_default_map); + cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; sd->groups = &sched_group_cpus[group]; #endif @@ -4775,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void) /* Set up CPU (sibling) groups */ for_each_online_cpu(i) { cpumask_t this_sibling_map = cpu_sibling_map[i]; - cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) continue; @@ -4788,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void) for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); if (cpus_empty(nodemask)) continue; @@ -4798,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void) #ifdef CONFIG_NUMA /* Set up node groups */ - init_sched_build_groups(sched_group_nodes, cpu_default_map, + init_sched_build_groups(sched_group_nodes, *cpu_map, &cpu_to_node_group); #endif /* Calculate CPU power for physical packages and nodes */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { int power; struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT @@ -4827,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void) } /* Attach the domains */ - for_each_online_cpu(i) { + for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -4837,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void) cpu_attach_domain(sd, i); } } +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void arch_init_sched_domains(cpumask_t *cpu_map) +{ + cpumask_t cpu_default_map; + +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) + check_sibling_maps(); +#endif + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); + + build_sched_domains(&cpu_default_map); +} -#ifdef CONFIG_HOTPLUG_CPU -static void __devinit arch_destroy_sched_domains(void) +static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { /* Do nothing: everything is statically allocated. */ } -#endif #endif /* ARCH_HAS_SCHED_DOMAIN */ /* - * Initial dummy domain for early boot and for hotplug cpu. Being static, - * it is initialized to zero, so all balancing flags are cleared which is - * what we want. + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain */ -static struct sched_domain sched_domain_dummy; +static inline void detach_destroy_domains(const cpumask_t *cpu_map) +{ + int i; + + for_each_cpu_mask(i, *cpu_map) + cpu_attach_domain(NULL, i); + synchronize_sched(); + arch_destroy_sched_domains(cpu_map); +} + +/* + * Partition sched domains as specified by the cpumasks below. + * This attaches all cpus from the cpumasks to the NULL domain, + * waits for a RCU quiescent period, recalculates sched + * domain information and then attaches them back to the + * correct sched domains + * Call with hotplug lock held + */ +void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +{ + cpumask_t change_map; + + cpus_and(*partition1, *partition1, cpu_online_map); + cpus_and(*partition2, *partition2, cpu_online_map); + cpus_or(change_map, *partition1, *partition2); + + /* Detach sched domains from all of the affected cpus */ + detach_destroy_domains(&change_map); + if (!cpus_empty(*partition1)) + build_sched_domains(partition1); + if (!cpus_empty(*partition2)) + build_sched_domains(partition2); +} #ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing - * code, so we temporarily attach all running cpus to a "dummy" domain + * code, so we temporarily attach all running cpus to the NULL domain * which will prevent rebalancing while the sched domains are recalculated. */ static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int i; - switch (action) { case CPU_UP_PREPARE: case CPU_DOWN_PREPARE: - for_each_online_cpu(i) - cpu_attach_domain(&sched_domain_dummy, i); - arch_destroy_sched_domains(); + detach_destroy_domains(&cpu_online_map); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -4887,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb, } /* The hotplug lock is already held by cpu_up/cpu_down */ - arch_init_sched_domains(); + arch_init_sched_domains(&cpu_online_map); return NOTIFY_OK; } @@ -4896,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb, void __init sched_init_smp(void) { lock_cpu_hotplug(); - arch_init_sched_domains(); + arch_init_sched_domains(&cpu_online_map); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); @@ -4926,13 +5161,15 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); + rq->nr_running = 0; rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP - rq->sd = &sched_domain_dummy; - rq->cpu_load = 0; + rq->sd = NULL; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; diff --git a/kernel/sys.c b/kernel/sys.c index da24bc1..9a24374 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -16,6 +16,8 @@ #include <linux/init.h> #include <linux/highuid.h> #include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/kexec.h> #include <linux/workqueue.h> #include <linux/device.h> #include <linux/key.h> @@ -405,6 +407,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_HALT: notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); system_state = SYSTEM_HALT; + device_suspend(PMSG_SUSPEND); device_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); @@ -415,6 +418,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_POWER_OFF: notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); system_state = SYSTEM_POWER_OFF; + device_suspend(PMSG_SUSPEND); device_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); @@ -431,11 +435,30 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); system_state = SYSTEM_RESTART; + device_suspend(PMSG_FREEZE); device_shutdown(); printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); machine_restart(buffer); break; +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + { + struct kimage *image; + image = xchg(&kexec_image, 0); + if (!image) { + unlock_kernel(); + return -EINVAL; + } + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_shutdown(); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + machine_kexec(image); + break; + } +#endif #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6f15bea..29196ce 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -18,6 +18,8 @@ cond_syscall(sys_acct); cond_syscall(sys_lookup_dcookie); cond_syscall(sys_swapon); cond_syscall(sys_swapoff); +cond_syscall(sys_kexec_load); +cond_syscall(compat_sys_kexec_load); cond_syscall(sys_init_module); cond_syscall(sys_delete_module); cond_syscall(sys_socketpair); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 24a4d12..270ee7f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1000,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol int error = parse_table(name, nlen, oldval, oldlenp, newval, newlen, head->ctl_table, &context); - if (context) - kfree(context); + kfree(context); if (error != -ENOTDIR) return error; tmp = tmp->next; diff --git a/kernel/timer.c b/kernel/timer.c index 51ff917..f2a1188 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1597,7 +1597,7 @@ void msleep(unsigned int msecs) EXPORT_SYMBOL(msleep); /** - * msleep_interruptible - sleep waiting for waitqueue interruptions + * msleep_interruptible - sleep waiting for signals * @msecs: Time in milliseconds to sleep for */ unsigned long msleep_interruptible(unsigned int msecs) |