diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 17:19:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-28 17:19:28 -0700 |
commit | 532bfc851a7475fb6a36c1e953aa395798a7cca7 (patch) | |
tree | a7892e5a31330dd59f31959efbe9fda1803784fd | |
parent | 0195c00244dc2e9f522475868fa278c473ba7339 (diff) | |
parent | 8da00edc1069f01c34510fa405dc15d96c090a3f (diff) | |
download | op-kernel-dev-532bfc851a7475fb6a36c1e953aa395798a7cca7.zip op-kernel-dev-532bfc851a7475fb6a36c1e953aa395798a7cca7.tar.gz |
Merge branch 'akpm' (Andrew's patch-bomb)
Merge third batch of patches from Andrew Morton:
- Some MM stragglers
- core SMP library cleanups (on_each_cpu_mask)
- Some IPI optimisations
- kexec
- kdump
- IPMI
- the radix-tree iterator work
- various other misc bits.
"That'll do for -rc1. I still have ~10 patches for 3.4, will send
those along when they've baked a little more."
* emailed from Andrew Morton <akpm@linux-foundation.org>: (35 commits)
backlight: fix typo in tosa_lcd.c
crc32: add help text for the algorithm select option
mm: move hugepage test examples to tools/testing/selftests/vm
mm: move slabinfo.c to tools/vm
mm: move page-types.c from Documentation to tools/vm
selftests/Makefile: make `run_tests' depend on `all'
selftests: launch individual selftests from the main Makefile
radix-tree: use iterators in find_get_pages* functions
radix-tree: rewrite gang lookup using iterator
radix-tree: introduce bit-optimized iterator
fs/proc/namespaces.c: prevent crash when ns_entries[] is empty
nbd: rename the nbd_device variable from lo to nbd
pidns: add reboot_pid_ns() to handle the reboot syscall
sysctl: use bitmap library functions
ipmi: use locks on watchdog timeout set on reboot
ipmi: simplify locking
ipmi: fix message handling during panics
ipmi: use a tasklet for handling received messages
ipmi: increase KCS timeouts
ipmi: decrease the IPMI message transaction time in interrupt mode
...
55 files changed, 1225 insertions, 768 deletions
diff --git a/Documentation/Makefile b/Documentation/Makefile index 9b4bc5c..30b656e 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -1,3 +1,3 @@ obj-m := DocBook/ accounting/ auxdisplay/ connector/ \ filesystems/ filesystems/configfs/ ia64/ laptops/ networking/ \ - pcmcia/ spi/ timers/ vm/ watchdog/src/ + pcmcia/ spi/ timers/ watchdog/src/ diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile deleted file mode 100644 index 3fa4d06..0000000 --- a/Documentation/vm/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - -# List of programs to build -hostprogs-y := page-types hugepage-mmap hugepage-shm map_hugetlb - -# Tell kbuild to always build the programs -always := $(hostprogs-y) diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c index 7dcb352..02c5d2c 100644 --- a/arch/arm/kernel/smp_tlb.c +++ b/arch/arm/kernel/smp_tlb.c @@ -13,18 +13,6 @@ #include <asm/smp_plat.h> #include <asm/tlbflush.h> -static void on_each_cpu_mask(void (*func)(void *), void *info, int wait, - const struct cpumask *mask) -{ - preempt_disable(); - - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(smp_processor_id(), mask)) - func(info); - - preempt_enable(); -} - /**********************************************************************/ /* @@ -87,7 +75,7 @@ void flush_tlb_all(void) void flush_tlb_mm(struct mm_struct *mm) { if (tlb_ops_need_broadcast()) - on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mm_cpumask(mm)); + on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1); else local_flush_tlb_mm(mm); } @@ -98,7 +86,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = uaddr; - on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, + &ta, 1); } else local_flush_tlb_page(vma, uaddr); } @@ -121,7 +110,8 @@ void flush_tlb_range(struct vm_area_struct *vma, ta.ta_vma = vma; ta.ta_start = start; ta.ta_end = end; - on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range, + &ta, 1); } else local_flush_tlb_range(vma, start, end); } diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index d1cc81e..ac795d3 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -843,7 +843,7 @@ early_param("additional_cpus", setup_additional_cpus); * are onlined, or offlined. The reason is per-cpu data-structures * are allocated by some modules at init time, and dont expect to * do this dynamically on cpu arrival/departure. - * cpu_present_map on the other hand can change dynamically. + * cpu_present_mask on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current * behaviour, which is cpu_possible == cpu_present. * - Ashok Raj @@ -921,7 +921,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) acpi_map_cpu2node(handle, cpu, physid); - cpu_set(cpu, cpu_present_map); + set_cpu_present(cpu, true); ia64_cpu_to_sapicid[cpu] = physid; acpi_processor_set_pdc(handle); @@ -940,7 +940,7 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { ia64_cpu_to_sapicid[cpu] = -1; - cpu_clear(cpu, cpu_present_map); + set_cpu_present(cpu, false); #ifdef CONFIG_ACPI_NUMA /* NUMA specific cleanup's */ diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 08113b1..5c3e088 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -117,7 +117,7 @@ static inline int find_unassigned_vector(cpumask_t domain) cpumask_t mask; int pos, vector; - cpus_and(mask, domain, cpu_online_map); + cpumask_and(&mask, &domain, cpu_online_mask); if (cpus_empty(mask)) return -EINVAL; @@ -140,7 +140,7 @@ static int __bind_irq_vector(int irq, int vector, cpumask_t domain) BUG_ON((unsigned)irq >= NR_IRQS); BUG_ON((unsigned)vector >= IA64_NUM_VECTORS); - cpus_and(mask, domain, cpu_online_map); + cpumask_and(&mask, &domain, cpu_online_mask); if (cpus_empty(mask)) return -EINVAL; if ((cfg->vector == vector) && cpus_equal(cfg->domain, domain)) @@ -178,7 +178,7 @@ static void __clear_irq_vector(int irq) BUG_ON(cfg->vector == IRQ_VECTOR_UNASSIGNED); vector = cfg->vector; domain = cfg->domain; - cpus_and(mask, cfg->domain, cpu_online_map); + cpumask_and(&mask, &cfg->domain, cpu_online_mask); for_each_cpu_mask(cpu, mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = IRQ_VECTOR_UNASSIGNED; @@ -321,7 +321,7 @@ void irq_complete_move(unsigned irq) if (unlikely(cpu_isset(smp_processor_id(), cfg->old_domain))) return; - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cpumask_and(&cleanup_mask, &cfg->old_domain, cpu_online_mask); cfg->move_cleanup_count = cpus_weight(cleanup_mask); for_each_cpu_mask(i, cleanup_mask) platform_send_ipi(i, IA64_IRQ_MOVE_VECTOR, IA64_IPI_DM_INT, 0); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index a39fe09..65bf9cd 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1514,7 +1514,8 @@ static void ia64_mca_cmc_poll (unsigned long dummy) { /* Trigger a CMC interrupt cascade */ - platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); + platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CMCP_VECTOR, + IA64_IPI_DM_INT, 0); } /* @@ -1590,7 +1591,8 @@ static void ia64_mca_cpe_poll (unsigned long dummy) { /* Trigger a CPE interrupt cascade */ - platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); + platform_send_ipi(cpumask_first(cpu_online_mask), IA64_CPEP_VECTOR, + IA64_IPI_DM_INT, 0); } #endif /* CONFIG_ACPI */ diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c index 94e0db7..fb2f1e6 100644 --- a/arch/ia64/kernel/msi_ia64.c +++ b/arch/ia64/kernel/msi_ia64.c @@ -57,7 +57,7 @@ int ia64_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc) return irq; irq_set_msi_desc(irq, desc); - cpus_and(mask, irq_to_domain(irq), cpu_online_map); + cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask); dest_phys_id = cpu_physical_id(first_cpu(mask)); vector = irq_to_vector(irq); @@ -179,7 +179,7 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) unsigned dest; cpumask_t mask; - cpus_and(mask, irq_to_domain(irq), cpu_online_map); + cpumask_and(&mask, &(irq_to_domain(irq)), cpu_online_mask); dest = cpu_physical_id(first_cpu(mask)); msg->address_hi = 0; diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index c45e6dd..aaefd9b 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -485,7 +485,7 @@ mark_bsp_online (void) { #ifdef CONFIG_SMP /* If we register an early console, allow CPU 0 to printk */ - cpu_set(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), true); #endif } diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index e27f925..9fcd4e6 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -76,7 +76,7 @@ stop_this_cpu(void) /* * Remove this CPU: */ - cpu_clear(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), false); max_xtp(); local_irq_disable(); cpu_halt(); diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index fb7927b..796f6a5 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -400,7 +400,7 @@ smp_callin (void) /* Setup the per cpu irq handling data structures */ __setup_vector_irq(cpuid); notify_cpu_starting(cpuid); - cpu_set(cpuid, cpu_online_map); + set_cpu_online(cpuid, true); per_cpu(cpu_state, cpuid) = CPU_ONLINE; spin_unlock(&vector_lock); ipi_call_unlock_irq(); @@ -547,7 +547,7 @@ do_rest: if (!cpu_isset(cpu, cpu_callin_map)) { printk(KERN_ERR "Processor 0x%x/0x%x is stuck.\n", cpu, sapicid); ia64_cpu_to_sapicid[cpu] = -1; - cpu_clear(cpu, cpu_online_map); /* was set in smp_callin() */ + set_cpu_online(cpu, false); /* was set in smp_callin() */ return -EINVAL; } return 0; @@ -577,8 +577,7 @@ smp_build_cpu_map (void) } ia64_cpu_to_sapicid[0] = boot_cpu_id; - cpus_clear(cpu_present_map); - set_cpu_present(0, true); + init_cpu_present(cpumask_of(0)); set_cpu_possible(0, true); for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) { sapicid = smp_boot_data.cpu_phys_id[i]; @@ -605,10 +604,6 @@ smp_prepare_cpus (unsigned int max_cpus) smp_setup_percpu_timer(); - /* - * We have the boot CPU online for sure. - */ - cpu_set(0, cpu_online_map); cpu_set(0, cpu_callin_map); local_cpu_data->loops_per_jiffy = loops_per_jiffy; @@ -632,7 +627,7 @@ smp_prepare_cpus (unsigned int max_cpus) void __devinit smp_prepare_boot_cpu(void) { - cpu_set(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), true); cpu_set(smp_processor_id(), cpu_callin_map); set_numa_node(cpu_to_node_map[smp_processor_id()]); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; @@ -689,7 +684,7 @@ int migrate_platform_irqs(unsigned int cpu) /* * Now re-target the CPEI to a different processor */ - new_cpei_cpu = any_online_cpu(cpu_online_map); + new_cpei_cpu = cpumask_any(cpu_online_mask); mask = cpumask_of(new_cpei_cpu); set_cpei_target_cpu(new_cpei_cpu); data = irq_get_irq_data(ia64_cpe_irq); @@ -731,10 +726,10 @@ int __cpu_disable(void) return -EBUSY; } - cpu_clear(cpu, cpu_online_map); + set_cpu_online(cpu, false); if (migrate_platform_irqs(cpu)) { - cpu_set(cpu, cpu_online_map); + set_cpu_online(cpu, true); return -EBUSY; } diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 9deb21d..c64460b 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -220,7 +220,8 @@ static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf) ssize_t len; cpumask_t shared_cpu_map; - cpus_and(shared_cpu_map, this_leaf->shared_cpu_map, cpu_online_map); + cpumask_and(&shared_cpu_map, + &this_leaf->shared_cpu_map, cpu_online_mask); len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map); len += sprintf(buf+len, "\n"); return len; diff --git a/arch/tile/include/asm/smp.h b/arch/tile/include/asm/smp.h index 532124a..1aa759a 100644 --- a/arch/tile/include/asm/smp.h +++ b/arch/tile/include/asm/smp.h @@ -43,10 +43,6 @@ void evaluate_message(int tag); /* Boot a secondary cpu */ void online_secondary(void); -/* Call a function on a specified set of CPUs (may include this one). */ -extern void on_each_cpu_mask(const struct cpumask *mask, - void (*func)(void *), void *info, bool wait); - /* Topology of the supervisor tile grid, and coordinates of boot processor */ extern HV_Topology smp_topology; @@ -91,9 +87,6 @@ void print_disabled_cpus(void); #else /* !CONFIG_SMP */ -#define on_each_cpu_mask(mask, func, info, wait) \ - do { if (cpumask_test_cpu(0, (mask))) func(info); } while (0) - #define smp_master_cpu 0 #define smp_height 1 #define smp_width 1 diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index c52224d..a44e103 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -87,25 +87,6 @@ void send_IPI_allbutself(int tag) send_IPI_many(&mask, tag); } - -/* - * Provide smp_call_function_mask, but also run function locally - * if specified in the mask. - */ -void on_each_cpu_mask(const struct cpumask *mask, void (*func)(void *), - void *info, bool wait) -{ - int cpu = get_cpu(); - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(cpu, mask)) { - local_irq_disable(); - func(info); - local_irq_enable(); - } - put_cpu(); -} - - /* * Functions related to starting/stopping cpus. */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8cbeb72..1a29015 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -508,15 +508,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_pfn - min_low_pfn; - - return total << PAGE_SHIFT; -} - /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. @@ -535,7 +526,7 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - total_mem = get_total_mem(); + total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c7ba11f..061427a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -38,7 +38,7 @@ #include <linux/nbd.h> -#define LO_MAGIC 0x68797548 +#define NBD_MAGIC 0x68797548 #ifdef NDEBUG #define dprintk(flags, fmt...) @@ -115,7 +115,7 @@ static void nbd_end_request(struct request *req) spin_unlock_irqrestore(q->queue_lock, flags); } -static void sock_shutdown(struct nbd_device *lo, int lock) +static void sock_shutdown(struct nbd_device *nbd, int lock) { /* Forcibly shutdown the socket causing all listeners * to error @@ -124,14 +124,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock) * there should be a more generic interface rather than * calling socket ops directly here */ if (lock) - mutex_lock(&lo->tx_lock); - if (lo->sock) { - dev_warn(disk_to_dev(lo->disk), "shutting down socket\n"); - kernel_sock_shutdown(lo->sock, SHUT_RDWR); - lo->sock = NULL; + mutex_lock(&nbd->tx_lock); + if (nbd->sock) { + dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); + kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + nbd->sock = NULL; } if (lock) - mutex_unlock(&lo->tx_lock); + mutex_unlock(&nbd->tx_lock); } static void nbd_xmit_timeout(unsigned long arg) @@ -146,17 +146,17 @@ static void nbd_xmit_timeout(unsigned long arg) /* * Send or receive packet. */ -static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, +static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, int msg_flags) { - struct socket *sock = lo->sock; + struct socket *sock = nbd->sock; int result; struct msghdr msg; struct kvec iov; sigset_t blocked, oldset; if (unlikely(!sock)) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Attempted %s on closed socket in sock_xmit\n", (send ? "send" : "recv")); return -EINVAL; @@ -180,15 +180,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, if (send) { struct timer_list ti; - if (lo->xmit_timeout) { + if (nbd->xmit_timeout) { init_timer(&ti); ti.function = nbd_xmit_timeout; ti.data = (unsigned long)current; - ti.expires = jiffies + lo->xmit_timeout; + ti.expires = jiffies + nbd->xmit_timeout; add_timer(&ti); } result = kernel_sendmsg(sock, &msg, &iov, 1, size); - if (lo->xmit_timeout) + if (nbd->xmit_timeout) del_timer_sync(&ti); } else result = kernel_recvmsg(sock, &msg, &iov, 1, size, @@ -200,7 +200,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, task_pid_nr(current), current->comm, dequeue_signal_lock(current, ¤t->blocked, &info)); result = -EINTR; - sock_shutdown(lo, !send); + sock_shutdown(nbd, !send); break; } @@ -218,18 +218,19 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, return result; } -static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec, +static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec, int flags) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags); + result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset, + bvec->bv_len, flags); kunmap(bvec->bv_page); return result; } /* always call with the tx_lock held */ -static int nbd_send_req(struct nbd_device *lo, struct request *req) +static int nbd_send_req(struct nbd_device *nbd, struct request *req) { int result, flags; struct nbd_request request; @@ -242,14 +243,14 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) memcpy(request.handle, &req, sizeof(req)); dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n", - lo->disk->disk_name, req, + nbd->disk->disk_name, req, nbdcmd_to_ascii(nbd_cmd(req)), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); - result = sock_xmit(lo, 1, &request, sizeof(request), + result = sock_xmit(nbd, 1, &request, sizeof(request), (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); goto error_out; } @@ -266,10 +267,10 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) if (!rq_iter_last(req, iter)) flags = MSG_MORE; dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n", - lo->disk->disk_name, req, bvec->bv_len); - result = sock_send_bvec(lo, bvec, flags); + nbd->disk->disk_name, req, bvec->bv_len); + result = sock_send_bvec(nbd, bvec, flags); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", result); goto error_out; @@ -282,25 +283,25 @@ error_out: return -EIO; } -static struct request *nbd_find_request(struct nbd_device *lo, +static struct request *nbd_find_request(struct nbd_device *nbd, struct request *xreq) { struct request *req, *tmp; int err; - err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq); + err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq); if (unlikely(err)) goto out; - spin_lock(&lo->queue_lock); - list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) { + spin_lock(&nbd->queue_lock); + list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) { if (req != xreq) continue; list_del_init(&req->queuelist); - spin_unlock(&lo->queue_lock); + spin_unlock(&nbd->queue_lock); return req; } - spin_unlock(&lo->queue_lock); + spin_unlock(&nbd->queue_lock); err = -ENOENT; @@ -308,78 +309,78 @@ out: return ERR_PTR(err); } -static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec) +static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len, + result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len, MSG_WAITALL); kunmap(bvec->bv_page); return result; } /* NULL returned = something went wrong, inform userspace */ -static struct request *nbd_read_stat(struct nbd_device *lo) +static struct request *nbd_read_stat(struct nbd_device *nbd) { int result; struct nbd_reply reply; struct request *req; reply.magic = 0; - result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL); + result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); goto harderror; } if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { - dev_err(disk_to_dev(lo->disk), "Wrong magic (0x%lx)\n", + dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", (unsigned long)ntohl(reply.magic)); result = -EPROTO; goto harderror; } - req = nbd_find_request(lo, *(struct request **)reply.handle); + req = nbd_find_request(nbd, *(struct request **)reply.handle); if (IS_ERR(req)) { result = PTR_ERR(req); if (result != -ENOENT) goto harderror; - dev_err(disk_to_dev(lo->disk), "Unexpected reply (%p)\n", + dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", reply.handle); result = -EBADR; goto harderror; } if (ntohl(reply.error)) { - dev_err(disk_to_dev(lo->disk), "Other side returned error (%d)\n", + dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); req->errors++; return req; } dprintk(DBG_RX, "%s: request %p: got reply\n", - lo->disk->disk_name, req); + nbd->disk->disk_name, req); if (nbd_cmd(req) == NBD_CMD_READ) { struct req_iterator iter; struct bio_vec *bvec; rq_for_each_segment(bvec, req, iter) { - result = sock_recv_bvec(lo, bvec); + result = sock_recv_bvec(nbd, bvec); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), "Receive data failed (result %d)\n", + dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); req->errors++; return req; } dprintk(DBG_RX, "%s: request %p: got %d bytes data\n", - lo->disk->disk_name, req, bvec->bv_len); + nbd->disk->disk_name, req, bvec->bv_len); } } return req; harderror: - lo->harderror = result; + nbd->harderror = result; return NULL; } @@ -397,48 +398,48 @@ static struct device_attribute pid_attr = { .show = pid_show, }; -static int nbd_do_it(struct nbd_device *lo) +static int nbd_do_it(struct nbd_device *nbd) { struct request *req; int ret; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); - lo->pid = task_pid_nr(current); - ret = device_create_file(disk_to_dev(lo->disk), &pid_attr); + nbd->pid = task_pid_nr(current); + ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { - dev_err(disk_to_dev(lo->disk), "device_create_file failed!\n"); - lo->pid = 0; + dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); + nbd->pid = 0; return ret; } - while ((req = nbd_read_stat(lo)) != NULL) + while ((req = nbd_read_stat(nbd)) != NULL) nbd_end_request(req); - device_remove_file(disk_to_dev(lo->disk), &pid_attr); - lo->pid = 0; + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); + nbd->pid = 0; return 0; } -static void nbd_clear_que(struct nbd_device *lo) +static void nbd_clear_que(struct nbd_device *nbd) { struct request *req; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); /* - * Because we have set lo->sock to NULL under the tx_lock, all + * Because we have set nbd->sock to NULL under the tx_lock, all * modifications to the list must have completed by now. For * the same reason, the active_req must be NULL. * * As a consequence, we don't need to take the spin lock while * purging the list here. */ - BUG_ON(lo->sock); - BUG_ON(lo->active_req); + BUG_ON(nbd->sock); + BUG_ON(nbd->active_req); - while (!list_empty(&lo->queue_head)) { - req = list_entry(lo->queue_head.next, struct request, + while (!list_empty(&nbd->queue_head)) { + req = list_entry(nbd->queue_head.next, struct request, queuelist); list_del_init(&req->queuelist); req->errors++; @@ -447,7 +448,7 @@ static void nbd_clear_que(struct nbd_device *lo) } -static void nbd_handle_req(struct nbd_device *lo, struct request *req) +static void nbd_handle_req(struct nbd_device *nbd, struct request *req) { if (req->cmd_type != REQ_TYPE_FS) goto error_out; @@ -455,8 +456,8 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) nbd_cmd(req) = NBD_CMD_READ; if (rq_data_dir(req) == WRITE) { nbd_cmd(req) = NBD_CMD_WRITE; - if (lo->flags & NBD_READ_ONLY) { - dev_err(disk_to_dev(lo->disk), + if (nbd->flags & NBD_READ_ONLY) { + dev_err(disk_to_dev(nbd->disk), "Write on read-only\n"); goto error_out; } @@ -464,29 +465,29 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) req->errors = 0; - mutex_lock(&lo->tx_lock); - if (unlikely(!lo->sock)) { - mutex_unlock(&lo->tx_lock); - dev_err(disk_to_dev(lo->disk), + mutex_lock(&nbd->tx_lock); + if (unlikely(!nbd->sock)) { + mutex_unlock(&nbd->tx_lock); + dev_err(disk_to_dev(nbd->disk), "Attempted send on closed socket\n"); goto error_out; } - lo->active_req = req; + nbd->active_req = req; - if (nbd_send_req(lo, req) != 0) { - dev_err(disk_to_dev(lo->disk), "Request send failed\n"); + if (nbd_send_req(nbd, req) != 0) { + dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; nbd_end_request(req); } else { - spin_lock(&lo->queue_lock); - list_add(&req->queuelist, &lo->queue_head); - spin_unlock(&lo->queue_lock); + spin_lock(&nbd->queue_lock); + list_add(&req->queuelist, &nbd->queue_head); + spin_unlock(&nbd->queue_lock); } - lo->active_req = NULL; - mutex_unlock(&lo->tx_lock); - wake_up_all(&lo->active_wq); + nbd->active_req = NULL; + mutex_unlock(&nbd->tx_lock); + wake_up_all(&nbd->active_wq); return; @@ -497,28 +498,28 @@ error_out: static int nbd_thread(void *data) { - struct nbd_device *lo = data; + struct nbd_device *nbd = data; struct request *req; set_user_nice(current, -20); - while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) { + while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ - wait_event_interruptible(lo->waiting_wq, + wait_event_interruptible(nbd->waiting_wq, kthread_should_stop() || - !list_empty(&lo->waiting_queue)); + !list_empty(&nbd->waiting_queue)); /* extract request */ - if (list_empty(&lo->waiting_queue)) + if (list_empty(&nbd->waiting_queue)) continue; - spin_lock_irq(&lo->queue_lock); - req = list_entry(lo->waiting_queue.next, struct request, + spin_lock_irq(&nbd->queue_lock); + req = list_entry(nbd->waiting_queue.next, struct request, queuelist); list_del_init(&req->queuelist); - spin_unlock_irq(&lo->queue_lock); + spin_unlock_irq(&nbd->queue_lock); /* handle request */ - nbd_handle_req(lo, req); + nbd_handle_req(nbd, req); } return 0; } @@ -526,7 +527,7 @@ static int nbd_thread(void *data) /* * We always wait for result of write, for now. It would be nice to make it optional * in future - * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK)) + * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK)) * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } */ @@ -535,19 +536,19 @@ static void do_nbd_request(struct request_queue *q) struct request *req; while ((req = blk_fetch_request(q)) != NULL) { - struct nbd_device *lo; + struct nbd_device *nbd; spin_unlock_irq(q->queue_lock); dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n", req->rq_disk->disk_name, req, req->cmd_type); - lo = req->rq_disk->private_data; + nbd = req->rq_disk->private_data; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); - if (unlikely(!lo->sock)) { - dev_err(disk_to_dev(lo->disk), + if (unlikely(!nbd->sock)) { + dev_err(disk_to_dev(nbd->disk), "Attempted send on closed socket\n"); req->errors++; nbd_end_request(req); @@ -555,11 +556,11 @@ static void do_nbd_request(struct request_queue *q) continue; } - spin_lock_irq(&lo->queue_lock); - list_add_tail(&req->queuelist, &lo->waiting_queue); - spin_unlock_irq(&lo->queue_lock); + spin_lock_irq(&nbd->queue_lock); + list_add_tail(&req->queuelist, &nbd->waiting_queue); + spin_unlock_irq(&nbd->queue_lock); - wake_up(&lo->waiting_wq); + wake_up(&nbd->waiting_wq); spin_lock_irq(q->queue_lock); } @@ -567,32 +568,32 @@ static void do_nbd_request(struct request_queue *q) /* Must be called with tx_lock held */ -static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, +static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { switch (cmd) { case NBD_DISCONNECT: { struct request sreq; - dev_info(disk_to_dev(lo->disk), "NBD_DISCONNECT\n"); + dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_SPECIAL; nbd_cmd(&sreq) = NBD_CMD_DISC; - if (!lo->sock) + if (!nbd->sock) return -EINVAL; - nbd_send_req(lo, &sreq); + nbd_send_req(nbd, &sreq); return 0; } case NBD_CLEAR_SOCK: { struct file *file; - lo->sock = NULL; - file = lo->file; - lo->file = NULL; - nbd_clear_que(lo); - BUG_ON(!list_empty(&lo->queue_head)); + nbd->sock = NULL; + file = nbd->file; + nbd->file = NULL; + nbd_clear_que(nbd); + BUG_ON(!list_empty(&nbd->queue_head)); if (file) fput(file); return 0; @@ -600,14 +601,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, case NBD_SET_SOCK: { struct file *file; - if (lo->file) + if (nbd->file) return -EBUSY; file = fget(arg); if (file) { struct inode *inode = file->f_path.dentry->d_inode; if (S_ISSOCK(inode->i_mode)) { - lo->file = file; - lo->sock = SOCKET_I(inode); + nbd->file = file; + nbd->sock = SOCKET_I(inode); if (max_part > 0) bdev->bd_invalidated = 1; return 0; @@ -619,29 +620,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, } case NBD_SET_BLKSIZE: - lo->blksize = arg; - lo->bytesize &= ~(lo->blksize-1); - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->blksize = arg; + nbd->bytesize &= ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_SET_SIZE: - lo->bytesize = arg & ~(lo->blksize-1); - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->bytesize = arg & ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_SET_TIMEOUT: - lo->xmit_timeout = arg * HZ; + nbd->xmit_timeout = arg * HZ; return 0; case NBD_SET_SIZE_BLOCKS: - lo->bytesize = ((u64) arg) * lo->blksize; - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->bytesize = ((u64) arg) * nbd->blksize; + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_DO_IT: { @@ -649,38 +650,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, struct file *file; int error; - if (lo->pid) + if (nbd->pid) return -EBUSY; - if (!lo->file) + if (!nbd->file) return -EINVAL; - mutex_unlock(&lo->tx_lock); + mutex_unlock(&nbd->tx_lock); - thread = kthread_create(nbd_thread, lo, lo->disk->disk_name); + thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); if (IS_ERR(thread)) { - mutex_lock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); return PTR_ERR(thread); } wake_up_process(thread); - error = nbd_do_it(lo); + error = nbd_do_it(nbd); kthread_stop(thread); - mutex_lock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); if (error) return error; - sock_shutdown(lo, 0); - file = lo->file; - lo->file = NULL; - nbd_clear_que(lo); - dev_warn(disk_to_dev(lo->disk), "queue cleared\n"); + sock_shutdown(nbd, 0); + file = nbd->file; + nbd->file = NULL; + nbd_clear_que(nbd); + dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); if (file) fput(file); - lo->bytesize = 0; + nbd->bytesize = 0; bdev->bd_inode->i_size = 0; - set_capacity(lo->disk, 0); + set_capacity(nbd->disk, 0); if (max_part > 0) ioctl_by_bdev(bdev, BLKRRPART, 0); - return lo->harderror; + return nbd->harderror; } case NBD_CLEAR_QUE: @@ -688,14 +689,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, * This is for compatibility only. The queue is always cleared * by NBD_DO_IT or NBD_CLEAR_SOCK. */ - BUG_ON(!lo->sock && !list_empty(&lo->queue_head)); + BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head)); return 0; case NBD_PRINT_DEBUG: - dev_info(disk_to_dev(lo->disk), + dev_info(disk_to_dev(nbd->disk), "next = %p, prev = %p, head = %p\n", - lo->queue_head.next, lo->queue_head.prev, - &lo->queue_head); + nbd->queue_head.next, nbd->queue_head.prev, + &nbd->queue_head); return 0; } return -ENOTTY; @@ -704,21 +705,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, static int nbd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nbd_device *lo = bdev->bd_disk->private_data; + struct nbd_device *nbd = bdev->bd_disk->private_data; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); /* Anyone capable of this syscall can do *real bad* things */ dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n", - lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); + nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); - mutex_lock(&lo->tx_lock); - error = __nbd_ioctl(bdev, lo, cmd, arg); - mutex_unlock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); + error = __nbd_ioctl(bdev, nbd, cmd, arg); + mutex_unlock(&nbd->tx_lock); return error; } @@ -804,7 +805,7 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].file = NULL; - nbd_dev[i].magic = LO_MAGIC; + nbd_dev[i].magic = NBD_MAGIC; nbd_dev[i].flags = 0; INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); spin_lock_init(&nbd_dev[i].queue_lock); diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c index cf82fed..e53fc24 100644 --- a/drivers/char/ipmi/ipmi_kcs_sm.c +++ b/drivers/char/ipmi/ipmi_kcs_sm.c @@ -118,8 +118,8 @@ enum kcs_states { #define MAX_KCS_WRITE_SIZE IPMI_MAX_MSG_LENGTH /* Timeouts in microseconds. */ -#define IBF_RETRY_TIMEOUT 1000000 -#define OBF_RETRY_TIMEOUT 1000000 +#define IBF_RETRY_TIMEOUT 5000000 +#define OBF_RETRY_TIMEOUT 5000000 #define MAX_ERROR_RETRIES 10 #define ERROR0_OBF_WAIT_JIFFIES (2*HZ) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index c90e939..2c29942 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -45,6 +45,7 @@ #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> +#include <linux/interrupt.h> #define PFX "IPMI message handler: " @@ -52,6 +53,8 @@ static struct ipmi_recv_msg *ipmi_alloc_recv_msg(void); static int ipmi_init_msghandler(void); +static void smi_recv_tasklet(unsigned long); +static void handle_new_recv_msgs(ipmi_smi_t intf); static int initialized; @@ -354,12 +357,15 @@ struct ipmi_smi { int curr_seq; /* - * Messages that were delayed for some reason (out of memory, - * for instance), will go in here to be processed later in a - * periodic timer interrupt. + * Messages queued for delivery. If delivery fails (out of memory + * for instance), They will stay in here to be processed later in a + * periodic timer interrupt. The tasklet is for handling received + * messages directly from the handler. */ spinlock_t waiting_msgs_lock; struct list_head waiting_msgs; + atomic_t watchdog_pretimeouts_to_deliver; + struct tasklet_struct recv_tasklet; /* * The list of command receivers that are registered for commands @@ -492,6 +498,8 @@ static void clean_up_interface_data(ipmi_smi_t intf) struct cmd_rcvr *rcvr, *rcvr2; struct list_head list; + tasklet_kill(&intf->recv_tasklet); + free_smi_msg_list(&intf->waiting_msgs); free_recv_msg_list(&intf->waiting_events); @@ -2785,12 +2793,17 @@ channel_handler(ipmi_smi_t intf, struct ipmi_recv_msg *msg) return; } -void ipmi_poll_interface(ipmi_user_t user) +static void ipmi_poll(ipmi_smi_t intf) { - ipmi_smi_t intf = user->intf; - if (intf->handlers->poll) intf->handlers->poll(intf->send_info); + /* In case something came in */ + handle_new_recv_msgs(intf); +} + +void ipmi_poll_interface(ipmi_user_t user) +{ + ipmi_poll(user->intf); } EXPORT_SYMBOL(ipmi_poll_interface); @@ -2859,6 +2872,10 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers, #endif spin_lock_init(&intf->waiting_msgs_lock); INIT_LIST_HEAD(&intf->waiting_msgs); + tasklet_init(&intf->recv_tasklet, + smi_recv_tasklet, + (unsigned long) intf); + atomic_set(&intf->watchdog_pretimeouts_to_deliver, 0); spin_lock_init(&intf->events_lock); INIT_LIST_HEAD(&intf->waiting_events); intf->waiting_events_count = 0; @@ -3621,11 +3638,11 @@ static int handle_bmc_rsp(ipmi_smi_t intf, } /* - * Handle a new message. Return 1 if the message should be requeued, + * Handle a received message. Return 1 if the message should be requeued, * 0 if the message should be freed, or -1 if the message should not * be freed or requeued. */ -static int handle_new_recv_msg(ipmi_smi_t intf, +static int handle_one_recv_msg(ipmi_smi_t intf, struct ipmi_smi_msg *msg) { int requeue; @@ -3783,12 +3800,72 @@ static int handle_new_recv_msg(ipmi_smi_t intf, return requeue; } +/* + * If there are messages in the queue or pretimeouts, handle them. + */ +static void handle_new_recv_msgs(ipmi_smi_t intf) +{ + struct ipmi_smi_msg *smi_msg; + unsigned long flags = 0; + int rv; + int run_to_completion = intf->run_to_completion; + + /* See if any waiting messages need to be processed. */ + if (!run_to_completion) + spin_lock_irqsave(&intf->waiting_msgs_lock, flags); + while (!list_empty(&intf->waiting_msgs)) { + smi_msg = list_entry(intf->waiting_msgs.next, + struct ipmi_smi_msg, link); + list_del(&smi_msg->link); + if (!run_to_completion) + spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); + rv = handle_one_recv_msg(intf, smi_msg); + if (!run_to_completion) + spin_lock_irqsave(&intf->waiting_msgs_lock, flags); + if (rv == 0) { + /* Message handled */ + ipmi_free_smi_msg(smi_msg); + } else if (rv < 0) { + /* Fatal error on the message, del but don't free. */ + } else { + /* + * To preserve message order, quit if we + * can't handle a message. + */ + list_add(&smi_msg->link, &intf->waiting_msgs); + break; + } + } + if (!run_to_completion) + spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); + + /* + * If the pretimout count is non-zero, decrement one from it and + * deliver pretimeouts to all the users. + */ + if (atomic_add_unless(&intf->watchdog_pretimeouts_to_deliver, -1, 0)) { + ipmi_user_t user; + + rcu_read_lock(); + list_for_each_entry_rcu(user, &intf->users, link) { + if (user->handler->ipmi_watchdog_pretimeout) + user->handler->ipmi_watchdog_pretimeout( + user->handler_data); + } + rcu_read_unlock(); + } +} + +static void smi_recv_tasklet(unsigned long val) +{ + handle_new_recv_msgs((ipmi_smi_t) val); +} + /* Handle a new message from the lower layer. */ void ipmi_smi_msg_received(ipmi_smi_t intf, struct ipmi_smi_msg *msg) { unsigned long flags = 0; /* keep us warning-free. */ - int rv; int run_to_completion; @@ -3842,31 +3919,11 @@ void ipmi_smi_msg_received(ipmi_smi_t intf, run_to_completion = intf->run_to_completion; if (!run_to_completion) spin_lock_irqsave(&intf->waiting_msgs_lock, flags); - if (!list_empty(&intf->waiting_msgs)) { - list_add_tail(&msg->link, &intf->waiting_msgs); - if (!run_to_completion) - spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); - goto out; - } + list_add_tail(&msg->link, &intf->waiting_msgs); if (!run_to_completion) spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); - rv = handle_new_recv_msg(intf, msg); - if (rv > 0) { - /* - * Could not handle the message now, just add it to a - * list to handle later. - */ - run_to_completion = intf->run_to_completion; - if (!run_to_completion) - spin_lock_irqsave(&intf->waiting_msgs_lock, flags); - list_add_tail(&msg->link, &intf->waiting_msgs); - if (!run_to_completion) - spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); - } else if (rv == 0) { - ipmi_free_smi_msg(msg); - } - + tasklet_schedule(&intf->recv_tasklet); out: return; } @@ -3874,16 +3931,8 @@ EXPORT_SYMBOL(ipmi_smi_msg_received); void ipmi_smi_watchdog_pretimeout(ipmi_smi_t intf) { - ipmi_user_t user; - - rcu_read_lock(); - list_for_each_entry_rcu(user, &intf->users, link) { - if (!user->handler->ipmi_watchdog_pretimeout) - continue; - - user->handler->ipmi_watchdog_pretimeout(user->handler_data); - } - rcu_read_unlock(); + atomic_set(&intf->watchdog_pretimeouts_to_deliver, 1); + tasklet_schedule(&intf->recv_tasklet); } EXPORT_SYMBOL(ipmi_smi_watchdog_pretimeout); @@ -3997,28 +4046,12 @@ static void ipmi_timeout_handler(long timeout_period) ipmi_smi_t intf; struct list_head timeouts; struct ipmi_recv_msg *msg, *msg2; - struct ipmi_smi_msg *smi_msg, *smi_msg2; unsigned long flags; int i; rcu_read_lock(); list_for_each_entry_rcu(intf, &ipmi_interfaces, link) { - /* See if any waiting messages need to be processed. */ - spin_lock_irqsave(&intf->waiting_msgs_lock, flags); - list_for_each_entry_safe(smi_msg, smi_msg2, - &intf->waiting_msgs, link) { - if (!handle_new_recv_msg(intf, smi_msg)) { - list_del(&smi_msg->link); - ipmi_free_smi_msg(smi_msg); - } else { - /* - * To preserve message order, quit if we - * can't handle a message. - */ - break; - } - } - spin_unlock_irqrestore(&intf->waiting_msgs_lock, flags); + tasklet_schedule(&intf->recv_tasklet); /* * Go through the seq table and find any messages that @@ -4172,12 +4205,48 @@ EXPORT_SYMBOL(ipmi_free_recv_msg); #ifdef CONFIG_IPMI_PANIC_EVENT +static atomic_t panic_done_count = ATOMIC_INIT(0); + static void dummy_smi_done_handler(struct ipmi_smi_msg *msg) { + atomic_dec(&panic_done_count); } static void dummy_recv_done_handler(struct ipmi_recv_msg *msg) { + atomic_dec(&panic_done_count); +} + +/* + * Inside a panic, send a message and wait for a response. + */ +static void ipmi_panic_request_and_wait(ipmi_smi_t intf, + struct ipmi_addr *addr, + struct kernel_ipmi_msg *msg) +{ + struct ipmi_smi_msg smi_msg; + struct ipmi_recv_msg recv_msg; + int rv; + + smi_msg.done = dummy_smi_done_handler; + recv_msg.done = dummy_recv_done_handler; + atomic_add(2, &panic_done_count); + rv = i_ipmi_request(NULL, + intf, + addr, + 0, + msg, + intf, + &smi_msg, + &recv_msg, + 0, + intf->channels[0].address, + intf->channels[0].lun, + 0, 1); /* Don't retry, and don't wait. */ + if (rv) + atomic_sub(2, &panic_done_count); + while (atomic_read(&panic_done_count) != 0) + ipmi_poll(intf); } #ifdef CONFIG_IPMI_PANIC_STRING @@ -4216,8 +4285,6 @@ static void send_panic_events(char *str) unsigned char data[16]; struct ipmi_system_interface_addr *si; struct ipmi_addr addr; - struct ipmi_smi_msg smi_msg; - struct ipmi_recv_msg recv_msg; si = (struct ipmi_system_interface_addr *) &addr; si->addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE; @@ -4245,9 +4312,6 @@ static void send_panic_events(char *str) data[7] = str[2]; } - smi_msg.done = dummy_smi_done_handler; - recv_msg.done = dummy_recv_done_handler; - /* For every registered interface, send the event. */ list_for_each_entry_rcu(intf, &ipmi_interfaces, link) { if (!intf->handlers) @@ -4257,18 +4321,7 @@ static void send_panic_events(char *str) intf->run_to_completion = 1; /* Send the event announcing the panic. */ intf->handlers->set_run_to_completion(intf->send_info, 1); - i_ipmi_request(NULL, - intf, - &addr, - 0, - &msg, - intf, - &smi_msg, - &recv_msg, - 0, - intf->channels[0].address, - intf->channels[0].lun, - 0, 1); /* Don't retry, and don't wait. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); } #ifdef CONFIG_IPMI_PANIC_STRING @@ -4316,18 +4369,7 @@ static void send_panic_events(char *str) msg.data = NULL; msg.data_len = 0; intf->null_user_handler = device_id_fetcher; - i_ipmi_request(NULL, - intf, - &addr, - 0, - &msg, - intf, - &smi_msg, - &recv_msg, - 0, - intf->channels[0].address, - intf->channels[0].lun, - 0, 1); /* Don't retry, and don't wait. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); if (intf->local_event_generator) { /* Request the event receiver from the local MC. */ @@ -4336,18 +4378,7 @@ static void send_panic_events(char *str) msg.data = NULL; msg.data_len = 0; intf->null_user_handler = event_receiver_fetcher; - i_ipmi_request(NULL, - intf, - &addr, - 0, - &msg, - intf, - &smi_msg, - &recv_msg, - 0, - intf->channels[0].address, - intf->channels[0].lun, - 0, 1); /* no retry, and no wait. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); } intf->null_user_handler = NULL; @@ -4404,18 +4435,7 @@ static void send_panic_events(char *str) strncpy(data+5, p, 11); p += size; - i_ipmi_request(NULL, - intf, - &addr, - 0, - &msg, - intf, - &smi_msg, - &recv_msg, - 0, - intf->channels[0].address, - intf->channels[0].lun, - 0, 1); /* no retry, and no wait. */ + ipmi_panic_request_and_wait(intf, &addr, &msg); } } #endif /* CONFIG_IPMI_PANIC_STRING */ diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index f9fdc11..1e638ff 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -170,7 +170,6 @@ struct smi_info { struct si_sm_handlers *handlers; enum si_type si_type; spinlock_t si_lock; - spinlock_t msg_lock; struct list_head xmit_msgs; struct list_head hp_xmit_msgs; struct ipmi_smi_msg *curr_msg; @@ -319,16 +318,8 @@ static int register_xaction_notifier(struct notifier_block *nb) static void deliver_recv_msg(struct smi_info *smi_info, struct ipmi_smi_msg *msg) { - /* Deliver the message to the upper layer with the lock - released. */ - - if (smi_info->run_to_completion) { - ipmi_smi_msg_received(smi_info->intf, msg); - } else { - spin_unlock(&(smi_info->si_lock)); - ipmi_smi_msg_received(smi_info->intf, msg); - spin_lock(&(smi_info->si_lock)); - } + /* Deliver the message to the upper layer. */ + ipmi_smi_msg_received(smi_info->intf, msg); } static void return_hosed_msg(struct smi_info *smi_info, int cCode) @@ -357,13 +348,6 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info) struct timeval t; #endif - /* - * No need to save flags, we aleady have interrupts off and we - * already hold the SMI lock. - */ - if (!smi_info->run_to_completion) - spin_lock(&(smi_info->msg_lock)); - /* Pick the high priority queue first. */ if (!list_empty(&(smi_info->hp_xmit_msgs))) { entry = smi_info->hp_xmit_msgs.next; @@ -401,9 +385,6 @@ static enum si_sm_result start_next_msg(struct smi_info *smi_info) rv = SI_SM_CALL_WITHOUT_DELAY; } out: - if (!smi_info->run_to_completion) - spin_unlock(&(smi_info->msg_lock)); - return rv; } @@ -480,9 +461,7 @@ static void handle_flags(struct smi_info *smi_info) start_clear_flags(smi_info); smi_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT; - spin_unlock(&(smi_info->si_lock)); ipmi_smi_watchdog_pretimeout(smi_info->intf); - spin_lock(&(smi_info->si_lock)); } else if (smi_info->msg_flags & RECEIVE_MSG_AVAIL) { /* Messages available. */ smi_info->curr_msg = ipmi_alloc_smi_msg(); @@ -888,19 +867,6 @@ static void sender(void *send_info, printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec); #endif - /* - * last_timeout_jiffies is updated here to avoid - * smi_timeout() handler passing very large time_diff - * value to smi_event_handler() that causes - * the send command to abort. - */ - smi_info->last_timeout_jiffies = jiffies; - - mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); - - if (smi_info->thread) - wake_up_process(smi_info->thread); - if (smi_info->run_to_completion) { /* * If we are running to completion, then throw it in @@ -923,16 +889,29 @@ static void sender(void *send_info, return; } - spin_lock_irqsave(&smi_info->msg_lock, flags); + spin_lock_irqsave(&smi_info->si_lock, flags); if (priority > 0) list_add_tail(&msg->link, &smi_info->hp_xmit_msgs); else list_add_tail(&msg->link, &smi_info->xmit_msgs); - spin_unlock_irqrestore(&smi_info->msg_lock, flags); - spin_lock_irqsave(&smi_info->si_lock, flags); - if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL) + if (smi_info->si_state == SI_NORMAL && smi_info->curr_msg == NULL) { + /* + * last_timeout_jiffies is updated here to avoid + * smi_timeout() handler passing very large time_diff + * value to smi_event_handler() that causes + * the send command to abort. + */ + smi_info->last_timeout_jiffies = jiffies; + + mod_timer(&smi_info->si_timer, jiffies + SI_TIMEOUT_JIFFIES); + + if (smi_info->thread) + wake_up_process(smi_info->thread); + start_next_msg(smi_info); + smi_event_handler(smi_info, 0); + } spin_unlock_irqrestore(&smi_info->si_lock, flags); } @@ -1033,16 +1012,19 @@ static int ipmi_thread(void *data) static void poll(void *send_info) { struct smi_info *smi_info = send_info; - unsigned long flags; + unsigned long flags = 0; + int run_to_completion = smi_info->run_to_completion; /* * Make sure there is some delay in the poll loop so we can * drive time forward and timeout things. */ udelay(10); - spin_lock_irqsave(&smi_info->si_lock, flags); + if (!run_to_completion) + spin_lock_irqsave(&smi_info->si_lock, flags); smi_event_handler(smi_info, 10); - spin_unlock_irqrestore(&smi_info->si_lock, flags); + if (!run_to_completion) + spin_unlock_irqrestore(&smi_info->si_lock, flags); } static void request_events(void *send_info) @@ -1679,10 +1661,8 @@ static struct smi_info *smi_info_alloc(void) { struct smi_info *info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info) { + if (info) spin_lock_init(&info->si_lock); - spin_lock_init(&info->msg_lock); - } return info; } diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 020a6ae..7ed356e 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -520,6 +520,7 @@ static void panic_halt_ipmi_heartbeat(void) msg.cmd = IPMI_WDOG_RESET_TIMER; msg.data = NULL; msg.data_len = 0; + atomic_add(2, &panic_done_count); rv = ipmi_request_supply_msgs(watchdog_user, (struct ipmi_addr *) &addr, 0, @@ -528,8 +529,8 @@ static void panic_halt_ipmi_heartbeat(void) &panic_halt_heartbeat_smi_msg, &panic_halt_heartbeat_recv_msg, 1); - if (!rv) - atomic_add(2, &panic_done_count); + if (rv) + atomic_sub(2, &panic_done_count); } static struct ipmi_smi_msg panic_halt_smi_msg = { @@ -553,16 +554,18 @@ static void panic_halt_ipmi_set_timeout(void) /* Wait for the messages to be free. */ while (atomic_read(&panic_done_count) != 0) ipmi_poll_interface(watchdog_user); + atomic_add(2, &panic_done_count); rv = i_ipmi_set_timeout(&panic_halt_smi_msg, &panic_halt_recv_msg, &send_heartbeat_now); - if (!rv) { - atomic_add(2, &panic_done_count); - if (send_heartbeat_now) - panic_halt_ipmi_heartbeat(); - } else + if (rv) { + atomic_sub(2, &panic_done_count); printk(KERN_WARNING PFX "Unable to extend the watchdog timeout."); + } else { + if (send_heartbeat_now) + panic_halt_ipmi_heartbeat(); + } while (atomic_read(&panic_done_count) != 0) ipmi_poll_interface(watchdog_user); } @@ -1164,7 +1167,7 @@ static int wdog_reboot_handler(struct notifier_block *this, if (code == SYS_POWER_OFF || code == SYS_HALT) { /* Disable the WDT if we are shutting down. */ ipmi_watchdog_state = WDOG_TIMEOUT_NONE; - panic_halt_ipmi_set_timeout(); + ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB); } else if (ipmi_watchdog_state != WDOG_TIMEOUT_NONE) { /* Set a long timer to let the reboot happens, but reboot if it hangs, but only if the watchdog @@ -1172,7 +1175,7 @@ static int wdog_reboot_handler(struct notifier_block *this, timeout = 120; pretimeout = 0; ipmi_watchdog_state = WDOG_TIMEOUT_RESET; - panic_halt_ipmi_set_timeout(); + ipmi_set_timeout(IPMI_SET_TIMEOUT_NO_HB); } } return NOTIFY_OK; diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c index a2161f6..2231aec 100644 --- a/drivers/video/backlight/tosa_lcd.c +++ b/drivers/video/backlight/tosa_lcd.c @@ -271,7 +271,7 @@ static int tosa_lcd_resume(struct spi_device *spi) } #else #define tosa_lcd_suspend NULL -#define tosa_lcd_reume NULL +#define tosa_lcd_resume NULL #endif static struct spi_driver tosa_lcd_driver = { diff --git a/fs/buffer.c b/fs/buffer.c index 70e2017..36d6665 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1384,10 +1384,23 @@ static void invalidate_bh_lru(void *arg) } put_cpu_var(bh_lrus); } + +static bool has_bh_in_lru(int cpu, void *dummy) +{ + struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); + int i; + for (i = 0; i < BH_LRU_SIZE; i++) { + if (b->bhs[i]) + return 1; + } + + return 0; +} + void invalidate_bh_lrus(void) { - on_each_cpu(invalidate_bh_lru, NULL, 1); + on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); diff --git a/fs/proc/array.c b/fs/proc/array.c index fbb53c2..f9bd395 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -550,7 +550,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, ' ', shared); seq_put_decimal_ull(m, ' ', text); seq_put_decimal_ull(m, ' ', 0); - seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', data); seq_put_decimal_ull(m, ' ', 0); seq_putc(m, '\n'); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 3551f1f..0d9e23a 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -156,15 +156,15 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out; - last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; - for (entry = ns_entries; entry <= last; entry++) { + last = &ns_entries[ARRAY_SIZE(ns_entries)]; + for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) continue; if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } error = ERR_PTR(-ENOENT); - if (entry > last) + if (entry == last) goto out; error = proc_ns_instantiate(dir, dentry, task, *entry); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9694cc2..c283832 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -781,9 +781,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); - if (pmd_trans_unstable(pmd)) - return 0; - /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); spin_lock(&walk->mm->page_table_lock); @@ -802,6 +799,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } + if (pmd_trans_unstable(pmd)) + return 0; for (; addr != end; addr += PAGE_SIZE) { /* check to see if we've left 'vma' behind diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 7b9b75a..1ffdb98 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -810,11 +810,10 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) #else /* NR_CPUS > 1 */ int __first_cpu(const cpumask_t *srcp); int __next_cpu(int n, const cpumask_t *srcp); -int __any_online_cpu(const cpumask_t *mask); #define first_cpu(src) __first_cpu(&(src)) #define next_cpu(n, src) __next_cpu((n), &(src)) -#define any_online_cpu(mask) __any_online_cpu(&(mask)) +#define any_online_cpu(mask) cpumask_any_and(&mask, cpu_online_mask) #define for_each_cpu_mask(cpu, mask) \ for ((cpu) = -1; \ (cpu) = next_cpu((cpu), (mask)), \ diff --git a/include/linux/mm.h b/include/linux/mm.h index f2a60dd..d8738a4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -954,7 +954,7 @@ extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); extern int vmtruncate(struct inode *inode, loff_t offset); extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end); - +void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index f5bd679..b067bd8 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -33,6 +33,7 @@ struct pid_namespace { #endif gid_t pid_gid; int hide_pid; + int reboot; /* group exit code if this pidns was rebooted */ }; extern struct pid_namespace init_pid_ns; @@ -48,6 +49,7 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); extern void free_pid_ns(struct kref *kref); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); +extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); static inline void put_pid_ns(struct pid_namespace *ns) { @@ -75,11 +77,15 @@ static inline void put_pid_ns(struct pid_namespace *ns) { } - static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } + +static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ + return 0; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index e9a4823..0d04cd6 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -2,6 +2,7 @@ * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2006 Nick Piggin + * Copyright (C) 2012 Konstantin Khlebnikov * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -257,4 +258,199 @@ static inline void radix_tree_preload_end(void) preempt_enable(); } +/** + * struct radix_tree_iter - radix tree iterator state + * + * @index: index of current slot + * @next_index: next-to-last index for this chunk + * @tags: bit-mask for tag-iterating + * + * This radix tree iterator works in terms of "chunks" of slots. A chunk is a + * subinterval of slots contained within one radix tree leaf node. It is + * described by a pointer to its first slot and a struct radix_tree_iter + * which holds the chunk's position in the tree and its size. For tagged + * iteration radix_tree_iter also holds the slots' bit-mask for one chosen + * radix tree tag. + */ +struct radix_tree_iter { + unsigned long index; + unsigned long next_index; + unsigned long tags; +}; + +#define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ +#define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ +#define RADIX_TREE_ITER_CONTIG 0x0200 /* stop at first hole */ + +/** + * radix_tree_iter_init - initialize radix tree iterator + * + * @iter: pointer to iterator state + * @start: iteration starting index + * Returns: NULL + */ +static __always_inline void ** +radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) +{ + /* + * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it + * in the case of a successful tagged chunk lookup. If the lookup was + * unsuccessful or non-tagged then nobody cares about ->tags. + * + * Set index to zero to bypass next_index overflow protection. + * See the comment in radix_tree_next_chunk() for details. + */ + iter->index = 0; + iter->next_index = start; + return NULL; +} + +/** + * radix_tree_next_chunk - find next chunk of slots for iteration + * + * @root: radix tree root + * @iter: iterator state + * @flags: RADIX_TREE_ITER_* flags and tag index + * Returns: pointer to chunk first slot, or NULL if there no more left + * + * This function looks up the next chunk in the radix tree starting from + * @iter->next_index. It returns a pointer to the chunk's first slot. + * Also it fills @iter with data about chunk: position in the tree (index), + * its end (next_index), and constructs a bit mask for tagged iterating (tags). + */ +void **radix_tree_next_chunk(struct radix_tree_root *root, + struct radix_tree_iter *iter, unsigned flags); + +/** + * radix_tree_chunk_size - get current chunk size + * + * @iter: pointer to radix tree iterator + * Returns: current chunk size + */ +static __always_inline unsigned +radix_tree_chunk_size(struct radix_tree_iter *iter) +{ + return iter->next_index - iter->index; +} + +/** + * radix_tree_next_slot - find next slot in chunk + * + * @slot: pointer to current slot + * @iter: pointer to interator state + * @flags: RADIX_TREE_ITER_*, should be constant + * Returns: pointer to next slot, or NULL if there no more left + * + * This function updates @iter->index in the case of a successful lookup. + * For tagged lookup it also eats @iter->tags. + */ +static __always_inline void ** +radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) +{ + if (flags & RADIX_TREE_ITER_TAGGED) { + iter->tags >>= 1; + if (likely(iter->tags & 1ul)) { + iter->index++; + return slot + 1; + } + if (!(flags & RADIX_TREE_ITER_CONTIG) && likely(iter->tags)) { + unsigned offset = __ffs(iter->tags); + + iter->tags >>= offset; + iter->index += offset + 1; + return slot + offset + 1; + } + } else { + unsigned size = radix_tree_chunk_size(iter) - 1; + + while (size--) { + slot++; + iter->index++; + if (likely(*slot)) + return slot; + if (flags & RADIX_TREE_ITER_CONTIG) + break; + } + } + return NULL; +} + +/** + * radix_tree_for_each_chunk - iterate over chunks + * + * @slot: the void** variable for pointer to chunk first slot + * @root: the struct radix_tree_root pointer + * @iter: the struct radix_tree_iter pointer + * @start: iteration starting index + * @flags: RADIX_TREE_ITER_* and tag index + * + * Locks can be released and reacquired between iterations. + */ +#define radix_tree_for_each_chunk(slot, root, iter, start, flags) \ + for (slot = radix_tree_iter_init(iter, start) ; \ + (slot = radix_tree_next_chunk(root, iter, flags)) ;) + +/** + * radix_tree_for_each_chunk_slot - iterate over slots in one chunk + * + * @slot: the void** variable, at the beginning points to chunk first slot + * @iter: the struct radix_tree_iter pointer + * @flags: RADIX_TREE_ITER_*, should be constant + * + * This macro is designed to be nested inside radix_tree_for_each_chunk(). + * @slot points to the radix tree slot, @iter->index contains its index. + */ +#define radix_tree_for_each_chunk_slot(slot, iter, flags) \ + for (; slot ; slot = radix_tree_next_slot(slot, iter, flags)) + +/** + * radix_tree_for_each_slot - iterate over non-empty slots + * + * @slot: the void** variable for pointer to slot + * @root: the struct radix_tree_root pointer + * @iter: the struct radix_tree_iter pointer + * @start: iteration starting index + * + * @slot points to radix tree slot, @iter->index contains its index. + */ +#define radix_tree_for_each_slot(slot, root, iter, start) \ + for (slot = radix_tree_iter_init(iter, start) ; \ + slot || (slot = radix_tree_next_chunk(root, iter, 0)) ; \ + slot = radix_tree_next_slot(slot, iter, 0)) + +/** + * radix_tree_for_each_contig - iterate over contiguous slots + * + * @slot: the void** variable for pointer to slot + * @root: the struct radix_tree_root pointer + * @iter: the struct radix_tree_iter pointer + * @start: iteration starting index + * + * @slot points to radix tree slot, @iter->index contains its index. + */ +#define radix_tree_for_each_contig(slot, root, iter, start) \ + for (slot = radix_tree_iter_init(iter, start) ; \ + slot || (slot = radix_tree_next_chunk(root, iter, \ + RADIX_TREE_ITER_CONTIG)) ; \ + slot = radix_tree_next_slot(slot, iter, \ + RADIX_TREE_ITER_CONTIG)) + +/** + * radix_tree_for_each_tagged - iterate over tagged slots + * + * @slot: the void** variable for pointer to slot + * @root: the struct radix_tree_root pointer + * @iter: the struct radix_tree_iter pointer + * @start: iteration starting index + * @tag: tag index + * + * @slot points to radix tree slot, @iter->index contains its index. + */ +#define radix_tree_for_each_tagged(slot, root, iter, start, tag) \ + for (slot = radix_tree_iter_init(iter, start) ; \ + slot || (slot = radix_tree_next_chunk(root, iter, \ + RADIX_TREE_ITER_TAGGED | tag)) ; \ + slot = radix_tree_next_slot(slot, iter, \ + RADIX_TREE_ITER_TAGGED)) + #endif /* _LINUX_RADIX_TREE_H */ diff --git a/include/linux/smp.h b/include/linux/smp.h index 8cc38d3..10530d9 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -102,6 +102,22 @@ static inline void call_function_init(void) { } int on_each_cpu(smp_call_func_t func, void *info, int wait); /* + * Call a function on processors specified by mask, which might include + * the local one. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait); + +/* + * Call a function on each processor for which the supplied function + * cond_func returns a positive value. This may include the local + * processor. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags); + +/* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. */ @@ -132,6 +148,36 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) local_irq_enable(); \ 0; \ }) +/* + * Note we still need to test the mask even for UP + * because we actually can get an empty mask from + * code that on SMP might call us without the local + * CPU in the mask. + */ +#define on_each_cpu_mask(mask, func, info, wait) \ + do { \ + if (cpumask_test_cpu(0, (mask))) { \ + local_irq_disable(); \ + (func)(info); \ + local_irq_enable(); \ + } \ + } while (0) +/* + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\ + do { \ + void *__info = (info); \ + preempt_disable(); \ + if ((cond_func)(0, __info)) { \ + local_irq_disable(); \ + (func)(__info); \ + local_irq_enable(); \ + } \ + preempt_enable(); \ + } while (0) + static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 #define smp_prepare_boot_cpu() do {} while (0) diff --git a/include/linux/swap.h b/include/linux/swap.h index b86b5c2..8dc0ea7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -21,6 +21,9 @@ struct bio; #define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */ +#define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ + SWAP_FLAG_DISCARD) + static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a0deff..4e2e472 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1358,6 +1358,10 @@ static int __init parse_crashkernel_simple(char *cmdline, if (*cur == '@') *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warning("crashkernel: unrecognized char\n"); + return -EINVAL; + } return 0; } @@ -1461,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmlist); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 17b2328..57bc1fd 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -15,6 +15,7 @@ #include <linux/acct.h> #include <linux/slab.h> #include <linux/proc_fs.h> +#include <linux/reboot.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -183,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + if (pid_ns->reboot) + current->signal->group_exit_code = pid_ns->reboot; + acct_exit_ns(pid_ns); return; } @@ -217,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = { static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ + if (pid_ns == &init_pid_ns) + return 0; + + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART2: + case LINUX_REBOOT_CMD_RESTART: + pid_ns->reboot = SIGHUP; + break; + + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_HALT: + pid_ns->reboot = SIGINT; + break; + default: + return -EINVAL; + } + + read_lock(&tasklist_lock); + force_sig(SIGKILL, pid_ns->child_reaper); + read_unlock(&tasklist_lock); + + do_exit(0); + + /* Not reached */ + return 0; +} + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/smp.c b/kernel/smp.c index db197d6..2f8b10e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) return ret; } EXPORT_SYMBOL(on_each_cpu); + +/** + * on_each_cpu_mask(): Run a function on processors specified by + * cpumask, which may include the local processor. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait) +{ + int cpu = get_cpu(); + + smp_call_function_many(mask, func, info, wait); + if (cpumask_test_cpu(cpu, mask)) { + local_irq_disable(); + func(info); + local_irq_enable(); + } + put_cpu(); +} +EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * on_each_cpu_cond(): Call a function on each processor for which + * the supplied function cond_func returns true, optionally waiting + * for all the required CPUs to finish. This may include the local + * processor. + * @cond_func: A callback function that is passed a cpu id and + * the the info parameter. The function is called + * with preemption disabled. The function should + * return a blooean value indicating whether to IPI + * the specified CPU. + * @func: The function to run on all applicable CPUs. + * This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to both functions. + * @wait: If true, wait (atomically) until function has + * completed on other CPUs. + * @gfp_flags: GFP flags to use when allocating the cpumask + * used internally by the function. + * + * The function might sleep if the GFP flags indicates a non + * atomic allocation is allowed. + * + * Preemption is disabled to protect against CPUs going offline but not online. + * CPUs going online during the call will not be seen or sent an IPI. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + cpumask_var_t cpus; + int cpu, ret; + + might_sleep_if(gfp_flags & __GFP_WAIT); + + if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) + cpumask_set_cpu(cpu, cpus); + on_each_cpu_mask(cpus, func, info, wait); + preempt_enable(); + free_cpumask_var(cpus); + } else { + /* + * No free cpumask, bother. No matter, we'll + * just have to IPI them one by one. + */ + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) { + ret = smp_call_function_single(cpu, func, + info, wait); + WARN_ON_ONCE(!ret); + } + preempt_enable(); + } +} +EXPORT_SYMBOL(on_each_cpu_cond); diff --git a/kernel/sys.c b/kernel/sys.c index 9eb7fca..e7006eb 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + if (ret) + return ret; + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 803a374..52b3a06 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include <linux/swap.h> #include <linux/slab.h> #include <linux/sysctl.h> +#include <linux/bitmap.h> #include <linux/signal.h> #include <linux/printk.h> #include <linux/proc_fs.h> @@ -2395,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } } - while (val_a <= val_b) - set_bit(val_a++, tmp_bitmap); - + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; proc_skip_char(&kbuf, &left, '\n'); } @@ -2440,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else - memcpy(bitmap, tmp_bitmap, - BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } kfree(tmp_bitmap); *lenp -= left; diff --git a/lib/Kconfig b/lib/Kconfig index a0e5900..4a8aba2 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -88,6 +88,10 @@ choice prompt "CRC32 implementation" depends on CRC32 default CRC32_SLICEBY8 + help + This option allows a kernel builder to override the default choice + of CRC32 algorithm. Choose the default ("slice by 8") unless you + know that you need one of the others. config CRC32_SLICEBY8 bool "Slice by 8 bytes" diff --git a/lib/cpumask.c b/lib/cpumask.c index 0b66011..402a54a 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -26,18 +26,6 @@ int __next_cpu_nr(int n, const cpumask_t *srcp) EXPORT_SYMBOL(__next_cpu_nr); #endif -int __any_online_cpu(const cpumask_t *mask) -{ - int cpu; - - for_each_cpu(cpu, mask) { - if (cpu_online(cpu)) - break; - } - return cpu; -} -EXPORT_SYMBOL(__any_online_cpu); - /** * cpumask_next_and - get the next cpu in *src1p & *src2p * @n: the cpu prior to the place to search (ie. return will be > @n) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3e69c2b..86516f5 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -3,6 +3,7 @@ * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin + * Copyright (C) 2012 Konstantin Khlebnikov * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -146,6 +147,43 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) } return 0; } + +/** + * radix_tree_find_next_bit - find the next set bit in a memory region + * + * @addr: The address to base the search on + * @size: The bitmap size in bits + * @offset: The bitnumber to start searching at + * + * Unrollable variant of find_next_bit() for constant size arrays. + * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero. + * Returns next bit offset, or size if nothing found. + */ +static __always_inline unsigned long +radix_tree_find_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + if (!__builtin_constant_p(size)) + return find_next_bit(addr, size, offset); + + if (offset < size) { + unsigned long tmp; + + addr += offset / BITS_PER_LONG; + tmp = *addr >> (offset % BITS_PER_LONG); + if (tmp) + return __ffs(tmp) + offset; + offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); + while (offset < size) { + tmp = *++addr; + if (tmp) + return __ffs(tmp) + offset; + offset += BITS_PER_LONG; + } + } + return size; +} + /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. @@ -613,6 +651,119 @@ int radix_tree_tag_get(struct radix_tree_root *root, EXPORT_SYMBOL(radix_tree_tag_get); /** + * radix_tree_next_chunk - find next chunk of slots for iteration + * + * @root: radix tree root + * @iter: iterator state + * @flags: RADIX_TREE_ITER_* flags and tag index + * Returns: pointer to chunk first slot, or NULL if iteration is over + */ +void **radix_tree_next_chunk(struct radix_tree_root *root, + struct radix_tree_iter *iter, unsigned flags) +{ + unsigned shift, tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *rnode, *node; + unsigned long index, offset; + + if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) + return NULL; + + /* + * Catch next_index overflow after ~0UL. iter->index never overflows + * during iterating; it can be zero only at the beginning. + * And we cannot overflow iter->next_index in a single step, + * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. + */ + index = iter->next_index; + if (!index && iter->index) + return NULL; + + rnode = rcu_dereference_raw(root->rnode); + if (radix_tree_is_indirect_ptr(rnode)) { + rnode = indirect_to_ptr(rnode); + } else if (rnode && !index) { + /* Single-slot tree */ + iter->index = 0; + iter->next_index = 1; + iter->tags = 1; + return (void **)&root->rnode; + } else + return NULL; + +restart: + shift = (rnode->height - 1) * RADIX_TREE_MAP_SHIFT; + offset = index >> shift; + + /* Index outside of the tree */ + if (offset >= RADIX_TREE_MAP_SIZE) + return NULL; + + node = rnode; + while (1) { + if ((flags & RADIX_TREE_ITER_TAGGED) ? + !test_bit(offset, node->tags[tag]) : + !node->slots[offset]) { + /* Hole detected */ + if (flags & RADIX_TREE_ITER_CONTIG) + return NULL; + + if (flags & RADIX_TREE_ITER_TAGGED) + offset = radix_tree_find_next_bit( + node->tags[tag], + RADIX_TREE_MAP_SIZE, + offset + 1); + else + while (++offset < RADIX_TREE_MAP_SIZE) { + if (node->slots[offset]) + break; + } + index &= ~((RADIX_TREE_MAP_SIZE << shift) - 1); + index += offset << shift; + /* Overflow after ~0UL */ + if (!index) + return NULL; + if (offset == RADIX_TREE_MAP_SIZE) + goto restart; + } + + /* This is leaf-node */ + if (!shift) + break; + + node = rcu_dereference_raw(node->slots[offset]); + if (node == NULL) + goto restart; + shift -= RADIX_TREE_MAP_SHIFT; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + } + + /* Update the iterator state */ + iter->index = index; + iter->next_index = (index | RADIX_TREE_MAP_MASK) + 1; + + /* Construct iter->tags bit-mask from node->tags[tag] array */ + if (flags & RADIX_TREE_ITER_TAGGED) { + unsigned tag_long, tag_bit; + + tag_long = offset / BITS_PER_LONG; + tag_bit = offset % BITS_PER_LONG; + iter->tags = node->tags[tag][tag_long] >> tag_bit; + /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ + if (tag_long < RADIX_TREE_TAG_LONGS - 1) { + /* Pick tags from next element */ + if (tag_bit) + iter->tags |= node->tags[tag][tag_long + 1] << + (BITS_PER_LONG - tag_bit); + /* Clip chunk size, here only BITS_PER_LONG tags */ + iter->next_index = index + BITS_PER_LONG; + } + } + + return node->slots + offset; +} +EXPORT_SYMBOL(radix_tree_next_chunk); + +/** * radix_tree_range_tag_if_tagged - for each item in given range set given * tag if item has another tag set * @root: radix tree root @@ -817,57 +968,6 @@ unsigned long radix_tree_prev_hole(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_prev_hole); -static unsigned int -__lookup(struct radix_tree_node *slot, void ***results, unsigned long *indices, - unsigned long index, unsigned int max_items, unsigned long *next_index) -{ - unsigned int nr_found = 0; - unsigned int shift, height; - unsigned long i; - - height = slot->height; - if (height == 0) - goto out; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - for ( ; height > 1; height--) { - i = (index >> shift) & RADIX_TREE_MAP_MASK; - for (;;) { - if (slot->slots[i] != NULL) - break; - index &= ~((1UL << shift) - 1); - index += 1UL << shift; - if (index == 0) - goto out; /* 32-bit wraparound */ - i++; - if (i == RADIX_TREE_MAP_SIZE) - goto out; - } - - shift -= RADIX_TREE_MAP_SHIFT; - slot = rcu_dereference_raw(slot->slots[i]); - if (slot == NULL) - goto out; - } - - /* Bottom level: grab some items */ - for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { - if (slot->slots[i]) { - results[nr_found] = &(slot->slots[i]); - if (indices) - indices[nr_found] = index; - if (++nr_found == max_items) { - index++; - goto out; - } - } - index++; - } -out: - *next_index = index; - return nr_found; -} - /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root @@ -891,48 +991,19 @@ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { - unsigned long max_index; - struct radix_tree_node *node; - unsigned long cur_index = first_index; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; - node = rcu_dereference_raw(root->rnode); - if (!node) + if (unlikely(!max_items)) return 0; - if (!radix_tree_is_indirect_ptr(node)) { - if (first_index > 0) - return 0; - results[0] = node; - return 1; - } - node = indirect_to_ptr(node); - - max_index = radix_tree_maxindex(node->height); - - ret = 0; - while (ret < max_items) { - unsigned int nr_found, slots_found, i; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) - break; - slots_found = __lookup(node, (void ***)results + ret, NULL, - cur_index, max_items - ret, &next_index); - nr_found = 0; - for (i = 0; i < slots_found; i++) { - struct radix_tree_node *slot; - slot = *(((void ***)results)[ret + i]); - if (!slot) - continue; - results[ret + nr_found] = - indirect_to_ptr(rcu_dereference_raw(slot)); - nr_found++; - } - ret += nr_found; - if (next_index == 0) + radix_tree_for_each_slot(slot, root, &iter, first_index) { + results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); + if (!results[ret]) + continue; + if (++ret == max_items) break; - cur_index = next_index; } return ret; @@ -962,112 +1033,25 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items) { - unsigned long max_index; - struct radix_tree_node *node; - unsigned long cur_index = first_index; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; - node = rcu_dereference_raw(root->rnode); - if (!node) + if (unlikely(!max_items)) return 0; - if (!radix_tree_is_indirect_ptr(node)) { - if (first_index > 0) - return 0; - results[0] = (void **)&root->rnode; + radix_tree_for_each_slot(slot, root, &iter, first_index) { + results[ret] = slot; if (indices) - indices[0] = 0; - return 1; - } - node = indirect_to_ptr(node); - - max_index = radix_tree_maxindex(node->height); - - ret = 0; - while (ret < max_items) { - unsigned int slots_found; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) + indices[ret] = iter.index; + if (++ret == max_items) break; - slots_found = __lookup(node, results + ret, - indices ? indices + ret : NULL, - cur_index, max_items - ret, &next_index); - ret += slots_found; - if (next_index == 0) - break; - cur_index = next_index; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_slot); -/* - * FIXME: the two tag_get()s here should use find_next_bit() instead of - * open-coding the search. - */ -static unsigned int -__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index, - unsigned int max_items, unsigned long *next_index, unsigned int tag) -{ - unsigned int nr_found = 0; - unsigned int shift, height; - - height = slot->height; - if (height == 0) - goto out; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - - while (height > 0) { - unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ; - - for (;;) { - if (tag_get(slot, tag, i)) - break; - index &= ~((1UL << shift) - 1); - index += 1UL << shift; - if (index == 0) - goto out; /* 32-bit wraparound */ - i++; - if (i == RADIX_TREE_MAP_SIZE) - goto out; - } - height--; - if (height == 0) { /* Bottom level: grab some items */ - unsigned long j = index & RADIX_TREE_MAP_MASK; - - for ( ; j < RADIX_TREE_MAP_SIZE; j++) { - index++; - if (!tag_get(slot, tag, j)) - continue; - /* - * Even though the tag was found set, we need to - * recheck that we have a non-NULL node, because - * if this lookup is lockless, it may have been - * subsequently deleted. - * - * Similar care must be taken in any place that - * lookup ->slots[x] without a lock (ie. can't - * rely on its value remaining the same). - */ - if (slot->slots[j]) { - results[nr_found++] = &(slot->slots[j]); - if (nr_found == max_items) - goto out; - } - } - } - shift -= RADIX_TREE_MAP_SHIFT; - slot = rcu_dereference_raw(slot->slots[i]); - if (slot == NULL) - break; - } -out: - *next_index = index; - return nr_found; -} - /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag @@ -1086,52 +1070,19 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag) { - struct radix_tree_node *node; - unsigned long max_index; - unsigned long cur_index = first_index; - unsigned int ret; - - /* check the root's tag bit */ - if (!root_tag_get(root, tag)) - return 0; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; - node = rcu_dereference_raw(root->rnode); - if (!node) + if (unlikely(!max_items)) return 0; - if (!radix_tree_is_indirect_ptr(node)) { - if (first_index > 0) - return 0; - results[0] = node; - return 1; - } - node = indirect_to_ptr(node); - - max_index = radix_tree_maxindex(node->height); - - ret = 0; - while (ret < max_items) { - unsigned int nr_found, slots_found, i; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) - break; - slots_found = __lookup_tag(node, (void ***)results + ret, - cur_index, max_items - ret, &next_index, tag); - nr_found = 0; - for (i = 0; i < slots_found; i++) { - struct radix_tree_node *slot; - slot = *(((void ***)results)[ret + i]); - if (!slot) - continue; - results[ret + nr_found] = - indirect_to_ptr(rcu_dereference_raw(slot)); - nr_found++; - } - ret += nr_found; - if (next_index == 0) + radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { + results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); + if (!results[ret]) + continue; + if (++ret == max_items) break; - cur_index = next_index; } return ret; @@ -1156,42 +1107,17 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, unsigned long first_index, unsigned int max_items, unsigned int tag) { - struct radix_tree_node *node; - unsigned long max_index; - unsigned long cur_index = first_index; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; - /* check the root's tag bit */ - if (!root_tag_get(root, tag)) - return 0; - - node = rcu_dereference_raw(root->rnode); - if (!node) + if (unlikely(!max_items)) return 0; - if (!radix_tree_is_indirect_ptr(node)) { - if (first_index > 0) - return 0; - results[0] = (void **)&root->rnode; - return 1; - } - node = indirect_to_ptr(node); - - max_index = radix_tree_maxindex(node->height); - - ret = 0; - while (ret < max_items) { - unsigned int slots_found; - unsigned long next_index; /* Index of next search */ - - if (cur_index > max_index) - break; - slots_found = __lookup_tag(node, results + ret, - cur_index, max_items - ret, &next_index, tag); - ret += slots_found; - if (next_index == 0) + radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { + results[ret] = slot; + if (++ret == max_items) break; - cur_index = next_index; } return ret; diff --git a/mm/filemap.c b/mm/filemap.c index c3811bc..79c4b2b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -813,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found, nr_skip; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, start, nr_pages); - ret = 0; - nr_skip = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -837,7 +836,7 @@ repeat: * when entry at index 0 moves out of or back * to root: none yet gotten, safe to restart. */ - WARN_ON(start | i); + WARN_ON(iter.index); goto restart; } /* @@ -845,7 +844,6 @@ repeat: * here as an exceptional entry: so skip over it - * we only reach this from invalidate_mapping_pages(). */ - nr_skip++; continue; } @@ -853,21 +851,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found > nr_skip)) - goto restart; rcu_read_unlock(); return ret; } @@ -887,21 +880,22 @@ repeat: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, index, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); + /* The hole, there no reason to continue */ if (unlikely(!page)) - continue; + break; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -924,7 +918,7 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } @@ -934,14 +928,14 @@ repeat: * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page->index != index) { + if (page->mapping == NULL || page->index != iter.index) { page_cache_release(page); break; } pages[ret] = page; - ret++; - index++; + if (++ret == nr_pages) + break; } rcu_read_unlock(); return ret; @@ -962,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, - (void ***)pages, *index, nr_pages, tag); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, *index, tag) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -998,21 +993,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found)) - goto restart; rcu_read_unlock(); if (ret) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2ee6df..7d698df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5306,6 +5306,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (get_mctgt_type(vma, addr, *pte, NULL)) @@ -5502,6 +5504,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return 0; } + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index caea788..a712fb9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * Note that this code is protected against sending an IPI to an offline + * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: + * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but + * nothing keeps CPUs from showing up after we populated the cpumask and + * before the call to on_each_cpu_mask(). */ void drain_all_pages(void) { - on_each_cpu(drain_local_pages, NULL, 1); + int cpu; + struct per_cpu_pageset *pcp; + struct zone *zone; + + /* + * Allocate in the BSS so we wont require allocation in + * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y + */ + static cpumask_t cpus_with_pcps; + + /* + * We don't care about racing with CPU hotplug event + * as offline notification will cause the notified + * cpu to drain that CPU pcps and on_each_cpu_mask + * disables preemption as part of its processing + */ + for_each_online_cpu(cpu) { + bool has_pcps = false; + for_each_populated_zone(zone) { + pcp = per_cpu_ptr(zone->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } + } + if (has_pcps) + cpumask_set_cpu(cpu, &cpus_with_pcps); + else + cpumask_clear_cpu(cpu, &cpus_with_pcps); + } + on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); } #ifdef CONFIG_HIBERNATION @@ -2308,6 +2344,10 @@ rebalance: if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) goto nopage; + /* Coredumps can quickly deplete all memory reserves */ + if ((current->flags & PF_DUMPCORE) && + !(gfp_mask & __GFP_NOFAIL)) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, @@ -2035,9 +2035,17 @@ static void flush_cpu_slab(void *d) __flush_cpu_slab(s, smp_processor_id()); } +static bool has_cpu_slab(int cpu, void *info) +{ + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return !!(c->page); +} + static void flush_all(struct kmem_cache *s) { - on_each_cpu(flush_cpu_slab, s, 1); + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index dae42f3..fafc26d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct page *page = NULL; struct inode *inode = NULL; + if (swap_flags & ~SWAP_FLAGS_VALID) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/mm/truncate.c b/mm/truncate.c index 18aded3..61a183b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) return 0; } + +/** + * truncate_pagecache_range - unmap and remove pagecache that is hole-punched + * @inode: inode + * @lstart: offset of beginning of hole + * @lend: offset of last byte of hole + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t unmap_start = round_up(lstart, PAGE_SIZE); + loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; + /* + * This rounding is currently just for example: unmap_mapping_range + * expands its hole outwards, whereas we want it to contract the hole + * inwards. However, existing callers of truncate_pagecache_range are + * doing their own page rounding first; and truncate_inode_pages_range + * currently BUGs if lend is not pagealigned-1 (it handles partial + * page at start of hole, but not partial page at end of hole). Note + * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. + */ + + /* + * Unlike in truncate_pagecache, unmap_mapping_range is called only + * once (before truncating pagecache), and without "even_cows" flag: + * hole-punching should not remove private COWed pages from the hole. + */ + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + truncate_inode_pages_range(mapping, lstart, lend); +} +EXPORT_SYMBOL(truncate_pagecache_range); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 4ec8401..28bc57e 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,10 +1,15 @@ -TARGETS = breakpoints +TARGETS = breakpoints vm all: for TARGET in $(TARGETS); do \ make -C $$TARGET; \ done; +run_tests: all + for TARGET in $(TARGETS); do \ + make -C $$TARGET run_tests; \ + done; + clean: for TARGET in $(TARGETS); do \ make -C $$TARGET clean; \ diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile index f362722..9312780 100644 --- a/tools/testing/selftests/breakpoints/Makefile +++ b/tools/testing/selftests/breakpoints/Makefile @@ -11,10 +11,13 @@ endif all: ifeq ($(ARCH),x86) - gcc breakpoint_test.c -o run_test + gcc breakpoint_test.c -o breakpoint_test else echo "Not an x86 target, can't build breakpoints selftests" endif +run_tests: + ./breakpoint_test + clean: - rm -fr run_test + rm -fr breakpoint_test diff --git a/tools/testing/selftests/run_tests b/tools/testing/selftests/run_tests deleted file mode 100644 index 320718a..0000000 --- a/tools/testing/selftests/run_tests +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -TARGETS=breakpoints - -for TARGET in $TARGETS -do - $TARGET/run_test -done diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile new file mode 100644 index 0000000..b336b24 --- /dev/null +++ b/tools/testing/selftests/vm/Makefile @@ -0,0 +1,14 @@ +# Makefile for vm selftests + +CC = $(CROSS_COMPILE)gcc +CFLAGS = -Wall -Wextra + +all: hugepage-mmap hugepage-shm map_hugetlb +%: %.c + $(CC) $(CFLAGS) -o $@ $^ + +run_tests: all + /bin/sh ./run_vmtests + +clean: + $(RM) hugepage-mmap hugepage-shm map_hugetlb diff --git a/Documentation/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c index db0dd9a..a10f310 100644 --- a/Documentation/vm/hugepage-mmap.c +++ b/tools/testing/selftests/vm/hugepage-mmap.c @@ -22,7 +22,7 @@ #include <sys/mman.h> #include <fcntl.h> -#define FILE_NAME "/mnt/hugepagefile" +#define FILE_NAME "huge/hugepagefile" #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) @@ -48,7 +48,7 @@ static void write_bytes(char *addr) *(addr + i) = (char)i; } -static void read_bytes(char *addr) +static int read_bytes(char *addr) { unsigned long i; @@ -56,14 +56,15 @@ static void read_bytes(char *addr) for (i = 0; i < LENGTH; i++) if (*(addr + i) != (char)i) { printf("Mismatch at %lu\n", i); - break; + return 1; } + return 0; } int main(void) { void *addr; - int fd; + int fd, ret; fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); if (fd < 0) { @@ -81,11 +82,11 @@ int main(void) printf("Returned address is %p\n", addr); check_bytes(addr); write_bytes(addr); - read_bytes(addr); + ret = read_bytes(addr); munmap(addr, LENGTH); close(fd); unlink(FILE_NAME); - return 0; + return ret; } diff --git a/Documentation/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c index 07956d8..0d0ef4f 100644 --- a/Documentation/vm/hugepage-shm.c +++ b/tools/testing/selftests/vm/hugepage-shm.c @@ -57,8 +57,8 @@ int main(void) unsigned long i; char *shmaddr; - if ((shmid = shmget(2, LENGTH, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { + shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { perror("shmget"); exit(1); } @@ -82,14 +82,16 @@ int main(void) dprintf("Starting the Check..."); for (i = 0; i < LENGTH; i++) - if (shmaddr[i] != (char)i) + if (shmaddr[i] != (char)i) { printf("\nIndex %lu mismatched\n", i); + exit(3); + } dprintf("Done.\n"); if (shmdt((const void *)shmaddr) != 0) { perror("Detach failure"); shmctl(shmid, IPC_RMID, NULL); - exit(3); + exit(4); } shmctl(shmid, IPC_RMID, NULL); diff --git a/Documentation/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c index eda1a6d..ac56639 100644 --- a/Documentation/vm/map_hugetlb.c +++ b/tools/testing/selftests/vm/map_hugetlb.c @@ -44,7 +44,7 @@ static void write_bytes(char *addr) *(addr + i) = (char)i; } -static void read_bytes(char *addr) +static int read_bytes(char *addr) { unsigned long i; @@ -52,13 +52,15 @@ static void read_bytes(char *addr) for (i = 0; i < LENGTH; i++) if (*(addr + i) != (char)i) { printf("Mismatch at %lu\n", i); - break; + return 1; } + return 0; } int main(void) { void *addr; + int ret; addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); if (addr == MAP_FAILED) { @@ -69,9 +71,9 @@ int main(void) printf("Returned address is %p\n", addr); check_bytes(addr); write_bytes(addr); - read_bytes(addr); + ret = read_bytes(addr); munmap(addr, LENGTH); - return 0; + return ret; } diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests new file mode 100644 index 0000000..8b40bd5 --- /dev/null +++ b/tools/testing/selftests/vm/run_vmtests @@ -0,0 +1,77 @@ +#!/bin/bash +#please run as root + +#we need 256M, below is the size in kB +needmem=262144 +mnt=./huge + +#get pagesize and freepages from /proc/meminfo +while read name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs=$size + fi + if [ "$name" = "Hugepagesize:" ]; then + pgsize=$size + fi +done < /proc/meminfo + +#set proper nr_hugepages +if [ -n "$freepgs" ] && [ -n "$pgsize" ]; then + nr_hugepgs=`cat /proc/sys/vm/nr_hugepages` + needpgs=`expr $needmem / $pgsize` + if [ $freepgs -lt $needpgs ]; then + lackpgs=$(( $needpgs - $freepgs )) + echo $(( $lackpgs + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages + if [ $? -ne 0 ]; then + echo "Please run this test as root" + exit 1 + fi + fi +else + echo "no hugetlbfs support in kernel?" + exit 1 +fi + +mkdir $mnt +mount -t hugetlbfs none $mnt + +echo "--------------------" +echo "runing hugepage-mmap" +echo "--------------------" +./hugepage-mmap +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi + +shmmax=`cat /proc/sys/kernel/shmmax` +shmall=`cat /proc/sys/kernel/shmall` +echo 268435456 > /proc/sys/kernel/shmmax +echo 4194304 > /proc/sys/kernel/shmall +echo "--------------------" +echo "runing hugepage-shm" +echo "--------------------" +./hugepage-shm +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi +echo $shmmax > /proc/sys/kernel/shmmax +echo $shmall > /proc/sys/kernel/shmall + +echo "--------------------" +echo "runing map_hugetlb" +echo "--------------------" +./map_hugetlb +if [ $? -ne 0 ]; then + echo "[FAIL]" +else + echo "[PASS]" +fi + +#cleanup +umount $mnt +rm -rf $mnt +echo $nr_hugepgs > /proc/sys/vm/nr_hugepages diff --git a/tools/vm/Makefile b/tools/vm/Makefile new file mode 100644 index 0000000..8e30e5c --- /dev/null +++ b/tools/vm/Makefile @@ -0,0 +1,11 @@ +# Makefile for vm tools + +CC = $(CROSS_COMPILE)gcc +CFLAGS = -Wall -Wextra + +all: page-types slabinfo +%: %.c + $(CC) $(CFLAGS) -o $@ $^ + +clean: + $(RM) page-types slabinfo diff --git a/Documentation/vm/page-types.c b/tools/vm/page-types.c index 0b13f02..7dab7b25 100644 --- a/Documentation/vm/page-types.c +++ b/tools/vm/page-types.c @@ -124,7 +124,7 @@ #define BIT(name) (1ULL << KPF_##name) #define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) -static const char *page_flag_names[] = { +static const char * const page_flag_names[] = { [KPF_LOCKED] = "L:locked", [KPF_ERROR] = "E:error", [KPF_REFERENCED] = "R:referenced", @@ -166,7 +166,7 @@ static const char *page_flag_names[] = { }; -static const char *debugfs_known_mountpoints[] = { +static const char * const debugfs_known_mountpoints[] = { "/sys/kernel/debug", "/debug", 0, @@ -215,7 +215,7 @@ static int hwpoison_forget_fd; static unsigned long total_pages; static unsigned long nr_pages[HASH_SIZE]; -static uint64_t page_flags[HASH_SIZE]; +static uint64_t page_flags[HASH_SIZE]; /* diff --git a/tools/slub/slabinfo.c b/tools/vm/slabinfo.c index 164cbcf..164cbcf 100644 --- a/tools/slub/slabinfo.c +++ b/tools/vm/slabinfo.c |