diff options
63 files changed, 802 insertions, 318 deletions
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index 0b1a3f9..a0d479d 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1961,6 +1961,12 @@ machines due to caching. </sect1> </chapter> + <chapter id="apiref"> + <title>Mutex API reference</title> +!Iinclude/linux/mutex.h +!Ekernel/mutex.c + </chapter> + <chapter id="references"> <title>Further reading</title> diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt index c91ccc0..38c10fd 100644 --- a/Documentation/mutex-design.txt +++ b/Documentation/mutex-design.txt @@ -9,7 +9,7 @@ firstly, there's nothing wrong with semaphores. But if the simpler mutex semantics are sufficient for your code, then there are a couple of advantages of mutexes: - - 'struct mutex' is smaller on most architectures: .e.g on x86, + - 'struct mutex' is smaller on most architectures: E.g. on x86, 'struct semaphore' is 20 bytes, 'struct mutex' is 16 bytes. A smaller structure size means less RAM footprint, and better CPU-cache utilization. @@ -136,3 +136,4 @@ the APIs of 'struct mutex' have been streamlined: void mutex_lock_nested(struct mutex *lock, unsigned int subclass); int mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); + int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); diff --git a/MAINTAINERS b/MAINTAINERS index 087912a..7e189b5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4810,6 +4810,7 @@ RCUTORTURE MODULE M: Josh Triplett <josh@freedesktop.org> M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> S: Supported +T: git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git F: Documentation/RCU/torture.txt F: kernel/rcutorture.c @@ -4834,6 +4835,7 @@ M: Dipankar Sarma <dipankar@in.ibm.com> M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> W: http://www.rdrop.com/users/paulmck/rclock/ S: Supported +T: git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-2.6-rcu.git F: Documentation/RCU/ F: include/linux/rcu* F: include/linux/srcu* diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index f35eb45..c4191b3 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -26,11 +26,11 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> -void * +void __iomem * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void -iounmap_atomic(void *kvaddr, enum km_type type); +iounmap_atomic(void __iomem *kvaddr, enum km_type type); int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 224392d..5e97529 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -530,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) err = -ENOMEM; goto out; } - if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { kfree(b); err = -ENOMEM; goto out; @@ -543,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifndef CONFIG_SMP cpumask_setall(b->cpus); #else - cpumask_copy(b->cpus, c->llc_shared_map); + cpumask_set_cpu(cpu, b->cpus); #endif per_cpu(threshold_banks, cpu)[bank] = b; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index c2a8b26..d9368ee 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -202,10 +202,11 @@ static int therm_throt_process(bool new_event, int event, int level) #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) +static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, + unsigned int cpu) { int err; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct cpuinfo_x86 *c = &cpu_data(cpu); err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); if (err) @@ -251,7 +252,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev); + err = thermal_throttle_add_dev(sys_dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -287,7 +288,7 @@ static __init int thermal_throttle_init_device(void) #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); + err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index f2da20f..3efdf28 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1154,7 +1154,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) /* * event overflow */ - handled = 1; + handled++; data.period = event->hw.last_period; if (!x86_perf_event_set_period(event)) @@ -1200,12 +1200,20 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } +struct pmu_nmi_state { + unsigned int marked; + int handled; +}; + +static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); + static int __kprobes perf_event_nmi_handler(struct notifier_block *self, unsigned long cmd, void *__args) { struct die_args *args = __args; - struct pt_regs *regs; + unsigned int this_nmi; + int handled; if (!atomic_read(&active_events)) return NOTIFY_DONE; @@ -1214,22 +1222,47 @@ perf_event_nmi_handler(struct notifier_block *self, case DIE_NMI: case DIE_NMI_IPI: break; - + case DIE_NMIUNKNOWN: + this_nmi = percpu_read(irq_stat.__nmi_count); + if (this_nmi != __get_cpu_var(pmu_nmi).marked) + /* let the kernel handle the unknown nmi */ + return NOTIFY_DONE; + /* + * This one is a PMU back-to-back nmi. Two events + * trigger 'simultaneously' raising two back-to-back + * NMIs. If the first NMI handles both, the latter + * will be empty and daze the CPU. So, we drop it to + * avoid false-positive 'unknown nmi' messages. + */ + return NOTIFY_STOP; default: return NOTIFY_DONE; } - regs = args->regs; - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* - * Can't rely on the handled return value to say it was our NMI, two - * events could trigger 'simultaneously' raising two back-to-back NMIs. - * - * If the first NMI handles both, the latter will be empty and daze - * the CPU. - */ - x86_pmu.handle_irq(regs); + + handled = x86_pmu.handle_irq(args->regs); + if (!handled) + return NOTIFY_DONE; + + this_nmi = percpu_read(irq_stat.__nmi_count); + if ((handled > 1) || + /* the next nmi could be a back-to-back nmi */ + ((__get_cpu_var(pmu_nmi).marked == this_nmi) && + (__get_cpu_var(pmu_nmi).handled > 1))) { + /* + * We could have two subsequent back-to-back nmis: The + * first handles more than one counter, the 2nd + * handles only one counter and the 3rd handles no + * counter. + * + * This is the 2nd nmi because the previous was + * handling more than one counter. We will mark the + * next (3rd) and then drop it if unhandled. + */ + __get_cpu_var(pmu_nmi).marked = this_nmi + 1; + __get_cpu_var(pmu_nmi).handled = handled; + } return NOTIFY_STOP; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d8d86d0..ee05c90 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -712,7 +712,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) struct perf_sample_data data; struct cpu_hw_events *cpuc; int bit, loops; - u64 ack, status; + u64 status; + int handled = 0; perf_sample_data_init(&data, 0); @@ -728,6 +729,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) loops = 0; again: + intel_pmu_ack_status(status); if (++loops > 100) { WARN_ONCE(1, "perfevents: irq loop stuck!\n"); perf_event_print_debug(); @@ -736,19 +738,22 @@ again: } inc_irq_stat(apic_perf_irqs); - ack = status; intel_pmu_lbr_read(); /* * PEBS overflow sets bit 62 in the global status register */ - if (__test_and_clear_bit(62, (unsigned long *)&status)) + if (__test_and_clear_bit(62, (unsigned long *)&status)) { + handled++; x86_pmu.drain_pebs(regs); + } for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; + handled++; + if (!test_bit(bit, cpuc->active_mask)) continue; @@ -761,8 +766,6 @@ again: x86_pmu_stop(event); } - intel_pmu_ack_status(ack); - /* * Repeat if there is more work to be done: */ @@ -772,7 +775,7 @@ again: done: intel_pmu_enable_all(0); - return 1; + return handled; } static struct event_constraint * diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 7e578e9..b560db3 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -692,7 +692,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) inc_irq_stat(apic_perf_irqs); } - return handled > 0; + return handled; } /* diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a874495..e2a5952 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -45,8 +45,7 @@ void __init setup_trampoline_page_table(void) /* Copy kernel address range */ clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, - KERNEL_PGD_BOUNDARY)); + KERNEL_PGD_PTRS); /* Initialize low mappings */ clone_pgd_range(trampoline_pg_dir, diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 84e236c..72fc70c 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -74,7 +74,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) /* * Map 'pfn' using fixed map 'type' and protections 'prot' */ -void * +void __iomem * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { /* @@ -86,12 +86,12 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) prot = PAGE_KERNEL_UC_MINUS; - return kmap_atomic_prot_pfn(pfn, type, prot); + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); void -iounmap_atomic(void *kvaddr, enum km_type type) +iounmap_atomic(void __iomem *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f6b48f6..cfe4faa 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -568,8 +568,13 @@ static int __init init_sysfs(void) int error; error = sysdev_class_register(&oprofile_sysclass); - if (!error) - error = sysdev_register(&device_oprofile); + if (error) + return error; + + error = sysdev_register(&device_oprofile); + if (error) + sysdev_class_unregister(&oprofile_sysclass); + return error; } @@ -580,8 +585,10 @@ static void exit_sysfs(void) } #else -#define init_sysfs() do { } while (0) -#define exit_sysfs() do { } while (0) + +static inline int init_sysfs(void) { return 0; } +static inline void exit_sysfs(void) { } + #endif /* CONFIG_PM */ static int __init p4_init(char **cpu_type) @@ -695,6 +702,8 @@ int __init op_nmi_init(struct oprofile_operations *ops) char *cpu_type = NULL; int ret = 0; + using_nmi = 0; + if (!cpu_has_apic) return -ENODEV; @@ -774,7 +783,10 @@ int __init op_nmi_init(struct oprofile_operations *ops) mux_init(ops); - init_sysfs(); + ret = init_sysfs(); + if (ret) + return ret; + using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c index 2bbeaae..38df8c1 100644 --- a/drivers/char/vt_ioctl.c +++ b/drivers/char/vt_ioctl.c @@ -533,11 +533,14 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, case KIOCSOUND: if (!perm) goto eperm; - /* FIXME: This is an old broken API but we need to keep it - supported and somehow separate the historic advertised - tick rate from any real one */ + /* + * The use of PIT_TICK_RATE is historic, it used to be + * the platform-dependent CLOCK_TICK_RATE between 2.6.12 + * and 2.6.36, which was a minor but unfortunate ABI + * change. + */ if (arg) - arg = CLOCK_TICK_RATE / arg; + arg = PIT_TICK_RATE / arg; kd_mksound(arg, 0); break; @@ -553,11 +556,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, */ ticks = HZ * ((arg >> 16) & 0xffff) / 1000; count = ticks ? (arg & 0xffff) : 0; - /* FIXME: This is an old broken API but we need to keep it - supported and somehow separate the historic advertised - tick rate from any real one */ if (count) - count = CLOCK_TICK_RATE / count; + count = PIT_TICK_RATE / count; kd_mksound(count, ticks); break; } diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index 8f0caf7..78fbe9f 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -53,7 +53,7 @@ #define T3_MAX_PBL_SIZE 256 #define T3_MAX_RQ_SIZE 1024 #define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1) -#define T3_MAX_CQ_DEPTH 262144 +#define T3_MAX_CQ_DEPTH 65536 #define T3_MAX_NUM_STAG (1<<15) #define T3_MAX_MR_SIZE 0x100000000ULL #define T3_PAGESIZE_MASK 0xffff000 /* 4KB-128MB */ diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 443cea5..61e0efd 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -502,7 +502,9 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb, static void nes_retrans_expired(struct nes_cm_node *cm_node) { struct iw_cm_id *cm_id = cm_node->cm_id; - switch (cm_node->state) { + enum nes_cm_node_state state = cm_node->state; + cm_node->state = NES_CM_STATE_CLOSED; + switch (state) { case NES_CM_STATE_SYN_RCVD: case NES_CM_STATE_CLOSING: rem_ref_cm_node(cm_node->cm_core, cm_node); @@ -511,7 +513,6 @@ static void nes_retrans_expired(struct nes_cm_node *cm_node) case NES_CM_STATE_FIN_WAIT1: if (cm_node->cm_id) cm_id->rem_ref(cm_id); - cm_node->state = NES_CM_STATE_CLOSED; send_reset(cm_node, NULL); break; default: @@ -1439,9 +1440,6 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, break; case NES_CM_STATE_MPAREQ_RCVD: passive_state = atomic_add_return(1, &cm_node->passive_state); - if (passive_state == NES_SEND_RESET_EVENT) - create_event(cm_node, NES_CM_EVENT_RESET); - cm_node->state = NES_CM_STATE_CLOSED; dev_kfree_skb_any(skb); break; case NES_CM_STATE_ESTABLISHED: @@ -1456,6 +1454,7 @@ static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb, case NES_CM_STATE_CLOSED: drop_packet(skb); break; + case NES_CM_STATE_FIN_WAIT2: case NES_CM_STATE_FIN_WAIT1: case NES_CM_STATE_LAST_ACK: cm_node->cm_id->rem_ref(cm_node->cm_id); @@ -2777,6 +2776,12 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) return -EINVAL; } + passive_state = atomic_add_return(1, &cm_node->passive_state); + if (passive_state == NES_SEND_RESET_EVENT) { + rem_ref_cm_node(cm_node->cm_core, cm_node); + return -ECONNRESET; + } + /* associate the node with the QP */ nesqp->cm_node = (void *)cm_node; cm_node->nesqp = nesqp; @@ -2979,9 +2984,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) printk(KERN_ERR "%s[%u] OFA CM event_handler returned, " "ret=%d\n", __func__, __LINE__, ret); - passive_state = atomic_add_return(1, &cm_node->passive_state); - if (passive_state == NES_SEND_RESET_EVENT) - create_event(cm_node, NES_CM_EVENT_RESET); return 0; } diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index f8233c8..1980a46 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -3468,6 +3468,19 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, return; /* Ignore it, wait for close complete */ if (atomic_inc_return(&nesqp->close_timer_started) == 1) { + if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) && + (nesqp->ibqp_state == IB_QPS_RTS) && + ((nesadapter->eeprom_version >> 16) != NES_A0)) { + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; + next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; + nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; + spin_unlock_irqrestore(&nesqp->lock, flags); + nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); + nes_cm_disconn(nesqp); + } nesqp->cm_id->add_ref(nesqp->cm_id); schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp, NES_TIMER_TYPE_CLOSE, 1, 0); @@ -3477,7 +3490,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount), async_event_id, nesqp->last_aeq, tcp_state); } - break; case NES_AEQE_AEID_LLP_CLOSE_COMPLETE: if (nesqp->term_flags) { diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h index aa9183d..1204c34 100644 --- a/drivers/infiniband/hw/nes/nes_hw.h +++ b/drivers/infiniband/hw/nes/nes_hw.h @@ -45,6 +45,7 @@ #define NES_PHY_TYPE_KR 9 #define NES_MULTICAST_PF_MAX 8 +#define NES_A0 3 enum pci_regs { NES_INT_STAT = 0x0000, diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 6dfdd49..10560c7 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -1446,14 +1446,14 @@ static int nes_netdev_set_pauseparam(struct net_device *netdev, NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; nes_write_indexed(nesdev, - NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE + (nesdev->mac_index*0x200), u32temp); + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp); nesdev->disable_tx_flow_control = 0; } else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) { u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200)); u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE; nes_write_indexed(nesdev, - NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE + (nesdev->mac_index*0x200), u32temp); + NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp); nesdev->disable_tx_flow_control = 1; } if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) { diff --git a/drivers/input/input.c b/drivers/input/input.c index a9b025f..ab69820 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -1599,11 +1599,14 @@ EXPORT_SYMBOL(input_free_device); * @dev: input device supporting MT events and finger tracking * @num_slots: number of slots used by the device * - * This function allocates all necessary memory for MT slot handling - * in the input device, and adds ABS_MT_SLOT to the device capabilities. + * This function allocates all necessary memory for MT slot handling in the + * input device, and adds ABS_MT_SLOT to the device capabilities. All slots + * are initially marked as unused iby setting ABS_MT_TRACKING_ID to -1. */ int input_mt_create_slots(struct input_dev *dev, unsigned int num_slots) { + int i; + if (!num_slots) return 0; @@ -1614,6 +1617,10 @@ int input_mt_create_slots(struct input_dev *dev, unsigned int num_slots) dev->mtsize = num_slots; input_set_abs_params(dev, ABS_MT_SLOT, 0, num_slots - 1, 0, 0); + /* Mark slots as 'unused' */ + for (i = 0; i < num_slots; i++) + dev->mt[i].abs[ABS_MT_TRACKING_ID - ABS_MT_FIRST] = -1; + return 0; } EXPORT_SYMBOL(input_mt_create_slots); diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c index ea67c49..b952317 100644 --- a/drivers/input/mouse/bcm5974.c +++ b/drivers/input/mouse/bcm5974.c @@ -337,10 +337,14 @@ static void report_finger_data(struct input_dev *input, const struct bcm5974_config *cfg, const struct tp_finger *f) { - input_report_abs(input, ABS_MT_TOUCH_MAJOR, raw2int(f->force_major)); - input_report_abs(input, ABS_MT_TOUCH_MINOR, raw2int(f->force_minor)); - input_report_abs(input, ABS_MT_WIDTH_MAJOR, raw2int(f->size_major)); - input_report_abs(input, ABS_MT_WIDTH_MINOR, raw2int(f->size_minor)); + input_report_abs(input, ABS_MT_TOUCH_MAJOR, + raw2int(f->force_major) << 1); + input_report_abs(input, ABS_MT_TOUCH_MINOR, + raw2int(f->force_minor) << 1); + input_report_abs(input, ABS_MT_WIDTH_MAJOR, + raw2int(f->size_major) << 1); + input_report_abs(input, ABS_MT_WIDTH_MINOR, + raw2int(f->size_minor) << 1); input_report_abs(input, ABS_MT_ORIENTATION, MAX_FINGER_ORIENTATION - raw2int(f->orientation)); input_report_abs(input, ABS_MT_POSITION_X, raw2int(f->abs_x)); diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c index 46e4ba0..f585131 100644 --- a/drivers/input/serio/i8042.c +++ b/drivers/input/serio/i8042.c @@ -1485,8 +1485,8 @@ static int __init i8042_init(void) static void __exit i8042_exit(void) { - platform_driver_unregister(&i8042_driver); platform_device_unregister(i8042_platform_device); + platform_driver_unregister(&i8042_driver); i8042_platform_exit(); panic_blink = NULL; diff --git a/drivers/input/tablet/wacom_wac.c b/drivers/input/tablet/wacom_wac.c index 40d77ba..6e29bad 100644 --- a/drivers/input/tablet/wacom_wac.c +++ b/drivers/input/tablet/wacom_wac.c @@ -243,10 +243,10 @@ static int wacom_graphire_irq(struct wacom_wac *wacom) if (features->type == WACOM_G4 || features->type == WACOM_MO) { input_report_abs(input, ABS_DISTANCE, data[6] & 0x3f); - rw = (signed)(data[7] & 0x04) - (data[7] & 0x03); + rw = (data[7] & 0x04) - (data[7] & 0x03); } else { input_report_abs(input, ABS_DISTANCE, data[7] & 0x3f); - rw = -(signed)data[6]; + rw = -(signed char)data[6]; } input_report_rel(input, REL_WHEEL, rw); } diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index a9352b2..b7e755f 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -141,16 +141,6 @@ static struct notifier_block module_load_nb = { .notifier_call = module_load_notify, }; - -static void end_sync(void) -{ - end_cpu_work(); - /* make sure we don't leak task structs */ - process_task_mortuary(); - process_task_mortuary(); -} - - int sync_start(void) { int err; @@ -158,7 +148,7 @@ int sync_start(void) if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL)) return -ENOMEM; - start_cpu_work(); + mutex_lock(&buffer_mutex); err = task_handoff_register(&task_free_nb); if (err) @@ -173,7 +163,10 @@ int sync_start(void) if (err) goto out4; + start_cpu_work(); + out: + mutex_unlock(&buffer_mutex); return err; out4: profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); @@ -182,7 +175,6 @@ out3: out2: task_handoff_unregister(&task_free_nb); out1: - end_sync(); free_cpumask_var(marked_cpus); goto out; } @@ -190,11 +182,20 @@ out1: void sync_stop(void) { + /* flush buffers */ + mutex_lock(&buffer_mutex); + end_cpu_work(); unregister_module_notifier(&module_load_nb); profile_event_unregister(PROFILE_MUNMAP, &munmap_nb); profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); task_handoff_unregister(&task_free_nb); - end_sync(); + mutex_unlock(&buffer_mutex); + flush_scheduled_work(); + + /* make sure we don't leak task structs */ + process_task_mortuary(); + process_task_mortuary(); + free_cpumask_var(marked_cpus); } diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 219f79e..f179ac2 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -120,8 +120,6 @@ void end_cpu_work(void) cancel_delayed_work(&b->work); } - - flush_scheduled_work(); } /* diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 69ad053..d367af1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc) * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) +__releases(fc->lock) { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -306,8 +306,8 @@ __releases(&fc->lock) static void wait_answer_interruptible(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { if (signal_pending(current)) return; @@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { if (!fc->no_interrupt) { /* Any signal may interrupt this */ @@ -905,8 +905,8 @@ static int request_pending(struct fuse_conn *fc) /* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { DECLARE_WAITQUEUE(wait, current); @@ -934,7 +934,7 @@ __acquires(&fc->lock) */ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(&fc->lock) +__releases(fc->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1720,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { while (!list_empty(head)) { struct fuse_req *req; @@ -1744,8 +1744,8 @@ __acquires(&fc->lock) * locked). */ static void end_io_requests(struct fuse_conn *fc) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { while (!list_empty(&fc->io)) { struct fuse_req *req = @@ -1769,6 +1769,16 @@ __acquires(&fc->lock) } } +static void end_queued_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); +} + /* * Abort all requests. * @@ -1795,8 +1805,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; end_io_requests(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + end_queued_requests(fc); wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -1811,8 +1820,9 @@ int fuse_dev_release(struct inode *inode, struct file *file) if (fc) { spin_lock(&fc->lock); fc->connected = 0; - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + fc->blocked = 0; + end_queued_requests(fc); + wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); fuse_conn_put(fc); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 147c1f7..c822458 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) /* Called under fc->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); loff_t size = i_size_read(req->inode); @@ -1183,8 +1183,8 @@ __acquires(&fc->lock) * Called with fc->lock */ void fuse_flush_writepages(struct inode *inode) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3dfef06..cf0d2ff 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -440,7 +440,7 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { static int nfs4_access_to_omode(u32 access) { - switch (access) { + switch (access & NFS4_SHARE_ACCESS_BOTH) { case NFS4_SHARE_ACCESS_READ: return O_RDONLY; case NFS4_SHARE_ACCESS_WRITE: diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 215e12c..592fae5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; do { - pages[numpages] = grab_cache_page(mapping, index); + pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { ret = -ENOMEM; mlog_errno(ret); diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index ec6d123..c7ee03c 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, - "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ @@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, goto out; } - mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); rc = -EIO; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 81296b4..9a03c15 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -36,6 +36,7 @@ #include <linux/writeback.h> #include <linux/falloc.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -190,8 +191,16 @@ static int ocfs2_sync_file(struct file *file, int datasync) if (err) goto bail; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { + /* + * We still have to flush drive's caches to get data to the + * platter + */ + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); goto bail; + } journal = osb->journal->j_journal; err = jbd2_journal_force_commit(journal); @@ -774,7 +783,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { ret = -ENOMEM; mlog_errno(ret); @@ -2329,7 +2338,7 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || - ((file->f_flags & O_DIRECT) && has_refcount)) { + ((file->f_flags & O_DIRECT) && !direct_io)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0492464..eece3e0 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); - if (!status) + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) status = ocfs2_validate_inode_block(osb->sb, bh); } if (status < 0) { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af2b8fe..4c18f4a 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -74,9 +74,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, /* * Another node might have truncated while we were waiting on * cluster locks. + * We don't check size == 0 before the shift. This is borrowed + * from do_generic_file_read. */ - last_index = size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) { + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!size || page->index > last_index)) { ret = -EINVAL; goto out; } @@ -107,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, * because the "write" would invalidate their data. */ if (page->index == last_index) - len = size & ~PAGE_CACHE_MASK; + len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f171b51..a00dda2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -472,32 +472,23 @@ leave: return status; } -static int ocfs2_mknod_locked(struct ocfs2_super *osb, - struct inode *dir, - struct inode *inode, - dev_t dev, - struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *inode_ac) +static int __ocfs2_mknod_locked(struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac, + u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) { int status = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; - u64 suballoc_loc, fe_blkno = 0; - u16 suballoc_bit; u16 feat; *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, - inode_ac, &suballoc_loc, - &suballoc_bit, &fe_blkno); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* populate as many fields early on as possible - many of * these are used by the support functions here and in * callers. */ @@ -591,6 +582,34 @@ leave: return status; } +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + u64 suballoc_loc, fe_blkno = 0; + u16 suballoc_bit; + + *new_fe_bh = NULL; + + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); + if (status < 0) { + mlog_errno(status); + return status; + } + + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + parent_fe_bh, handle, inode_ac, + fe_blkno, suballoc_loc, suballoc_bit); +} + static int ocfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) @@ -1852,61 +1871,117 @@ bail: return status; } -static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, - struct inode **ret_orphan_dir, - u64 blkno, - char *name, - struct ocfs2_dir_lookup_result *lookup) +static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + struct buffer_head **ret_orphan_dir_bh) { struct inode *orphan_dir_inode; struct buffer_head *orphan_dir_bh = NULL; - int status = 0; - - status = ocfs2_blkno_stringify(blkno, name); - if (status < 0) { - mlog_errno(status); - return status; - } + int ret = 0; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -ENOENT; - mlog_errno(status); - return status; + ret = -ENOENT; + mlog_errno(ret); + return ret; } mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); - if (status < 0) { - mlog_errno(status); - goto leave; + ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (ret < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + mlog_errno(ret); + return ret; } - status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, - orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); - if (status < 0) { - ocfs2_inode_unlock(orphan_dir_inode, 1); + *ret_orphan_dir = orphan_dir_inode; + *ret_orphan_dir_bh = orphan_dir_bh; - mlog_errno(status); - goto leave; + return 0; +} + +static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + + ret = ocfs2_blkno_stringify(blkno, name); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + OCFS2_ORPHAN_NAMELEN, lookup); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return 0; +} + +/** + * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for + * insertion of an orphan. + * @osb: ocfs2 file system + * @ret_orphan_dir: Orphan dir inode - returned locked! + * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @lookup: dir lookup result, to be passed back into functions like + * ocfs2_orphan_add + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode, + &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, + blkno, name, lookup); + if (ret < 0) { + mlog_errno(ret); + goto out; } *ret_orphan_dir = orphan_dir_inode; -leave: - if (status) { +out: + brelse(orphan_dir_bh); + + if (ret) { + ocfs2_inode_unlock(orphan_dir_inode, 1); mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); } - brelse(orphan_dir_bh); - - mlog_exit(status); - return status; + mlog_exit(ret); + return ret; } static int ocfs2_orphan_add(struct ocfs2_super *osb, @@ -2053,6 +2128,99 @@ leave: return status; } +/** + * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly + * allocated file. This is different from the typical 'add to orphan dir' + * operation in that the inode does not yet exist. This is a problem because + * the orphan dir stringifies the inode block number to come up with it's + * dirent. Obviously if the inode does not yet exist we have a chicken and egg + * problem. This function works around it by calling deeper into the orphan + * and suballoc code than other callers. Use this only by necessity. + * @dir: The directory which this inode will ultimately wind up under - not the + * orphan dir! + * @dir_bh: buffer_head the @dir inode block + * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled + * with the string to be used for orphan dirent. Pass back to the orphan dir + * code. + * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan + * dir code. + * @ret_di_blkno: block number where the new inode will be allocated. + * @orphan_insert: Dir insert context to be passed back into orphan dir code. + * @ret_inode_ac: Inode alloc context to be passed back to the allocator. + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prep_new_orphaned_file(struct inode *dir, + struct buffer_head *dir_bh, + char *orphan_name, + struct inode **ret_orphan_dir, + u64 *ret_di_blkno, + struct ocfs2_dir_lookup_result *orphan_insert, + struct ocfs2_alloc_context **ret_inode_ac) +{ + int ret; + u64 di_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct inode *orphan_dir = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* reserve an inode spot */ + ret = ocfs2_reserve_new_inode(osb, &inode_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac, + &di_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, + di_blkno, orphan_name, orphan_insert); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (ret == 0) { + *ret_orphan_dir = orphan_dir; + *ret_di_blkno = di_blkno; + *ret_inode_ac = inode_ac; + /* + * orphan_name and orphan_insert are already up to + * date via prepare_orphan_dir + */ + } else { + /* Unroll reserve_new_inode* */ + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + /* Unroll orphan dir locking */ + mutex_unlock(&orphan_dir->i_mutex); + ocfs2_inode_unlock(orphan_dir, 1); + iput(orphan_dir); + } + + brelse(orphan_dir_bh); + + return 0; +} + int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode) @@ -2068,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct buffer_head *new_di_bh = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + u64 uninitialized_var(di_blkno), suballoc_loc; + u16 suballoc_bit; status = ocfs2_inode_lock(dir, &parent_di_bh, 1); if (status < 0) { @@ -2076,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, return status; } - /* - * We give the orphan dir the root blkno to fake an orphan name, - * and allocate enough space for our insertion. - */ - status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, - osb->root_blkno, - orphan_name, &orphan_insert); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - /* reserve an inode spot */ - status = ocfs2_reserve_new_inode(osb, &inode_ac); + status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh, + orphan_name, &orphan_dir, + &di_blkno, &orphan_insert, &inode_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -2116,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; did_quota_inode = 1; - inode->i_nlink = 0; - /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, inode, - 0, &new_di_bh, parent_di_bh, handle, - inode_ac); + status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac, + &suballoc_loc, + &suballoc_bit, di_blkno); if (status < 0) { mlog_errno(status); goto leave; } - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); + inode->i_nlink = 0; + /* do the real work now. */ + status = __ocfs2_mknod_locked(dir, inode, + 0, &new_di_bh, parent_di_bh, handle, + inode_ac, di_blkno, suballoc_loc, + suballoc_bit); if (status < 0) { mlog_errno(status); goto leave; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 73a11cc..0afeda831 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2960,7 +2960,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_CACHE_SIZE - 1)) to = map_end & (PAGE_CACHE_SIZE - 1); - page = grab_cache_page(mapping, page_index); + page = find_or_create_page(mapping, page_index, GFP_NOFS); /* * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page @@ -3179,7 +3179,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb, if (map_end > end) map_end = end; - page = grab_cache_page(context->inode->i_mapping, page_index); + page = find_or_create_page(context->inode->i_mapping, + page_index, GFP_NOFS); BUG_ON(!page); wait_on_page_writeback(page); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index a8e6a95..8a286f5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -57,11 +57,28 @@ struct ocfs2_suballoc_result { u64 sr_bg_blkno; /* The bg we allocated from. Set to 0 when a block group is contiguous. */ + u64 sr_bg_stable_blkno; /* + * Doesn't change, always + * set to target block + * group descriptor + * block. + */ u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ }; +static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) +{ + if (res->sr_blkno == 0) + return 0; + + if (res->sr_bg_blkno) + return res->sr_bg_blkno; + + return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); +} + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; + if (ac->ac_find_loc_priv) { + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; + } } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -1678,6 +1699,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (!ret) ocfs2_bg_discontig_fix_result(ac, gd, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + if (ac->ac_find_loc_only) + goto out_loc_only; + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, le16_to_cpu(gd->bg_chain)); @@ -1691,6 +1721,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (ret < 0) mlog_errno(ret); +out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); out: @@ -1708,7 +1739,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; - u32 tmp_used; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1770,6 +1800,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, if (!status) ocfs2_bg_discontig_fix_result(ac, bg, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; /* * Keep track of previous block descriptor read. When @@ -1796,22 +1831,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } } - /* Ok, claim our bits now: set the info on dinode, chainlist - * and then the group */ - status = ocfs2_journal_access_di(handle, - INODE_CACHE(alloc_inode), - ac->ac_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { + if (ac->ac_find_loc_only) + goto out_loc_only; + + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (status) { mlog_errno(status); goto bail; } - tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); - fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits); - ocfs2_journal_dirty(handle, ac->ac_bh); - status = ocfs2_block_group_set_bits(handle, alloc_inode, bg, @@ -1826,6 +1856,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, (unsigned long long)le64_to_cpu(fe->i_blkno)); +out_loc_only: *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: brelse(group_bh); @@ -1845,6 +1876,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, int status; u16 victim, i; u16 bits_left = 0; + u64 hint = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1872,7 +1904,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, goto bail; } - res->sr_bg_blkno = ac->ac_last_group; + res->sr_bg_blkno = hint; if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used @@ -1896,8 +1928,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); goto set_hint; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1920,8 +1954,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); break; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1936,7 +1972,7 @@ set_hint: if (bits_left < min_bits) ac->ac_last_group = 0; else - ac->ac_last_group = res->sr_bg_blkno; + ac->ac_last_group = hint; } bail: @@ -2016,6 +2052,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir, OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; } +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_suballoc_result *res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + res = kzalloc(sizeof(*res), GFP_NOFS); + if (res == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + /* + * The handle started here is for chain relink. Alternatively, + * we could just disable relink for these calls. + */ + handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + /* + * This will instruct ocfs2_claim_suballoc_bits and + * ocfs2_search_one_group to search but save actual allocation + * for later. + */ + ac->ac_find_loc_only = 1; + + ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ac->ac_find_loc_priv = res; + *fe_blkno = res->sr_blkno; + +out: + if (handle) + ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); + + if (ret) + kfree(res); + + return ret; +} + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno) +{ + int ret; + u16 chain; + struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* + * Since di_blkno is being passed back in, we check for any + * inconsistencies which may have happened between + * calls. These are code bugs as di_blkno is not expected to + * change once returned from ocfs2_find_new_inode_loc() + */ + BUG_ON(res->sr_blkno != di_blkno); + + ret = ocfs2_read_group_descriptor(ac->ac_inode, di, + res->sr_bg_stable_blkno, &bg_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + chain = le16_to_cpu(bg->bg_chain); + + ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, + ac->ac_inode, + bg, + bg_bh, + res->sr_bit_offset, + res->sr_bits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, + (unsigned long long)di_blkno); + + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res->sr_bits != 1); + + *suballoc_loc = res->sr_bg_blkno; + *suballoc_bit = res->sr_bit_offset; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + +out: + brelse(bg_bh); + + return ret; +} + int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, @@ -2567,7 +2733,8 @@ out: * suballoc_bit. */ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, - u16 *suballoc_slot, u16 *suballoc_bit) + u16 *suballoc_slot, u64 *group_blkno, + u16 *suballoc_bit) { int status; struct buffer_head *inode_bh = NULL; @@ -2604,6 +2771,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); if (suballoc_bit) *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + if (group_blkno) + *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); bail: brelse(inode_bh); @@ -2621,7 +2790,8 @@ bail: */ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, - struct buffer_head *alloc_bh, u64 blkno, + struct buffer_head *alloc_bh, + u64 group_blkno, u64 blkno, u16 bit, int *res) { struct ocfs2_dinode *alloc_di; @@ -2642,10 +2812,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, goto bail; } - if (alloc_di->i_suballoc_loc) - bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); - else - bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + bg_blkno = group_blkno ? group_blkno : + ocfs2_which_suballoc_group(blkno, bit); status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, &group_bh); if (status < 0) { @@ -2680,6 +2848,7 @@ bail: int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) { int status; + u64 group_blkno = 0; u16 suballoc_bit = 0, suballoc_slot = 0; struct inode *inode_alloc_inode; struct buffer_head *alloc_bh = NULL; @@ -2687,7 +2856,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog_entry("blkno: %llu", (unsigned long long)blkno); status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, - &suballoc_bit); + &group_blkno, &suballoc_bit); if (status < 0) { mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); goto bail; @@ -2715,7 +2884,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) } status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, - blkno, suballoc_bit, res); + group_blkno, blkno, suballoc_bit, res); if (status < 0) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a017dd3..b8afabf 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -56,6 +56,9 @@ struct ocfs2_alloc_context { u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + int ac_find_loc_only; /* hack for reflink operation ordering */ + struct ocfs2_suballoc_result *ac_find_loc_priv; /* */ + struct ocfs2_alloc_reservation *ac_resv; }; @@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_alloc_context **meta_ac); int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); + + + +/* + * The following two interfaces are for ocfs2_create_inode_in_orphan(). + */ +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno); + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno); + #endif /* _CHAINALLOC_H_ */ diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 439fc1f..271afc4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -224,7 +224,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; if (vma->vm_flags & VM_GROWSDOWN) - start += PAGE_SIZE; + if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) + start += PAGE_SIZE; seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", start, diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 0a6b3d5..7fb5927 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -79,7 +79,7 @@ io_mapping_free(struct io_mapping *mapping) } /* Atomic map/unmap */ -static inline void * +static inline void __iomem * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset, int slot) @@ -94,12 +94,12 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, } static inline void -io_mapping_unmap_atomic(void *vaddr, int slot) +io_mapping_unmap_atomic(void __iomem *vaddr, int slot) { iounmap_atomic(vaddr, slot); } -static inline void * +static inline void __iomem * io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) { resource_size_t phys_addr; @@ -111,7 +111,7 @@ io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) } static inline void -io_mapping_unmap(void *vaddr) +io_mapping_unmap(void __iomem *vaddr) { iounmap(vaddr); } @@ -125,38 +125,38 @@ struct io_mapping; static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, unsigned long size) { - return (struct io_mapping *) ioremap_wc(base, size); + return (struct io_mapping __force *) ioremap_wc(base, size); } static inline void io_mapping_free(struct io_mapping *mapping) { - iounmap(mapping); + iounmap((void __force __iomem *) mapping); } /* Atomic map/unmap */ -static inline void * +static inline void __iomem * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset, int slot) { - return ((char *) mapping) + offset; + return ((char __force __iomem *) mapping) + offset; } static inline void -io_mapping_unmap_atomic(void *vaddr, int slot) +io_mapping_unmap_atomic(void __iomem *vaddr, int slot) { } /* Non-atomic map/unmap */ -static inline void * +static inline void __iomem * io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) { - return ((char *) mapping) + offset; + return ((char __force __iomem *) mapping) + offset; } static inline void -io_mapping_unmap(void *vaddr) +io_mapping_unmap(void __iomem *vaddr) { } diff --git a/include/linux/lglock.h b/include/linux/lglock.h index b288cb7..f549056 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -150,7 +150,7 @@ int i; \ preempt_disable(); \ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_possible_cpu(i) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_lock(lock); \ @@ -161,7 +161,7 @@ void name##_global_unlock(void) { \ int i; \ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_possible_cpu(i) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_unlock(lock); \ diff --git a/include/linux/mm.h b/include/linux/mm.h index e6b1210..74949fb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -864,6 +864,12 @@ int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); +/* Is the vma a continuation of the stack vma above it? */ +static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) +{ + return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); +} + extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 878cab4..f363bc8 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -78,6 +78,14 @@ struct mutex_waiter { # include <linux/mutex-debug.h> #else # define __DEBUG_MUTEX_INITIALIZER(lockname) +/** + * mutex_init - initialize the mutex + * @mutex: the mutex to be initialized + * + * Initialize the mutex to unlocked state. + * + * It is not allowed to initialize an already locked mutex. + */ # define mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 7415839..5310d27 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -26,6 +26,9 @@ struct semaphore { .wait_list = LIST_HEAD_INIT((name).wait_list), \ } +#define DEFINE_SEMAPHORE(name) \ + struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1) + #define DECLARE_MUTEX(name) \ struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1) diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 75bd9b3..20059ef 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv) int i, bpno; kdb_bp_t *bp, *bp_check; int diag; - int free; char *symname = NULL; long offset = 0ul; int nextarg; @@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv) /* * Find an empty bp structure to allocate */ - free = KDB_MAXBPT; for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { if (bp->bp_free) break; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ce66917..1decafb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1091,11 +1091,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; - base = lock_hrtimer_base(timer, &flags); + lock_hrtimer_base(timer, &flags); rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); diff --git a/kernel/mutex.c b/kernel/mutex.c index 4c0b7b3..200407c 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -36,15 +36,6 @@ # include <asm/mutex.h> #endif -/*** - * mutex_init - initialize the mutex - * @lock: the mutex to be initialized - * @key: the lock_class_key for the class; used by mutex lock debugging - * - * Initialize the mutex to unlocked state. - * - * It is not allowed to initialize an already locked mutex. - */ void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init); static __used noinline void __sched __mutex_lock_slowpath(atomic_t *lock_count); -/*** +/** * mutex_lock - acquire the mutex * @lock: the mutex to be acquired * @@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock); static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); -/*** +/** * mutex_unlock - release the mutex * @lock: the mutex to be released * @@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count); static noinline int __sched __mutex_lock_interruptible_slowpath(atomic_t *lock_count); -/*** - * mutex_lock_interruptible - acquire the mutex, interruptable +/** + * mutex_lock_interruptible - acquire the mutex, interruptible * @lock: the mutex to be acquired * * Lock the mutex like mutex_lock(), and return 0 if the mutex has @@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) return prev == 1; } -/*** - * mutex_trylock - try acquire the mutex, without waiting +/** + * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired * * Try to acquire the mutex atomically. Returns 1 if the mutex * has been acquired successfully, and 0 on contention. * * NOTE: this function follows the spin_trylock() convention, so - * it is negated to the down_trylock() return values! Be careful + * it is negated from the down_trylock() return values! Be careful * about this when converting semaphore users to mutexes. * * This function must not be used in interrupt context. The diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 403d180..657555a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -402,11 +402,31 @@ static void perf_group_detach(struct perf_event *event) } } +static inline int +event_filter_match(struct perf_event *event) +{ + return event->cpu == -1 || event->cpu == smp_processor_id(); +} + static void event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 delta; + /* + * An event which could not be activated because of + * filter mismatch still needs to have its timings + * maintained, otherwise bogus information is return + * via read() for time_enabled, time_running: + */ + if (event->state == PERF_EVENT_STATE_INACTIVE + && !event_filter_match(event)) { + delta = ctx->time - event->tstamp_stopped; + event->tstamp_running += delta; + event->tstamp_stopped = ctx->time; + } + if (event->state != PERF_EVENT_STATE_ACTIVE) return; @@ -432,9 +452,7 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - - if (group_event->state != PERF_EVENT_STATE_ACTIVE) - return; + int state = group_event->state; event_sched_out(group_event, cpuctx, ctx); @@ -444,7 +462,7 @@ group_sched_out(struct perf_event *group_event, list_for_each_entry(event, &group_event->sibling_list, group_entry) event_sched_out(event, cpuctx, ctx); - if (group_event->attr.exclusive) + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ab661eb..134f7ed 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1313,7 +1313,7 @@ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int load_idx) { - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - this = group; } else if (avg_load < min_load) { min_load = avg_load; idlest = group; diff --git a/kernel/sys.c b/kernel/sys.c index e9ad444..7f5a0cd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) pgid = pid; if (pgid < 0) return -EINVAL; + rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM @@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); + rcu_read_unlock(); return err; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ca38e8e..f88552c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1713,10 +1713,7 @@ static __init int sysctl_init(void) { sysctl_set_parent(NULL, root_table); #ifdef CONFIG_SYSCTL_SYSCALL_CHECK - { - int err; - err = sysctl_check_table(current->nsproxy, root_table); - } + sysctl_check_table(current->nsproxy, root_table); #endif return 0; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d88ce9..7cb1f45 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; + int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static DEFINE_MUTEX(mutex); static struct trace_seq s; unsigned long long avg; unsigned long long stddev; #endif + mutex_lock(&ftrace_profile_lock); + + /* we raced with function_profile_reset() */ + if (unlikely(rec->counter == 0)) { + ret = -EBUSY; + goto out; + } kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); @@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v) do_div(stddev, (rec->counter - 1) * 1000); } - mutex_lock(&mutex); trace_seq_init(&s); trace_print_graph_duration(rec->time, &s); trace_seq_puts(&s, " "); @@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v) trace_seq_puts(&s, " "); trace_print_graph_duration(stddev, &s); trace_print_seq(m, &s); - mutex_unlock(&mutex); #endif seq_putc(m, '\n'); +out: + mutex_unlock(&ftrace_profile_lock); - return 0; + return ret; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 19cccc3..492197e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2985,13 +2985,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) static void rb_advance_iter(struct ring_buffer_iter *iter) { - struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; unsigned length; cpu_buffer = iter->cpu_buffer; - buffer = cpu_buffer->buffer; /* * Check if we are at the end of the buffer. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0d53c8e..7f9c3c5 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -122,7 +122,7 @@ static void __touch_watchdog(void) void touch_softlockup_watchdog(void) { - __get_cpu_var(watchdog_touch_ts) = 0; + __raw_get_cpu_var(watchdog_touch_ts) = 0; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -142,7 +142,14 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - __get_cpu_var(watchdog_nmi_touch) = true; + if (watchdog_enabled) { + unsigned cpu; + + for_each_present_cpu(cpu) { + if (per_cpu(watchdog_nmi_touch, cpu) != true) + per_cpu(watchdog_nmi_touch, cpu) = true; + } + } touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); @@ -433,6 +440,9 @@ static int watchdog_enable(int cpu) wake_up_process(p); } + /* if any cpu succeeds, watchdog is considered enabled for the system */ + watchdog_enabled = 1; + return 0; } @@ -455,9 +465,6 @@ static void watchdog_disable(int cpu) per_cpu(softlockup_watchdog, cpu) = NULL; kthread_stop(p); } - - /* if any cpu succeeds, watchdog is considered enabled for the system */ - watchdog_enabled = 1; } static void watchdog_enable_all_cpus(void) @@ -135,12 +135,6 @@ void munlock_vma_page(struct page *page) } } -/* Is the vma a continuation of the stack vma above it? */ -static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) -{ - return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); -} - static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) { return (vma->vm_flags & VM_GROWSDOWN) && diff --git a/security/apparmor/include/resource.h b/security/apparmor/include/resource.h index 3c88be9..02baec7 100644 --- a/security/apparmor/include/resource.h +++ b/security/apparmor/include/resource.h @@ -33,8 +33,8 @@ struct aa_rlimit { }; int aa_map_resource(int resource); -int aa_task_setrlimit(struct aa_profile *profile, unsigned int resource, - struct rlimit *new_rlim); +int aa_task_setrlimit(struct aa_profile *profile, struct task_struct *, + unsigned int resource, struct rlimit *new_rlim); void __aa_transition_rlimits(struct aa_profile *old, struct aa_profile *new); diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c index 6e85cdb..506d2ba 100644 --- a/security/apparmor/lib.c +++ b/security/apparmor/lib.c @@ -40,6 +40,7 @@ char *aa_split_fqname(char *fqname, char **ns_name) *ns_name = NULL; if (name[0] == ':') { char *split = strchr(&name[1], ':'); + *ns_name = skip_spaces(&name[1]); if (split) { /* overwrite ':' with \0 */ *split = 0; @@ -47,7 +48,6 @@ char *aa_split_fqname(char *fqname, char **ns_name) } else /* a ns name without a following profile is allowed */ name = NULL; - *ns_name = &name[1]; } if (name && *name == 0) name = NULL; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index f73e2c2..cf1de44 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -614,7 +614,7 @@ static int apparmor_task_setrlimit(struct task_struct *task, int error = 0; if (!unconfined(profile)) - error = aa_task_setrlimit(profile, resource, new_rlim); + error = aa_task_setrlimit(profile, task, resource, new_rlim); return error; } diff --git a/security/apparmor/path.c b/security/apparmor/path.c index 19358dc..8239605 100644 --- a/security/apparmor/path.c +++ b/security/apparmor/path.c @@ -59,8 +59,7 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, { struct path root, tmp; char *res; - int deleted, connected; - int error = 0; + int connected, error = 0; /* Get the root we want to resolve too, released below */ if (flags & PATH_CHROOT_REL) { @@ -74,19 +73,8 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, } spin_lock(&dcache_lock); - /* There is a race window between path lookup here and the - * need to strip the " (deleted) string that __d_path applies - * Detect the race and relookup the path - * - * The stripping of (deleted) is a hack that could be removed - * with an updated __d_path - */ - do { - tmp = root; - deleted = d_unlinked(path->dentry); - res = __d_path(path, &tmp, buf, buflen); - - } while (deleted != d_unlinked(path->dentry)); + tmp = root; + res = __d_path(path, &tmp, buf, buflen); spin_unlock(&dcache_lock); *name = res; @@ -98,21 +86,17 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, *name = buf; goto out; } - if (deleted) { - /* On some filesystems, newly allocated dentries appear to the - * security_path hooks as a deleted dentry except without an - * inode allocated. - * - * Remove the appended deleted text and return as string for - * normal mediation, or auditing. The (deleted) string is - * guaranteed to be added in this case, so just strip it. - */ - buf[buflen - 11] = 0; /* - (len(" (deleted)") +\0) */ - if (path->dentry->d_inode && !(flags & PATH_MEDIATE_DELETED)) { + /* Handle two cases: + * 1. A deleted dentry && profile is not allowing mediation of deleted + * 2. On some filesystems, newly allocated dentries appear to the + * security_path hooks as a deleted dentry except without an inode + * allocated. + */ + if (d_unlinked(path->dentry) && path->dentry->d_inode && + !(flags & PATH_MEDIATE_DELETED)) { error = -ENOENT; goto out; - } } /* Determine if the path is connected to the expected root */ diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c index 3cdc1ad..52cc865 100644 --- a/security/apparmor/policy.c +++ b/security/apparmor/policy.c @@ -1151,12 +1151,14 @@ ssize_t aa_remove_profiles(char *fqname, size_t size) /* released below */ ns = aa_get_namespace(root); - write_lock(&ns->lock); if (!name) { /* remove namespace - can only happen if fqname[0] == ':' */ + write_lock(&ns->parent->lock); __remove_namespace(ns); + write_unlock(&ns->parent->lock); } else { /* remove profile */ + write_lock(&ns->lock); profile = aa_get_profile(__lookup_profile(&ns->base, name)); if (!profile) { error = -ENOENT; @@ -1165,8 +1167,8 @@ ssize_t aa_remove_profiles(char *fqname, size_t size) } name = profile->base.hname; __remove_profile(profile); + write_unlock(&ns->lock); } - write_unlock(&ns->lock); /* don't fail removal if audit fails */ (void) audit_policy(OP_PROF_RM, GFP_KERNEL, name, info, error); diff --git a/security/apparmor/resource.c b/security/apparmor/resource.c index 4a368f1..a4136c1 100644 --- a/security/apparmor/resource.c +++ b/security/apparmor/resource.c @@ -72,6 +72,7 @@ int aa_map_resource(int resource) /** * aa_task_setrlimit - test permission to set an rlimit * @profile - profile confining the task (NOT NULL) + * @task - task the resource is being set on * @resource - the resource being set * @new_rlim - the new resource limit (NOT NULL) * @@ -79,18 +80,21 @@ int aa_map_resource(int resource) * * Returns: 0 or error code if setting resource failed */ -int aa_task_setrlimit(struct aa_profile *profile, unsigned int resource, - struct rlimit *new_rlim) +int aa_task_setrlimit(struct aa_profile *profile, struct task_struct *task, + unsigned int resource, struct rlimit *new_rlim) { int error = 0; - if (profile->rlimits.mask & (1 << resource) && - new_rlim->rlim_max > profile->rlimits.limits[resource].rlim_max) - - error = audit_resource(profile, resource, new_rlim->rlim_max, - -EACCES); + /* TODO: extend resource control to handle other (non current) + * processes. AppArmor rules currently have the implicit assumption + * that the task is setting the resource of the current process + */ + if ((task != current->group_leader) || + (profile->rlimits.mask & (1 << resource) && + new_rlim->rlim_max > profile->rlimits.limits[resource].rlim_max)) + error = -EACCES; - return error; + return audit_resource(profile, resource, new_rlim->rlim_max, error); } /** diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 16d100d3..3fbcd1d 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -35,6 +35,7 @@ enum tpm_pcrs { TPM_PCR0 = 0, TPM_PCR8 = 8 }; #define IMA_MEASURE_HTABLE_SIZE (1 << IMA_HASH_BITS) /* set during initialization */ +extern int iint_initialized; extern int ima_initialized; extern int ima_used_chip; extern char *ima_hash; diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c index 7625b85..afba4ae 100644 --- a/security/integrity/ima/ima_iint.c +++ b/security/integrity/ima/ima_iint.c @@ -22,9 +22,10 @@ RADIX_TREE(ima_iint_store, GFP_ATOMIC); DEFINE_SPINLOCK(ima_iint_lock); - static struct kmem_cache *iint_cache __read_mostly; +int iint_initialized = 0; + /* ima_iint_find_get - return the iint associated with an inode * * ima_iint_find_get gets a reference to the iint. Caller must @@ -141,6 +142,7 @@ static int __init ima_iintcache_init(void) iint_cache = kmem_cache_create("iint_cache", sizeof(struct ima_iint_cache), 0, SLAB_PANIC, init_once); + iint_initialized = 1; return 0; } security_initcall(ima_iintcache_init); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f936413..e662b89 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -148,12 +148,14 @@ void ima_counts_get(struct file *file) struct ima_iint_cache *iint; int rc; - if (!ima_initialized || !S_ISREG(inode->i_mode)) + if (!iint_initialized || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find_get(inode); if (!iint) return; mutex_lock(&iint->mutex); + if (!ima_initialized) + goto out; rc = ima_must_measure(iint, inode, MAY_READ, FILE_CHECK); if (rc < 0) goto out; @@ -213,7 +215,7 @@ void ima_file_free(struct file *file) struct inode *inode = file->f_dentry->d_inode; struct ima_iint_cache *iint; - if (!ima_initialized || !S_ISREG(inode->i_mode)) + if (!iint_initialized || !S_ISREG(inode->i_mode)) return; iint = ima_iint_find_get(inode); if (!iint) @@ -230,7 +232,7 @@ static int process_measurement(struct file *file, const unsigned char *filename, { struct inode *inode = file->f_dentry->d_inode; struct ima_iint_cache *iint; - int rc; + int rc = 0; if (!ima_initialized || !S_ISREG(inode->i_mode)) return 0; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 624a96c..6de4313 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -50,6 +50,7 @@ static inline void callchain_init(struct callchain_node *node) INIT_LIST_HEAD(&node->children); INIT_LIST_HEAD(&node->val); + node->children_hit = 0; node->parent = NULL; node->hit = 0; } |