From 3d9a854c2dac3e888393b23ba7adafcce4d6d4b9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 20 Feb 2010 01:03:43 +0100 Subject: Rename .data[.percpu][.XXX] to .data[..percpu][..XXX]. Signed-off-by: Denys Vlasenko Signed-off-by: Michal Marek --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f82386b..5daf0ab 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -397,7 +397,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) { - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); + return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); } static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -- cgit v1.1 From 54e88fad223c4e1d94289611a90c7fe3ebe5631b Mon Sep 17 00:00:00 2001 From: "Amit K. Arora" Date: Tue, 25 May 2010 18:53:46 +0530 Subject: sched: Make sure timers have migrated before killing the migration_thread Problem: In a stress test where some heavy tests were running along with regular CPU offlining and onlining, a hang was observed. The system seems to be hung at a point where migration_call() tries to kill the migration_thread of the dying CPU, which just got moved to the current CPU. This migration thread does not get a chance to run (and die) since rt_throttled is set to 1 on current, and it doesn't get cleared as the hrtimer which is supposed to reset the rt bandwidth (sched_rt_period_timer) is tied to the CPU which we just marked dead! Solution: This patch pushes the killing of migration thread to "CPU_POST_DEAD" event. By then all the timers (including sched_rt_period_timer) should have got migrated (along with other callbacks). Signed-off-by: Amit Arora Signed-off-by: Gautham R Shenoy Acked-by: Tejun Heo Signed-off-by: Peter Zijlstra Cc: Thomas Gleixner LKML-Reference: <20100525132346.GA14986@amitarora.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b4e7431..70f8d90 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: - case CPU_DEAD: + case CPU_POST_DEAD: { struct cpu_stop_work *work; -- cgit v1.1 From ac9721f3f54b27a16c7e1afb2481e7ee95a70318 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 May 2010 12:54:41 +0200 Subject: perf_events: Fix races and clean up perf_event and perf_mmap_data interaction In order to move toward separate buffer objects, rework the whole perf_mmap_data construct to be a more self-sufficient entity, one with its own lifetime rules. This greatly sanitizes the whole output redirection code, which was riddled with bugs and races. Signed-off-by: Peter Zijlstra Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 224 +++++++++++++++++++++++++++++----------------------- 1 file changed, 126 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bd7ce8c..848d49a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1841,6 +1841,7 @@ static void free_event_rcu(struct rcu_head *head) } static void perf_pending_sync(struct perf_event *event); +static void perf_mmap_data_put(struct perf_mmap_data *data); static void free_event(struct perf_event *event) { @@ -1856,9 +1857,9 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); } - if (event->output) { - fput(event->output->filp); - event->output = NULL; + if (event->data) { + perf_mmap_data_put(event->data); + event->data = NULL; } if (event->destroy) @@ -2175,7 +2176,27 @@ unlock: return ret; } -static int perf_event_set_output(struct perf_event *event, int output_fd); +static const struct file_operations perf_fops; + +static struct perf_event *perf_fget_light(int fd, int *fput_needed) +{ + struct file *file; + + file = fget_light(fd, fput_needed); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &perf_fops) { + fput_light(file, *fput_needed); + *fput_needed = 0; + return ERR_PTR(-EBADF); + } + + return file->private_data; +} + +static int perf_event_set_output(struct perf_event *event, + struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -2202,7 +2223,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return perf_event_period(event, (u64 __user *)arg); case PERF_EVENT_IOC_SET_OUTPUT: - return perf_event_set_output(event, arg); + { + struct perf_event *output_event = NULL; + int fput_needed = 0; + int ret; + + if (arg != -1) { + output_event = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_event)) + return PTR_ERR(output_event); + } + + ret = perf_event_set_output(event, output_event); + if (output_event) + fput_light(output_event->filp, fput_needed); + + return ret; + } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); @@ -2335,8 +2372,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) unsigned long size; int i; - WARN_ON(atomic_read(&event->mmap_count)); - size = sizeof(struct perf_mmap_data); size += nr_pages * sizeof(void *); @@ -2452,8 +2487,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) unsigned long size; void *all_buf; - WARN_ON(atomic_read(&event->mmap_count)); - size = sizeof(struct perf_mmap_data); size += sizeof(void *); @@ -2536,7 +2569,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) if (!data->watermark) data->watermark = max_size / 2; - + atomic_set(&data->refcount, 1); rcu_assign_pointer(event->data, data); } @@ -2548,13 +2581,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) perf_mmap_data_free(data); } -static void perf_mmap_data_release(struct perf_event *event) +static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) { - struct perf_mmap_data *data = event->data; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) { + if (!atomic_inc_not_zero(&data->refcount)) + data = NULL; + } + rcu_read_unlock(); + + return data; +} - WARN_ON(atomic_read(&event->mmap_count)); +static void perf_mmap_data_put(struct perf_mmap_data *data) +{ + if (!atomic_dec_and_test(&data->refcount)) + return; - rcu_assign_pointer(event->data, NULL); call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } @@ -2569,15 +2615,18 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - WARN_ON_ONCE(event->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { unsigned long size = perf_data_size(event->data); - struct user_struct *user = current_user(); + struct user_struct *user = event->mmap_user; + struct perf_mmap_data *data = event->data; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->data->nr_locked; - perf_mmap_data_release(event); + vma->vm_mm->locked_vm -= event->mmap_locked; + rcu_assign_pointer(event->data, NULL); mutex_unlock(&event->mmap_mutex); + + perf_mmap_data_put(data); + free_uid(user); } } @@ -2629,13 +2678,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&event->mmap_count)) { - if (nr_pages != event->data->nr_pages) + if (event->data) { + if (event->data->nr_pages == nr_pages) + atomic_inc(&event->data->refcount); + else ret = -EINVAL; goto unlock; } @@ -2667,21 +2713,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON(event->data); data = perf_mmap_data_alloc(event, nr_pages); - ret = -ENOMEM; - if (!data) + if (!data) { + ret = -ENOMEM; goto unlock; + } - ret = 0; perf_mmap_data_init(event, data); - - atomic_set(&event->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - event->data->nr_locked = extra; if (vma->vm_flags & VM_WRITE) event->data->writable = 1; + atomic_long_add(user_extra, &user->locked_vm); + event->mmap_locked = extra; + event->mmap_user = get_current_user(); + vma->vm_mm->locked_vm += event->mmap_locked; + unlock: + if (!ret) + atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); vma->vm_flags |= VM_RESERVED; @@ -2993,7 +3041,6 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_event *output_event; struct perf_mmap_data *data; unsigned long tail, offset, head; int have_lost; @@ -3010,10 +3057,6 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - output_event = rcu_dereference(event->output); - if (output_event) - event = output_event; - data = rcu_dereference(event->data); if (!data) goto out; @@ -4912,39 +4955,17 @@ err_size: goto out; } -static int perf_event_set_output(struct perf_event *event, int output_fd) +static int +perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_event *output_event = NULL; - struct file *output_file = NULL; - struct perf_event *old_output; - int fput_needed = 0; + struct perf_mmap_data *data = NULL, *old_data = NULL; int ret = -EINVAL; - /* - * Don't allow output of inherited per-task events. This would - * create performance issues due to cross cpu access. - */ - if (event->cpu == -1 && event->attr.inherit) - return -EINVAL; - - if (!output_fd) + if (!output_event) goto set; - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) - goto out; - - output_event = output_file->private_data; - - /* Don't chain output fds */ - if (output_event->output) - goto out; - - /* Don't set an output fd when we already have an output channel */ - if (event->data) + /* don't allow circular references */ + if (event == output_event) goto out; /* @@ -4959,26 +4980,28 @@ static int perf_event_set_output(struct perf_event *event, int output_fd) if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; - atomic_long_inc(&output_file->f_count); - set: mutex_lock(&event->mmap_mutex); - old_output = event->output; - rcu_assign_pointer(event->output, output_event); - mutex_unlock(&event->mmap_mutex); + /* Can't redirect output if we've got an active mmap() */ + if (atomic_read(&event->mmap_count)) + goto unlock; - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this event. - */ - synchronize_rcu(); - fput(old_output->filp); + if (output_event) { + /* get the buffer we want to redirect to */ + data = perf_mmap_data_get(output_event); + if (!data) + goto unlock; } + old_data = event->data; + rcu_assign_pointer(event->data, data); ret = 0; +unlock: + mutex_unlock(&event->mmap_mutex); + + if (old_data) + perf_mmap_data_put(old_data); out: - fput_light(output_file, fput_needed); return ret; } @@ -4994,7 +5017,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_event *event, *group_leader; + struct perf_event *event, *group_leader = NULL, *output_event = NULL; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; @@ -5034,19 +5057,25 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } + if (group_fd != -1) { + group_leader = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_leader)) { + err = PTR_ERR(group_leader); + goto err_put_context; + } + group_file = group_leader->filp; + if (flags & PERF_FLAG_FD_OUTPUT) + output_event = group_leader; + if (flags & PERF_FLAG_FD_NO_GROUP) + group_leader = NULL; + } + /* * Look up the group leader (we will attach this event to it): */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + if (group_leader) { err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - group_leader = group_file->private_data; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): @@ -5068,9 +5097,16 @@ SYSCALL_DEFINE5(perf_event_open, event = perf_event_alloc(&attr, cpu, ctx, group_leader, NULL, NULL, GFP_KERNEL); - err = PTR_ERR(event); - if (IS_ERR(event)) + if (IS_ERR(event)) { + err = PTR_ERR(event); goto err_put_context; + } + + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_free_put_context; + } event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); if (IS_ERR(event_file)) { @@ -5078,12 +5114,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_free_put_context; } - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_event_set_output(event, group_fd); - if (err) - goto err_fput_free_put_context; - } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -5101,8 +5131,6 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; -err_fput_free_put_context: - fput(event_file); err_free_put_context: free_event(event); err_put_context: -- cgit v1.1 From 8a49542c0554af7d0073aac0ee73ee65b807ef34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 May 2010 15:47:49 +0200 Subject: perf_events: Fix races in group composition Group siblings don't pin each-other or the parent, so when we destroy events we must make sure to clean up all cross referencing pointers. In particular, for destruction of a group leader we must be able to find all its siblings and remove their reference to it. This means that detaching an event from its context must not detach it from the group, otherwise we can end up failing to clear all pointers. Solve this by clearly separating the attachment to a context and attachment to a group, and keep the group composed until we destroy the events. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 91 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 848d49a..10a1aee 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *group_leader = event->group_leader; + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; /* - * Depending on whether it is a standalone or sibling event, - * add it straight to the context's event list, or to the group - * leader's sibling list: + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. */ - if (group_leader == event) { + if (event->group_leader == event) { struct list_head *list; if (is_software_event(event)) @@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); - } else { - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; - - list_add_tail(&event->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; } list_add_rcu(&event->event_entry, &ctx->event_list); @@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader; + + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; +} + /* * Remove a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - if (list_empty(&event->group_entry)) + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; - list_del_init(&event->group_entry); list_del_rcu(&event->event_entry); - if (event->group_leader != event) - event->group_leader->nr_siblings--; + if (event->group_leader == event) + list_del_init(&event->group_entry); update_group_times(event); @@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->state = PERF_EVENT_STATE_OFF; } -static void -perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + return; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them - * to the context list directly: + * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - struct list_head *list; - - list = ctx_group_list(event, ctx); - list_move_tail(&sibling->group_entry, list); + if (list) + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ @@ -727,6 +762,7 @@ static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); + perf_group_attach(event); event->tstamp_enabled = ctx->time; event->tstamp_running = ctx->time; event->tstamp_stopped = ctx->time; @@ -1894,8 +1930,8 @@ int perf_event_release_kernel(struct perf_event *event) */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); raw_spin_lock_irq(&ctx->lock); + perf_group_detach(event); list_del_event(event, ctx); - perf_destroy_group(event, ctx); raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); @@ -5127,6 +5163,12 @@ SYSCALL_DEFINE5(perf_event_open, list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); + /* + * Drop the reference on the group_event after placing the + * new event on the sibling_list. This ensures destruction + * of the group leader will find the pointer to itself in + * perf_group_detach(). + */ fput_light(group_file, fput_needed); fd_install(event_fd, event_file); return event_fd; @@ -5448,6 +5490,7 @@ static void perf_free_event(struct perf_event *event, fput(parent->filp); + perf_group_detach(event); list_del_event(event, ctx); free_event(event); } -- cgit v1.1 From 3771f0771154675d4a0ca780be2411f3cc357208 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 12:31:09 +0200 Subject: perf_events, trace: Fix probe unregister race tracepoint_probe_unregister() does not synchronize against the probe callbacks, so do that explicitly. This properly serializes the callbacks and the free of the data used therein. Also, use this_cpu_ptr() where possible. Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1274438476.1674.1702.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 10 ++++++++-- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cb6f365..49c7abf 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -116,7 +116,7 @@ int perf_trace_enable(struct perf_event *p_event) if (WARN_ON_ONCE(!list)) return -EINVAL; - list = per_cpu_ptr(list, smp_processor_id()); + list = this_cpu_ptr(list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; @@ -142,6 +142,12 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->class->perf_probe, tp_event); + /* + * Ensure our callback won't be called anymore. See + * tracepoint_probe_unregister() and __DO_TRACE(). + */ + synchronize_sched(); + free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; @@ -169,7 +175,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, if (*rctxp < 0) return NULL; - raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id()); + raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index faf7cef..f52b5f5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1359,7 +1359,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - head = per_cpu_ptr(call->perf_events, smp_processor_id()); + head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); } @@ -1392,7 +1392,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - head = per_cpu_ptr(call->perf_events, smp_processor_id()); + head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index d2c859c..34e3580 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -519,7 +519,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id()); + head = this_cpu_ptr(sys_data->enter_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } @@ -595,7 +595,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id()); + head = this_cpu_ptr(sys_data->exit_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } -- cgit v1.1 From 2e97942fe57864588774f173cf4cd7bb68968b76 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 16:22:33 +0200 Subject: perf_events, trace: Fix perf_trace_destroy(), mutex went missing Steve spotted I forgot to do the destroy under event_mutex. Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <1274451913.1674.1707.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 49c7abf..e6f6588 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -132,8 +132,9 @@ void perf_trace_destroy(struct perf_event *p_event) struct ftrace_event_call *tp_event = p_event->tp_event; int i; + mutex_lock(&event_mutex); if (--tp_event->perf_refcount > 0) - return; + goto out; if (tp_event->class->reg) tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); @@ -157,6 +158,8 @@ void perf_trace_destroy(struct perf_event *p_event) perf_trace_buf[i] = NULL; } } +out: + mutex_unlock(&event_mutex); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, -- cgit v1.1 From 90151c35b19633e0cab5a6c80f1ba4a51e7c913b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 25 May 2010 16:23:10 +0200 Subject: perf_events: Fix event scheduling issues introduced by transactional API The transactional API patch between the generic and model-specific code introduced several important bugs with event scheduling, at least on X86. If you had pinned events, e.g., watchdog, and were over-committing the PMU, you would get bogus counts. The bug was showing up on Intel CPU because events would move around more often that on AMD. But the problem also existed on AMD, though harder to expose. The issues were: - group_sched_in() was missing a cancel_txn() in the error path - cpuc->n_added was not properly maintained, leading to missing actions in hw_perf_enable(), i.e., n_running being 0. You cannot update n_added until you know the transaction has succeeded. In case of failed transaction n_added was not adjusted back. - in case of failed transactions, event_sched_out() was called and eventually invoked x86_disable_event() to touch the HW reg. But with transactions, on X86, event_sched_in() does not touch HW registers, it simply collects events into a list. Thus, you could end up calling x86_disable_event() on a counter which did not correspond to the current event when idx != -1. The patch modifies the generic and X86 code to avoid all those problems. First, we keep track of the number of events added last. In case the transaction fails, we substract them from n_added. This approach is necessary (as opposed to delaying updates to n_added) because not all event updates use the transaction API, e.g., single events. Second, we encapsulate the event_sched_in() and event_sched_out() in group_sched_in() inside the transaction. That makes the operations symmetrical and you can also detect that you are inside a transaction and skip the HW reg access by checking cpuc->group_flag. With this patch, you can now overcommit the PMU even with pinned system-wide events present and still get valid counts. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1274796225.5882.1389.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 10a1aee..42a0e91 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -687,8 +687,11 @@ group_sched_in(struct perf_event *group_event, if (txn) pmu->start_txn(pmu); - if (event_sched_in(group_event, cpuctx, ctx)) + if (event_sched_in(group_event, cpuctx, ctx)) { + if (txn) + pmu->cancel_txn(pmu); return -EAGAIN; + } /* * Schedule in siblings as one group (if any): @@ -710,9 +713,6 @@ group_sched_in(struct perf_event *group_event, } group_error: - if (txn) - pmu->cancel_txn(pmu); - /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: @@ -724,6 +724,9 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); + if (txn) + pmu->cancel_txn(pmu); + return -EAGAIN; } -- cgit v1.1 From 74048f895fa8cbf8119b4999f1f44881a825f954 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 27 May 2010 21:34:58 +0200 Subject: perf_events: Fix unincremented buffer base on partial copy If a sample size crosses to the next page boundary, the copy will be made in more than one step. However we forget to advance the source offset for the next copy, leading to unexpected double copies that completely mess up the traces. This fixes various kinds of bad traces that have irrelevant data inside, as an example: geany-4979 [001] 5758.077775: sched_switch: prev_comm=! prev_pid=121 prev_prio=0 prev_state=S|D|Z|X|x ==> next_comm= next_pid=7497072 next_prio=0 Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Signed-off-by: Peter Zijlstra LKML-Reference: <1274988898-5639-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 42a0e91..858f56fa 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3064,6 +3064,7 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, len -= size; handle->addr += size; + buf += size; handle->size -= size; if (!handle->size) { struct perf_mmap_data *data = handle->data; -- cgit v1.1 From 546cf44a1b507c1cbb5cf42bbe6169780567f36f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 29 May 2010 11:45:07 -0700 Subject: blktrace: Fix new kernel-doc warnings Fix blktrace.c kernel-doc warnings: Warning(kernel/trace/blktrace.c:858): No description found for parameter 'ignore' Warning(kernel/trace/blktrace.c:890): No description found for parameter 'ignore' Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20100529114507.c466fc1e.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 36ea2b6..638711c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -842,6 +842,7 @@ static void blk_add_trace_split(void *ignore, /** * blk_add_trace_remap - Add a trace for a remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio * @dev: target device @@ -873,6 +874,7 @@ static void blk_add_trace_remap(void *ignore, /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @rq: the source request * @dev: target device -- cgit v1.1 From 293a7cfeedc2b2380a7c7274902323c3cf5f7575 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 31 May 2010 19:53:50 +0930 Subject: module: fix reference to mod->percpu after freeing module. Rafael sees a sometimes crash at precpu_modfree from kernel/module.c; it only occurred with another (since-reverted) patch, but that patch simply changed timing to uncover this bug, it was otherwise unrelated. The comment about the mod being freed is self-explanatory, but neither Tejun nor I read it. This bug was introduced in 259354deaa, after it had previously been fixed in 6e2b75740b. How embarrassing. Reported-by: "Rafael J. Wysocki" Signed-off-by: Rusty Russell Embarrassingly-Acked-by: Tejun Heo Cc: Masami Hiramatsu Tested-by: "Rafael J. Wysocki" Signed-off-by: Linus Torvalds --- kernel/module.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 333fbcc..d806e00 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2014,6 +2014,7 @@ static noinline struct module *load_module(void __user *umod, long err = 0; void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; + void __percpu *percpu; mm_segment_t old_fs; @@ -2158,6 +2159,8 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; } + /* Keep this around for failure path. */ + percpu = mod_percpu(mod); /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any @@ -2463,7 +2466,7 @@ static noinline struct module *load_module(void __user *umod, module_free(mod, mod->module_core); /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: - percpu_modfree(mod); + free_percpu(percpu); free_mod: kfree(args); kfree(strmap); -- cgit v1.1 From e51fd5e22e12b39f49b1bb60b37b300b17378a43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 May 2010 12:37:30 +0200 Subject: sched: Fix wake_affine() vs RT tasks Mike reports that since e9e9250b (sched: Scale down cpu_power due to RT tasks), wake_affine() goes funny on RT tasks due to them still having a !0 weight and wake_affine() still subtracts that from the rq weight. Since nobody should be using se->weight for RT tasks, set the value to zero. Also, since we now use ->cpu_power to normalize rq weights to account for RT cpu usage, add that factor into the imbalance computation. Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1275316109.27810.22969.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 24 ++++++------------------ kernel/sched_fair.c | 22 ++++++++++++++++------ 2 files changed, 22 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d484081..f8b8996 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -544,6 +544,8 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned long cpu_power; + unsigned char idle_at_tick; /* For active balancing */ int post_schedule; @@ -1499,24 +1501,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - static unsigned long power_of(int cpu) { - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; + return cpu_rq(cpu)->cpu_power; } static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); @@ -1854,8 +1841,8 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; + p->se.load.weight = 0; + p->se.load.inv_weight = WMULT_CONST; return; } @@ -7605,6 +7592,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->cpu_power = SCHED_LOAD_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 217e4a9..eed35ed 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; - unsigned int imbalance; struct task_group *tg; unsigned long weight; int balanced; @@ -1252,8 +1251,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) tg = task_group(p); weight = p->se.load.weight; - imbalance = 100 + (sd->imbalance_pct - 100) / 2; - /* * In low-load situations, where prev_cpu is idle and this_cpu is idle * due to the sync cause above having dropped this_load to 0, we'll @@ -1263,9 +1260,21 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !this_load || - 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= - imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); + if (this_load) { + unsigned long this_eff_load, prev_eff_load; + + this_eff_load = 100; + this_eff_load *= power_of(prev_cpu); + this_eff_load *= this_load + + effective_load(tg, this_cpu, weight, weight); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= power_of(this_cpu); + prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + + balanced = this_eff_load <= prev_eff_load; + } else + balanced = true; /* * If the currently running task will sleep within @@ -2298,6 +2307,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) if (!power) power = 1; + cpu_rq(cpu)->cpu_power = power; sdg->cpu_power = power; } -- cgit v1.1 From 5c113fbeed7a5a192d8431a768965f8a45c16475 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 1 Jun 2010 12:15:11 +0100 Subject: fix cpu_chain section mismatch... In commit e9fb7631ebcd ("cpu-hotplug: introduce cpu_notify(), __cpu_notify(), cpu_notify_nofail()") the new helper functions access cpu_chain. As a result, it shouldn't be marked __cpuinitdata (via section mismatch warning). Alternatively, the helper functions should be forced inline, or marked __ref or __cpuinit. In the meantime, this patch silences the warning the trivial way. Signed-off-by: Daniel J Blueman Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8b92539..97d1b42 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -34,7 +34,7 @@ void cpu_maps_update_done(void) mutex_unlock(&cpu_add_remove_lock); } -static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); +static RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock -- cgit v1.1 From ff9da691c0498ff81fdd014e7a0731dab2337dac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jun 2010 14:54:39 +0200 Subject: pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface This changes the interface to be based on bytes instead. The API matches that of F_SETPIPE_SZ in that it rounds up the passed in size so that the resulting page array is a power-of-2 in size. The proc file is renamed to /proc/sys/fs/pipe-max-size to reflect this change. Signed-off-by: Jens Axboe --- kernel/sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 997080f..d24f761f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = { }, #endif { - .procname = "pipe-max-pages", - .data = &pipe_max_pages, + .procname = "pipe-max-size", + .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &two, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, }, /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.1 From c6df8d5ab87a246942d138321e1721edbb69f6e1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Jun 2010 11:21:20 +0200 Subject: perf: Fix crash in swevents Frederic reported that because swevents handling doesn't disable IRQs anymore, we can get a recursion of perf_adjust_period(), once from overflow handling and once from the tick. If both call ->disable, we get a double hlist_del_rcu() and trigger a LIST_POISON2 dereference. Since we don't actually need to stop/start a swevent to re-programm the hardware (lack of hardware to program), simply nop out these callbacks for the swevent pmu. Reported-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1275557609.27810.35218.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 858f56fa..31d6afe 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4055,13 +4055,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } } -static void perf_swevent_unthrottle(struct perf_event *event) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - static void perf_swevent_add(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) @@ -4276,11 +4269,22 @@ static void perf_swevent_disable(struct perf_event *event) hlist_del_rcu(&event->hlist_entry); } +static void perf_swevent_void(struct perf_event *event) +{ +} + +static int perf_swevent_int(struct perf_event *event) +{ + return 0; +} + static const struct pmu perf_ops_generic = { .enable = perf_swevent_enable, .disable = perf_swevent_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, .read = perf_swevent_read, - .unthrottle = perf_swevent_unthrottle, + .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ }; /* @@ -4561,8 +4565,10 @@ static int swevent_hlist_get(struct perf_event *event) static const struct pmu perf_ops_tracepoint = { .enable = perf_trace_enable, .disable = perf_trace_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, .read = perf_swevent_read, - .unthrottle = perf_swevent_unthrottle, + .unthrottle = perf_swevent_void, }; static int perf_tp_filter_match(struct perf_event *event, -- cgit v1.1 From 485d527686850d68a0e9006dd9904f19f122485e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 4 Jun 2010 14:14:58 -0700 Subject: sys_personality: change sys_personality() to accept "unsigned int" instead of u_long task_struct->pesonality is "unsigned int", but sys_personality() paths use "unsigned long pesonality". This means that every assignment or comparison is not right. In particular, if this argument does not fit into "unsigned int" __set_personality() changes the caller's personality and then sys_personality() returns -EINVAL. Turn this argument into "unsigned int" and avoid overflows. Obviously, this is the user-visible change, we just ignore the upper bits. But this can't break the sane application. There is another thing which can confuse the poorly written applications. User-space thinks that this syscall returns int, not long. This means that the returned value can be negative and look like the error code. But note that libc won't be confused and thus errno won't be set, and with this patch the user-space can never get -1 unless sys_personality() really fails. And, most importantly, the negative RET != -1 is only possible if that app previously called personality(RET). Pointed-out-by: Wenming Zhang Suggested-by: Linus Torvalds Signed-off-by: Oleg Nesterov Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exec_domain.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c35452c..dd62f8e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain; static DEFINE_RWLOCK(exec_domains_lock); -static u_long ident_map[32] = { +static unsigned long ident_map[32] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp) } static struct exec_domain * -lookup_exec_domain(u_long personality) +lookup_exec_domain(unsigned int personality) { - struct exec_domain * ep; - u_long pers = personality(personality); + unsigned int pers = personality(personality); + struct exec_domain *ep; read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality) #ifdef CONFIG_MODULES read_unlock(&exec_domains_lock); - request_module("personality-%ld", pers); + request_module("personality-%d", pers); read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -135,7 +135,7 @@ unregister: } int -__set_personality(u_long personality) +__set_personality(unsigned int personality) { struct exec_domain *ep, *oep; @@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void) module_init(proc_execdomains_init); #endif -SYSCALL_DEFINE1(personality, u_long, personality) +SYSCALL_DEFINE1(personality, unsigned int, personality) { - u_long old = current->personality; + unsigned int old = current->personality; if (personality != 0xffffffff) { set_personality(personality); @@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality) return -EINVAL; } - return (long)old; + return old; } -- cgit v1.1 From 94b3dd0f7bb393d93e84a173b1df9b8b64c83ac4 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 4 Jun 2010 14:15:03 -0700 Subject: cgroups: alloc_css_id() increments hierarchy depth Child groups should have a greater depth than their parents. Prior to this change, the parent would incorrectly report zero memory usage for child cgroups when use_hierarchy is enabled. test script: mount -t cgroup none /cgroups -o memory cd /cgroups mkdir cg1 echo 1 > cg1/memory.use_hierarchy mkdir cg1/cg11 echo $$ > cg1/cg11/tasks dd if=/dev/zero of=/tmp/foo bs=1M count=1 echo echo CHILD grep cache cg1/cg11/memory.stat echo echo PARENT grep cache cg1/memory.stat echo $$ > tasks rmdir cg1/cg11 cg1 cd / umount /cgroups Using fae9c79, a recent patch that changed alloc_css_id() depth computation, the parent incorrectly reports zero usage: root@ubuntu:~# ./test 1+0 records in 1+0 records out 1048576 bytes (1.0 MB) copied, 0.0151844 s, 69.1 MB/s CHILD cache 1048576 total_cache 1048576 PARENT cache 0 total_cache 0 With this patch, the parent correctly includes child usage: root@ubuntu:~# ./test 1+0 records in 1+0 records out 1048576 bytes (1.0 MB) copied, 0.0136827 s, 76.6 MB/s CHILD cache 1052672 total_cache 1052672 PARENT cache 0 total_cache 1052672 Signed-off-by: Greg Thelen Acked-by: Paul Menage Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: [2.6.34.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 422cb19..3ac6f5b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4598,7 +4598,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, parent_css = parent->subsys[subsys_id]; child_css = child->subsys[subsys_id]; parent_id = parent_css->id; - depth = parent_id->depth; + depth = parent_id->depth + 1; child_id = get_new_cssid(ss, depth); if (IS_ERR(child_id)) -- cgit v1.1 From 9e506f7adce8e6165a104d3d78fddd8ff0cdccf8 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 4 Jun 2010 14:15:04 -0700 Subject: kernel/: fix BUG_ON checks for cpu notifier callbacks direct call The commit 80b5184cc537718122e036afe7e62d202b70d077 ("kernel/: convert cpu notifier to return encapsulate errno value") changed the return value of cpu notifier callbacks. Those callbacks don't return NOTIFY_BAD on failures anymore. But there are a few callbacks which are called directly at init time and checking the return value. I forgot to change BUG_ON checking by the direct callers in the commit. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 +- kernel/timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 825e112..07b4f1b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void) void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; diff --git a/kernel/timer.c b/kernel/timer.c index 2454172..ee305c8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1717,7 +1717,7 @@ void __init init_timers(void) init_timer_stats(); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.1 From 2c02dfe7fe3fba97a5665d329d039d2415ea5607 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 31 May 2010 12:19:37 -0700 Subject: module: Make the 'usage' lists be two-way When adding a module that depends on another one, we used to create a one-way list of "modules_which_use_me", so that module unloading could see who needs a module. It's actually quite simple to make that list go both ways: so that we not only can see "who uses me", but also see a list of modules that are "used by me". In fact, we always wanted that list in "module_unload_free()": when we unload a module, we want to also release all the other modules that are used by that module. But because we didn't have that list, we used to first iterate over all modules, and then iterate over each "used by me" list of that module. By making the list two-way, we simplify module_unload_free(), and it allows for some trivial fixes later too. Signed-off-by: Linus Torvalds Signed-off-by: Rusty Russell (cleaned & rebased) --- kernel/module.c | 79 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0129769..be18c3e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -523,7 +523,8 @@ static void module_unload_init(struct module *mod) { int cpu; - INIT_LIST_HEAD(&mod->modules_which_use_me); + INIT_LIST_HEAD(&mod->source_list); + INIT_LIST_HEAD(&mod->target_list); for_each_possible_cpu(cpu) { per_cpu_ptr(mod->refptr, cpu)->incs = 0; per_cpu_ptr(mod->refptr, cpu)->decs = 0; @@ -538,8 +539,9 @@ static void module_unload_init(struct module *mod) /* modules using other modules */ struct module_use { - struct list_head list; - struct module *module_which_uses; + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; }; /* Does a already use b? */ @@ -547,8 +549,8 @@ static int already_uses(struct module *a, struct module *b) { struct module_use *use; - list_for_each_entry(use, &b->modules_which_use_me, list) { - if (use->module_which_uses == a) { + list_for_each_entry(use, &b->source_list, source_list) { + if (use->source == a) { DEBUGP("%s uses %s!\n", a->name, b->name); return 1; } @@ -557,6 +559,33 @@ static int already_uses(struct module *a, struct module *b) return 0; } +/* + * Module a uses b + * - we add 'a' as a "source", 'b' as a "target" of module use + * - the module_use is added to the list of 'b' sources (so + * 'b' can walk the list to see who sourced them), and of 'a' + * targets (so 'a' can see what modules it targets). + */ +static int add_module_usage(struct module *a, struct module *b) +{ + int no_warn; + struct module_use *use; + + DEBUGP("Allocating new usage for %s.\n", a->name); + use = kmalloc(sizeof(*use), GFP_ATOMIC); + if (!use) { + printk(KERN_WARNING "%s: out of memory loading\n", a->name); + return -ENOMEM; + } + + use->source = a; + use->target = b; + list_add(&use->source_list, &b->source_list); + list_add(&use->target_list, &a->target_list); + no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); + return 0; +} + /* Module a uses b */ int use_module(struct module *a, struct module *b) { @@ -578,17 +607,11 @@ int use_module(struct module *a, struct module *b) if (err) return 0; - DEBUGP("Allocating new usage for %s.\n", a->name); - use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - printk("%s: out of memory loading\n", a->name); + err = add_module_usage(a, b); + if (err) { module_put(b); return 0; } - - use->module_which_uses = a; - list_add(&use->list, &b->modules_which_use_me); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); return 1; } EXPORT_SYMBOL_GPL(use_module); @@ -596,22 +619,16 @@ EXPORT_SYMBOL_GPL(use_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) { - struct module *i; - - list_for_each_entry(i, &modules, list) { - struct module_use *use; + struct module_use *use, *tmp; - list_for_each_entry(use, &i->modules_which_use_me, list) { - if (use->module_which_uses == mod) { - DEBUGP("%s unusing %s\n", mod->name, i->name); - module_put(i); - list_del(&use->list); - kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); - /* There can be at most one match. */ - break; - } - } + list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { + struct module *i = use->target; + DEBUGP("%s unusing %s\n", mod->name, i->name); + module_put(i); + list_del(&use->source_list); + list_del(&use->target_list); + kfree(use); + sysfs_remove_link(i->holders_dir, mod->name); } } @@ -735,7 +752,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, goto out; } - if (!list_empty(&mod->modules_which_use_me)) { + if (!list_empty(&mod->source_list)) { /* Other modules depend on us: get rid of them first. */ ret = -EWOULDBLOCK; goto out; @@ -799,9 +816,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ - list_for_each_entry(use, &mod->modules_which_use_me, list) { + list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; - seq_printf(m, "%s,", use->module_which_uses->name); + seq_printf(m, "%s,", use->source->name); } if (mod->init != NULL && mod->exit == NULL) { -- cgit v1.1 From c8e21ced08b39ef8dfe7236fb2a923a95f645262 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:35 -0600 Subject: module: fix kdb's illicit use of struct module_use. Linus changed the structure, and luckily this didn't compile any more. Reported-by: Stephen Rothwell Signed-off-by: Rusty Russell Cc: Jason Wessel Cc: Martin Hicks --- kernel/debug/kdb/kdb_main.c | 12 +++--------- kernel/module.c | 11 +---------- 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index b724c79..184cd82 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1857,12 +1857,6 @@ static int kdb_ef(int argc, const char **argv) } #if defined(CONFIG_MODULES) -/* modules using other modules */ -struct module_use { - struct list_head list; - struct module *module_which_uses; -}; - /* * kdb_lsmod - This function implements the 'lsmod' command. Lists * currently loaded kernel modules. @@ -1894,9 +1888,9 @@ static int kdb_lsmod(int argc, const char **argv) { struct module_use *use; kdb_printf(" [ "); - list_for_each_entry(use, &mod->modules_which_use_me, - list) - kdb_printf("%s ", use->module_which_uses->name); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); kdb_printf("]\n"); } #endif diff --git a/kernel/module.c b/kernel/module.c index be18c3e..bbb1d81 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -536,14 +536,6 @@ static void module_unload_init(struct module *mod) mod->waiter = current; } -/* modules using other modules */ -struct module_use -{ - struct list_head source_list; - struct list_head target_list; - struct module *source, *target; -}; - /* Does a already use b? */ static int already_uses(struct module *a, struct module *b) { @@ -589,8 +581,7 @@ static int add_module_usage(struct module *a, struct module *b) /* Module a uses b */ int use_module(struct module *a, struct module *b) { - struct module_use *use; - int no_warn, err; + int err; if (b == NULL || already_uses(a, b)) return 1; -- cgit v1.1 From 80a3d1bb410e000e176931a076cdf19a1e89a955 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: move sysfs exposure to end of load_module This means a little extra work, but is more logical: we don't put anything in sysfs until we're about to put the module into the global list an parse its parameters. This also gives us a logical place to put duplicate module detection in the next patch. Signed-off-by: Rusty Russell --- kernel/module.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index bbb1d81..c690d98 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -560,7 +560,6 @@ static int already_uses(struct module *a, struct module *b) */ static int add_module_usage(struct module *a, struct module *b) { - int no_warn; struct module_use *use; DEBUGP("Allocating new usage for %s.\n", a->name); @@ -574,7 +573,6 @@ static int add_module_usage(struct module *a, struct module *b) use->target = b; list_add(&use->source_list, &b->source_list); list_add(&use->target_list, &a->target_list); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); return 0; } @@ -619,7 +617,6 @@ static void module_unload_free(struct module *mod) list_del(&use->source_list); list_del(&use->target_list); kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); } } @@ -1303,6 +1300,29 @@ static inline void remove_notes_attrs(struct module *mod) #endif #ifdef CONFIG_SYSFS +static void add_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + int nowarn; + + list_for_each_entry(use, &mod->target_list, target_list) { + nowarn = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + } +#endif +} + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); +#endif +} + int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; @@ -1385,6 +1405,10 @@ int mod_sysfs_setup(struct module *mod, { int err; + err = mod_sysfs_init(mod); + if (err) + goto out; + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); if (!mod->holders_dir) { err = -ENOMEM; @@ -1399,6 +1423,8 @@ int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; + add_usage_links(mod); + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1408,6 +1434,7 @@ out_unreg_holders: kobject_put(mod->holders_dir); out_unreg: kobject_put(&mod->mkobj.kobj); +out: return err; } @@ -1422,10 +1449,15 @@ static void mod_sysfs_fini(struct module *mod) { } +static void del_usage_links(struct module *mod) +{ +} + #endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { + del_usage_links(mod); module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); @@ -2242,11 +2274,6 @@ static noinline struct module *load_module(void __user *umod, /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); - /* add kobject, so we can reference it. */ - err = mod_sysfs_init(mod); - if (err) - goto free_unload; - /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); @@ -2443,6 +2470,7 @@ static noinline struct module *load_module(void __user *umod, err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); if (err < 0) goto unlink; + add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); @@ -2461,9 +2489,6 @@ static noinline struct module *load_module(void __user *umod, module_arch_cleanup(mod); cleanup: free_modinfo(mod); - kobject_del(&mod->mkobj.kobj); - kobject_put(&mod->mkobj.kobj); - free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) free_percpu(mod->refptr); -- cgit v1.1 From 6407ebb271fc34440b306f305e1efb7685eece26 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: Make module sysfs functions private. These were placed in the header in ef665c1a06 to get the various SYSFS/MODULE config combintations to compile. That may have been necessary then, but it's not now. These functions are all local to module.c. Signed-off-by: Rusty Russell Cc: Randy Dunlap --- kernel/module.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c690d98..808aa18 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1323,7 +1323,7 @@ static void del_usage_links(struct module *mod) #endif } -int module_add_modinfo_attrs(struct module *mod) +static int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; struct module_attribute *temp_attr; @@ -1349,7 +1349,7 @@ int module_add_modinfo_attrs(struct module *mod) return error; } -void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { struct module_attribute *attr; int i; @@ -1365,7 +1365,7 @@ void module_remove_modinfo_attrs(struct module *mod) kfree(mod->modinfo_attrs); } -int mod_sysfs_init(struct module *mod) +static int mod_sysfs_init(struct module *mod) { int err; struct kobject *kobj; @@ -1399,7 +1399,7 @@ out: return err; } -int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, struct kernel_param *kparam, unsigned int num_params) { @@ -1445,6 +1445,27 @@ static void mod_sysfs_fini(struct module *mod) #else /* CONFIG_SYSFS */ +static inline int mod_sysfs_init(struct module *mod) +{ + return 0; +} + +static inline int mod_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline int module_add_modinfo_attrs(struct module *mod) +{ + return 0; +} + +static inline void module_remove_modinfo_attrs(struct module *mod) +{ +} + static void mod_sysfs_fini(struct module *mod) { } -- cgit v1.1 From 75676500f8298f0ee89db12db97294883c4b768e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: make locking more fine-grained. Kay Sievers reports that we still have some contention over module loading which is slowing boot. Linus also disliked a previous "drop lock and regrab" patch to fix the bne2 "gave up waiting for init of module libcrc32c" message. This is more ambitious: we only grab the lock where we need it. Signed-off-by: Rusty Russell Cc: Brandon Philips Cc: Kay Sievers Cc: Linus Torvalds --- kernel/module.c | 65 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 808aa18..d293c21 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -72,7 +72,11 @@ /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -/* List of modules, protected by module_mutex or preempt_disable +/* + * Mutex protects: + * 1) List of modules (also safely readable with preempt_disable), + * 2) module_use links, + * 3) module_addr_min/module_addr_max. * (delete uses stop_machine/add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); @@ -90,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); -/* Bounds of module allocation, for speeding __module_address */ +/* Bounds of module allocation, for speeding __module_address. + * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; int register_module_notifier(struct notifier_block * nb) @@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms, } /* Find a symbol and return it, along with, (optional) crc and - * (optional) module which owns it */ + * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const unsigned long **crc, @@ -576,7 +581,7 @@ static int add_module_usage(struct module *a, struct module *b) return 0; } -/* Module a uses b */ +/* Module a uses b: caller needs module_mutex() */ int use_module(struct module *a, struct module *b) { int err; @@ -610,6 +615,7 @@ static void module_unload_free(struct module *mod) { struct module_use *use, *tmp; + mutex_lock(&module_mutex); list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { struct module *i = use->target; DEBUGP("%s unusing %s\n", mod->name, i->name); @@ -618,6 +624,7 @@ static void module_unload_free(struct module *mod) list_del(&use->target_list); kfree(use); } + mutex_unlock(&module_mutex); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -784,13 +791,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); async_synchronize_full(); - mutex_lock(&module_mutex); + /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); ddebug_remove_module(mod->name); - free_module(mod); - out: + free_module(mod); + return 0; +out: mutex_unlock(&module_mutex); return ret; } @@ -1006,6 +1014,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; + /* Since this should be found in kernel (which can't be removed), + * no locking is necessary. */ if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, &crc, true, false)) BUG(); @@ -1048,8 +1058,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, } #endif /* CONFIG_MODVERSIONS */ -/* Resolve a symbol for this module. I.e. if we find one, record usage. - Must be holding module_mutex. */ +/* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, unsigned int versindex, const char *name, @@ -1059,6 +1068,7 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, const struct kernel_symbol *sym; const unsigned long *crc; + mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); /* use_module can fail due to OOM, @@ -1068,6 +1078,7 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, || !use_module(mod, owner)) sym = NULL; } + mutex_unlock(&module_mutex); return sym; } @@ -1306,10 +1317,12 @@ static void add_usage_links(struct module *mod) struct module_use *use; int nowarn; + mutex_lock(&module_mutex); list_for_each_entry(use, &mod->target_list, target_list) { nowarn = sysfs_create_link(use->target->holders_dir, &mod->mkobj.kobj, mod->name); } + mutex_unlock(&module_mutex); #endif } @@ -1318,8 +1331,10 @@ static void del_usage_links(struct module *mod) #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; + mutex_lock(&module_mutex); list_for_each_entry(use, &mod->target_list, target_list) sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); #endif } @@ -1497,13 +1512,15 @@ static int __unlink_module(void *_mod) return 0; } -/* Free a module, remove from lists, etc (must hold module_mutex). */ +/* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { trace_module_free(mod); /* Delete from various lists */ + mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); remove_notes_attrs(mod); remove_sect_attrs(mod); mod_kobject_remove(mod); @@ -1575,7 +1592,14 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { + const struct kernel_symbol *sym; + + /* Stopping preemption makes find_symbol safe. */ + preempt_disable(); + sym = find_symbol(s->name, &owner, NULL, true, false); + preempt_enable(); + + if (sym) { printk(KERN_ERR "%s: exports duplicate symbol %s" " (owned by %s)\n", @@ -2021,11 +2045,13 @@ static void *module_alloc_update_bounds(unsigned long size) void *ret = module_alloc(size); if (ret) { + mutex_lock(&module_mutex); /* Update module bounds. */ if ((unsigned long)ret < module_addr_min) module_addr_min = (unsigned long)ret; if ((unsigned long)ret + size > module_addr_max) module_addr_max = (unsigned long)ret + size; + mutex_unlock(&module_mutex); } return ret; } @@ -2482,7 +2508,9 @@ static noinline struct module *load_module(void __user *umod, * function to insert in a way safe to concurrent readers. * The mutex protects against concurrent writers. */ + mutex_lock(&module_mutex); list_add_rcu(&mod->list, &modules); + mutex_unlock(&module_mutex); err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) @@ -2504,8 +2532,10 @@ static noinline struct module *load_module(void __user *umod, return mod; unlink: + mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); cleanup: @@ -2556,19 +2586,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - /* Only one module load at a time, please */ - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - /* Do all the hard work */ mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) { - mutex_unlock(&module_mutex); + if (IS_ERR(mod)) return PTR_ERR(mod); - } - - /* Drop lock so they can recurse */ - mutex_unlock(&module_mutex); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -2585,9 +2606,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); - mutex_lock(&module_mutex); free_module(mod); - mutex_unlock(&module_mutex); wake_up(&module_wq); return ret; } -- cgit v1.1 From 3bafeb6247042dcbb72b0141ec7c7107de9f0b99 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 5 Jun 2010 11:17:36 -0600 Subject: module: move find_module check to end I think Rusty may have made the lock a bit _too_ finegrained there, and didn't add it to some places that needed it. It looks, for example, like PATCH 1/2 actually drops the lock in places where it's needed ("find_module()" is documented to need it, but now load_module() didn't hold it at all when it did the find_module()). Rather than adding a new "module_loading" list, I think we should be able to just use the existing "modules" list, and just fix up the locking a bit. In fact, maybe we could just move the "look up existing module" a bit later - optimistically assuming that the module doesn't exist, and then just undoing the work if it turns out that we were wrong, just before adding ourselves to the list. Signed-off-by: Rusty Russell --- kernel/module.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d293c21..2845004 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2226,11 +2226,6 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; } - if (find_module(mod->name)) { - err = -EEXIST; - goto free_mod; - } - mod->state = MODULE_STATE_COMING; /* Allow arches to frob section contents and sizes. */ @@ -2509,6 +2504,12 @@ static noinline struct module *load_module(void __user *umod, * The mutex protects against concurrent writers. */ mutex_lock(&module_mutex); + if (find_module(mod->name)) { + err = -EEXIST; + /* This will also unlock the mutex */ + goto already_exists; + } + list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2535,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + already_exists: mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); -- cgit v1.1 From be593f4ce4eb1bd40e38fdc403371f149f6f12eb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:37 -0600 Subject: module: verify_export_symbols under the lock It disabled preempt so it was "safe", but nothing stops another module slipping in before this module is added to the global list now we don't hold the lock the whole time. So we check this just after we check for duplicate modules, and just before we put the module in the global list. (find_symbol finds symbols in coming and going modules, too). Signed-off-by: Rusty Russell --- kernel/module.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 2845004..f99558e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1571,6 +1571,8 @@ EXPORT_SYMBOL_GPL(__symbol_get); /* * Ensure that an exported symbol [global namespace] does not already exist * in the kernel or in some other module's exported symbol table. + * + * You must hold the module_mutex. */ static int verify_export_symbols(struct module *mod) { @@ -1592,14 +1594,7 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - const struct kernel_symbol *sym; - - /* Stopping preemption makes find_symbol safe. */ - preempt_disable(); - sym = find_symbol(s->name, &owner, NULL, true, false); - preempt_enable(); - - if (sym) { + if (find_symbol(s->name, &owner, NULL, true, false)) { printk(KERN_ERR "%s: exports duplicate symbol %s" " (owned by %s)\n", @@ -2440,11 +2435,6 @@ static noinline struct module *load_module(void __user *umod, goto cleanup; } - /* Find duplicate symbols */ - err = verify_export_symbols(mod); - if (err < 0) - goto cleanup; - /* Set up and sort exception table */ mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); @@ -2506,10 +2496,14 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); if (find_module(mod->name)) { err = -EEXIST; - /* This will also unlock the mutex */ - goto already_exists; + goto unlock; } + /* Find duplicate symbols */ + err = verify_export_symbols(mod); + if (err < 0) + goto unlock; + list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2536,7 +2530,7 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); - already_exists: + unlock: mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); -- cgit v1.1 From 9bea7f23952d5948f8e5dfdff4de09bb9981fb5f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 5 Jun 2010 11:17:37 -0600 Subject: module: fix bne2 "gave up waiting for init of module libcrc32c" Problem: it's hard to avoid an init routine stumbling over a request_module these days. And it's not clear it's always a bad idea: for example, a module like kvm with dynamic dependencies on kvm-intel or kvm-amd would be neater if it could simply request_module the right one. In this particular case, it's libcrc32c: libcrc32c_mod_init crypto_alloc_shash crypto_alloc_tfm crypto_find_alg crypto_alg_mod_lookup crypto_larval_lookup request_module If another module is waiting inside resolve_symbol() for libcrc32c to finish initializing (ie. bne2 depends on libcrc32c) then it does so holding the module lock, and our request_module() can't make progress until that is released. Waiting inside resolve_symbol() without the lock isn't all that hard: we just need to pass the -EBUSY up the call chain so we can sleep where we don't hold the lock. Error reporting is a bit trickier: we need to copy the name of the unfinished module before releasing the lock. Other notes: 1) This also fixes a theoretical issue where a weak dependency would allow symbol version mismatches to be ignored. 2) We rename use_module to ref_module to make life easier for the only external user (the out-of-tree ksplice patches). Signed-off-by: Rusty Russell Cc: Linus Torvalds Cc: Tim Abbot Tested-by: Brandon Philips --- kernel/module.c | 91 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f99558e..8c6b428 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -582,33 +582,26 @@ static int add_module_usage(struct module *a, struct module *b) } /* Module a uses b: caller needs module_mutex() */ -int use_module(struct module *a, struct module *b) +int ref_module(struct module *a, struct module *b) { int err; - if (b == NULL || already_uses(a, b)) return 1; - - /* If we're interrupted or time out, we fail. */ - if (wait_event_interruptible_timeout( - module_wq, (err = strong_try_module_get(b)) != -EBUSY, - 30 * HZ) <= 0) { - printk("%s: gave up waiting for init of module %s.\n", - a->name, b->name); + if (b == NULL || already_uses(a, b)) return 0; - } - /* If strong_try_module_get() returned a different error, we fail. */ + /* If module isn't available, we fail. */ + err = strong_try_module_get(b); if (err) - return 0; + return err; err = add_module_usage(a, b); if (err) { module_put(b); - return 0; + return err; } - return 1; + return 0; } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) @@ -893,11 +886,11 @@ static inline void module_unload_free(struct module *mod) { } -int use_module(struct module *a, struct module *b) +int ref_module(struct module *a, struct module *b) { - return strong_try_module_get(b) == 0; + return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); static inline void module_unload_init(struct module *mod) { @@ -1062,26 +1055,58 @@ static inline int same_magic(const char *amagic, const char *bmagic, static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, unsigned int versindex, const char *name, - struct module *mod) + struct module *mod, + char ownername[]) { struct module *owner; const struct kernel_symbol *sym; const unsigned long *crc; + int err; mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); - /* use_module can fail due to OOM, - or module initialization or unloading */ - if (sym) { - if (!check_version(sechdrs, versindex, name, mod, crc, owner) - || !use_module(mod, owner)) - sym = NULL; + if (!sym) + goto unlock; + + if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + sym = ERR_PTR(-EINVAL); + goto getname; } + + err = ref_module(mod, owner); + if (err) { + sym = ERR_PTR(err); + goto getname; + } + +getname: + /* We must make copy under the lock if we failed to get ref. */ + strncpy(ownername, module_name(owner), MODULE_NAME_LEN); +unlock: mutex_unlock(&module_mutex); return sym; } +static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *name, + struct module *mod) +{ + const struct kernel_symbol *ksym; + char ownername[MODULE_NAME_LEN]; + + if (wait_event_interruptible_timeout(module_wq, + !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, + mod, ownername)) || + PTR_ERR(ksym) != -EBUSY, + 30 * HZ) <= 0) { + printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", + mod->name, ownername); + } + return ksym; +} + /* * /sys/module/foo/sections stuff * J. Corbet @@ -1638,21 +1663,23 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol(sechdrs, versindex, - strtab + sym[i].st_name, mod); + ksym = resolve_symbol_wait(sechdrs, versindex, + strtab + sym[i].st_name, + mod); /* Ok if resolved. */ - if (ksym) { + if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; break; } /* Ok if weak. */ - if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) + if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - printk(KERN_WARNING "%s: Unknown symbol %s\n", - mod->name, strtab + sym[i].st_name); - ret = -ENOENT; + printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", + mod->name, strtab + sym[i].st_name, + PTR_ERR(ksym)); + ret = PTR_ERR(ksym) ?: -ENOENT; break; default: -- cgit v1.1 From f6ab91add6355e231e1c47897027b2a6ee4fa268 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Jun 2010 15:18:01 +0200 Subject: perf: Fix signed comparison in perf_adjust_period() Frederic reported that frequency driven swevents didn't work properly and even caused a division-by-zero error. It turns out there are two bugs, the division-by-zero comes from a failure to deal with that in perf_calculate_period(). The other was more interesting and turned out to be a wrong comparison in perf_adjust_period(). The comparison was between an s64 and u64 and got implicitly converted to an unsigned comparison. The problem is that period_left is typically < 0, so it ended up being always true. Cure this by making the local period variables s64. Reported-by: Frederic Weisbecker Tested-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 31d6afe..ff86c55 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1507,6 +1507,9 @@ do { \ divisor = nsec * frequency; } + if (!divisor) + return dividend; + return div64_u64(dividend, divisor); } @@ -1529,7 +1532,7 @@ static int perf_event_start(struct perf_event *event) static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; - u64 period, sample_period; + s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); -- cgit v1.1 From dc61b1d65e353d638b2445f71fb8e5b5630f2415 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Jun 2010 11:40:42 +0200 Subject: sched: Fix PROVE_RCU vs cpu_cgroup PROVE_RCU has a few issues with the cpu_cgroup because the scheduler typically holds rq->lock around the css rcu derefs but the generic cgroup code doesn't (and can't) know about that lock. Provide means to add extra checks to the css dereference and use that in the scheduler to annotate its users. The addition of rq->lock to these checks is correct because the cgroup_subsys::attach() method takes the rq->lock for each task it moves, therefore by holding that lock, we ensure the task is pinned to the current cgroup and the RCU derefence is valid. That leaves one genuine race in __sched_setscheduler() where we used task_group() without holding any of the required locks and thus raced with the cgroup code. Solve this by moving the check under the appropriate lock. Signed-off-by: Peter Zijlstra Cc: "Paul E. McKenney" LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 115 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f8b8996..2aaceeb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -306,52 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD; */ struct task_group init_task_group; -/* return group to which a task belongs */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - -#ifdef CONFIG_CGROUP_SCHED - tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), - struct task_group, css); -#else - tg = &init_task_group; -#endif - return tg; -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ - /* - * Strictly speaking this rcu_read_lock() is not needed since the - * task_group is tied to the cgroup, which in turn can never go away - * as long as there are tasks attached to it. - * - * However since task_group() uses task_subsys_state() which is an - * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. - */ - rcu_read_lock(); -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif - rcu_read_unlock(); -} - -#else - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - #endif /* CONFIG_CGROUP_SCHED */ /* CFS-related fields in a runqueue */ @@ -644,6 +598,49 @@ static inline int cpu_of(struct rq *rq) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification + * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * holds that lock for each task it moves into the cgroup. Therefore + * by holding that lock, we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct cgroup_subsys_state *css; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&task_rq(p)->lock)); + return container_of(css, struct task_group, css); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + inline void update_rq_clock(struct rq *rq) { if (!rq->skip_clock_update) @@ -4465,16 +4462,6 @@ recheck: } if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; -#endif - retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4490,6 +4477,22 @@ recheck: * runqueue lock must be held. */ rq = __task_rq_lock(p); + +#ifdef CONFIG_RT_GROUP_SCHED + if (user) { + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EPERM; + } + } +#endif + /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; -- cgit v1.1 From 4673247562e39a17e09440fa1400819522ccd446 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Jun 2010 17:53:51 +0200 Subject: genirq: Deal with desc->set_type() changing desc->chip The set_type() function can change the chip implementation when the trigger mode changes. That might result in using an non-initialized irq chip when called from __setup_irq() or when called via set_irq_type() on an already enabled irq. The set_irq_type() function should not be called on an enabled irq, but because we forgot to put a check into it, we have a bunch of users which grew the habit of doing that and it never blew up as the function is serialized via desc->lock against all users of desc->chip and they never hit the non-initialized irq chip issue. The easy fix for the __setup_irq() issue would be to move the irq_chip_set_defaults(desc->chip) call after the trigger setting to make sure that a chip change is covered. But as we have already users, which do the type setting after request_irq(), the safe fix for now is to call irq_chip_set_defaults() from __irq_set_trigger() when desc->set_type() changed the irq chip. It needs a deeper analysis whether we should refuse to change the chip on an already enabled irq, but that'd be a large scale change to fix all the existing users. So that's neither stable nor 2.6.35 material. Reported-by: Esben Haabendal Signed-off-by: Thomas Gleixner Cc: Benjamin Herrenschmidt Cc: linuxppc-dev Cc: stable@kernel.org --- kernel/irq/manage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3164ba7..e149748 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -456,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; + + if (chip != desc->chip) + irq_chip_set_defaults(desc->chip); } return ret; -- cgit v1.1 From dd4c4f17d722ffeb2515bf781400675a30fcead7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 28 May 2010 16:32:14 -0400 Subject: suspend: Move NVS save/restore code to generic suspend functionality Saving platform non-volatile state may be required for suspend to RAM as well as hibernation. Move it to more generic code. Signed-off-by: Matthew Garrett Acked-by: Rafael J. Wysocki Tested-by: Maxim Levitsky Signed-off-by: Len Brown --- kernel/power/Kconfig | 9 +-- kernel/power/Makefile | 2 +- kernel/power/hibernate_nvs.c | 136 ------------------------------------------- kernel/power/nvs.c | 136 +++++++++++++++++++++++++++++++++++++++++++ kernel/power/suspend.c | 6 ++ 5 files changed, 148 insertions(+), 141 deletions(-) delete mode 100644 kernel/power/hibernate_nvs.c create mode 100644 kernel/power/nvs.c (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5c36ea9..ca6066a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n +config SUSPEND_NVS + bool + config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE + select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -130,13 +134,10 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. -config HIBERNATION_NVS - bool - config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE - select HIBERNATION_NVS if HAS_IOMEM + select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 524e058..f9063c6 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,6 +10,6 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o +obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c deleted file mode 100644 index fdcad9e..0000000 --- a/kernel/power/hibernate_nvs.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * hibernation and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * hibernate_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int hibernate_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * hibernate_nvs_free - free data pages allocated for saving NVS regions - */ -void hibernate_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int hibernate_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - hibernate_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * hibernate_nvs_save - save NVS memory regions - */ -void hibernate_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * hibernate_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void hibernate_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c new file mode 100644 index 0000000..1836db6 --- /dev/null +++ b/kernel/power/nvs.c @@ -0,0 +1,136 @@ +/* + * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory + * + * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * suspend and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * suspend_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int suspend_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * suspend_nvs_free - free data pages allocated for saving NVS regions + */ +void suspend_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * suspend_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int suspend_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + suspend_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * suspend_nvs_save - save NVS memory regions + */ +void suspend_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * suspend_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void suspend_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 56e7dbb..f37cb7d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -16,6 +16,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "power.h" -- cgit v1.1 From a8fb2608053547bc3152ea61a5ec7cdfce5d942c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jun 2010 14:53:16 -0400 Subject: perf/tracing: Fix regression of perf losing kprobe events With the addition of the code to shrink the kernel tracepoint infrastructure, we lost kprobes being traced by perf. The reason is that I tested if the "tp_event->class->perf_probe" existed before enabling it. This prevents "ftrace only" events (like the function trace events) from being enabled by perf. Unfortunately, kprobe events do not use perf_probe. This causes kprobes to be missed by perf. To fix this, we add the test to see if "tp_event->class->reg" exists as well as perf_probe. Normal trace events have only "perf_probe" but no "reg" function, and kprobes and syscalls have the "reg" but no "perf_probe". The ftrace unique events do not have either, so this is a valid test. If a kprobe or syscall is not to be probed by perf, the "reg" function is called anyway, and will return a failure and prevent perf from probing it. Reported-by: Srikar Dronamraju Tested-by: Srikar Dronamraju Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e6f6588..8a2b73f 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -96,7 +96,9 @@ int perf_trace_init(struct perf_event *p_event) mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && - tp_event->class && tp_event->class->perf_probe && + tp_event->class && + (tp_event->class->perf_probe || + tp_event->class->reg) && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); break; -- cgit v1.1 From 3310d4d38fbc514e7b18bd3b1eea8effdd63b5aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Jun 2010 18:02:37 +0200 Subject: nohz: Fix nohz ratelimit Chris Wedgwood reports that 39c0cbe (sched: Rate-limit nohz) causes a serial console regression, unresponsiveness, and indeed it does. The reason is that the nohz code is skipped even when the tick was already stopped before the nohz_ratelimit(cpu) condition changed. Move the nohz_ratelimit() check to the other conditions which prevent long idle sleeps. Reported-by: Chris Wedgwood Tested-by: Brian Bloniarz Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Jiri Kosina Cc: Linus Torvalds Cc: Greg KH Cc: Alan Cox Cc: OGAWA Hirofumi Cc: Jef Driesen LKML-Reference: <1276790557.27822.516.camel@twins> Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc..783fbad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -315,9 +315,6 @@ void tick_nohz_stop_sched_tick(int inidle) goto end; } - if (nohz_ratelimit(cpu)) - goto end; - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { @@ -328,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { + arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { -- cgit v1.1 From 3c93717cfa51316e4dbb471e7c0f9d243359d5f8 Mon Sep 17 00:00:00 2001 From: "Alex,Shi" Date: Thu, 17 Jun 2010 14:08:13 +0800 Subject: sched: Fix over-scheduling bug Commit e70971591 ("sched: Optimize unused cgroup configuration") introduced an imbalanced scheduling bug. If we do not use CGROUP, function update_h_load won't update h_load. When the system has a large number of tasks far more than logical CPU number, the incorrect cfs_rq[cpu]->h_load value will cause load_balance() to pull too many tasks to the local CPU from the busiest CPU. So the busiest CPU keeps going in a round robin. That will hurt performance. The issue was found originally by a scientific calculation workload that developed by Yanmin. With that commit, the workload performance drops about 40%. CPU before after 00 : 2 : 7 01 : 1 : 7 02 : 11 : 6 03 : 12 : 7 04 : 6 : 6 05 : 11 : 7 06 : 10 : 6 07 : 12 : 7 08 : 11 : 6 09 : 12 : 6 10 : 1 : 6 11 : 1 : 6 12 : 6 : 6 13 : 2 : 6 14 : 2 : 6 15 : 1 : 6 Reviewed-by: Yanmin zhang Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra LKML-Reference: <1276754893.9452.5442.camel@debian> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2aaceeb..6c9e7c8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1657,9 +1657,6 @@ static void update_shares(struct sched_domain *sd) static void update_h_load(long cpu) { - if (root_task_group_empty()) - return; - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -- cgit v1.1 From f3b577dec1f2ce32d2db6d2ca6badff7002512af Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 1 Jun 2010 14:06:13 +0100 Subject: rcu: apply RCU protection to wake_affine() The task_group() function returns a pointer that must be protected by either RCU, the ->alloc_lock, or the cgroup lock (see the rcu_dereference_check() in task_subsys_state(), which is invoked by task_group()). The wake_affine() function currently does none of these, which means that a concurrent update would be within its rights to free the structure returned by task_group(). Because wake_affine() uses this structure only to compute load-balancing heuristics, there is no reason to acquire either of the two locks. Therefore, this commit introduces an RCU read-side critical section that starts before the first call to task_group() and ends after the last use of the "tg" pointer returned from task_group(). Thanks to Li Zefan for pointing out the need to extend the RCU read-side critical section from that proposed by the original patch. Signed-off-by: Daniel J Blueman Signed-off-by: Paul E. McKenney --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index eed35ed..a878b53 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1240,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * effect of the currently running task from the load * of the current CPU: */ + rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1275,6 +1276,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; } else balanced = true; + rcu_read_unlock(); /* * If the currently running task will sleep within -- cgit v1.1 From 8695159967957015f8dfb49315d6f88e111d90e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Jun 2010 11:44:53 +0200 Subject: sched: silence PROVE_RCU in sched_fork() Because cgroup_fork() is ran before sched_fork() [ from copy_process() ] and the child's pid is not yet visible the child is pinned to its cgroup. Therefore we can silence this warning. A nicer solution would be moving cgroup_fork() to right after dup_task_struct() and exclude PF_STARTING from task_subsys_state(). Signed-off-by: Peter Zijlstra Reviewed-by: Li Zefan Signed-off-by: Paul E. McKenney --- kernel/sched.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f8b8996..a2d215d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2494,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + rcu_read_lock(); set_task_cpu(p, cpu); + rcu_read_unlock(); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) -- cgit v1.1 From 0d98bb2656e9bd2dfda2d089db1fe1dbdab41504 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 24 May 2010 12:11:43 -0700 Subject: sched: Prevent compiler from optimising the sched_avg_update() loop GCC 4.4.1 on ARM has been observed to replace the while loop in sched_avg_update with a call to uldivmod, resulting in the following build failure at link-time: kernel/built-in.o: In function `sched_avg_update': kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod' kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod' make: *** [.tmp_vmlinux1] Error 1 This patch introduces a fake data hazard to the loop body to prevent the compiler optimising the loop away. Signed-off-by: Will Deacon Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra Cc: Catalin Marinas Cc: Russell King Cc: Linus Torvalds Cc: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c9e7c8..a24d6d5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1254,6 +1254,12 @@ static void sched_avg_update(struct rq *rq) s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } -- cgit v1.1 From e05bd3367bd3d88715b53766f95bb3a8ec7ab59e Mon Sep 17 00:00:00 2001 From: Pavan Naregundi Date: Tue, 29 Jun 2010 15:05:28 -0700 Subject: kexec: fix Oops in crash_shrink_memory() When crashkernel is not enabled, "echo 0 > /sys/kernel/kexec_crash_size" OOPSes the kernel in crash_shrink_memory. This happens when crash_shrink_memory tries to release the 'crashk_res' resource which are not reserved. Also value of "/sys/kernel/kexec_crash_size" shows as 1, which should be 0. This patch fixes the OOPS in crash_shrink_memory and shows "/sys/kernel/kexec_crash_size" as 0 when crash kernel memory is not reserved. Signed-off-by: Pavan Naregundi Reviewed-by: WANG Cong Cc: Simon Horman Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 474a847..131b170 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs) size_t crash_get_memory_size(void) { - size_t size; + size_t size = 0; mutex_lock(&kexec_mutex); - size = crashk_res.end - crashk_res.start + 1; + if (crashk_res.end != crashk_res.start) + size = crashk_res.end - crashk_res.start + 1; mutex_unlock(&kexec_mutex); return size; } @@ -1134,7 +1135,7 @@ int crash_shrink_memory(unsigned long new_size) free_reserved_phys_range(end, crashk_res.end); - if (start == end) + if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); crashk_res.end = end - 1; -- cgit v1.1 From 7a0ea09ad5352efce8fe79ed853150449903b9f5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 30 Jun 2010 09:51:19 +0200 Subject: futex: futex_find_get_task remove credentails check futex_find_get_task is currently used (through lookup_pi_state) from two contexts, futex_requeue and futex_lock_pi_atomic. None of the paths looks it needs the credentials check, though. Different (e)uids shouldn't matter at all because the only thing that is important for shared futex is the accessibility of the shared memory. The credentail check results in glibc assert failure or process hang (if glibc is compiled without assert support) for shared robust pthread mutex with priority inheritance if a process tries to lock already held lock owned by a process with a different euid: pthread_mutex_lock.c:312: __pthread_mutex_lock_full: Assertion `(-(e)) != 3 || !robust' failed. The problem is that futex_lock_pi_atomic which is called when we try to lock already held lock checks the current holder (tid is stored in the futex value) to get the PI state. It uses lookup_pi_state which in turn gets task struct from futex_find_get_task. ESRCH is returned either when the task is not found or if credentials check fails. futex_lock_pi_atomic simply returns if it gets ESRCH. glibc code, however, doesn't expect that robust lock returns with ESRCH because it should get either success or owner died. Signed-off-by: Michal Hocko Acked-by: Darren Hart Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Nick Piggin Cc: Alexey Kuznetsov Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/futex.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e7a35f10..6a3a5fa 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p) { - p = ERR_PTR(-ESRCH); - } else { - pcred = __task_cred(p); - if (cred->euid != pcred->euid && - cred->euid != pcred->uid) - p = ERR_PTR(-ESRCH); - else - get_task_struct(p); - } + if (p) + get_task_struct(p); rcu_read_unlock(); @@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (IS_ERR(p)) - return PTR_ERR(p); + if (!p) + return -ESRCH; /* * We need to look at the task state flags to figure out, -- cgit v1.1 From 8c215bd3890c347dfb6a2db4779755f8b9c298a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Jul 2010 09:07:17 +0200 Subject: sched: Cure nr_iowait_cpu() users Commit 0224cf4c5e (sched: Intoduce get_cpu_iowait_time_us()) broke things by not making sure preemption was indeed disabled by the callers of nr_iowait_cpu() which took the iowait value of the current cpu. This resulted in a heap of preempt warnings. Cure this by making nr_iowait_cpu() take a cpu number and fix up the callers to pass in the right number. Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Cc: Sergey Senozhatsky Cc: Rafael J. Wysocki Cc: Maxim Levitsky Cc: Len Brown Cc: Pavel Machek Cc: Jiri Slaby Cc: linux-pm@lists.linux-foundation.org LKML-Reference: <1277968037.1868.120.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- kernel/time/tick-sched.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a24d6d5..f87abe3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2864,9 +2864,9 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_iowait_cpu(void) +unsigned long nr_iowait_cpu(int cpu) { - struct rq *this = this_rq(); + struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc..1a6f828 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now) * Updates the per cpu time idle statistics counters */ static void -update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time) +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - if (nr_iowait_cpu() > 0) + if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); ts->idle_entrytime = now; } @@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { ktime_t now; now = ktime_get(); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_entrytime = now; ts->idle_active = 1; @@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->idle_sleeptime); } @@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->iowait_sleeptime); } @@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ ts->inidle = 1; - now = tick_nohz_start_idle(ts); + now = tick_nohz_start_idle(cpu, ts); /* * If this cpu is offline and it is the one which updates -- cgit v1.1 From ff49d74ad383f54041378144ca1a229ee9aeaa59 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Sat, 3 Jul 2010 13:07:35 +1000 Subject: module: initialize module dynamic debug later We should initialize the module dynamic debug datastructures only after determining that the module is not loaded yet. This fixes a bug that introduced in 2.6.35-rc2, where when a trying to load a module twice, we also load it's dynamic printing data twice which causes all sorts of nasty issues. Also handle the dynamic debug cleanup later on failure. Signed-off-by: Yehuda Sadeh Signed-off-by: Rusty Russell (removed a #ifdef) Signed-off-by: Linus Torvalds --- kernel/module.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8c6b428..5d2d281 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2062,6 +2062,12 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) #endif } +static void dynamic_debug_remove(struct _ddebug *debug) +{ + if (debug) + ddebug_remove_module(debug->modname); +} + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -2124,6 +2130,8 @@ static noinline struct module *load_module(void __user *umod, void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; void __percpu *percpu; + struct _ddebug *debug = NULL; + unsigned int num_debug = 0; mm_segment_t old_fs; @@ -2476,15 +2484,9 @@ static noinline struct module *load_module(void __user *umod, kfree(strmap); strmap = NULL; - if (!mod->taints) { - struct _ddebug *debug; - unsigned int num_debug; - + if (!mod->taints) debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); - if (debug) - dynamic_debug_setup(debug, num_debug); - } err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2526,10 +2528,13 @@ static noinline struct module *load_module(void __user *umod, goto unlock; } + if (debug) + dynamic_debug_setup(debug, num_debug); + /* Find duplicate symbols */ err = verify_export_symbols(mod); if (err < 0) - goto unlock; + goto ddebug; list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2557,6 +2562,8 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + ddebug: + dynamic_debug_remove(debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); -- cgit v1.1