diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 14 | ||||
-rw-r--r-- | kernel/events/core.c | 4 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 31 | ||||
-rw-r--r-- | kernel/mutex.c | 32 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 2 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 65 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 36 |
7 files changed, 132 insertions, 52 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2418b6e..8bd9cfd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2039,7 +2039,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) - continue; + goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); @@ -2047,7 +2047,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) - continue; + goto next; /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. @@ -2055,7 +2055,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; - + next: if (!threadgroup) break; } while_each_thread(leader, tsk); @@ -3188,11 +3188,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, visit the leftmost descendant */ - if (!pos) { - next = css_leftmost_descendant(root); - return next != root ? next : NULL; - } + /* if first iteration, visit leftmost descendant which may be @root */ + if (!pos) + return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) diff --git a/kernel/events/core.c b/kernel/events/core.c index d49a9d2..953c143 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6767,6 +6767,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + /* disabled for now */ + if (attr->mmap2) + return -EINVAL; + if (attr->__reserved_1) return -EINVAL; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144..9c2ddfb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -87,10 +87,31 @@ again: goto out; /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the rb->head read and this - * write. + * Since the mmap() consumer (userspace) can run on a different CPU: + * + * kernel user + * + * READ ->data_tail READ ->data_head + * smp_mb() (A) smp_rmb() (C) + * WRITE $data READ $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head WRITE ->data_tail + * + * Where A pairs with D, and B pairs with C. + * + * I don't think A needs to be a full barrier because we won't in fact + * write data until we see the store from userspace. So we simply don't + * issue the data WRITE until we observe it. Be conservative for now. + * + * OTOH, D needs to be a full barrier since it separates the data READ + * from the tail WRITE. + * + * For B a WMB is sufficient since it separates two WRITEs, and for C + * an RMB is sufficient since it separates two READs. + * + * See perf_output_begin(). */ + smp_wmb(); rb->user_page->data_head = head; /* @@ -154,9 +175,11 @@ int perf_output_begin(struct perf_output_handle *handle, * Userspace could choose to issue a mb() before updating the * tail pointer. So that all reads will be completed before the * write is issued. + * + * See perf_output_put_handle(). */ tail = ACCESS_ONCE(rb->user_page->data_tail); - smp_rmb(); + smp_mb(); offset = head = local_read(&rb->head); head += size; if (unlikely(!perf_output_space(rb, tail, offset, head))) diff --git a/kernel/mutex.c b/kernel/mutex.c index 6d647ae..d24105b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, static __always_inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct task_struct *task = current; struct mutex_waiter waiter; @@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *owner; struct mspin_node node; - if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); - if (!__builtin_constant_p(ww_ctx == NULL)) { + if (use_ww_ctx) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -551,7 +551,7 @@ slowpath: goto err; } - if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx->acquired > 0) { ret = __mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; @@ -575,7 +575,7 @@ skip_wait: lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); - if (!__builtin_constant_p(ww_ctx == NULL)) { + if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct mutex_waiter *cur; @@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { might_sleep(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - 0, nest, _RET_IP_, NULL); + 0, nest, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_KILLABLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + 0, &ctx->dep_map, _RET_IP_, ctx, 1); if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + 0, &ctx->dep_map, _RET_IP_, ctx, 1); if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count) struct mutex *lock = container_of(lock_count, struct mutex, count); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) { return __mutex_lock_common(lock, TASK_KILLABLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) { return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx); + NULL, _RET_IP_, ctx, 1); } static noinline int __sched @@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx); + NULL, _RET_IP_, ctx, 1); } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c9c759d..0121dab 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -846,7 +846,7 @@ static int software_resume(void) goto Finish; } -late_initcall(software_resume); +late_initcall_sync(software_resume); static const char * const hibernation_modes[] = { diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c8..662c579 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -33,29 +33,64 @@ struct ce_unbind { int res; }; -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) { u64 clc = (u64) latch << evt->shift; + u64 rnd; if (unlikely(!evt->mult)) { evt->mult = 1; WARN_ON(1); } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1U << evt->shift))) + clc += rnd; do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > KTIME_MAX) - clc = KTIME_MAX; - return clc; + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); - dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); - dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b8b8560..f785aef 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> +#include <linux/list.h> #include <trace/events/block.h> @@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); + /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -107,10 +111,18 @@ record_it: * Send out a notify for this process, if we haven't done so since a trace * started */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +static void trace_note_tsk(struct task_struct *tsk) { + unsigned long flags; + struct blk_trace *bt; + tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); + spin_lock_irqsave(&running_trace_lock, flags); + list_for_each_entry(bt, &running_trace_list, running_list) { + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, + sizeof(tsk->comm)); + } + spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, goto record_it; } + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(tsk); + /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); + INIT_LIST_HEAD(&bt->running_list); ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; - memcpy(&buts.name, &cbuts.name, 32); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; - if (copy_to_user(arg, &buts.name, 32)) { + if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); return -EFAULT; } @@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; + spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); + spin_lock_irq(&running_trace_lock); + list_del(&bt->running_list); + spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } |