From 42d35d48ce7cefb9429880af19d1c329d1554e7a Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 29 Dec 2008 15:49:53 -0800 Subject: futex: make futex_(get|put)_key() calls symmetric Impact: cleanup This patch makes the calls to futex_get_key_refs() and futex_drop_key_refs() explicitly symmetric by only "putting" keys we successfully "got". Also cleanup a couple return points that didn't "put" after a successful "get". Build and boot tested on an x86_64 system. Signed-off-by: Darren Hart Cc: Signed-off-by: Ingo Molnar --- kernel/futex.c | 67 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b4f87ba..c5ac55c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -723,8 +723,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); -out: put_futex_key(fshared, &key); +out: return ret; } @@ -748,7 +748,7 @@ retryfull: goto out; ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -770,12 +770,12 @@ retry: * but we might get them from range checking */ ret = op_ret; - goto out; + goto out_put_keys; #endif if (unlikely(op_ret != -EFAULT)) { ret = op_ret; - goto out; + goto out_put_keys; } /* @@ -789,7 +789,7 @@ retry: ret = futex_handle_fault((unsigned long)uaddr2, attempt); if (ret) - goto out; + goto out_put_keys; goto retry; } @@ -827,10 +827,11 @@ retry: spin_unlock(&hb1->lock); if (hb1 != hb2) spin_unlock(&hb2->lock); -out: +out_put_keys: put_futex_key(fshared, &key2); +out_put_key1: put_futex_key(fshared, &key1); - +out: return ret; } @@ -847,13 +848,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_q *this, *next; int ret, drop_count = 0; - retry: +retry: ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) - goto out; + goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); @@ -875,7 +876,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, if (!ret) goto retry; - return ret; + goto out_put_keys; } if (curval != *cmpval) { ret = -EAGAIN; @@ -920,9 +921,11 @@ out_unlock: while (--drop_count >= 0) drop_futex_key_refs(&key1); -out: +out_put_keys: put_futex_key(fshared, &key2); +out_put_key1: put_futex_key(fshared, &key1); +out: return ret; } @@ -983,7 +986,7 @@ static int unqueue_me(struct futex_q *q) int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ - retry: +retry: lock_ptr = q->lock_ptr; barrier(); if (lock_ptr != NULL) { @@ -1165,11 +1168,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; - retry: +retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) - goto out_release_sem; + goto out; hb = queue_lock(&q); @@ -1197,6 +1200,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, if (unlikely(ret)) { queue_unlock(&q, hb); + put_futex_key(fshared, &q.key); ret = get_user(uval, uaddr); @@ -1206,7 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, } ret = -EWOULDBLOCK; if (uval != val) - goto out_unlock_release_sem; + goto out_unlock_put_key; /* Only actually queue if *uaddr contained val. */ queue_me(&q, hb); @@ -1298,11 +1302,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, return -ERESTART_RESTARTBLOCK; } - out_unlock_release_sem: +out_unlock_put_key: queue_unlock(&q, hb); - - out_release_sem: put_futex_key(fshared, &q.key); + +out: return ret; } @@ -1351,16 +1355,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; - retry: +retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) - goto out_release_sem; + goto out; - retry_unlocked: +retry_unlocked: hb = queue_lock(&q); - retry_locked: +retry_locked: ret = lock_taken = 0; /* @@ -1381,14 +1385,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, */ if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { ret = -EDEADLK; - goto out_unlock_release_sem; + goto out_unlock_put_key; } /* * Surprise - we got the lock. Just return to userspace: */ if (unlikely(!curval)) - goto out_unlock_release_sem; + goto out_unlock_put_key; uval = curval; @@ -1424,7 +1428,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, * We took the lock due to owner died take over. */ if (unlikely(lock_taken)) - goto out_unlock_release_sem; + goto out_unlock_put_key; /* * We dont have the lock. Look up the PI state (or create it if @@ -1463,7 +1467,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, goto retry_locked; } default: - goto out_unlock_release_sem; + goto out_unlock_put_key; } } @@ -1554,16 +1558,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, destroy_hrtimer_on_stack(&to->timer); return ret != -EINTR ? ret : -ERESTARTNOINTR; - out_unlock_release_sem: +out_unlock_put_key: queue_unlock(&q, hb); - out_release_sem: +out_put_key: put_futex_key(fshared, &q.key); +out: if (to) destroy_hrtimer_on_stack(&to->timer); return ret; - uaddr_faulted: +uaddr_faulted: /* * We have to r/w *(int __user *)uaddr, and we have to modify it * atomically. Therefore, if we continue to fault after get_user() @@ -1576,7 +1581,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, if (attempt++) { ret = futex_handle_fault((unsigned long)uaddr, attempt); if (ret) - goto out_release_sem; + goto out_put_key; goto retry_unlocked; } @@ -1668,9 +1673,9 @@ retry_unlocked: out_unlock: spin_unlock(&hb->lock); -out: put_futex_key(fshared, &key); +out: return ret; pi_faulted: -- cgit v1.1 From 1c5745aa380efb6417b5681104b007c8612fb496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Dec 2008 23:05:28 +0100 Subject: sched_clock: prevent scd->clock from moving backwards, take #2 Redo: 5b7dba4: sched_clock: prevent scd->clock from moving backwards which had to be reverted due to s2ram hangs: ca7e716: Revert "sched_clock: prevent scd->clock from moving backwards" ... this time with resume restoring GTOD later in the sequence taken into account as well. The "timekeeping_suspended" flag is not very nice but we cannot call into GTOD before it has been properly resumed and the scheduler will run very early in the resume sequence. Cc: Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 5 ++++- kernel/time/timekeeping.c | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096..a0b0852 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); @@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); */ void sched_clock_idle_wakeup_event(u64 delta_ns) { + if (timekeeping_suspended) + return; + sched_clock_tick(); touch_softlockup_watchdog(); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fa05e88..900f1b6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + static struct timespec xtime_cache __attribute__ ((aligned (16))); void update_xtime_cache(u64 nsec) { @@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts) unsigned long seq; s64 nsecs; + WARN_ON(timekeeping_suspended); + do { seq = read_seqbegin(&xtime_lock); @@ -299,8 +304,6 @@ void __init timekeeping_init(void) write_sequnlock_irqrestore(&xtime_lock, flags); } -/* flag for if timekeeping is suspended */ -static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -- cgit v1.1 From 0a582440ff546e2c6610d1acec325e91b4efd313 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 2 Jan 2009 12:16:42 +0100 Subject: sched: fix sched_slice() Impact: fix bad-interactivity buglet Fix sched_slice() to emit a sane result whether a task is currently enqueued or not. Signed-off-by: Mike Galbraith Tested-by: Jayson King Signed-off-by: Ingo Molnar kernel/sched_fair.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) --- kernel/sched_fair.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5ad4440..b808563 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* - * delta *= P[w / rw] - */ -static inline unsigned long -calc_delta_weight(unsigned long delta, struct sched_entity *se) -{ - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - se->load.weight, &cfs_rq_of(se)->load); - } - - return delta; -} - -/* * delta /= w */ static inline unsigned long @@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; + u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); - if (unlikely(!se->on_rq)) - nr_running++; + for_each_sched_entity(se) { + struct load_weight *load = &cfs_rq->load; - return calc_delta_weight(__sched_period(nr_running), se); + if (unlikely(!se->on_rq)) { + struct load_weight lw = cfs_rq->load; + + update_load_add(&lw, se->load.weight); + load = &lw; + } + slice = calc_delta_mine(slice, se->load.weight, load); + } + return slice; } /* -- cgit v1.1 From 90621c40cc4ab7b0a414311ce37e7cc7173403b6 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 29 Dec 2008 19:43:21 -0800 Subject: futex: catch certain assymetric (get|put)_futex_key calls Impact: add debug check Following up on my previous key reference accounting patches, this patch will catch puts on keys that haven't been "got". This won't catch nested get/put mismatches though. Build and boot tested, with minimal desktop activity and a run of the open_posix_testsuite in LTP for testing. No warnings logged. Signed-off-by: Darren Hart Cc: Signed-off-by: Ingo Molnar --- kernel/futex.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c5ac55c..206d4c90 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key) */ static void drop_futex_key_refs(union futex_key *key) { - if (!key->both.ptr) + if (!key->both.ptr) { + /* If we're here then we tried to put a key we failed to get */ + WARN_ON_ONCE(1); return; + } switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: -- cgit v1.1 From 8916edef5888c5d8fe283714416a9ca95b4c3431 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sun, 4 Jan 2009 05:40:37 +0900 Subject: getrusage: RUSAGE_THREAD should return ru_utime and ru_stime Impact: task stats regression fix Original getrusage(RUSAGE_THREAD) implementation can return ru_utime and ru_stime. But commit "f06febc: timers: fix itimer/many thread hang" broke it. this patch restores it. Signed-off-by: KOSAKI Motohiro Acked-by: Roland McGrath Signed-off-by: Ingo Molnar --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index d356d79..61dbfd4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1627,6 +1627,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { + utime = task_utime(current); + stime = task_stime(current); accumulate_thread_rusage(p, r); goto out; } -- cgit v1.1 From 14eaddc967b16017d4a1a24d2be6c28ecbe06ed8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 31 Dec 2008 15:15:42 +0000 Subject: CRED: Fix regression in cap_capable() as shown up by sys_faccessat() [ver #2] Fix a regression in cap_capable() due to: commit 5ff7711e635b32f0a1e558227d030c7e45b4a465 Author: David Howells Date: Wed Dec 31 02:52:28 2008 +0000 CRED: Differentiate objective and effective subjective credentials on a task The problem is that the above patch allows a process to have two sets of credentials, and for the most part uses the subjective credentials when accessing current's creds. There is, however, one exception: cap_capable(), and thus capable(), uses the real/objective credentials of the target task, whether or not it is the current task. Ordinarily this doesn't matter, since usually the two cred pointers in current point to the same set of creds. However, sys_faccessat() makes use of this facility to override the credentials of the calling process to make its test, without affecting the creds as seen from other processes. One of the things sys_faccessat() does is to make an adjustment to the effective capabilities mask, which cap_capable(), as it stands, then ignores. The affected capability check is in generic_permission(): if (!(mask & MAY_EXEC) || execute_ok(inode)) if (capable(CAP_DAC_OVERRIDE)) return 0; This change splits capable() from has_capability() down into the commoncap and SELinux code. The capable() security op now only deals with the current process, and uses the current process's subjective creds. A new security op - task_capable() - is introduced that can check any task's objective creds. strictly the capable() security op is superfluous with the presence of the task_capable() op, however it should be faster to call the capable() op since two fewer arguments need be passed down through the various layers. This can be tested by compiling the following program from the XFS testsuite: /* * t_access_root.c - trivial test program to show permission bug. * * Written by Michael Kerrisk - copyright ownership not pursued. * Sourced from: http://linux.derkeiler.com/Mailing-Lists/Kernel/2003-10/6030.html */ #include #include #include #include #include #include #define UID 500 #define GID 100 #define PERM 0 #define TESTPATH "/tmp/t_access" static void errExit(char *msg) { perror(msg); exit(EXIT_FAILURE); } /* errExit */ static void accessTest(char *file, int mask, char *mstr) { printf("access(%s, %s) returns %d\n", file, mstr, access(file, mask)); } /* accessTest */ int main(int argc, char *argv[]) { int fd, perm, uid, gid; char *testpath; char cmd[PATH_MAX + 20]; testpath = (argc > 1) ? argv[1] : TESTPATH; perm = (argc > 2) ? strtoul(argv[2], NULL, 8) : PERM; uid = (argc > 3) ? atoi(argv[3]) : UID; gid = (argc > 4) ? atoi(argv[4]) : GID; unlink(testpath); fd = open(testpath, O_RDWR | O_CREAT, 0); if (fd == -1) errExit("open"); if (fchown(fd, uid, gid) == -1) errExit("fchown"); if (fchmod(fd, perm) == -1) errExit("fchmod"); close(fd); snprintf(cmd, sizeof(cmd), "ls -l %s", testpath); system(cmd); if (seteuid(uid) == -1) errExit("seteuid"); accessTest(testpath, 0, "0"); accessTest(testpath, R_OK, "R_OK"); accessTest(testpath, W_OK, "W_OK"); accessTest(testpath, X_OK, "X_OK"); accessTest(testpath, R_OK | W_OK, "R_OK | W_OK"); accessTest(testpath, R_OK | X_OK, "R_OK | X_OK"); accessTest(testpath, W_OK | X_OK, "W_OK | X_OK"); accessTest(testpath, R_OK | W_OK | X_OK, "R_OK | W_OK | X_OK"); exit(EXIT_SUCCESS); } /* main */ This can be run against an Ext3 filesystem as well as against an XFS filesystem. If successful, it will show: [root@andromeda src]# ./t_access_root /tmp/xxx 0 4043 4043 ---------- 1 dhowells dhowells 0 2008-12-31 03:00 /tmp/xxx access(/tmp/xxx, 0) returns 0 access(/tmp/xxx, R_OK) returns 0 access(/tmp/xxx, W_OK) returns 0 access(/tmp/xxx, X_OK) returns -1 access(/tmp/xxx, R_OK | W_OK) returns 0 access(/tmp/xxx, R_OK | X_OK) returns -1 access(/tmp/xxx, W_OK | X_OK) returns -1 access(/tmp/xxx, R_OK | W_OK | X_OK) returns -1 If unsuccessful, it will show: [root@andromeda src]# ./t_access_root /tmp/xxx 0 4043 4043 ---------- 1 dhowells dhowells 0 2008-12-31 02:56 /tmp/xxx access(/tmp/xxx, 0) returns 0 access(/tmp/xxx, R_OK) returns -1 access(/tmp/xxx, W_OK) returns -1 access(/tmp/xxx, X_OK) returns -1 access(/tmp/xxx, R_OK | W_OK) returns -1 access(/tmp/xxx, R_OK | X_OK) returns -1 access(/tmp/xxx, W_OK | X_OK) returns -1 access(/tmp/xxx, R_OK | W_OK | X_OK) returns -1 I've also tested the fix with the SELinux and syscalls LTP testsuites. Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 36b4b4d..df62f53 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -308,7 +308,7 @@ int capable(int cap) BUG(); } - if (has_capability(current, cap)) { + if (security_capable(cap) == 0) { current->flags |= PF_SUPERPRIV; return 1; } -- cgit v1.1 From c12172c0251761c54260376eb29a5f6547495580 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jan 2009 20:30:06 -0800 Subject: rcu: fix rcutree grace-period-latency bug on small systems Impact: fix delays during bootup Kudos to Andi Kleen for finding a grace-period-latency problem! The problem was that the special-case code for small machines never updated the ->signaled field to indicate that grace-period initialization had completed, which prevented force_quiescent_state() from ever expediting grace periods. This problem resulted in grace periods extending for more than 20 seconds. Not subtle. I introduced this bug during my inspection process when I fixed a race between grace-period initialization and force_quiescent_state() execution. The following patch properly updates the ->signaled field for the "small"-system case (no more than 32 CPUs for 32-bit kernels and no more than 64 CPUs for 64-bit kernels). Reported-by: Andi Kleen Tested-by: Andi Kleen Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a342b03..88d921c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -572,6 +572,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { rnp->qsmask = rnp->qsmaskinit; + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ spin_unlock_irqrestore(&rnp->lock, flags); return; } -- cgit v1.1 From 90a4d2c0106bb690f0b6af3d506febc35c658aa7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jan 2009 11:41:11 -0800 Subject: rcu: make treercu safe for suspend and resume Impact: fix kernel warnings [and potential crash] during suspend+resume Kudos to both Dhaval Giani and Jens Axboe for finding a bug in treercu that causes warnings after suspend-resume cycles in Dhaval's case and during stress tests in Jens's case. It would also probably cause failures if heavily stressed. The solution, ironically enough, is to revert to rcupreempt's code for initializing the dynticks state. And the patch even results in smaller code -- so what was I thinking??? This is 2.6.29 material, given that people really do suspend and resume Linux these days. ;-) Reported-by: Dhaval Giani Reported-by: Jens Axboe Tested-by: Dhaval Giani Tested-by: Jens Axboe Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 88d921c..f2d8638 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); #ifdef CONFIG_NO_HZ -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); +DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = 1, + .dynticks = 1, +}; #endif /* #ifdef CONFIG_NO_HZ */ static int blimit = 10; /* Maximum callbacks per softirq. */ @@ -1380,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) static void __cpuinit rcu_online_cpu(int cpu) { -#ifdef CONFIG_NO_HZ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdtp->dynticks_nesting = 1; - rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */ - rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1; -#endif /* #ifdef CONFIG_NO_HZ */ rcu_init_percpu_data(cpu, &rcu_state); rcu_init_percpu_data(cpu, &rcu_bh_state); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -- cgit v1.1 From ea7d3fef4222cd98556a0b386598268d4dbf6670 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jan 2009 13:03:02 -0800 Subject: rcu: eliminate synchronize_rcu_xxx macro Impact: cleanup Expand macro into two files. The synchronize_rcu_xxx macro is quite ugly and it's only used by two callers, so expand it instead. This makes this code easier to change. Signed-off-by: Andi Kleen Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 11 +++++++++-- kernel/rcupreempt.c | 11 ++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ad63af8..d92a76a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void synchronize_rcu(void); /* Makes kernel-doc tools happy */ -synchronize_rcu_xxx(synchronize_rcu, call_rcu) +void synchronize_rcu(void) +{ + struct rcu_synchronize rcu; + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} EXPORT_SYMBOL_GPL(synchronize_rcu); static void rcu_barrier_callback(struct rcu_head *notused) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index f9dc8f3..33cfc50 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1177,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * in -rt this does -not- necessarily result in all currently executing * interrupt -handlers- having completed. */ -synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) +void __synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} EXPORT_SYMBOL_GPL(__synchronize_sched); /* -- cgit v1.1 From c59ab97e9ecdee9084d2da09e5a8ceea9a396508 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 Jan 2009 18:28:27 -0800 Subject: rcu: fix rcutorture bug Fix an rcutorture bug that prevents the shutdown notifier from ever actually having any effect, due to the fact that kthreads ignore all signals. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 3245b40..1cff28d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -136,7 +136,7 @@ static int stutter_pause_test = 0; #endif int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ +#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */ #define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ static int fullstop; /* stop generating callbacks at test end. */ DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ @@ -151,12 +151,10 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, { if (fullstop) return NOTIFY_DONE; - if (signal_pending(current)) { - mutex_lock(&fullstop_mutex); - if (!ACCESS_ONCE(fullstop)) - fullstop = FULLSTOP_SIGNALED; - mutex_unlock(&fullstop_mutex); - } + mutex_lock(&fullstop_mutex); + if (!fullstop) + fullstop = FULLSTOP_SHUTDOWN; + mutex_unlock(&fullstop_mutex); return NOTIFY_DONE; } @@ -624,7 +622,7 @@ rcu_torture_writer(void *arg) rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } @@ -649,7 +647,7 @@ rcu_torture_fakewriter(void *arg) } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } @@ -759,7 +757,7 @@ rcu_torture_reader(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); if (irqreader && cur_ops->irqcapable) del_timer_sync(&t); - while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) + while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN) schedule_timeout_uninterruptible(1); return 0; } -- cgit v1.1 From 8bdec955b0da2ffbd10eb9b200651dd1f9e366f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Jan 2009 11:28:19 +0100 Subject: hrtimer: splitout peek ahead functionality Impact: cleanup Provide a peek ahead function that assumes irqs disabled, allows for micro optimizations. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index eb2bfef..8f7001c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1243,6 +1243,22 @@ void hrtimer_interrupt(struct clock_event_device *dev) } } +/* + * local version of hrtimer_peek_ahead_timers() called with interrupts + * disabled. + */ +static void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = &__get_cpu_var(tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + /** * hrtimer_peek_ahead_timers -- run soft-expired timers now * @@ -1254,16 +1270,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) */ void hrtimer_peek_ahead_timers(void) { - struct tick_device *td; unsigned long flags; - if (!hrtimer_hres_active()) - return; - local_irq_save(flags); - td = &__get_cpu_var(tick_cpu_device); - if (td && td->evtdev) - hrtimer_interrupt(td->evtdev); + __hrtimer_peek_ahead_timers(); local_irq_restore(flags); } -- cgit v1.1 From d5fd43c4ae04523e1dcd7794f9c511b289851350 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Jan 2009 11:28:20 +0100 Subject: hrtimer: fix HOTPLUG_CPU=n compile warning Impact: cleanup kernel/hrtimer.c: In function 'hrtimer_cpu_notify': kernel/hrtimer.c:1574: warning: unused variable 'dcpu' Introduced by commit 37810659ea7d9572c5ac284ade272f806ef8f788 ("hrtimer: removing all ur callback modes, fix hotplug") from the timers. dcpu is only used if CONFIG_HOTPLUG_CPU is set. Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8f7001c..9c2bfa8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1504,6 +1504,11 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU +static void tickle_timers(void *arg) +{ + hrtimer_peek_ahead_timers(); +} + static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { @@ -1539,7 +1544,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } -static int migrate_hrtimers(int scpu) +static void migrate_hrtimers(int scpu) { struct hrtimer_cpu_base *old_base, *new_base; int dcpu, i; @@ -1567,12 +1572,7 @@ static int migrate_hrtimers(int scpu) spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); - return dcpu; -} - -static void tickle_timers(void *arg) -{ - hrtimer_peek_ahead_timers(); + smp_call_function_single(dcpu, tickle_timers, NULL, 0); } #endif /* CONFIG_HOTPLUG_CPU */ @@ -1593,11 +1593,8 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, case CPU_DEAD: case CPU_DEAD_FROZEN: { - int dcpu; - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); - dcpu = migrate_hrtimers(scpu); - smp_call_function_single(dcpu, tickle_timers, NULL, 0); + migrate_hrtimers(scpu); break; } #endif -- cgit v1.1 From 731a55ba0f17064f85903b7bf8e24849ec6cfa20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Jan 2009 11:28:21 +0100 Subject: hrtimer: simplify hotplug migration Impact: cleanup No need for a smp function call, which is likely to run on the same CPU anyway. We can just call hrtimers_peek_ahead() in the interrupts disabled section of migrate_hrtimers(). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9c2bfa8..8010a67 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1504,11 +1504,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU -static void tickle_timers(void *arg) -{ - hrtimer_peek_ahead_timers(); -} - static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { @@ -1547,20 +1542,19 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, static void migrate_hrtimers(int scpu) { struct hrtimer_cpu_base *old_base, *new_base; - int dcpu, i; + int i; BUG_ON(cpu_online(scpu)); - old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &get_cpu_var(hrtimer_bases); - - dcpu = smp_processor_id(); - tick_cancel_sched_timer(scpu); + + local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = &__get_cpu_var(hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); + spin_lock(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1569,10 +1563,11 @@ static void migrate_hrtimers(int scpu) } spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); - put_cpu_var(hrtimer_bases); + spin_unlock(&new_base->lock); - smp_call_function_single(dcpu, tickle_timers, NULL, 0); + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_enable(); } #endif /* CONFIG_HOTPLUG_CPU */ -- cgit v1.1 From a6037b61c2f5fc99c57c15b26d7cfa58bbb34008 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 Jan 2009 11:28:22 +0100 Subject: hrtimer: fix recursion deadlock by re-introducing the softirq Impact: fix rare runtime deadlock There are a few sites that do: spin_lock_irq(&foo) hrtimer_start(&bar) __run_hrtimer(&bar) func() spin_lock(&foo) which obviously deadlocks. In order to avoid this, never call __run_hrtimer() from hrtimer_start*() context, but instead defer this to softirq context. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 60 +++++++++++++++++++++++++------------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8010a67..b68e98f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -634,7 +634,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static void __run_hrtimer(struct hrtimer *timer); /* * When High resolution timers are active, try to reprogram. Note, that in case @@ -646,13 +645,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - /* - * XXX: recursion check? - * hrtimer_forward() should round up with timer granularity - * so that we never get into inf recursion here, - * it doesn't do that though - */ - __run_hrtimer(timer); + spin_unlock(&base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + spin_lock(&base->cpu_base->lock); return 1; } return 0; @@ -705,11 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -780,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. */ -static void enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, int reprogram) +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -814,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - if (leftmost) { - /* - * Reprogram the clock event device. When the timer is already - * expired hrtimer_enqueue_reprogram has either called the - * callback or added it to the pending list and raised the - * softirq. - * - * This is a NOP for !HIGHRES - */ - if (reprogram && hrtimer_enqueue_reprogram(timer, base)) - return; - + if (leftmost) base->first = &timer->node; - } rb_link_node(&timer->node, parent, link); rb_insert_color(&timer->node, &base->active); @@ -836,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, * state of a possibly running callback. */ timer->state |= HRTIMER_STATE_ENQUEUED; + + return leftmost; } /* @@ -912,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret; + int ret, leftmost; base = lock_hrtimer_base(timer, &flags); @@ -940,12 +922,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n timer_stats_hrtimer_set_start_info(timer); + leftmost = enqueue_hrtimer(timer, new_base); + /* * Only allow reprogramming if the new base is on this CPU. * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? */ - enqueue_hrtimer(timer, new_base, - new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) + hrtimer_enqueue_reprogram(timer, new_base); unlock_hrtimer_base(timer, &flags); @@ -1163,7 +1149,7 @@ static void __run_hrtimer(struct hrtimer *timer) */ if (restart != HRTIMER_NORESTART) { BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); - enqueue_hrtimer(timer, base, 0); + enqueue_hrtimer(timer, base); } timer->state &= ~HRTIMER_STATE_CALLBACK; } @@ -1277,6 +1263,11 @@ void hrtimer_peek_ahead_timers(void) local_irq_restore(flags); } +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_peek_ahead_timers(); +} + #endif /* CONFIG_HIGH_RES_TIMERS */ /* @@ -1532,7 +1523,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * is done, which will run all expired timers and re-programm * the timer device. */ - enqueue_hrtimer(timer, new_base, 0); + enqueue_hrtimer(timer, new_base); /* Clear the migration state bit */ timer->state &= ~HRTIMER_STATE_MIGRATE; @@ -1610,6 +1601,9 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif } /** -- cgit v1.1 From e3f1d883740b09e5116d4d4e30a6a6987264a83c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 5 Jan 2009 11:28:23 +0100 Subject: hrtimer: fixup comments Clean up the comments Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b68e98f..aa024f2 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1143,9 +1143,9 @@ static void __run_hrtimer(struct hrtimer *timer) spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid - * reprogramming of the event hardware. This happens at the end of this - * function anyway. + * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * we do not reprogramm the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() */ if (restart != HRTIMER_NORESTART) { BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); @@ -1514,14 +1514,12 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* - * Enqueue the timers on the new cpu, but do not reprogram - * the timer as that would enable a deadlock between - * hrtimer_enqueue_reprogramm() running the timer and us still - * holding a nested base lock. - * - * Instead we tickle the hrtimer interrupt after the migration - * is done, which will run all expired timers and re-programm - * the timer device. + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. */ enqueue_hrtimer(timer, new_base); -- cgit v1.1 From 39aac64812da70f0af262f4700e67637338cbb3b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 5 Jan 2009 19:18:02 +0800 Subject: sched: mark sched_create_sysfs_power_savings_entries() as __init Impact: cleanup The only caller is cpu_dev_init() which is marked as __init. Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 545c6fc..9a8e296 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8060,7 +8060,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_store); #endif -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) { int err = 0; -- cgit v1.1 From c70f22d203fc02c805b6ed4a3483b740dc36786b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 5 Jan 2009 19:07:50 +0800 Subject: sched: clean up arch_reinit_sched_domains() - Make arch_reinit_sched_domains() static. It was exported to be used in s390, but now rebuild_sched_domains() is used instead. - Make it return void. Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9a8e296..c5019a5d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7987,7 +7987,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static void arch_reinit_sched_domains(void) { get_online_cpus(); @@ -7996,13 +7996,10 @@ int arch_reinit_sched_domains(void) rebuild_sched_domains(); put_online_cpus(); - - return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { - int ret; unsigned int level = 0; if (sscanf(buf, "%u", &level) != 1) @@ -8023,9 +8020,9 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - ret = arch_reinit_sched_domains(); + arch_reinit_sched_domains(); - return ret ? ret : count; + return count; } #ifdef CONFIG_SCHED_MC -- cgit v1.1 From 82c5b7b527ccc4b5d3cf832437e842f9d2920a79 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 5 Jan 2009 14:11:10 +0100 Subject: hrtimer: splitout peek ahead functionality, fix Impact: build fix on !CONFIG_HIGH_RES_TIMERS Fix: kernel/hrtimer.c:1586: error: implicit declaration of function '__hrtimer_peek_ahead_timers' Signen-off-by: Ingo Molnar --- kernel/hrtimer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index aa024f2..1455b76 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1268,7 +1268,11 @@ static void run_hrtimer_softirq(struct softirq_action *h) hrtimer_peek_ahead_timers(); } -#endif /* CONFIG_HIGH_RES_TIMERS */ +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from timer softirq every jiffy, expire hrtimers: -- cgit v1.1 From 0c910d289567163dbe40ccc174b36afd1c7723bd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 6 Jan 2009 17:39:06 +0800 Subject: sched: fix double kfree in failure path It's not the responsibility of init_rootdomain() to free root_domain allocated by alloc_rootdomain(). Signed-off-by: Li Zefan Reviewed-by: Pekka Enberg Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c5019a5d..973f973 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6970,7 +6970,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) } if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto free_rd; + goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) @@ -6986,8 +6986,7 @@ free_online: free_cpumask_var(rd->online); free_span: free_cpumask_var(rd->span); -free_rd: - kfree(rd); +out: return -ENOMEM; } -- cgit v1.1 From db2f59c8c9b315f2b88b1dac159b988c6009034d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 6 Jan 2009 17:40:36 +0800 Subject: sched: fix section mismatch init_rootdomain() calls alloc_bootmem_cpumask_var() at system boot, so does cpupri_init(). Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_cpupri.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 973f973..2e3545f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6957,7 +6957,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) spin_unlock_irqrestore(&rq->lock, flags); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 018b7be..1e00bfa 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -151,7 +151,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int cpupri_init(struct cpupri *cp, bool bootmem) +int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) { int i; -- cgit v1.1 From cd3772e6898c6386f21d2958346d6dd57d4204f5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 16 Nov 2008 18:22:09 +0800 Subject: kernel/ksysfs.c:fix dependence on CONFIG_NET Access to uevent_seqnum and uevent_helper does not need to depend on CONFIG_NET, so remove it. Signed-off-by: Ming Lei Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/ksysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 08dd8ed..528dd78 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static struct kobj_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#if defined(CONFIG_HOTPLUG) /* current uevent sequence number */ static ssize_t uevent_seqnum_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -137,7 +137,7 @@ struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#if defined(CONFIG_HOTPLUG) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif -- cgit v1.1 From 81ff86a11f54c9e266c6a6bc3ecd2c9a0f1e11cc Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Tue, 6 Jan 2009 10:44:39 -0800 Subject: pm: struct device - replace bus_id with dev_name(), dev_set_name() Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/power/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 613f169..2399888 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) /* this may fail if the RTC hasn't been initialized */ status = rtc_read_time(rtc, &alm.time); if (status < 0) { - printk(err_readtime, rtc->dev.bus_id, status); + printk(err_readtime, dev_name(&rtc->dev), status); return; } rtc_tm_to_time(&alm.time, &now); @@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) status = rtc_set_alarm(rtc, &alm); if (status < 0) { - printk(err_wakealarm, rtc->dev.bus_id, status); + printk(err_wakealarm, dev_name(&rtc->dev), status); return; } @@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) if (!device_may_wakeup(candidate->dev.parent)) return 0; - *(char **)name_ptr = dev->bus_id; + *(const char **)name_ptr = dev_name(dev); return 1; } -- cgit v1.1 From 29881c4502ba05f46bc12ae8053d4e08d7e2615c Mon Sep 17 00:00:00 2001 From: James Morris Date: Wed, 7 Jan 2009 09:21:54 +1100 Subject: Revert "CRED: Fix regression in cap_capable() as shown up by sys_faccessat() [ver #2]" This reverts commit 14eaddc967b16017d4a1a24d2be6c28ecbe06ed8. David has a better version to come. --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index df62f53..36b4b4d 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -308,7 +308,7 @@ int capable(int cap) BUG(); } - if (security_capable(cap) == 0) { + if (has_capability(current, cap)) { current->flags |= PF_SUPERPRIV; return 1; } -- cgit v1.1 From 3699c53c485bf0168e6500d0ed18bf931584dd7c Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 6 Jan 2009 22:27:01 +0000 Subject: CRED: Fix regression in cap_capable() as shown up by sys_faccessat() [ver #3] Fix a regression in cap_capable() due to: commit 3b11a1decef07c19443d24ae926982bc8ec9f4c0 Author: David Howells Date: Fri Nov 14 10:39:26 2008 +1100 CRED: Differentiate objective and effective subjective credentials on a task The problem is that the above patch allows a process to have two sets of credentials, and for the most part uses the subjective credentials when accessing current's creds. There is, however, one exception: cap_capable(), and thus capable(), uses the real/objective credentials of the target task, whether or not it is the current task. Ordinarily this doesn't matter, since usually the two cred pointers in current point to the same set of creds. However, sys_faccessat() makes use of this facility to override the credentials of the calling process to make its test, without affecting the creds as seen from other processes. One of the things sys_faccessat() does is to make an adjustment to the effective capabilities mask, which cap_capable(), as it stands, then ignores. The affected capability check is in generic_permission(): if (!(mask & MAY_EXEC) || execute_ok(inode)) if (capable(CAP_DAC_OVERRIDE)) return 0; This change passes the set of credentials to be tested down into the commoncap and SELinux code. The security functions called by capable() and has_capability() select the appropriate set of credentials from the process being checked. This can be tested by compiling the following program from the XFS testsuite: /* * t_access_root.c - trivial test program to show permission bug. * * Written by Michael Kerrisk - copyright ownership not pursued. * Sourced from: http://linux.derkeiler.com/Mailing-Lists/Kernel/2003-10/6030.html */ #include #include #include #include #include #include #define UID 500 #define GID 100 #define PERM 0 #define TESTPATH "/tmp/t_access" static void errExit(char *msg) { perror(msg); exit(EXIT_FAILURE); } /* errExit */ static void accessTest(char *file, int mask, char *mstr) { printf("access(%s, %s) returns %d\n", file, mstr, access(file, mask)); } /* accessTest */ int main(int argc, char *argv[]) { int fd, perm, uid, gid; char *testpath; char cmd[PATH_MAX + 20]; testpath = (argc > 1) ? argv[1] : TESTPATH; perm = (argc > 2) ? strtoul(argv[2], NULL, 8) : PERM; uid = (argc > 3) ? atoi(argv[3]) : UID; gid = (argc > 4) ? atoi(argv[4]) : GID; unlink(testpath); fd = open(testpath, O_RDWR | O_CREAT, 0); if (fd == -1) errExit("open"); if (fchown(fd, uid, gid) == -1) errExit("fchown"); if (fchmod(fd, perm) == -1) errExit("fchmod"); close(fd); snprintf(cmd, sizeof(cmd), "ls -l %s", testpath); system(cmd); if (seteuid(uid) == -1) errExit("seteuid"); accessTest(testpath, 0, "0"); accessTest(testpath, R_OK, "R_OK"); accessTest(testpath, W_OK, "W_OK"); accessTest(testpath, X_OK, "X_OK"); accessTest(testpath, R_OK | W_OK, "R_OK | W_OK"); accessTest(testpath, R_OK | X_OK, "R_OK | X_OK"); accessTest(testpath, W_OK | X_OK, "W_OK | X_OK"); accessTest(testpath, R_OK | W_OK | X_OK, "R_OK | W_OK | X_OK"); exit(EXIT_SUCCESS); } /* main */ This can be run against an Ext3 filesystem as well as against an XFS filesystem. If successful, it will show: [root@andromeda src]# ./t_access_root /tmp/xxx 0 4043 4043 ---------- 1 dhowells dhowells 0 2008-12-31 03:00 /tmp/xxx access(/tmp/xxx, 0) returns 0 access(/tmp/xxx, R_OK) returns 0 access(/tmp/xxx, W_OK) returns 0 access(/tmp/xxx, X_OK) returns -1 access(/tmp/xxx, R_OK | W_OK) returns 0 access(/tmp/xxx, R_OK | X_OK) returns -1 access(/tmp/xxx, W_OK | X_OK) returns -1 access(/tmp/xxx, R_OK | W_OK | X_OK) returns -1 If unsuccessful, it will show: [root@andromeda src]# ./t_access_root /tmp/xxx 0 4043 4043 ---------- 1 dhowells dhowells 0 2008-12-31 02:56 /tmp/xxx access(/tmp/xxx, 0) returns 0 access(/tmp/xxx, R_OK) returns -1 access(/tmp/xxx, W_OK) returns -1 access(/tmp/xxx, X_OK) returns -1 access(/tmp/xxx, R_OK | W_OK) returns -1 access(/tmp/xxx, R_OK | X_OK) returns -1 access(/tmp/xxx, W_OK | X_OK) returns -1 access(/tmp/xxx, R_OK | W_OK | X_OK) returns -1 I've also tested the fix with the SELinux and syscalls LTP testsuites. Signed-off-by: David Howells Tested-by: J. Bruce Fields Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 36b4b4d..df62f53 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -308,7 +308,7 @@ int capable(int cap) BUG(); } - if (has_capability(current, cap)) { + if (security_capable(cap) == 0) { current->flags |= PF_SUPERPRIV; return 1; } -- cgit v1.1 From 75aa199410359dc5fbcf9025ff7af98a9d20f0d5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 6 Jan 2009 14:39:01 -0800 Subject: oom: print triggering task's cpuset and mems allowed When cpusets are enabled, it's necessary to print the triggering task's set of allowable nodes so the subsequently printed meminfo can be interpreted correctly. We also print the task's cpuset name for informational purposes. [rientjes@google.com: task lock current before dereferencing cpuset] Cc: Paul Menage Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 39c1a4c..345ace5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -240,6 +240,17 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); /* + * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist + * buffers. They are statically allocated to prevent using excess stack + * when calling cpuset_print_task_mems_allowed(). + */ +#define CPUSET_NAME_LEN (128) +#define CPUSET_NODELIST_LEN (256) +static char cpuset_name[CPUSET_NAME_LEN]; +static char cpuset_nodelist[CPUSET_NODELIST_LEN]; +static DEFINE_SPINLOCK(cpuset_buffer_lock); + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -2356,6 +2367,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } +/** + * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed + * @task: pointer to task_struct of some task. + * + * Description: Prints @task's name, cpuset name, and cached copy of its + * mems_allowed to the kernel log. Must hold task_lock(task) to allow + * dereferencing task_cs(task). + */ +void cpuset_print_task_mems_allowed(struct task_struct *tsk) +{ + struct dentry *dentry; + + dentry = task_cs(tsk)->css.cgroup->dentry; + spin_lock(&cpuset_buffer_lock); + snprintf(cpuset_name, CPUSET_NAME_LEN, + dentry ? (const char *)dentry->d_name.name : "/"); + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, + tsk->mems_allowed); + printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", + tsk->comm, cpuset_name, cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); +} + /* * Collection of memory_pressure is suppressed unless * this flag is enabled by writing "1" to the special -- cgit v1.1 From e5991371ee0d1c0ce19e133c6f9075b49c5b4ae8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:22 -0800 Subject: mm: remove cgroup_mm_owner_callbacks cgroup_mm_owner_callbacks() was brought in to support the memrlimit controller, but sneaked into mainline ahead of it. That controller has now been shelved, and the mm_owner_changed() args were inadequate for it anyway (they needed an mm pointer instead of a task pointer). Remove the dead code, and restore mm_update_next_owner() locking to how it was before: taking mmap_sem there does nothing for memcontrol.c, now the only user of mm->owner. Signed-off-by: Hugh Dickins Cc: Paul Menage Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 33 --------------------------------- kernel/exit.c | 16 ++++++---------- 2 files changed, 6 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 87bb025..f221446 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -116,7 +116,6 @@ static int root_count; * be called. */ static int need_forkexit_callback __read_mostly; -static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -2539,7 +2538,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; - need_mm_owner_callback |= !!ss->mm_owner_changed; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -2789,37 +2787,6 @@ void cgroup_fork_callbacks(struct task_struct *child) } } -#ifdef CONFIG_MM_OWNER -/** - * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes - * @p: the new owner - * - * Called on every change to mm->owner. mm_init_owner() does not - * invoke this routine, since it assigns the mm->owner the first time - * and does not change it. - * - * The callbacks are invoked with mmap_sem held in read mode. - */ -void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) -{ - struct cgroup *oldcgrp, *newcgrp = NULL; - - if (need_mm_owner_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - oldcgrp = task_cgroup(old, ss->subsys_id); - if (new) - newcgrp = task_cgroup(new, ss->subsys_id); - if (oldcgrp == newcgrp) - continue; - if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); - } - } -} -#endif /* CONFIG_MM_OWNER */ - /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question diff --git a/kernel/exit.c b/kernel/exit.c index c9e5a1c..f923724 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -642,35 +642,31 @@ retry: /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL, - * so that subsystems can understand the callback and take action. + * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - down_write(&mm->mmap_sem); - cgroup_mm_owner_callbacks(mm->owner, NULL); mm->owner = NULL; - up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); - read_unlock(&tasklist_lock); - down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } - cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ -- cgit v1.1 From 2da02997e08d3efe8174c7a47696e6f7cbe69ba9 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 6 Jan 2009 14:39:31 -0800 Subject: mm: add dirty_background_bytes and dirty_bytes sysctls This change introduces two new sysctls to /proc/sys/vm: dirty_background_bytes and dirty_bytes. dirty_background_bytes is the counterpart to dirty_background_ratio and dirty_bytes is the counterpart to dirty_ratio. With growing memory capacities of individual machines, it's no longer sufficient to specify dirty thresholds as a percentage of the amount of dirtyable memory over the entire system. dirty_background_bytes and dirty_bytes specify quantities of memory, in bytes, that represent the dirty limits for the entire system. If either of these values is set, its value represents the amount of dirty memory that is needed to commence either background or direct writeback. When a `bytes' or `ratio' file is written, its counterpart becomes a function of the written value. For example, if dirty_bytes is written to be 8096, 8K of memory is required to commence direct writeback. dirty_ratio is then functionally equivalent to 8K / the amount of dirtyable memory: dirtyable_memory = free pages + mapped pages + file cache dirty_background_bytes = dirty_background_ratio * dirtyable_memory -or- dirty_background_ratio = dirty_background_bytes / dirtyable_memory AND dirty_bytes = dirty_ratio * dirtyable_memory -or- dirty_ratio = dirty_bytes / dirtyable_memory Only one of dirty_background_bytes and dirty_background_ratio may be specified at a time, and only one of dirty_bytes and dirty_ratio may be specified. When one sysctl is written, the other appears as 0 when read. The `bytes' files operate on a page size granularity since dirty limits are compared with ZVC values, which are in page units. Prior to this change, the minimum dirty_ratio was 5 as implemented by get_dirty_limits() although /proc/sys/vm/dirty_ratio would show any user written value between 0 and 100. This restriction is maintained, but dirty_bytes has a lower limit of only one page. Also prior to this change, the dirty_background_ratio could not equal or exceed dirty_ratio. This restriction is maintained in addition to restricting dirty_background_bytes. If either background threshold equals or exceeds that of the dirty threshold, it is implicitly set to half the dirty threshold. Acked-by: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Signed-off-by: David Rientjes Cc: Andrea Righi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ff6d45c..92f6e5b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -87,10 +87,6 @@ extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP) -static int one = 1; -#endif - #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; static int neg_one = -1; @@ -101,6 +97,7 @@ static int two = 2; #endif static int zero; +static int one = 1; static int one_hundred = 100; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -952,12 +949,22 @@ static struct ctl_table vm_table[] = { .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &dirty_background_ratio_handler, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = &dirty_background_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", .data = &vm_dirty_ratio, @@ -969,6 +976,16 @@ static struct ctl_table vm_table[] = { .extra2 = &one_hundred, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = &dirty_bytes_handler, + .strategy = &sysctl_intvec, + .extra1 = &one, + }, + { .procname = "dirty_writeback_centisecs", .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), -- cgit v1.1 From 901608d9045146aec6f14a7777ea4b1501c379f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Jan 2009 14:40:29 -0800 Subject: mm: introduce get_mm_hiwater_xxx(), fix taskstats->hiwater_xxx accounting xacct_add_tsk() relies on do_exit()->update_hiwater_xxx() and uses mm->hiwater_xxx directly, this leads to 2 problems: - taskstats_user_cmd() can call fill_pid()->xacct_add_tsk() at any moment before the task exits, so we should check the current values of rss/vm anyway. - do_exit()->update_hiwater_xxx() calls are racy. An exiting thread can be preempted right before mm->hiwater_xxx = new_val, and another thread can use A_LOT of memory and exit in between. When the first thread resumes it can be the last thread in the thread group, in that case we report the wrong hiwater_xxx values which do not take A_LOT into account. Introduce get_mm_hiwater_rss() and get_mm_hiwater_vm() helpers and change xacct_add_tsk() to use them. The first helper will also be used by rusage->ru_maxrss accounting. Kill do_exit()->update_hiwater_xxx() calls. Unless we are going to decrease rss/vm there is no point to update mm->hiwater_xxx, and nobody can look at this mm_struct when exit_mmap() actually unmaps the memory. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 5 +---- kernel/tsacct.c | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f923724..c7740fa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1051,10 +1051,7 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 2dc06ab..43f891b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; + stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar; -- cgit v1.1 From f1883f86dea84fe47a71a39fc1afccc005915ed8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Jan 2009 14:40:45 -0800 Subject: Remove remaining unwinder code Signed-off-by: Alexey Dobriyan Cc: Gabor Gombas Cc: Jan Beulich Cc: Andi Kleen Cc: Ingo Molnar , Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f47cce9..34b56cf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -1449,8 +1448,6 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); - unwind_remove_table(mod->unwind_info, 0); - /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1867,7 +1864,6 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int unwindex = 0; unsigned int num_kp, num_mcount; struct kernel_param *kp; struct module *mod; @@ -1957,9 +1953,6 @@ static noinline struct module *load_module(void __user *umod, versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); -#ifdef ARCH_UNWIND_SECTION_NAME - unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); -#endif /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1969,8 +1962,6 @@ static noinline struct module *load_module(void __user *umod, sechdrs[symindex].sh_flags |= SHF_ALLOC; sechdrs[strindex].sh_flags |= SHF_ALLOC; #endif - if (unwindex) - sechdrs[unwindex].sh_flags |= SHF_ALLOC; /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2267,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod, add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - /* Size of section 0 is 0, so this works well if no unwind info. */ - mod->unwind_info = unwind_add_table(mod, - (void *)sechdrs[unwindex].sh_addr, - sechdrs[unwindex].sh_size); - /* Get rid of temporary copy */ vfree(hdr); @@ -2370,7 +2356,6 @@ sys_init_module(void __user *umod, mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); - unwind_remove_table(mod->unwind_info, 1); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; -- cgit v1.1 From 60348802e9cb137ee86590c3e4c57c1ec2e8fc69 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Tue, 6 Jan 2009 14:40:46 -0800 Subject: fork.c: cleanup for copy_sighand() Check CLONE_SIGHAND only is enough, because combination of CLONE_THREAD and CLONE_SIGHAND is already done in copy_process(). Impact: cleanup, no functionality changed Signed-off-by: Zhao Lei Cc: Roland McGrath Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 43cbf30..23b9121 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -758,7 +758,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; - if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { + if (clone_flags & CLONE_SIGHAND) { atomic_inc(¤t->sighand->count); return 0; } -- cgit v1.1 From d6624f996ae539344e8d748cce1117ae7af06fbf Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 6 Jan 2009 14:40:54 -0800 Subject: oops: increment the oops UUID every time we oops ... because we do want repeated same-oops to be seen by automated tools like kerneloops.org Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 13f0634..2a2ff36 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -299,6 +299,8 @@ static int init_oops_id(void) { if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); + else + oops_id++; return 0; } -- cgit v1.1 From e3d5a27d5862b6425d0879272e24abecf7245105 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 6 Jan 2009 14:41:02 -0800 Subject: Allow times and time system calls to return small negative values At the moment, the times() system call will appear to fail for a period shortly after boot, while the value it want to return is between -4095 and -1. The same thing will also happen for the time() system call on 32-bit platforms some time in 2106 or so. On some platforms, such as x86, this is unavoidable because of the system call ABI, but other platforms such as powerpc have a separate error indication from the return value, so system calls can in fact return small negative values without indicating an error. On those platforms, force_successful_syscall_return() provides a way to indicate that the system call return value should not be treated as an error even if it is in the range which would normally be taken as a negative error number. This adds a force_successful_syscall_return() call to the time() and times() system calls plus their 32-bit compat versions, so that they don't erroneously indicate an error on those platforms whose system call ABI has a separate error indication. This will not affect anything on other platforms. Joakim Tjernlund added the fix for time() and the compat versions of time() and times(), after I did the fix for times(). Signed-off-by: Joakim Tjernlund Signed-off-by: Paul Mackerras Acked-by: David S. Miller Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 5 ++++- kernel/sys.c | 2 ++ kernel/time.c | 4 +++- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d52e2ec..42d5654 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } + force_successful_syscall_return(); return compat_jiffies_to_clock_t(jiffies); } @@ -894,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } diff --git a/kernel/sys.c b/kernel/sys.c index d356d79..4a43617c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf) if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } + force_successful_syscall_return(); return (long) jiffies_64_to_clock_t(get_jiffies_64()); } diff --git a/kernel/time.c b/kernel/time.c index d63a433..4886e3c 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc) if (tloc) { if (put_user(i,tloc)) - i = -EFAULT; + return -EFAULT; } + force_successful_syscall_return(); return i; } -- cgit v1.1 From 26e5438e4b77f04a51870f9415ffed68004fac1d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 6 Jan 2009 14:41:10 -0800 Subject: profile: don't include twice. Currently, kernel/profile.c include twice. It can be removed. Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index d18e2d2..784933ac 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -445,7 +445,6 @@ void profile_tick(int type) #ifdef CONFIG_PROC_FS #include #include -#include static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) -- cgit v1.1 From bc2f70151fe7a117dbe8347edc5a877e749572a3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:46 -0800 Subject: kprobes: bugfix: try_module_get even if calling_mod is NULL When someone called register_*probe() from kernel-core code(not from module) and that probes a kernel module, users can remove the probed module because kprobe doesn't increment reference counter of the module. (on the other hand, if the kernel-module calls register_*probe, kprobe increments refcount of the probed module.) Currently, we have no register_*probe() calling from kernel-core(except smoke-test, but the smoke-test doesn't probe module), so there is no real bugs. But the logic is wrong(or not fair) and it can causes a problem when someone might want to probe module from kernel. After this patch is applied, even if someone put register_*probe() call in the kernel-core code, it increments the reference counter of the probed module, and it prevents user to remove the module until stopping probing it. Signed-off-by: Masami Hiramatsu Cc: Lai Jiangshan Cc: Ananth N Mavinakayanahalli Cc: Hiroshi Shimamoto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9f8a3f2..3afd354 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -634,7 +634,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, * avoid incrementing the module refcount, so as to allow * unloading of self probing modules. */ - if (calling_mod && calling_mod != probed_mod) { + if (calling_mod != probed_mod) { if (unlikely(!try_module_get(probed_mod))) { preempt_enable(); return -EINVAL; -- cgit v1.1 From 8e1144050e49dd4ef19c117dc5626f212cfe73cf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:47 -0800 Subject: kprobes: indirectly call kprobe_target Call kprobe_target indirectly. This prevents gcc to unroll a noinline function in caller function. I ported patches which had been discussed on http://sources.redhat.com/bugzilla/show_bug.cgi?id=3542 Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: David Miller Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/test_kprobes.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 06b6395..9c0127e 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -22,21 +22,10 @@ static u32 rand1, preh_val, posth_val, jph_val; static int errors, handler_errors, num_tests; +static u32 (*target)(u32 value); static noinline u32 kprobe_target(u32 value) { - /* - * gcc ignores noinline on some architectures unless we stuff - * sufficient lard into the function. The get_kprobe() here is - * just for that. - * - * NOTE: We aren't concerned about the correctness of get_kprobe() - * here; hence, this call is neither under !preempt nor with the - * kprobe_mutex held. This is fine(tm) - */ - if (get_kprobe((void *)0xdeadbeef)) - printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); - return (value / div_factor); } @@ -74,7 +63,7 @@ static int test_kprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kprobe(&kp); if (preh_val == 0) { @@ -121,7 +110,7 @@ static int test_jprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -177,7 +166,7 @@ static int test_kretprobe(void) return ret; } - ret = kprobe_target(rand1); + ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -193,6 +182,8 @@ int init_test_probes(void) { int ret; + target = kprobe_target; + do { rand1 = random32(); } while (rand1 <= div_factor); -- cgit v1.1 From 12da3b888b2035bb0f106122f1cc1b6d357fad53 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:48 -0800 Subject: kprobes: add tests for register_kprobes Add testcases for *probe batch registration (register_kprobes) to kprobes sanity tests. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: David Miller Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/test_kprobes.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 9c0127e..4f10451 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -23,6 +23,7 @@ static u32 rand1, preh_val, posth_val, jph_val; static int errors, handler_errors, num_tests; static u32 (*target)(u32 value); +static u32 (*target2)(u32 value); static noinline u32 kprobe_target(u32 value) { @@ -81,6 +82,84 @@ static int test_kprobe(void) return 0; } +static noinline u32 kprobe_target2(u32 value) +{ + return (value / div_factor) + 1; +} + +static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) +{ + preh_val = (rand1 / div_factor) + 1; + return 0; +} + +static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + if (preh_val != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in post_handler2\n"); + } + posth_val = preh_val + div_factor; +} + +static struct kprobe kp2 = { + .symbol_name = "kprobe_target2", + .pre_handler = kp_pre_handler2, + .post_handler = kp_post_handler2 +}; + +static int test_kprobes(void) +{ + int ret; + struct kprobe *kps[2] = {&kp, &kp2}; + + kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kprobes(kps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kprobes returned %d\n", ret); + return ret; + } + + preh_val = 0; + posth_val = 0; + ret = target(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler not called\n"); + handler_errors++; + } + + preh_val = 0; + posth_val = 0; + ret = target2(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler2 not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler2 not called\n"); + handler_errors++; + } + + unregister_kprobes(kps, 2); + return 0; + +} + static u32 j_kprobe_target(u32 value) { if (value != rand1) { @@ -121,6 +200,43 @@ static int test_jprobe(void) return 0; } +static struct jprobe jp2 = { + .entry = j_kprobe_target, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_jprobes(void) +{ + int ret; + struct jprobe *jps[2] = {&jp, &jp2}; + + jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_jprobes(jps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_jprobes returned %d\n", ret); + return ret; + } + + jph_val = 0; + ret = target(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler not called\n"); + handler_errors++; + } + + jph_val = 0; + ret = target2(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler2 not called\n"); + handler_errors++; + } + unregister_jprobes(jps, 2); + + return 0; +} #ifdef CONFIG_KRETPROBES static u32 krph_val; @@ -176,6 +292,63 @@ static int test_kretprobe(void) return 0; } + +static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long ret = regs_return_value(regs); + + if (ret != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in kretprobe handler2\n"); + } + if (krph_val == 0) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "call to kretprobe entry handler failed\n"); + } + + krph_val = rand1; + return 0; +} + +static struct kretprobe rp2 = { + .handler = return_handler2, + .entry_handler = entry_handler, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_kretprobes(void) +{ + int ret; + struct kretprobe *rps[2] = {&rp, &rp2}; + + rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + ret = register_kretprobes(rps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kretprobe returned %d\n", ret); + return ret; + } + + krph_val = 0; + ret = target(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler not called\n"); + handler_errors++; + } + + krph_val = 0; + ret = target2(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler2 not called\n"); + handler_errors++; + } + unregister_kretprobes(rps, 2); + return 0; +} #endif /* CONFIG_KRETPROBES */ int init_test_probes(void) @@ -183,6 +356,7 @@ int init_test_probes(void) int ret; target = kprobe_target; + target2 = kprobe_target2; do { rand1 = random32(); @@ -195,15 +369,30 @@ int init_test_probes(void) errors++; num_tests++; + ret = test_kprobes(); + if (ret < 0) + errors++; + + num_tests++; ret = test_jprobe(); if (ret < 0) errors++; + num_tests++; + ret = test_jprobes(); + if (ret < 0) + errors++; + #ifdef CONFIG_KRETPROBES num_tests++; ret = test_kretprobe(); if (ret < 0) errors++; + + num_tests++; + ret = test_kretprobes(); + if (ret < 0) + errors++; #endif /* CONFIG_KRETPROBES */ if (errors) -- cgit v1.1 From a06f6211ef9b1785922f9d0e8766d63ac4e66de1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:49 -0800 Subject: module: add within_module_core() and within_module_init() This series of patches allows kprobes to probe module's __init and __exit functions. This means, you can probe driver initialization and terminating. Currently, kprobes can't probe __init function because these functions are freed after module initialization. And it also can't probe module __exit functions because kprobe increments reference count of target module and user can't unload it. this means __exit functions never be called unless removing probes from the module. To solve both cases, this series of patches introduces GONE flag and sets it when the target code is freed(for this purpose, kprobes hooks MODULE_STATE_* events). This also removes refcount incrementing for allowing user to unload target module. Users can check which probes are GONE by debugfs interface. For taking timing of freeing module's .init text, these also include a patch which adds module's notifier of MODULE_STATE_LIVE event. This patch: Add within_module_core() and within_module_init() for checking whether an address is in the module .init.text section or .text section, and replace within() local inline functions in kernel/module.c with them. kprobes uses these functions to check where the kprobe is inserted. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 34b56cf..cc79c94 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2390,7 +2390,7 @@ static const char *get_ksymbol(struct module *mod, unsigned long nextval; /* At worse, next value is at end of module */ - if (within(addr, mod->module_init, mod->init_size)) + if (within_module_init(addr, mod)) nextval = (unsigned long)mod->module_init+mod->init_text_size; else nextval = (unsigned long)mod->module_core+mod->core_text_size; @@ -2438,8 +2438,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) - || within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -2461,8 +2461,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -2485,8 +2485,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_init, mod->init_size) || - within(addr, mod->module_core, mod->core_size)) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -2705,7 +2705,7 @@ int is_module_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { - if (within(addr, mod->module_core, mod->core_size)) { + if (within_module_core(addr, mod)) { preempt_enable(); return 1; } -- cgit v1.1 From 129415607845d4daea11ddcba706005c69dcb942 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:50 -0800 Subject: kprobes: add kprobe_insn_mutex and cleanup arch_remove_kprobe() Add kprobe_insn_mutex for protecting kprobe_insn_pages hlist, and remove kprobe_mutex from architecture dependent code. This allows us to call arch_remove_kprobe() (and free_insn_slot) while holding kprobe_mutex. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Russell King Cc: "Luck, Tony" Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3afd354..29e8792 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobe_enabled; -DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -115,6 +115,7 @@ enum kprobe_slot_state { SLOT_USED = 2, }; +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ static struct hlist_head kprobe_insn_pages; static int kprobe_garbage_slots; static int collect_garbage_slots(void); @@ -144,10 +145,10 @@ loop_end: } /** - * get_insn_slot() - Find a slot on an executable page for an instruction. + * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t __kprobes *get_insn_slot(void) +static kprobe_opcode_t __kprobes *__get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) return kip->insns; } +kprobe_opcode_t __kprobes *get_insn_slot(void) +{ + kprobe_opcode_t *ret; + mutex_lock(&kprobe_insn_mutex); + ret = __get_insn_slot(); + mutex_unlock(&kprobe_insn_mutex); + return ret; +} + /* Return 1 if all garbages are collected, otherwise 0. */ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) { @@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip; struct hlist_node *pos, *next; + int safety; /* Ensure no-one is preepmted on the garbages */ - if (check_safety() != 0) + mutex_unlock(&kprobe_insn_mutex); + safety = check_safety(); + mutex_lock(&kprobe_insn_mutex); + if (safety != 0) return -EAGAIN; hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { @@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) struct kprobe_insn_page *kip; struct hlist_node *pos; + mutex_lock(&kprobe_insn_mutex); hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { @@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) collect_garbage_slots(); + + mutex_unlock(&kprobe_insn_mutex); } #endif -- cgit v1.1 From 017c39bdb1b3ac1da6db339474a77b528043c05a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:51 -0800 Subject: kprobes: add __kprobes to kprobe internal functions Add __kprobes to kprobes internal functions for protecting from probing by kprobes itself. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 29e8792..a1e233a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -410,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_add_head(&ri->hlist, head); } -void kretprobe_hash_lock(struct task_struct *tsk, +void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); @@ -421,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_lock(unsigned long hash, + unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -436,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) spin_unlock_irqrestore(hlist_lock, *flags); } -void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); @@ -762,7 +764,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) } } -static int __register_kprobes(struct kprobe **kps, int num, +static int __kprobes __register_kprobes(struct kprobe **kps, int num, unsigned long called_from) { int i, ret = 0; @@ -828,7 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -static int __register_jprobes(struct jprobe **jps, int num, +static int __kprobes __register_jprobes(struct jprobe **jps, int num, unsigned long called_from) { struct jprobe *jp; @@ -990,7 +992,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, return ret; } -static int __register_kretprobes(struct kretprobe **rps, int num, +static int __kprobes __register_kretprobes(struct kretprobe **rps, int num, unsigned long called_from) { int ret = 0, i; -- cgit v1.1 From e8386a0cb22f4a2d439384212c494ad0bda848fe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:52 -0800 Subject: kprobes: support probing module __exit function Allows kprobes to probe __exit routine. This adds flags member to struct kprobe. When module is freed(kprobes hooks module_notifier to get this event), kprobes which probe the functions in that module are set to "Gone" flag to the flags member. These "Gone" probes are never be enabled. Users can check the GONE flag through debugfs. This also removes mod_refcounted, because we couldn't free a module if kprobe incremented the refcount of that module. [akpm@linux-foundation.org: document some locking] [mhiramat@redhat.com: bugfix: pass aggr_kprobe to arch_remove_kprobe] [mhiramat@redhat.com: bugfix: release old_p's insn_slot before error return] Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 159 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 119 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a1e233a..cb732a9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -327,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler) { + if (kp->pre_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); if (kp->pre_handler(kp, regs)) return 1; @@ -343,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->post_handler) { + if (kp->post_handler && !kprobe_gone(kp)) { set_kprobe_instance(kp); kp->post_handler(kp, regs, flags); reset_kprobe_instance(); @@ -545,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; - if (p->post_handler) + /* We don't care the kprobe which has gone. */ + if (p->post_handler && !kprobe_gone(p)) ap->post_handler = aggr_post_handler; - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); @@ -566,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, int ret = 0; struct kprobe *ap; + if (kprobe_gone(old_p)) { + /* + * Attempting to insert new probe at the same location that + * had a probe in the module vaddr area which already + * freed. So, the instruction slot has already been + * released. We need a new slot for the new probe. + */ + ret = arch_prepare_kprobe(old_p); + if (ret) + return ret; + } if (old_p->pre_handler == aggr_pre_handler) { copy_kprobe(old_p, p); ret = add_new_kprobe(old_p, p); + ap = old_p; } else { ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); - if (!ap) + if (!ap) { + if (kprobe_gone(old_p)) + arch_remove_kprobe(old_p); return -ENOMEM; + } add_aggr_kprobe(ap, old_p); copy_kprobe(ap, p); ret = add_new_kprobe(ap, p); } + if (kprobe_gone(old_p)) { + /* + * If the old_p has gone, its breakpoint has been disarmed. + * We have to arm it again after preparing real kprobes. + */ + ap->flags &= ~KPROBE_FLAG_GONE; + if (kprobe_enabled) + arch_arm_kprobe(ap); + } return ret; } @@ -639,8 +664,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; } - p->mod_refcounted = 0; - + p->flags = 0; /* * Check if are we probing a module. */ @@ -649,16 +673,14 @@ static int __kprobes __register_kprobe(struct kprobe *p, struct module *calling_mod; calling_mod = __module_text_address(called_from); /* - * We must allow modules to probe themself and in this case - * avoid incrementing the module refcount, so as to allow - * unloading of self probing modules. + * We must hold a refcount of the probed module while updating + * its code to prohibit unexpected unloading. */ if (calling_mod != probed_mod) { if (unlikely(!try_module_get(probed_mod))) { preempt_enable(); return -EINVAL; } - p->mod_refcounted = 1; } else probed_mod = NULL; } @@ -687,8 +709,9 @@ static int __kprobes __register_kprobe(struct kprobe *p, out: mutex_unlock(&kprobe_mutex); - if (ret && probed_mod) + if (probed_mod) module_put(probed_mod); + return ret; } @@ -716,16 +739,16 @@ valid_p: list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are - * enabled - otherwise, the breakpoint would already have - * been removed. We save on flushing icache. + * enabled and not gone - otherwise, the breakpoint would + * already have been removed. We save on flushing icache. */ - if (kprobe_enabled) + if (kprobe_enabled && !kprobe_gone(old_p)) arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); } else { - if (p->break_handler) + if (p->break_handler && !kprobe_gone(p)) old_p->break_handler = NULL; - if (p->post_handler) { + if (p->post_handler && !kprobe_gone(p)) { list_for_each_entry_rcu(list_p, &old_p->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; @@ -740,27 +763,16 @@ noclean: static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) { - struct module *mod; struct kprobe *old_p; - if (p->mod_refcounted) { - /* - * Since we've already incremented refcount, - * we don't need to disable preemption. - */ - mod = module_text_address((unsigned long)p->addr); - if (mod) - module_put(mod); - } - - if (list_empty(&p->list) || list_is_singular(&p->list)) { - if (!list_empty(&p->list)) { - /* "p" is the last child of an aggr_kprobe */ - old_p = list_entry(p->list.next, struct kprobe, list); - list_del(&p->list); - kfree(old_p); - } + if (list_empty(&p->list)) arch_remove_kprobe(p); + else if (list_is_singular(&p->list)) { + /* "p" is the last child of an aggr_kprobe */ + old_p = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); + arch_remove_kprobe(old_p); + kfree(old_p); } } @@ -1074,6 +1086,67 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, #endif /* CONFIG_KRETPROBES */ +/* Set the kprobe gone and remove its instruction buffer. */ +static void __kprobes kill_kprobe(struct kprobe *p) +{ + struct kprobe *kp; + p->flags |= KPROBE_FLAG_GONE; + if (p->pre_handler == aggr_pre_handler) { + /* + * If this is an aggr_kprobe, we have to list all the + * chained probes and mark them GONE. + */ + list_for_each_entry_rcu(kp, &p->list, list) + kp->flags |= KPROBE_FLAG_GONE; + p->post_handler = NULL; + p->break_handler = NULL; + } + /* + * Here, we can remove insn_slot safely, because no thread calls + * the original probed function (which will be freed soon) any more. + */ + arch_remove_kprobe(p); +} + +/* Module notifier call back, checking kprobes on the module */ +static int __kprobes kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + /* + * module .text section will be freed. We need to + * disable kprobes which have been inserted in the section. + */ + mutex_lock(&kprobe_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (within_module_core((unsigned long)p->addr, mod)) { + /* + * The vaddr this probe is installed will soon + * be vfreed buy not synced to disk. Hence, + * disarming the breakpoint isn't needed. + */ + kill_kprobe(p); + } + } + mutex_unlock(&kprobe_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block kprobe_module_nb = { + .notifier_call = kprobes_module_callback, + .priority = 0 +}; + static int __init init_kprobes(void) { int i, err = 0; @@ -1130,6 +1203,9 @@ static int __init init_kprobes(void) err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); + if (!err) + err = register_module_notifier(&kprobe_module_nb); + kprobes_initialized = (err == 0); if (!err) @@ -1150,10 +1226,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; if (sym) - seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, - sym, offset, (modname ? modname : " ")); + seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, + sym, offset, (modname ? modname : " "), + (kprobe_gone(p) ? "[GONE]" : "")); else - seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); + seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, + (kprobe_gone(p) ? "[GONE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1234,7 +1312,8 @@ static void __kprobes enable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) - arch_arm_kprobe(p); + if (!kprobe_gone(p)) + arch_arm_kprobe(p); } kprobe_enabled = true; @@ -1263,7 +1342,7 @@ static void __kprobes disable_all_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!arch_trampoline_kprobe(p)) + if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) arch_disarm_kprobe(p); } } -- cgit v1.1 From 49ad2fd76c97133fb396edc24ded7fe26093a578 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:53 -0800 Subject: kprobes: remove called_from argument Remove called_from argument from kprobes which had been used for preventing self-refering of kernel module. However, since we don't keep module's refcount after registering kprobe any more, there is no reason to check that. This patch also simplifies registering/unregistering functions because we don't need to use __builtin_return_address(0) which was passed to called_from. [ananth@in.ibm.com: build fix] Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 72 +++++++++++--------------------------------------------- 1 file changed, 14 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cb732a9..ddefb9f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -644,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } -static int __kprobes __register_kprobe(struct kprobe *p, - unsigned long called_from) +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; struct kprobe *old_p; @@ -670,19 +669,14 @@ static int __kprobes __register_kprobe(struct kprobe *p, */ probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod; - calling_mod = __module_text_address(called_from); /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } - } else - probed_mod = NULL; + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); + return -EINVAL; + } } preempt_enable(); @@ -776,15 +770,14 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) } } -static int __kprobes __register_kprobes(struct kprobe **kps, int num, - unsigned long called_from) +int __kprobes register_kprobes(struct kprobe **kps, int num) { int i, ret = 0; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kprobe(kps[i], called_from); + ret = register_kprobe(kps[i]); if (ret < 0) { if (i > 0) unregister_kprobes(kps, i); @@ -794,26 +787,11 @@ static int __kprobes __register_kprobes(struct kprobe **kps, int num, return ret; } -/* - * Registration and unregistration functions for kprobe. - */ -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobes(&p, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } -int __kprobes register_kprobes(struct kprobe **kps, int num) -{ - return __register_kprobes(kps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kprobes(struct kprobe **kps, int num) { int i; @@ -842,8 +820,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -static int __kprobes __register_jprobes(struct jprobe **jps, int num, - unsigned long called_from) +int __kprobes register_jprobes(struct jprobe **jps, int num) { struct jprobe *jp; int ret = 0, i; @@ -861,7 +838,7 @@ static int __kprobes __register_jprobes(struct jprobe **jps, int num, /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; - ret = __register_kprobe(&jp->kp, called_from); + ret = register_kprobe(&jp->kp); } if (ret < 0) { if (i > 0) @@ -874,8 +851,7 @@ static int __kprobes __register_jprobes(struct jprobe **jps, int num, int __kprobes register_jprobe(struct jprobe *jp) { - return __register_jprobes(&jp, 1, - (unsigned long)__builtin_return_address(0)); + return register_jprobes(&jp, 1); } void __kprobes unregister_jprobe(struct jprobe *jp) @@ -883,12 +859,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp) unregister_jprobes(&jp, 1); } -int __kprobes register_jprobes(struct jprobe **jps, int num) -{ - return __register_jprobes(jps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_jprobes(struct jprobe **jps, int num) { int i; @@ -951,8 +921,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -static int __kprobes __register_kretprobe(struct kretprobe *rp, - unsigned long called_from) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -998,21 +967,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, rp->nmissed = 0; /* Establish function entry probe point */ - ret = __register_kprobe(&rp->kp, called_from); + ret = register_kprobe(&rp->kp); if (ret != 0) free_rp_inst(rp); return ret; } -static int __kprobes __register_kretprobes(struct kretprobe **rps, int num, - unsigned long called_from) +int __kprobes register_kretprobes(struct kretprobe **rps, int num) { int ret = 0, i; if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - ret = __register_kretprobe(rps[i], called_from); + ret = register_kretprobe(rps[i]); if (ret < 0) { if (i > 0) unregister_kretprobes(rps, i); @@ -1022,23 +990,11 @@ static int __kprobes __register_kretprobes(struct kretprobe **rps, int num, return ret; } -int __kprobes register_kretprobe(struct kretprobe *rp) -{ - return __register_kretprobes(&rp, 1, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } -int __kprobes register_kretprobes(struct kretprobe **rps, int num) -{ - return __register_kretprobes(rps, num, - (unsigned long)__builtin_return_address(0)); -} - void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) { int i; -- cgit v1.1 From 0deddf436a37c18ceb26c6e3b632fb9b5f58a0c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:54 -0800 Subject: module: add MODULE_STATE_LIVE notify Add a module notifier call which notifies that the state of a module changes from MODULE_STATE_COMING to MODULE_STATE_LIVE. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index cc79c94..496dcb5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2352,6 +2352,8 @@ sys_init_module(void __user *umod, /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; wake_up(&module_wq); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); mutex_lock(&module_mutex); /* Drop initial reference. */ -- cgit v1.1 From f24659d96f4e056125f14498285203d1427cb18e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 6 Jan 2009 14:41:55 -0800 Subject: kprobes: support probing module __init function Allow kprobes to probe module __init routines. When __init functions are freed, kprobes which probe those functions are set to "Gone" flag. These "Gone" probes are disarmed from the code and never be enabled. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ddefb9f..1b9cbdc 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -677,6 +677,16 @@ int __kprobes register_kprobe(struct kprobe *p) preempt_enable(); return -EINVAL; } + /* + * If the module freed .init.text, we couldn't insert + * kprobes in there. + */ + if (within_module_init((unsigned long)p->addr, probed_mod) && + probed_mod->state != MODULE_STATE_COMING) { + module_put(probed_mod); + preempt_enable(); + return -EINVAL; + } } preempt_enable(); @@ -1073,19 +1083,24 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb, struct hlist_node *node; struct kprobe *p; unsigned int i; + int checkcore = (val == MODULE_STATE_GOING); - if (val != MODULE_STATE_GOING) + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; /* - * module .text section will be freed. We need to - * disable kprobes which have been inserted in the section. + * When MODULE_STATE_GOING was notified, both of module .text and + * .init.text sections would be freed. When MODULE_STATE_LIVE was + * notified, only .init.text section would be freed. We need to + * disable kprobes which have been inserted in the sections. */ mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) - if (within_module_core((unsigned long)p->addr, mod)) { + if (within_module_init((unsigned long)p->addr, mod) || + (checkcore && + within_module_core((unsigned long)p->addr, mod))) { /* * The vaddr this probe is installed will soon * be vfreed buy not synced to disk. Hence, -- cgit v1.1 From bd4207c9016749f0a212faf7f7f49e5317d96d9b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 6 Jan 2009 14:42:39 -0800 Subject: kmod: fix varargs kernel-doc Fix varargs kernel-doc format in kmod.c: Use @... instead of @varargs. Warning(kernel/kmod.c:67): Excess function parameter or struct member 'varargs' description in 'request_module' Signed-off-by: Randy Dunlap Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index b46dbb9..a27a5f6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; /** * request_module - try to load a kernel module - * @fmt: printf style format string for the name of the module - * @varargs: arguements as specified in the format string + * @fmt: printf style format string for the name of the module + * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns * zero on success or a negative errno code on failure. Note that a -- cgit v1.1 From 09bca05c90c639f57aae057e0c28f287e61f5a07 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 6 Jan 2009 14:42:45 -0800 Subject: SEND_SIG_NOINFO: masquerade si_pid when crossing pid-ns boundary For SEND_SIG_NOINFO, si_pid is currently set to the pid of sender in sender's active pid namespace. But if the receiver is in a Eg: when parent sends the 'pdeath_signal' to a child that is in a descendant pid namespace, we should set si_pid 0. Signed-off-by: Sukadev Bhattiprolu Acked-By: Roland McGrath Cc: "Eric W. Biederman" Cc: Oleg Nesterov Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8e95855..31db63b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = task_pid_vnr(current); + q->info.si_pid = task_pid_nr_ns(current, + task_active_pid_ns(t)); q->info.si_uid = current_uid(); break; case (unsigned long) SEND_SIG_PRIV: -- cgit v1.1 From 9cd4fd10437dda6b520cb1410b28f36967a34de8 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 6 Jan 2009 14:42:46 -0800 Subject: SEND_SIG_NOINFO: set si_pid to tgid instead of pid POSIX requires the si_pid to be the process id of the sender, so ->si_pid should really be set to 'tgid'. This change does have following changes in behavior: - When sending pdeath_signal on re-parent to a sub-thread, ->si_pid cannot be used to identify the thread that did the re-parent since it will now show the tgid instead of thread id. - A multi-threaded application that expects to find the specific thread that encountered a SIGPIPE using the ->si_pid will now break. Signed-off-by: Sukadev Bhattiprolu Acked-By: Roland McGrath Cc: "Eric W. Biederman" Cc: Oleg Nesterov Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 31db63b..3152ac3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -858,7 +858,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; - q->info.si_pid = task_pid_nr_ns(current, + q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); q->info.si_uid = current_uid(); break; -- cgit v1.1 From 4cb0e11b15d2badad455fcd538af0cccf05dc012 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 6 Jan 2009 14:42:47 -0800 Subject: coredump_filter: permit changing of the default filter Introduce a new kernel parameter `coredump_filter'. Setting a value to this parameter causes the default bitmask of coredump_filter to be changed. It is useful for users to change coredump_filter settings for the whole system at boot time. Without this parameter, users have to change coredump_filter settings for each /proc// in an initializing script. Signed-off-by: Hidehiro Kawai Cc: Roland McGrath Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 23b9121..7b8f2a7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; + +static int __init coredump_filter_setup(char *s) +{ + default_dump_filter = + (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & + MMF_DUMP_FILTER_MASK; + return 1; +} + +__setup("coredump_filter=", coredump_filter_setup); + #include static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) @@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags - : MMF_DUMP_FILTER_DEFAULT; + mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); -- cgit v1.1 From 0bef3c2dc7d0c8238330785c8f4504761b0e370b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 6 Jan 2009 14:43:08 -0800 Subject: dma_alloc_from_coherent(): fix fallback to generic memory If bitmap_find_free_region() fails and DMA_MEMORY_EXCLUSIVE is not set, the function will fail to write anything to *ret and will return 1. This will cause dma_alloc_coherent() to return an uninitialised value, crashing the kernel, perhaps via DMA to a random address. Fix that by changing it to return zero in this case, so the caller will proceed to allocate the memory from the generic memory allocator. Cc: Tetsuo Handa Cc: Dmitry Baryshkov Cc: Ingo Molnar Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma-coherent.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index f013a0c..4bdcea8 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -116,11 +116,25 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, int page = bitmap_find_free_region(mem->bitmap, mem->size, order); if (page >= 0) { + /* + * Memory was found in the per-device arena. + */ *dma_handle = mem->device_base + (page << PAGE_SHIFT); *ret = mem->virt_base + (page << PAGE_SHIFT); memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { + /* + * The per-device arena is exhausted and we are not + * permitted to fall back to generic memory. + */ *ret = NULL; + } else { + /* + * The per-device arena is exhausted and we are + * permitted to fall back to generic memory. + */ + return 0; + } } return (mem != NULL); } -- cgit v1.1 From eccd83e116e7f414a1da3aae3745384b7b171883 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 6 Jan 2009 14:43:09 -0800 Subject: dma_alloc_coherent: clean it up This thing was rather stupidly coded. Rework it all prior to making changes. Also, rename local variable `page': kernel readers expect something called `page' to have type `struct page *'. Cc: Guennadi Liakhovetski Cc: Johannes Weiner Cc: Pekka Enberg Cc: Dmitry Baryshkov Cc: Jesse Barnes Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma-coherent.c | 54 +++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 4bdcea8..8056d08 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -109,34 +109,38 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied); int dma_alloc_from_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret) { - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + struct dma_coherent_mem *mem; int order = get_order(size); + int pageno; - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - /* - * Memory was found in the per-device arena. - */ - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { - /* - * The per-device arena is exhausted and we are not - * permitted to fall back to generic memory. - */ - *ret = NULL; - } else { - /* - * The per-device arena is exhausted and we are - * permitted to fall back to generic memory. - */ - return 0; - } + if (!dev) + return 0; + mem = dev->dma_mem; + if (!mem) + return 0; + + pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); + if (pageno >= 0) { + /* + * Memory was found in the per-device arena. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *ret = mem->virt_base + (pageno << PAGE_SHIFT); + memset(*ret, 0, size); + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) { + /* + * The per-device arena is exhausted and we are not + * permitted to fall back to generic memory. + */ + *ret = NULL; + } else { + /* + * The per-device arena is exhausted and we are + * permitted to fall back to generic memory. + */ + return 0; } - return (mem != NULL); + return 1; } EXPORT_SYMBOL(dma_alloc_from_coherent); -- cgit v1.1 From 58c6d3dfe436eb8cfb451981d8fdc9044eaf42da Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 Jan 2009 14:43:10 -0800 Subject: dma-coherent: catch oversized requests to dma_alloc_from_coherent() Prevent passing an order to bitmap_find_free_region() that is larger than the actual bitmap can represent. These requests can come from device drivers that have no idea how big the dma region is and need to rely on dma_alloc_from_coherent() to sort it out for them. Reported-by: Guennadi Liakhovetski Signed-off-by: Johannes Weiner Cc: Pekka Enberg Cc: Dmitry Baryshkov Cc: Jesse Barnes Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma-coherent.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 8056d08..0387074 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -118,6 +118,8 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, mem = dev->dma_mem; if (!mem) return 0; + if (unlikely(size > mem->size)) + return 0; pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); if (pageno >= 0) { -- cgit v1.1