diff options
Diffstat (limited to 'kernel')
65 files changed, 5782 insertions, 1435 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 14f4d45..ac6b27a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -47,6 +47,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o diff --git a/kernel/audit.c b/kernel/audit.c index d9b690a..76c9a11 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2,7 +2,7 @@ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify @@ -65,7 +65,9 @@ * (Initialization happens after skb_init is called.) */ static int audit_initialized; -/* No syscall auditing will take place unless audit_enabled != 0. */ +/* 0 - no auditing + * 1 - auditing enabled + * 2 - auditing enabled and configuration is locked/unchangeable. */ int audit_enabled; /* Default state when kernel boots without any parameters. */ @@ -239,102 +241,150 @@ void audit_log_lost(const char *message) static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) { - int old = audit_rate_limit; + int res, rc = 0, old = audit_rate_limit; + + /* check if we are locked */ + if (audit_enabled == 2) + res = 0; + else + res = 1; if (sid) { char *ctx = NULL; u32 len; - int rc; - if ((rc = selinux_sid_to_string(sid, &ctx, &len))) - return rc; - else + if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_rate_limit=%d old=%d by auid=%u subj=%s", - limit, old, loginuid, ctx); - kfree(ctx); - } else - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_rate_limit=%d old=%d by auid=%u", - limit, old, loginuid); - audit_rate_limit = limit; - return 0; + "audit_rate_limit=%d old=%d by auid=%u" + " subj=%s res=%d", + limit, old, loginuid, ctx, res); + kfree(ctx); + } else + res = 0; /* Something weird, deny request */ + } + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_rate_limit=%d old=%d by auid=%u res=%d", + limit, old, loginuid, res); + + /* If we are allowed, make the change */ + if (res == 1) + audit_rate_limit = limit; + /* Not allowed, update reason */ + else if (rc == 0) + rc = -EPERM; + return rc; } static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) { - int old = audit_backlog_limit; + int res, rc = 0, old = audit_backlog_limit; + + /* check if we are locked */ + if (audit_enabled == 2) + res = 0; + else + res = 1; if (sid) { char *ctx = NULL; u32 len; - int rc; - if ((rc = selinux_sid_to_string(sid, &ctx, &len))) - return rc; - else + if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_backlog_limit=%d old=%d by auid=%u subj=%s", - limit, old, loginuid, ctx); - kfree(ctx); - } else - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_backlog_limit=%d old=%d by auid=%u", - limit, old, loginuid); - audit_backlog_limit = limit; - return 0; + "audit_backlog_limit=%d old=%d by auid=%u" + " subj=%s res=%d", + limit, old, loginuid, ctx, res); + kfree(ctx); + } else + res = 0; /* Something weird, deny request */ + } + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_backlog_limit=%d old=%d by auid=%u res=%d", + limit, old, loginuid, res); + + /* If we are allowed, make the change */ + if (res == 1) + audit_backlog_limit = limit; + /* Not allowed, update reason */ + else if (rc == 0) + rc = -EPERM; + return rc; } static int audit_set_enabled(int state, uid_t loginuid, u32 sid) { - int old = audit_enabled; + int res, rc = 0, old = audit_enabled; - if (state != 0 && state != 1) + if (state < 0 || state > 2) return -EINVAL; + /* check if we are locked */ + if (audit_enabled == 2) + res = 0; + else + res = 1; + if (sid) { char *ctx = NULL; u32 len; - int rc; - if ((rc = selinux_sid_to_string(sid, &ctx, &len))) - return rc; - else + if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_enabled=%d old=%d by auid=%u subj=%s", - state, old, loginuid, ctx); - kfree(ctx); - } else - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_enabled=%d old=%d by auid=%u", - state, old, loginuid); - audit_enabled = state; - return 0; + "audit_enabled=%d old=%d by auid=%u" + " subj=%s res=%d", + state, old, loginuid, ctx, res); + kfree(ctx); + } else + res = 0; /* Something weird, deny request */ + } + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_enabled=%d old=%d by auid=%u res=%d", + state, old, loginuid, res); + + /* If we are allowed, make the change */ + if (res == 1) + audit_enabled = state; + /* Not allowed, update reason */ + else if (rc == 0) + rc = -EPERM; + return rc; } static int audit_set_failure(int state, uid_t loginuid, u32 sid) { - int old = audit_failure; + int res, rc = 0, old = audit_failure; if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; + /* check if we are locked */ + if (audit_enabled == 2) + res = 0; + else + res = 1; + if (sid) { char *ctx = NULL; u32 len; - int rc; - if ((rc = selinux_sid_to_string(sid, &ctx, &len))) - return rc; - else + if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_failure=%d old=%d by auid=%u subj=%s", - state, old, loginuid, ctx); - kfree(ctx); - } else - audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, - "audit_failure=%d old=%d by auid=%u", - state, old, loginuid); - audit_failure = state; - return 0; + "audit_failure=%d old=%d by auid=%u" + " subj=%s res=%d", + state, old, loginuid, ctx, res); + kfree(ctx); + } else + res = 0; /* Something weird, deny request */ + } + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit_failure=%d old=%d by auid=%u res=%d", + state, old, loginuid, res); + + /* If we are allowed, make the change */ + if (res == 1) + audit_failure = state; + /* Not allowed, update reason */ + else if (rc == 0) + rc = -EPERM; + return rc; } static int kauditd_thread(void *dummy) @@ -599,6 +649,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_DEL: if (nlmsg_len(nlh) < sizeof(struct audit_rule)) return -EINVAL; + if (audit_enabled == 2) { + ab = audit_log_start(NULL, GFP_KERNEL, + AUDIT_CONFIG_CHANGE); + if (ab) { + audit_log_format(ab, + "pid=%d uid=%u auid=%u", + pid, uid, loginuid); + if (sid) { + if (selinux_sid_to_string( + sid, &ctx, &len)) { + audit_log_format(ab, + " ssid=%u", sid); + /* Maybe call audit_panic? */ + } else + audit_log_format(ab, + " subj=%s", ctx); + kfree(ctx); + } + audit_log_format(ab, " audit_enabled=%d res=0", + audit_enabled); + audit_log_end(ab); + } + return -EPERM; + } /* fallthrough */ case AUDIT_LIST: err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, @@ -609,6 +683,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_DEL_RULE: if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; + if (audit_enabled == 2) { + ab = audit_log_start(NULL, GFP_KERNEL, + AUDIT_CONFIG_CHANGE); + if (ab) { + audit_log_format(ab, + "pid=%d uid=%u auid=%u", + pid, uid, loginuid); + if (sid) { + if (selinux_sid_to_string( + sid, &ctx, &len)) { + audit_log_format(ab, + " ssid=%u", sid); + /* Maybe call audit_panic? */ + } else + audit_log_format(ab, + " subj=%s", ctx); + kfree(ctx); + } + audit_log_format(ab, " audit_enabled=%d res=0", + audit_enabled); + audit_log_end(ab); + } + return -EPERM; + } /* fallthrough */ case AUDIT_LIST_RULES: err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9c8c232..3749193 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -937,9 +937,10 @@ static void audit_update_watch(struct audit_parent *parent, } ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "audit updated rules specifying path="); + audit_log_format(ab, "op=updated rules specifying path="); audit_log_untrustedstring(ab, owatch->path); audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); + audit_log_format(ab, " list=%d res=1", r->listnr); audit_log_end(ab); audit_remove_watch(owatch); @@ -969,14 +970,14 @@ static void audit_remove_parent_watches(struct audit_parent *parent) e = container_of(r, struct audit_entry, rule); ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "audit implicitly removed rule path="); + audit_log_format(ab, "op=remove rule path="); audit_log_untrustedstring(ab, w->path); if (r->filterkey) { audit_log_format(ab, " key="); audit_log_untrustedstring(ab, r->filterkey); } else audit_log_format(ab, " key=(null)"); - audit_log_format(ab, " list=%d", r->listnr); + audit_log_format(ab, " list=%d res=1", r->listnr); audit_log_end(ab); list_del(&r->rlist); @@ -1410,7 +1411,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, audit_log_format(ab, " subj=%s", ctx); kfree(ctx); } - audit_log_format(ab, " %s rule key=", action); + audit_log_format(ab, " op=%s rule key=", action); if (rule->filterkey) audit_log_untrustedstring(ab, rule->filterkey); else @@ -1601,8 +1602,8 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, int audit_filter_user(struct netlink_skb_parms *cb, int type) { + enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; - enum audit_state state; int ret = 1; rcu_read_lock(); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2988975..3599558 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -170,6 +170,11 @@ struct audit_aux_data_sockaddr { char a[0]; }; +struct audit_aux_data_fd_pair { + struct audit_aux_data d; + int fd[2]; +}; + struct audit_aux_data_path { struct audit_aux_data d; struct dentry *dentry; @@ -961,6 +966,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); break; } + case AUDIT_FD_PAIR: { + struct audit_aux_data_fd_pair *axs = (void *)aux; + audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); + break; } + } audit_log_end(ab); } @@ -1815,6 +1825,36 @@ int audit_socketcall(int nargs, unsigned long *args) } /** + * __audit_fd_pair - record audit data for pipe and socketpair + * @fd1: the first file descriptor + * @fd2: the second file descriptor + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_fd_pair(int fd1, int fd2) +{ + struct audit_context *context = current->audit_context; + struct audit_aux_data_fd_pair *ax; + + if (likely(!context)) { + return 0; + } + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) { + return -ENOMEM; + } + + ax->fd[0] = fd1; + ax->fd[1] = fd2; + + ax->d.type = AUDIT_FD_PAIR; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto * @len: data length in user space * @a: data address in kernel space diff --git a/kernel/capability.c b/kernel/capability.c index edb845a..c8d3c77 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -92,15 +92,17 @@ out: * cap_set_pg - set capabilities for all processes in a given process * group. We call this holding task_capability_lock and tasklist_lock. */ -static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, +static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { struct task_struct *g, *target; int ret = -EPERM; int found = 0; + struct pid *pgrp; - do_each_task_pid(pgrp, PIDTYPE_PGID, g) { + pgrp = find_pid(pgrp_nr); + do_each_pid_task(pgrp, PIDTYPE_PGID, g) { target = g; while_each_thread(g, target) { if (!security_capset_check(target, effective, @@ -113,7 +115,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, } found = 1; } - } while_each_task_pid(pgrp, PIDTYPE_PGID, g); + } while_each_pid_task(pgrp, PIDTYPE_PGID, g); if (!found) ret = 0; diff --git a/kernel/compat.c b/kernel/compat.c index 6952dd0..cebb4c2 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1016,3 +1016,69 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, return sys_migrate_pages(pid, nr_bits + 1, old, new); } #endif + +struct compat_sysinfo { + s32 uptime; + u32 loads[3]; + u32 totalram; + u32 freeram; + u32 sharedram; + u32 bufferram; + u32 totalswap; + u32 freeswap; + u16 procs; + u16 pad; + u32 totalhigh; + u32 freehigh; + u32 mem_unit; + char _f[20-2*sizeof(u32)-sizeof(int)]; +}; + +asmlinkage long +compat_sys_sysinfo(struct compat_sysinfo __user *info) +{ + struct sysinfo s; + + do_sysinfo(&s); + + /* Check to see if any memory value is too large for 32-bit and scale + * down if needed + */ + if ((s.totalram >> 32) || (s.totalswap >> 32)) { + int bitcount = 0; + + while (s.mem_unit < PAGE_SIZE) { + s.mem_unit <<= 1; + bitcount++; + } + + s.totalram >>= bitcount; + s.freeram >>= bitcount; + s.sharedram >>= bitcount; + s.bufferram >>= bitcount; + s.totalswap >>= bitcount; + s.freeswap >>= bitcount; + s.totalhigh >>= bitcount; + s.freehigh >>= bitcount; + } + + if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || + __put_user (s.uptime, &info->uptime) || + __put_user (s.loads[0], &info->loads[0]) || + __put_user (s.loads[1], &info->loads[1]) || + __put_user (s.loads[2], &info->loads[2]) || + __put_user (s.totalram, &info->totalram) || + __put_user (s.freeram, &info->freeram) || + __put_user (s.sharedram, &info->sharedram) || + __put_user (s.bufferram, &info->bufferram) || + __put_user (s.totalswap, &info->totalswap) || + __put_user (s.freeswap, &info->freeswap) || + __put_user (s.procs, &info->procs) || + __put_user (s.totalhigh, &info->totalhigh) || + __put_user (s.freehigh, &info->freehigh) || + __put_user (s.mem_unit, &info->mem_unit)) + return -EFAULT; + + return 0; +} + diff --git a/kernel/cpu.c b/kernel/cpu.c index 7406fe6..3d4206a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -309,6 +309,8 @@ void enable_nonboot_cpus(void) mutex_lock(&cpu_add_remove_lock); cpu_hotplug_disabled = 0; mutex_unlock(&cpu_add_remove_lock); + if (cpus_empty(frozen_cpus)) + return; printk("Enabling non-boot CPUs ...\n"); for_each_cpu_mask(cpu, frozen_cpus) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6b05dc6..f382b0f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1540,7 +1540,7 @@ static const struct file_operations cpuset_file_operations = { .release = cpuset_file_release, }; -static struct inode_operations cpuset_dir_inode_operations = { +static const struct inode_operations cpuset_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cpuset_mkdir, .rmdir = cpuset_rmdir, @@ -2656,7 +2656,7 @@ static int cpuset_open(struct inode *inode, struct file *file) return single_open(file, proc_cpuset_show, pid); } -struct file_operations proc_cpuset_operations = { +const struct file_operations proc_cpuset_operations = { .open = cpuset_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/exit.c b/kernel/exit.c index fec12eb..f132349 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -185,21 +185,19 @@ repeat: * This checks not only the pgrp, but falls back on the pid if no * satisfactory pgrp is found. I dunno - gdb doesn't work correctly * without this... + * + * The caller must hold rcu lock or the tasklist lock. */ -int session_of_pgrp(int pgrp) +struct pid *session_of_pgrp(struct pid *pgrp) { struct task_struct *p; - int sid = 0; - - read_lock(&tasklist_lock); + struct pid *sid = NULL; - p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); + p = pid_task(pgrp, PIDTYPE_PGID); if (p == NULL) - p = find_task_by_pid(pgrp); + p = pid_task(pgrp, PIDTYPE_PID); if (p != NULL) - sid = process_session(p); - - read_unlock(&tasklist_lock); + sid = task_session(p); return sid; } @@ -212,53 +210,52 @@ int session_of_pgrp(int pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; int ret = 1; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->exit_state || is_init(p->real_parent)) continue; - if (process_group(p->real_parent) != pgrp && - process_session(p->real_parent) == process_session(p)) { + if (task_pgrp(p->real_parent) != pgrp && + task_session(p->real_parent) == task_session(p)) { ret = 0; break; } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return ret; /* (sighing) "Often!" */ } -int is_orphaned_pgrp(int pgrp) +int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); - retval = will_become_orphaned_pgrp(pgrp, NULL); + retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } -static int has_stopped_jobs(int pgrp) +static int has_stopped_jobs(struct pid *pgrp) { int retval = 0; struct task_struct *p; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->state != TASK_STOPPED) continue; retval = 1; break; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return retval; } /** - * reparent_to_init - Reparent the calling kernel thread to the init task - * of the pid space that the thread belongs to. + * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. * * If a kernel thread is launched as a result of a system call, or if * it ever exits, it should generally reparent itself to init so that @@ -431,8 +428,10 @@ static void close_files(struct files_struct * files) while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); - if (file) + if (file) { filp_close(file, files); + cond_resched(); + } } i++; set >>= 1; @@ -649,14 +648,14 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * than we are, and it was the only connection * outside, so the child pgrp is now orphaned. */ - if ((process_group(p) != process_group(father)) && - (process_session(p) == process_session(father))) { - int pgrp = process_group(p); + if ((task_pgrp(p) != task_pgrp(father)) && + (task_session(p) == task_session(father))) { + struct pid *pgrp = task_pgrp(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { - __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } } @@ -736,6 +735,7 @@ static void exit_notify(struct task_struct *tsk) int state; struct task_struct *t; struct list_head ptrace_dead, *_p, *_n; + struct pid *pgrp; if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) && !thread_group_empty(tsk)) { @@ -788,12 +788,13 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; - if ((process_group(t) != process_group(tsk)) && - (process_session(t) == process_session(tsk)) && - will_become_orphaned_pgrp(process_group(tsk), tsk) && - has_stopped_jobs(process_group(tsk))) { - __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); - __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); + pgrp = task_pgrp(tsk); + if ((task_pgrp(t) != pgrp) && + (task_session(t) != task_session(tsk)) && + will_become_orphaned_pgrp(pgrp, tsk) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } /* Let father know we died diff --git a/kernel/fork.c b/kernel/fork.c index d57118d..d154cc7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -858,7 +858,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL); + hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; sig->tsk = tsk; @@ -869,7 +869,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->it_prof_incr = cputime_zero; sig->leader = 0; /* session leadership doesn't inherit */ - sig->tty_old_pgrp = 0; + sig->tty_old_pgrp = NULL; sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; @@ -1038,10 +1038,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = cputime_zero; p->stime = cputime_zero; p->sched_time = 0; +#ifdef CONFIG_TASK_XACCT p->rchar = 0; /* I/O counter: bytes read */ p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ +#endif task_io_accounting_init(p); acct_clear_integrals(p); diff --git a/kernel/futex.c b/kernel/futex.c index 5a737de..e749e7d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1134,7 +1134,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, if (sec != MAX_SCHEDULE_TIMEOUT) { to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); to->timer.expires = ktime_set(sec, nsec); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d0ba190..476cb0c 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1,8 +1,9 @@ /* * linux/kernel/hrtimer.c * - * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> - * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * @@ -31,12 +32,17 @@ */ #include <linux/cpu.h> +#include <linux/irq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> +#include <linux/kallsyms.h> #include <linux/interrupt.h> +#include <linux/tick.h> +#include <linux/seq_file.h> +#include <linux/err.h> #include <asm/uaccess.h> @@ -45,7 +51,7 @@ * * returns the time in ktime_t format */ -static ktime_t ktime_get(void) +ktime_t ktime_get(void) { struct timespec now; @@ -59,7 +65,7 @@ static ktime_t ktime_get(void) * * returns the time in ktime_t format */ -static ktime_t ktime_get_real(void) +ktime_t ktime_get_real(void) { struct timespec now; @@ -79,21 +85,22 @@ EXPORT_SYMBOL_GPL(ktime_get_real); * This ensures that we capture erroneous accesses to these clock ids * rather than moving them into the range of valid clock id's. */ - -#define MAX_HRTIMER_BASES 2 - -static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + + .clock_base = { - .index = CLOCK_REALTIME, - .get_time = &ktime_get_real, - .resolution = KTIME_REALTIME_RES, - }, - { - .index = CLOCK_MONOTONIC, - .get_time = &ktime_get, - .resolution = KTIME_MONOTONIC_RES, - }, + { + .index = CLOCK_REALTIME, + .get_time = &ktime_get_real, + .resolution = KTIME_LOW_RES, + }, + { + .index = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_LOW_RES, + }, + } }; /** @@ -102,7 +109,7 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by ts. + * in normalized timespec format in the variable pointed to by @ts. */ void ktime_get_ts(struct timespec *ts) { @@ -125,20 +132,35 @@ EXPORT_SYMBOL_GPL(ktime_get_ts); * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. */ -static void hrtimer_get_softirq_time(struct hrtimer_base *base) +static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; + struct timespec xts; unsigned long seq; do { seq = read_seqbegin(&xtime_lock); - xtim = timespec_to_ktime(xtime); - tomono = timespec_to_ktime(wall_to_monotonic); - +#ifdef CONFIG_NO_HZ + getnstimeofday(&xts); +#else + xts = xtime; +#endif } while (read_seqretry(&xtime_lock, seq)); - base[CLOCK_REALTIME].softirq_time = xtim; - base[CLOCK_MONOTONIC].softirq_time = ktime_add(xtim, tomono); + xtim = timespec_to_ktime(xts); + tomono = timespec_to_ktime(wall_to_monotonic); + base->clock_base[CLOCK_REALTIME].softirq_time = xtim; + base->clock_base[CLOCK_MONOTONIC].softirq_time = + ktime_add(xtim, tomono); +} + +/* + * Helper function to check, whether the timer is running the callback + * function + */ +static inline int hrtimer_callback_running(struct hrtimer *timer) +{ + return timer->state & HRTIMER_STATE_CALLBACK; } /* @@ -147,8 +169,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) */ #ifdef CONFIG_SMP -#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -161,19 +181,20 @@ static void hrtimer_get_softirq_time(struct hrtimer_base *base) * possible to set timer->base = NULL and drop the lock: the timer remains * locked. */ -static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->lock, *flags); + spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } @@ -182,12 +203,14 @@ static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, /* * Switch the timer base to the current CPU when possible. */ -static inline struct hrtimer_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_base *new_base; + struct hrtimer_clock_base *new_base; + struct hrtimer_cpu_base *new_cpu_base; - new_base = &__get_cpu_var(hrtimer_bases)[base->index]; + new_cpu_base = &__get_cpu_var(hrtimer_bases); + new_base = &new_cpu_base->clock_base[base->index]; if (base != new_base) { /* @@ -199,13 +222,13 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ - if (unlikely(base->curr_timer == timer)) + if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->lock); - spin_lock(&new_base->lock); + spin_unlock(&base->cpu_base->lock); + spin_lock(&new_base->cpu_base->lock); timer->base = new_base; } return new_base; @@ -213,19 +236,17 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) #else /* CONFIG_SMP */ -#define set_curr_timer(b, t) do { } while (0) - -static inline struct hrtimer_base * +static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - struct hrtimer_base *base = timer->base; + struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->lock, *flags); + spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } -#define switch_hrtimer_base(t, b) (b) +# define switch_hrtimer_base(t, b) (b) #endif /* !CONFIG_SMP */ @@ -256,15 +277,12 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) return ktime_add(kt, tmp); } - -#else /* CONFIG_KTIME_SCALAR */ - # endif /* !CONFIG_KTIME_SCALAR */ /* * Divide a ktime value by a nanosecond value */ -static unsigned long ktime_divns(const ktime_t kt, s64 div) +unsigned long ktime_divns(const ktime_t kt, s64 div) { u64 dclc, inc, dns; int sft = 0; @@ -281,18 +299,311 @@ static unsigned long ktime_divns(const ktime_t kt, s64 div) return (unsigned long) dclc; } - -#else /* BITS_PER_LONG < 64 */ -# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) #endif /* BITS_PER_LONG >= 64 */ +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static int hrtimer_hres_enabled __read_mostly = 1; + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + if (!strcmp(str, "off")) + hrtimer_hres_enabled = 0; + else if (!strcmp(str, "on")) + hrtimer_hres_enabled = 1; + else + return 0; + return 1; +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + +/* + * Is the high resolution mode active ? + */ +static inline int hrtimer_hres_active(void) +{ + return __get_cpu_var(hrtimer_bases).hres_active; +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +{ + int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires; + + cpu_base->expires_next.tv64 = KTIME_MAX; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + + if (!base->first) + continue; + timer = rb_entry(base->first, struct hrtimer, node); + expires = ktime_sub(timer->expires, base->offset); + if (expires.tv64 < cpu_base->expires_next.tv64) + cpu_base->expires_next = expires; + } + + if (cpu_base->expires_next.tv64 != KTIME_MAX) + tick_program_event(cpu_base->expires_next, 1); +} + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + ktime_t expires = ktime_sub(timer->expires, base->offset); + int res; + + /* + * When the callback is running, we do not reprogram the clock event + * device. The timer callback is either running on a different CPU or + * the callback is executed in the hrtimer_interupt context. The + * reprogramming is handled either by the softirq, which called the + * callback or at the end of the hrtimer_interrupt. + */ + if (hrtimer_callback_running(timer)) + return 0; + + if (expires.tv64 >= expires_next->tv64) + return 0; + + /* + * Clockevents returns -ETIME, when the event was in the past. + */ + res = tick_program_event(expires, 0); + if (!IS_ERR_VALUE(res)) + *expires_next = expires; + return res; +} + + +/* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base; + struct timespec realtime_offset; + unsigned long seq; + + if (!hrtimer_hres_active()) + return; + + do { + seq = read_seqbegin(&xtime_lock); + set_normalized_timespec(&realtime_offset, + -wall_to_monotonic.tv_sec, + -wall_to_monotonic.tv_nsec); + } while (read_seqretry(&xtime_lock, seq)); + + base = &__get_cpu_var(hrtimer_bases); + + /* Adjust CLOCK_REALTIME offset */ + spin_lock(&base->lock); + base->clock_base[CLOCK_REALTIME].offset = + timespec_to_ktime(realtime_offset); + + hrtimer_force_reprogram(base); + spin_unlock(&base->lock); +} + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 0, 1); +} + +/* + * Check, whether the timer is on the callback pending list + */ +static inline int hrtimer_cb_pending(const struct hrtimer *timer) +{ + return timer->state & HRTIMER_STATE_PENDING; +} + +/* + * Remove a timer from the callback pending list + */ +static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) +{ + list_del_init(&timer->cb_entry); +} + +/* + * Initialize the high resolution related parts of cpu_base + */ +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +{ + base->expires_next.tv64 = KTIME_MAX; + base->hres_active = 0; + INIT_LIST_HEAD(&base->cb_pending); +} + +/* + * Initialize the high resolution related parts of a hrtimer + */ +static inline void hrtimer_init_timer_hres(struct hrtimer *timer) +{ + INIT_LIST_HEAD(&timer->cb_entry); +} + +/* + * When High resolution timers are active, try to reprogram. Note, that in case + * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry + * check happens. The timer gets enqueued into the rbtree. The reprogramming + * and expiry check is done in the hrtimer_interrupt or in the softirq. + */ +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { + + /* Timer is expired, act upon the callback mode */ + switch(timer->cb_mode) { + case HRTIMER_CB_IRQSAFE_NO_RESTART: + /* + * We can call the callback from here. No restart + * happens, so no danger of recursion + */ + BUG_ON(timer->function(timer) != HRTIMER_NORESTART); + return 1; + case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + /* + * This is solely for the sched tick emulation with + * dynamic tick support to ensure that we do not + * restart the tick right on the edge and end up with + * the tick timer in the softirq ! The calling site + * takes care of this. + */ + return 1; + case HRTIMER_CB_IRQSAFE: + case HRTIMER_CB_SOFTIRQ: + /* + * Move everything else into the softirq pending list ! + */ + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + timer->state = HRTIMER_STATE_PENDING; + raise_softirq(HRTIMER_SOFTIRQ); + return 1; + default: + BUG(); + } + } + return 0; +} + +/* + * Switch to high resolution mode + */ +static void hrtimer_switch_to_hres(void) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + unsigned long flags; + + if (base->hres_active) + return; + + local_irq_save(flags); + + if (tick_init_highres()) { + local_irq_restore(flags); + return; + } + base->hres_active = 1; + base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; + base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; + + tick_setup_sched_timer(); + + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + printk(KERN_INFO "Switched to high resolution mode on CPU %d\n", + smp_processor_id()); +} + +#else + +static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline void hrtimer_switch_to_hres(void) { } +static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return 0; +} +static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } +static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +#ifdef CONFIG_TIMER_STATS +void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /* * Counterpart to lock_timer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->lock, *flags); + spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -342,7 +653,8 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. */ -static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +static void enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, int reprogram) { struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; @@ -368,39 +680,85 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) + rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + /* + * Reprogram the clock event device. When the timer is already + * expired hrtimer_enqueue_reprogram has either called the + * callback or added it to the pending list and raised the + * softirq. + * + * This is a NOP for !HIGHRES + */ + if (reprogram && hrtimer_enqueue_reprogram(timer, base)) + return; + base->first = &timer->node; + } + + rb_link_node(&timer->node, parent, link); + rb_insert_color(&timer->node, &base->active); + /* + * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the + * state of a possibly running callback. + */ + timer->state |= HRTIMER_STATE_ENQUEUED; } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. + * + * High resolution timer mode reprograms the clock event device when the + * timer is the one which expires next. The caller can disable this by setting + * reprogram to zero. This is useful, when the context does a reprogramming + * anyway (e.g. timer interrupt) */ -static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + unsigned long newstate, int reprogram) { - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) - base->first = rb_next(&timer->node); - rb_erase(&timer->node, &base->active); - rb_set_parent(&timer->node, &timer->node); + /* High res. callback list. NOP for !HIGHRES */ + if (hrtimer_cb_pending(timer)) + hrtimer_remove_cb_pending(timer); + else { + /* + * Remove the timer from the rbtree and replace the + * first entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) + hrtimer_force_reprogram(base->cpu_base); + } + rb_erase(&timer->node, &base->active); + } + timer->state = newstate; } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { - if (hrtimer_active(timer)) { - __remove_hrtimer(timer, base); + if (hrtimer_is_queued(timer)) { + int reprogram; + + /* + * Remove the timer and force reprogramming when high + * resolution mode is active and the timer is on the current + * CPU. If we remove a timer on another CPU, reprogramming is + * skipped. The interrupt event on this CPU is fired and + * reprogramming happens in the interrupt handler. This is a + * rare case and less expensive than a smp call. + */ + timer_stats_hrtimer_clear_start_info(timer); + reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, + reprogram); return 1; } return 0; @@ -419,7 +777,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - struct hrtimer_base *base, *new_base; + struct hrtimer_clock_base *base, *new_base; unsigned long flags; int ret; @@ -431,7 +789,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base); - if (mode == HRTIMER_REL) { + if (mode == HRTIMER_MODE_REL) { tim = ktime_add(tim, new_base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures @@ -446,7 +804,9 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) } timer->expires = tim; - enqueue_hrtimer(timer, new_base); + timer_stats_hrtimer_set_start_info(timer); + + enqueue_hrtimer(timer, new_base, base == new_base); unlock_hrtimer_base(timer, &flags); @@ -466,13 +826,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); */ int hrtimer_try_to_cancel(struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; base = lock_hrtimer_base(timer, &flags); - if (base->curr_timer != timer) + if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base); unlock_hrtimer_base(timer, &flags); @@ -508,19 +868,19 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_base *base; + struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; base = lock_hrtimer_base(timer, &flags); - rem = ktime_sub(timer->expires, timer->base->get_time()); + rem = ktime_sub(timer->expires, base->get_time()); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_IDLE_HZ +#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) /** * hrtimer_get_next_event - get the time until next expiry event * @@ -529,26 +889,31 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { - struct hrtimer *timer; + spin_lock_irqsave(&cpu_base->lock, flags); - spin_lock_irqsave(&base->lock, flags); - if (!base->first) { - spin_unlock_irqrestore(&base->lock, flags); - continue; + if (!hrtimer_hres_active()) { + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + + if (!base->first) + continue; + + timer = rb_entry(base->first, struct hrtimer, node); + delta.tv64 = timer->expires.tv64; + delta = ktime_sub(delta, base->get_time()); + if (delta.tv64 < mindelta.tv64) + mindelta.tv64 = delta.tv64; } - timer = rb_entry(base->first, struct hrtimer, node); - delta.tv64 = timer->expires.tv64; - spin_unlock_irqrestore(&base->lock, flags); - delta = ktime_sub(delta, base->get_time()); - if (delta.tv64 < mindelta.tv64) - mindelta.tv64 = delta.tv64; } + + spin_unlock_irqrestore(&cpu_base->lock, flags); + if (mindelta.tv64 < 0) mindelta.tv64 = 0; return mindelta; @@ -564,17 +929,23 @@ ktime_t hrtimer_get_next_event(void) void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; memset(timer, 0, sizeof(struct hrtimer)); - bases = __raw_get_cpu_var(hrtimer_bases); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) + if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &bases[clock_id]; - rb_set_parent(&timer->node, &timer->node); + timer->base = &cpu_base->clock_base[clock_id]; + hrtimer_init_timer_hres(timer); + +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -583,26 +954,164 @@ EXPORT_SYMBOL_GPL(hrtimer_init); * @which_clock: which clock to query * @tp: pointer to timespec variable to store the resolution * - * Store the resolution of the clock selected by which_clock in the - * variable pointed to by tp. + * Store the resolution of the clock selected by @which_clock in the + * variable pointed to by @tp. */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { - struct hrtimer_base *bases; + struct hrtimer_cpu_base *cpu_base; - bases = __raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(bases[which_clock].resolution); + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); return 0; } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base; + ktime_t expires_next, now; + int i, raise = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + retry: + now = ktime_get(); + + expires_next.tv64 = KTIME_MAX; + + base = cpu_base->clock_base; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + ktime_t basenow; + struct rb_node *node; + + spin_lock(&cpu_base->lock); + + basenow = ktime_add(now, base->offset); + + while ((node = base->first)) { + struct hrtimer *timer; + + timer = rb_entry(node, struct hrtimer, node); + + if (basenow.tv64 < timer->expires.tv64) { + ktime_t expires; + + expires = ktime_sub(timer->expires, + base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + break; + } + + /* Move softirq callbacks to the pending list */ + if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + raise = 1; + continue; + } + + __remove_hrtimer(timer, base, + HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + + /* + * Note: We clear the CALLBACK bit after + * enqueue_hrtimer to avoid reprogramming of + * the event hardware. This happens at the end + * of this function anyway. + */ + if (timer->function(timer) != HRTIMER_NORESTART) { + BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + enqueue_hrtimer(timer, base, 0); + } + timer->state &= ~HRTIMER_STATE_CALLBACK; + } + spin_unlock(&cpu_base->lock); + base++; + } + + cpu_base->expires_next = expires_next; + + /* Reprogramming necessary ? */ + if (expires_next.tv64 != KTIME_MAX) { + if (tick_program_event(expires_next, 0)) + goto retry; + } + + /* Raise softirq ? */ + if (raise) + raise_softirq(HRTIMER_SOFTIRQ); +} + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + spin_lock_irq(&cpu_base->lock); + + while (!list_empty(&cpu_base->cb_pending)) { + enum hrtimer_restart (*fn)(struct hrtimer *); + struct hrtimer *timer; + int restart; + + timer = list_entry(cpu_base->cb_pending.next, + struct hrtimer, cb_entry); + + timer_stats_account_hrtimer(timer); + + fn = timer->function; + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); + spin_unlock_irq(&cpu_base->lock); + + restart = fn(timer); + + spin_lock_irq(&cpu_base->lock); + + timer->state &= ~HRTIMER_STATE_CALLBACK; + if (restart == HRTIMER_RESTART) { + BUG_ON(hrtimer_active(timer)); + /* + * Enqueue the timer, allow reprogramming of the event + * device + */ + enqueue_hrtimer(timer, timer->base, 1); + } else if (hrtimer_active(timer)) { + /* + * If the timer was rearmed on another CPU, reprogram + * the event device. + */ + if (timer->base->first == &timer->node) + hrtimer_reprogram(timer, timer->base); + } + } + spin_unlock_irq(&cpu_base->lock); +} + +#endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Expire the per base hrtimer-queue: */ -static inline void run_hrtimer_queue(struct hrtimer_base *base) +static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, + int index) { struct rb_node *node; + struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; if (!base->first) return; @@ -610,53 +1119,72 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base) if (base->get_softirq_time) base->softirq_time = base->get_softirq_time(); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; - int (*fn)(struct hrtimer *); + enum hrtimer_restart (*fn)(struct hrtimer *); int restart; timer = rb_entry(node, struct hrtimer, node); if (base->softirq_time.tv64 <= timer->expires.tv64) break; + timer_stats_account_hrtimer(timer); + fn = timer->function; - set_curr_timer(base, timer); - __remove_hrtimer(timer, base); - spin_unlock_irq(&base->lock); + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + spin_unlock_irq(&cpu_base->lock); restart = fn(timer); - spin_lock_irq(&base->lock); + spin_lock_irq(&cpu_base->lock); + timer->state &= ~HRTIMER_STATE_CALLBACK; if (restart != HRTIMER_NORESTART) { BUG_ON(hrtimer_active(timer)); - enqueue_hrtimer(timer, base); + enqueue_hrtimer(timer, base, 0); } } - set_curr_timer(base, NULL); - spin_unlock_irq(&base->lock); + spin_unlock_irq(&cpu_base->lock); } /* * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. */ void hrtimer_run_queues(void) { - struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); int i; - hrtimer_get_softirq_time(base); + if (hrtimer_hres_active()) + return; + + /* + * This _is_ ugly: We have to check in the softirq context, + * whether we can switch to highres and / or nohz mode. The + * clocksource switch happens in the timer interrupt with + * xtime_lock held. Notification from there only sets the + * check bit in the tick_oneshot code, otherwise we might + * deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + hrtimer_switch_to_hres(); - for (i = 0; i < MAX_HRTIMER_BASES; i++) - run_hrtimer_queue(&base[i]); + hrtimer_get_softirq_time(cpu_base); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + run_hrtimer_queue(cpu_base, i); } /* * Sleep related functions: */ -static int hrtimer_wakeup(struct hrtimer *timer) +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); @@ -673,6 +1201,9 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) { sl->timer.function = hrtimer_wakeup; sl->task = task; +#ifdef CONFIG_HIGH_RES_TIMERS + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; +#endif } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) @@ -683,10 +1214,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod set_current_state(TASK_INTERRUPTIBLE); hrtimer_start(&t->timer, t->timer.expires, mode); - schedule(); + if (likely(t->task)) + schedule(); hrtimer_cancel(&t->timer); - mode = HRTIMER_ABS; + mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); @@ -702,10 +1234,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) restart->fn = do_no_restart_syscall; - hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); + hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; - if (do_nanosleep(&t, HRTIMER_ABS)) + if (do_nanosleep(&t, HRTIMER_MODE_ABS)) return 0; rmtp = (struct timespec __user *) restart->arg1; @@ -738,7 +1270,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, return 0; /* Absolute timers do not update the rmtp value and restart: */ - if (mode == HRTIMER_ABS) + if (mode == HRTIMER_MODE_ABS) return -ERESTARTNOHAND; if (rmtp) { @@ -771,7 +1303,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) if (!timespec_valid(&tu)) return -EINVAL; - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* @@ -779,56 +1311,60 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) */ static void __devinit init_hrtimers_cpu(int cpu) { - struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { - spin_lock_init(&base->lock); - lockdep_set_class(&base->lock, &base->lock_key); - } + spin_lock_init(&cpu_base->lock); + lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + cpu_base->clock_base[i].cpu_base = cpu_base; + + hrtimer_init_hres(cpu_base); } #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_base *old_base, - struct hrtimer_base *new_base) +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); - __remove_hrtimer(timer, old_base); + BUG_ON(hrtimer_callback_running(timer)); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); timer->base = new_base; - enqueue_hrtimer(timer, new_base); + /* + * Enqueue the timer. Allow reprogramming of the event device + */ + enqueue_hrtimer(timer, new_base, 1); } } static void migrate_hrtimers(int cpu) { - struct hrtimer_base *old_base, *new_base; + struct hrtimer_cpu_base *old_base, *new_base; int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(hrtimer_bases, cpu); - new_base = get_cpu_var(hrtimer_bases); - - local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, cpu); + new_base = &get_cpu_var(hrtimer_bases); - for (i = 0; i < MAX_HRTIMER_BASES; i++) { + tick_cancel_sched_timer(cpu); - spin_lock(&new_base->lock); - spin_lock(&old_base->lock); - - BUG_ON(old_base->curr_timer); + local_irq_disable(); - migrate_hrtimer_list(old_base, new_base); + spin_lock(&new_base->lock); + spin_lock(&old_base->lock); - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - old_base++; - new_base++; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i]); } + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); @@ -848,6 +1384,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); migrate_hrtimers(cpu); break; #endif @@ -868,5 +1405,8 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); +#endif } diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1dab0ac..681c52d 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,5 @@ -obj-y := handle.o manage.o spurious.o resend.o chip.o +obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d27b258..0133f4f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -39,6 +39,7 @@ void dynamic_irq_init(unsigned int irq) desc->chip = &no_irq_chip; desc->handle_irq = handle_bad_irq; desc->depth = 1; + desc->msi_desc = NULL; desc->handler_data = NULL; desc->chip_data = NULL; desc->action = NULL; @@ -74,6 +75,9 @@ void dynamic_irq_cleanup(unsigned int irq) WARN_ON(1); return; } + desc->msi_desc = NULL; + desc->handler_data = NULL; + desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; spin_unlock_irqrestore(&desc->lock, flags); @@ -162,6 +166,30 @@ int set_irq_data(unsigned int irq, void *data) EXPORT_SYMBOL(set_irq_data); /** + * set_irq_data - set irq type data for an irq + * @irq: Interrupt number + * @entry: Pointer to MSI descriptor data + * + * Set the hardware irq controller data for an irq + */ +int set_irq_msi(unsigned int irq, struct msi_desc *entry) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR + "Trying to install msi data for IRQ%d\n", irq); + return -EINVAL; + } + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->msi_desc = entry; + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +/** * set_irq_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data @@ -202,10 +230,6 @@ static void default_enable(unsigned int irq) */ static void default_disable(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; - - if (!(desc->status & IRQ_DELAYED_DISABLE)) - desc->chip->mask(irq); } /* @@ -270,13 +294,18 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_cpu(cpu).irqs[irq]++; action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + if (desc->chip->mask) + desc->chip->mask(irq); + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->status |= IRQ_PENDING; goto out_unlock; + } + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); desc->status |= IRQ_INPROGRESS; spin_unlock(&desc->lock); @@ -368,11 +397,13 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) /* * If its disabled or no action available - * keep it masked and get out of here + * then mask it and get out of here: */ action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; + if (desc->chip->mask) + desc->chip->mask(irq); goto out; } @@ -534,10 +565,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { - desc->chip->mask(irq); - desc->chip->ack(irq); - } + if (desc->chip != &no_irq_chip) + mask_ack_irq(desc, irq); desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c new file mode 100644 index 0000000..85a430d --- /dev/null +++ b/kernel/irq/devres.c @@ -0,0 +1,88 @@ +#include <linux/module.h> +#include <linux/interrupt.h> + +/* + * Device resource management aware IRQ request/free implementation. + */ +struct irq_devres { + unsigned int irq; + void *dev_id; +}; + +static void devm_irq_release(struct device *dev, void *res) +{ + struct irq_devres *this = res; + + free_irq(this->irq, this->dev_id); +} + +static int devm_irq_match(struct device *dev, void *res, void *data) +{ + struct irq_devres *this = res, *match = data; + + return this->irq == match->irq && this->dev_id == match->dev_id; +} + +/** + * devm_request_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, dev_free_irq() must be used. + */ +int devm_request_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + kfree(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_irq); + +/** + * devm_free_irq - free an interrupt + * @dev: device to free interrupt for + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as free_irq(). + * This function instead of free_irq() should be used to manually + * free IRQs allocated with dev_request_irq(). + */ +void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) +{ + struct irq_devres match_data = { irq, dev_id }; + + free_irq(irq, dev_id); + WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, + &match_data)); +} +EXPORT_SYMBOL(devm_free_irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8b961ad..5597c15 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -38,6 +38,46 @@ void synchronize_irq(unsigned int irq) } EXPORT_SYMBOL(synchronize_irq); +/** + * irq_can_set_affinity - Check if the affinity of a given irq can be set + * @irq: Interrupt to check + * + */ +int irq_can_set_affinity(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || + !desc->chip->set_affinity) + return 0; + + return 1; +} + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @cpumask: cpumask + * + */ +int irq_set_affinity(unsigned int irq, cpumask_t cpumask) +{ + struct irq_desc *desc = irq_desc + irq; + + if (!desc->chip->set_affinity) + return -EINVAL; + + set_balance_irq_affinity(irq, cpumask); + +#ifdef CONFIG_GENERIC_PENDING_IRQ + set_pending_irq(irq, cpumask); +#else + desc->affinity = cpumask; + desc->chip->set_affinity(irq, cpumask); +#endif + return 0; +} + #endif /** @@ -281,6 +321,10 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (new->flags & IRQF_PERCPU) desc->status |= IRQ_PER_CPU; #endif + /* Exclude IRQ from balancing */ + if (new->flags & IRQF_NOBALANCING) + desc->status |= IRQ_NO_BALANCING; + if (!shared) { irq_chip_set_defaults(desc->chip); @@ -328,12 +372,14 @@ int setup_irq(unsigned int irq, struct irqaction *new) return 0; mismatch: +#ifdef CONFIG_DEBUG_SHIRQ if (!(new->flags & IRQF_PROBE_SHARED)) { printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); if (old_name) printk(KERN_ERR "current handler: %s\n", old_name); dump_stack(); } +#endif spin_unlock_irqrestore(&desc->lock, flags); return -EBUSY; } @@ -357,6 +403,7 @@ void free_irq(unsigned int irq, void *dev_id) struct irq_desc *desc; struct irqaction **p; unsigned long flags; + irqreturn_t (*handler)(int, void *) = NULL; WARN_ON(in_interrupt()); if (irq >= NR_IRQS) @@ -396,6 +443,8 @@ void free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU */ synchronize_irq(irq); + if (action->flags & IRQF_SHARED) + handler = action->handler; kfree(action); return; } @@ -403,6 +452,17 @@ void free_irq(unsigned int irq, void *dev_id) spin_unlock_irqrestore(&desc->lock, flags); return; } +#ifdef CONFIG_DEBUG_SHIRQ + if (handler) { + /* + * It's a shared IRQ -- the driver ought to be prepared for it + * to happen even now it's being freed, so let's make sure.... + * We do this after actually deregistering it, to make sure that + * a 'real' IRQ doesn't run in parallel with our fake + */ + handler(irq, dev_id); + } +#endif } EXPORT_SYMBOL(free_irq); @@ -445,7 +505,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, /* * Lockdep wants atomic interrupt handlers: */ - irqflags |= SA_INTERRUPT; + irqflags |= IRQF_DISABLED; #endif /* * Sanity-check: shared interrupts must pass in a real dev-ID, @@ -475,6 +535,25 @@ int request_irq(unsigned int irq, irq_handler_t handler, select_smp_affinity(irq); +#ifdef CONFIG_DEBUG_SHIRQ + if (irqflags & IRQF_SHARED) { + /* + * It's a shared IRQ -- the driver ought to be prepared for it + * to happen immediately, so let's make sure.... + * We do this before actually registering it, to make sure that + * a 'real' IRQ doesn't run in parallel with our fake + */ + if (irqflags & IRQF_DISABLED) { + unsigned long flags; + + local_irq_save(flags); + handler(irq, dev_id); + local_irq_restore(flags); + } else + handler(irq, dev_id); + } +#endif + retval = setup_irq(irq, action); if (retval) kfree(action); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 4baa3bb..77b7acc 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -65,12 +65,11 @@ void move_native_irq(int irq) if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; - if (likely(!(desc->status & IRQ_DISABLED))) - desc->chip->disable(irq); + if (unlikely(desc->status & IRQ_DISABLED)) + return; + desc->chip->mask(irq); move_masked_irq(irq); - - if (likely(!(desc->status & IRQ_DISABLED))) - desc->chip->enable(irq); + desc->chip->unmask(irq); } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 61f5c71..2db91eb 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -16,26 +16,6 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -#ifdef CONFIG_GENERIC_PENDING_IRQ -void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) -{ - set_balance_irq_affinity(irq, mask_val); - - /* - * Save these away for later use. Re-progam when the - * interrupt is pending - */ - set_pending_irq(irq, mask_val); -} -#else -void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) -{ - set_balance_irq_affinity(irq, mask_val); - irq_desc[irq].affinity = mask_val; - irq_desc[irq].chip->set_affinity(irq, mask_val); -} -#endif - static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -55,7 +35,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, cpumask_t new_value, tmp; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || - CHECK_IRQ_PER_CPU(irq_desc[irq].status)) + irq_balancing_disabled(irq)) return -EIO; err = cpumask_parse_user(buffer, count, new_value); @@ -73,7 +53,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, code to set default SMP affinity. */ return select_smp_affinity(irq) ? -EINVAL : full_count; - proc_set_irq_affinity(irq, new_value); + irq_set_affinity(irq, new_value); return full_count; } @@ -136,7 +116,6 @@ void register_irq_proc(unsigned int irq) entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); if (entry) { - entry->nlink = 1; entry->data = (void *)(long)irq; entry->read_proc = irq_affinity_read_proc; entry->write_proc = irq_affinity_write_proc; diff --git a/kernel/itimer.c b/kernel/itimer.c index 204ed79..307c6a6 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -128,18 +128,13 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) /* * The timer is automagically restarted, when interval != 0 */ -int it_real_fn(struct hrtimer *timer) +enum hrtimer_restart it_real_fn(struct hrtimer *timer) { struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk); - if (sig->it_real_incr.tv64 != 0) { - hrtimer_forward(timer, timer->base->softirq_time, - sig->it_real_incr); - return HRTIMER_RESTART; - } return HRTIMER_NORESTART; } @@ -231,11 +226,14 @@ again: spin_unlock_irq(&tsk->sighand->siglock); goto again; } - tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); expires = timeval_to_ktime(value->it_value); - if (expires.tv64 != 0) - hrtimer_start(timer, expires, HRTIMER_REL); + if (expires.tv64 != 0) { + tsk->signal->it_real_incr = + timeval_to_ktime(value->it_interval); + hrtimer_start(timer, expires, HRTIMER_MODE_REL); + } else + tsk->signal->it_real_incr.tv64 = 0; + spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 5d1d9073..cee4191 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -32,8 +32,8 @@ * @gfp_mask: get_free_pages mask, passed to kmalloc() * @lock: the lock to be used to protect the fifo buffer * - * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the - * struct kfifo with kfree(). + * Do NOT pass the kfifo to kfifo_free() after use! Simply free the + * &struct kfifo with kfree(). */ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, gfp_t gfp_mask, spinlock_t *lock) @@ -108,7 +108,7 @@ EXPORT_SYMBOL(kfifo_free); * @buffer: the data to be added. * @len: the length of the data to be added. * - * This function copies at most 'len' bytes from the 'buffer' into + * This function copies at most @len bytes from the @buffer into * the FIFO depending on the free space, and returns the number of * bytes copied. * @@ -155,8 +155,8 @@ EXPORT_SYMBOL(__kfifo_put); * @buffer: where the data must be copied. * @len: the size of the destination buffer. * - * This function copies at most 'len' bytes from the FIFO into the - * 'buffer' and returns the number of copied bytes. + * This function copies at most @len bytes from the FIFO into the + * @buffer and returns the number of copied bytes. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. diff --git a/kernel/kmod.c b/kernel/kmod.c index 3a7379a..7962761 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -217,7 +217,10 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + if (sub_info->wait < 0) + kfree(sub_info); + else + complete(sub_info->complete); return 0; } @@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work) pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD); + if (wait < 0) + return; + if (pid < 0) { sub_info->retval = pid; complete(sub_info->complete); @@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work) * @envp: null-terminated environment list * @session_keyring: session keyring for process (NULL for an empty keyring) * @wait: wait for the application to finish and return status. + * when -1 don't wait at all, but you get no useful error back when + * the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. @@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, struct key *session_keyring, int wait) { DECLARE_COMPLETION_ONSTACK(done); - struct subprocess_info sub_info = { - .work = __WORK_INITIALIZER(sub_info.work, - __call_usermodehelper), - .complete = &done, - .path = path, - .argv = argv, - .envp = envp, - .ring = session_keyring, - .wait = wait, - .retval = 0, - }; + struct subprocess_info *sub_info; + int retval; if (!khelper_wq) return -EBUSY; @@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, if (path[0] == '\0') return 0; - queue_work(khelper_wq, &sub_info.work); + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); + if (!sub_info) + return -ENOMEM; + + INIT_WORK(&sub_info->work, __call_usermodehelper); + sub_info->complete = &done; + sub_info->path = path; + sub_info->argv = argv; + sub_info->envp = envp; + sub_info->ring = session_keyring; + sub_info->wait = wait; + + queue_work(khelper_wq, &sub_info->work); + if (wait < 0) /* task has freed sub_info */ + return 0; wait_for_completion(&done); - return sub_info.retval; + retval = sub_info->retval; + kfree(sub_info); + return retval; } EXPORT_SYMBOL(call_usermodehelper_keys); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6fcf8dd..d25a9ad 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -39,6 +39,8 @@ #include <linux/moduleloader.h> #include <linux/kallsyms.h> #include <linux/freezer.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> @@ -778,6 +780,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp) return -ENOSYS; } +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) +{ + return 0; +} + #endif /* ARCH_SUPPORTS_KRETPROBES */ void __kprobes unregister_kretprobe(struct kretprobe *rp) @@ -815,7 +823,109 @@ static int __init init_kprobes(void) return err; } -__initcall(init_kprobes); +#ifdef CONFIG_DEBUG_FS +static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, + const char *sym, int offset,char *modname) +{ + char *kprobe_type; + + if (p->pre_handler == pre_handler_kretprobe) + kprobe_type = "r"; + else if (p->pre_handler == setjmp_pre_handler) + kprobe_type = "j"; + else + kprobe_type = "k"; + if (sym) + seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, + sym, offset, (modname ? modname : " ")); + else + seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); +} + +static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) +{ + return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; +} + +static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos >= KPROBE_TABLE_SIZE) + return NULL; + return pos; +} + +static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p, *kp; + const char *sym = NULL; + unsigned int i = *(loff_t *) v; + unsigned long size, offset = 0; + char *modname, namebuf[128]; + + head = &kprobe_table[i]; + preempt_disable(); + hlist_for_each_entry_rcu(p, node, head, hlist) { + sym = kallsyms_lookup((unsigned long)p->addr, &size, + &offset, &modname, namebuf); + if (p->pre_handler == aggr_pre_handler) { + list_for_each_entry_rcu(kp, &p->list, list) + report_probe(pi, kp, sym, offset, modname); + } else + report_probe(pi, p, sym, offset, modname); + } + preempt_enable(); + return 0; +} + +static struct seq_operations kprobes_seq_ops = { + .start = kprobe_seq_start, + .next = kprobe_seq_next, + .stop = kprobe_seq_stop, + .show = show_kprobe_addr +}; + +static int __kprobes kprobes_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &kprobes_seq_ops); +} + +static struct file_operations debugfs_kprobes_operations = { + .open = kprobes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __kprobes debugfs_kprobe_init(void) +{ + struct dentry *dir, *file; + + dir = debugfs_create_dir("kprobes", NULL); + if (!dir) + return -ENOMEM; + + file = debugfs_create_file("list", 0444, dir , 0 , + &debugfs_kprobes_operations); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + + return 0; +} + +late_initcall(debugfs_kprobe_init); +#endif /* CONFIG_DEBUG_FS */ + +module_init(init_kprobes); EXPORT_SYMBOL_GPL(register_kprobe); EXPORT_SYMBOL_GPL(unregister_kprobe); @@ -824,4 +934,3 @@ EXPORT_SYMBOL_GPL(unregister_jprobe); EXPORT_SYMBOL_GPL(jprobe_return); EXPORT_SYMBOL_GPL(register_kretprobe); EXPORT_SYMBOL_GPL(unregister_kretprobe); - diff --git a/kernel/kthread.c b/kernel/kthread.c index 1db8c72..87c50cc 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -50,7 +50,7 @@ static struct kthread_stop_info kthread_stop_info; /** * kthread_should_stop - should this kthread return now? * - * When someone calls kthread_stop on your kthread, it will be woken + * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ @@ -143,7 +143,7 @@ static void keventd_create_kthread(struct work_struct *work) * it. See also kthread_run(), kthread_create_on_cpu(). * * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn can either call do_exit() directly if it is a + * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which noone will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero @@ -192,7 +192,7 @@ EXPORT_SYMBOL(kthread_create); * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create(). + * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *k, unsigned int cpu) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 509efd4..a08a172 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -70,6 +70,9 @@ static int graph_lock(void) static inline int graph_unlock(void) { + if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) + return DEBUG_LOCKS_WARN_ON(1); + __raw_spin_unlock(&lockdep_lock); return 0; } @@ -487,7 +490,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth) * Add a new dependency to the head of the list: */ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip) + struct list_head *head, unsigned long ip, int distance) { struct lock_list *entry; /* @@ -499,6 +502,7 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 0; entry->class = this; + entry->distance = distance; if (!save_trace(&entry->trace)) return 0; @@ -712,6 +716,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) struct lock_list *entry; int ret; + if (!__raw_spin_is_locked(&lockdep_lock)) + return DEBUG_LOCKS_WARN_ON(1); + if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) @@ -900,7 +907,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) + struct held_lock *next, int distance) { struct lock_list *entry; int ret; @@ -978,8 +985,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * L2 added to its dependency list, due to the first chain.) */ list_for_each_entry(entry, &prev->class->locks_after, entry) { - if (entry->class == next->class) + if (entry->class == next->class) { + if (distance == 1) + entry->distance = 1; return 2; + } } /* @@ -987,12 +997,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * to the previous lock's dependency list: */ ret = add_lock_to_list(prev->class, next->class, - &prev->class->locks_after, next->acquire_ip); + &prev->class->locks_after, next->acquire_ip, distance); + if (!ret) return 0; ret = add_lock_to_list(next->class, prev->class, - &next->class->locks_before, next->acquire_ip); + &next->class->locks_before, next->acquire_ip, distance); if (!ret) return 0; @@ -1040,13 +1051,14 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) goto out_bug; for (;;) { + int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth-1; /* * Only non-recursive-read entries get new dependencies * added: */ if (hlock->read != 2) { - if (!check_prev_add(curr, hlock, next)) + if (!check_prev_add(curr, hlock, next, distance)) return 0; /* * Stop after the first non-trylock entry, @@ -1293,7 +1305,8 @@ out_unlock_set: if (!subclass || force) lock->class_cache = class; - DEBUG_LOCKS_WARN_ON(class->subclass != subclass); + if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) + return NULL; return class; } @@ -1308,7 +1321,8 @@ static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; /* * We can walk it lock-free, because entries only get added * to the hash: @@ -1394,7 +1408,9 @@ static void check_chain_key(struct task_struct *curr) return; } id = hlock->class - lock_classes; - DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS); + if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + return; + if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = 0; @@ -2205,15 +2221,24 @@ out_calc_hash: if (!check_prevs_add(curr, hlock)) return 0; graph_unlock(); - } + } else + /* after lookup_chain_cache(): */ + if (unlikely(!debug_locks)) + return 0; + curr->lockdep_depth++; check_chain_key(curr); +#ifdef CONFIG_DEBUG_LOCKDEP + if (unlikely(!debug_locks)) + return 0; +#endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); printk("BUG: MAX_LOCK_DEPTH too low!\n"); printk("turning off the locking correctness validator.\n"); return 0; } + if (unlikely(curr->lockdep_depth > max_lockdep_depth)) max_lockdep_depth = curr->lockdep_depth; @@ -2764,4 +2789,3 @@ void debug_show_held_locks(struct task_struct *task) } EXPORT_SYMBOL_GPL(debug_show_held_locks); - diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index b554b40..58f35e5 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -10,7 +10,6 @@ * Code for /proc/lockdep and /proc/lockdep_stats: * */ -#include <linux/sched.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -77,12 +76,29 @@ static unsigned long count_backward_deps(struct lock_class *class) return ret; } +static void print_name(struct seq_file *m, struct lock_class *class) +{ + char str[128]; + const char *name = class->name; + + if (!name) { + name = __get_key_name(class->key, str); + seq_printf(m, "%s", name); + } else{ + seq_printf(m, "%s", name); + if (class->name_version > 1) + seq_printf(m, "#%d", class->name_version); + if (class->subclass) + seq_printf(m, "/%d", class->subclass); + } +} + static int l_show(struct seq_file *m, void *v) { unsigned long nr_forward_deps, nr_backward_deps; struct lock_class *class = m->private; - char str[128], c1, c2, c3, c4; - const char *name; + struct lock_list *entry; + char c1, c2, c3, c4; seq_printf(m, "%p", class->key); #ifdef CONFIG_DEBUG_LOCKDEP @@ -97,16 +113,16 @@ static int l_show(struct seq_file *m, void *v) get_usage_chars(class, &c1, &c2, &c3, &c4); seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); - name = class->name; - if (!name) { - name = __get_key_name(class->key, str); - seq_printf(m, ": %s", name); - } else{ - seq_printf(m, ": %s", name); - if (class->name_version > 1) - seq_printf(m, "#%d", class->name_version); - if (class->subclass) - seq_printf(m, "/%d", class->subclass); + seq_printf(m, ": "); + print_name(m, class); + seq_puts(m, "\n"); + + list_for_each_entry(entry, &class->locks_after, entry) { + if (entry->distance == 1) { + seq_printf(m, " -> [%p] ", entry->class); + print_name(m, entry->class); + seq_puts(m, "\n"); + } } seq_puts(m, "\n"); @@ -227,7 +243,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps += count_forward_deps(class); } -#ifdef CONFIG_LOCKDEP_DEBUG +#ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); #endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", diff --git a/kernel/module.c b/kernel/module.c index d0f2260..f77e893 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -537,6 +537,8 @@ static int already_uses(struct module *a, struct module *b) static int use_module(struct module *a, struct module *b) { struct module_use *use; + int no_warn; + if (b == NULL || already_uses(a, b)) return 1; if (!strong_try_module_get(b)) @@ -552,6 +554,7 @@ static int use_module(struct module *a, struct module *b) use->module_which_uses = a; list_add(&use->list, &b->modules_which_use_me); + no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); return 1; } @@ -569,6 +572,7 @@ static void module_unload_free(struct module *mod) module_put(i); list_del(&use->list); kfree(use); + sysfs_remove_link(i->holders_dir, mod->name); /* There can be at most one match. */ break; } @@ -1064,7 +1068,8 @@ static inline void remove_sect_attrs(struct module *mod) } #endif /* CONFIG_KALLSYMS */ -static int module_add_modinfo_attrs(struct module *mod) +#ifdef CONFIG_SYSFS +int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; struct module_attribute *temp_attr; @@ -1090,7 +1095,7 @@ static int module_add_modinfo_attrs(struct module *mod) return error; } -static void module_remove_modinfo_attrs(struct module *mod) +void module_remove_modinfo_attrs(struct module *mod) { struct module_attribute *attr; int i; @@ -1105,10 +1110,10 @@ static void module_remove_modinfo_attrs(struct module *mod) } kfree(mod->modinfo_attrs); } +#endif -static int mod_sysfs_setup(struct module *mod, - struct kernel_param *kparam, - unsigned int num_params) +#ifdef CONFIG_SYSFS +int mod_sysfs_init(struct module *mod) { int err; @@ -1125,21 +1130,30 @@ static int mod_sysfs_setup(struct module *mod, kobj_set_kset_s(&mod->mkobj, module_subsys); mod->mkobj.mod = mod; - /* delay uevent until full sysfs population */ kobject_init(&mod->mkobj.kobj); + +out: + return err; +} + +int mod_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + int err; + + /* delay uevent until full sysfs population */ err = kobject_add(&mod->mkobj.kobj); if (err) goto out; - mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); - if (!mod->drivers_dir) { - err = -ENOMEM; + mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders"); + if (!mod->holders_dir) goto out_unreg; - } err = module_param_sysfs_setup(mod, kparam, num_params); if (err) - goto out_unreg_drivers; + goto out_unreg_holders; err = module_add_modinfo_attrs(mod); if (err) @@ -1150,21 +1164,22 @@ static int mod_sysfs_setup(struct module *mod, out_unreg_param: module_param_sysfs_remove(mod); -out_unreg_drivers: - kobject_unregister(mod->drivers_dir); +out_unreg_holders: + kobject_unregister(mod->holders_dir); out_unreg: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); out: return err; } +#endif static void mod_kobject_remove(struct module *mod) { module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); - kobject_unregister(mod->drivers_dir); - + kobject_unregister(mod->mkobj.drivers_dir); + kobject_unregister(mod->holders_dir); kobject_unregister(&mod->mkobj.kobj); } @@ -1768,6 +1783,10 @@ static struct module *load_module(void __user *umod, /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); + /* Initialize kobject, so we can reference it. */ + if (mod_sysfs_init(mod) != 0) + goto cleanup; + /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); @@ -2327,6 +2346,7 @@ void print_modules(void) printk("\n"); } +#ifdef CONFIG_SYSFS static char *make_driver_name(struct device_driver *drv) { char *driver_name; @@ -2340,19 +2360,43 @@ static char *make_driver_name(struct device_driver *drv) return driver_name; } +static void module_create_drivers_dir(struct module_kobject *mk) +{ + if (!mk || mk->drivers_dir) + return; + + mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers"); +} + void module_add_driver(struct module *mod, struct device_driver *drv) { char *driver_name; int no_warn; + struct module_kobject *mk = NULL; - if (!mod || !drv) + if (!drv) + return; + + if (mod) + mk = &mod->mkobj; + else if (drv->mod_name) { + struct kobject *mkobj; + + /* Lookup built-in module entry in /sys/modules */ + mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name); + if (mkobj) + mk = container_of(mkobj, struct module_kobject, kobj); + } + + if (!mk) return; /* Don't check return codes; these calls are idempotent */ - no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); + no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module"); driver_name = make_driver_name(drv); if (driver_name) { - no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, + module_create_drivers_dir(mk); + no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj, driver_name); kfree(driver_name); } @@ -2367,16 +2411,23 @@ void module_remove_driver(struct device_driver *drv) return; sysfs_remove_link(&drv->kobj, "module"); - if (drv->owner && drv->owner->drivers_dir) { + if (drv->owner && drv->owner->mkobj.drivers_dir) { driver_name = make_driver_name(drv); if (driver_name) { - sysfs_remove_link(drv->owner->drivers_dir, + sysfs_remove_link(drv->owner->mkobj.drivers_dir, driver_name); kfree(driver_name); } } + /* + * Undo the additional reference we added in module_add_driver() + * via kset_find_obj() + */ + if (drv->mod_name) + kobject_put(&drv->kobj); } EXPORT_SYMBOL(module_remove_driver); +#endif #ifdef CONFIG_MODVERSIONS /* Generate the signature for struct module here, too, for modversions. */ diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 841539d..d17436c 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -13,7 +13,6 @@ * Released under the General Public License (GPL). */ #include <linux/mutex.h> -#include <linux/sched.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/poison.h> diff --git a/kernel/panic.c b/kernel/panic.c index 525e365..623d182 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -150,6 +150,7 @@ EXPORT_SYMBOL(panic); * 'R' - User forced a module unload. * 'M' - Machine had a machine check experience. * 'B' - System has hit bad_page. + * 'U' - Userspace-defined naughtiness. * * The string is overwritten by the next call to print_taint(). */ @@ -158,13 +159,14 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' '); + tainted & TAINT_BAD_PAGE ? 'B' : ' ', + tainted & TAINT_USER ? 'U' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); diff --git a/kernel/params.c b/kernel/params.c index 718945d..e265b13 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -389,6 +389,7 @@ struct module_param_attrs struct param_attribute attrs[0]; }; +#ifdef CONFIG_SYSFS #define to_param_attr(n) container_of(n, struct param_attribute, mattr); static ssize_t param_attr_show(struct module_attribute *mattr, @@ -424,6 +425,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return len; return err; } +#endif #ifdef CONFIG_MODULES #define __modinit @@ -431,6 +433,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, #define __modinit __init #endif +#ifdef CONFIG_SYSFS /* * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME * @mk: struct module_kobject (contains parent kobject) @@ -498,9 +501,7 @@ param_sysfs_setup(struct module_kobject *mk, return mp; } - #ifdef CONFIG_MODULES - /* * module_param_sysfs_setup - setup sysfs support for one module * @mod: module @@ -561,14 +562,11 @@ static void __init kernel_param_sysfs_setup(const char *name, mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); kobject_set_name(&mk->kobj, name); - ret = kobject_register(&mk->kobj); + kobject_init(&mk->kobj); + ret = kobject_add(&mk->kobj); BUG_ON(ret < 0); - - /* no need to keep the kobject if no parameter is exported */ - if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { - kobject_unregister(&mk->kobj); - kfree(mk); - } + param_sysfs_setup(mk, kparam, num_params, name_skip); + kobject_uevent(&mk->kobj, KOBJ_ADD); } /* @@ -626,7 +624,6 @@ static void __init param_sysfs_builtin(void) /* module-related sysfs stuff */ -#ifdef CONFIG_SYSFS #define to_module_attr(n) container_of(n, struct module_attribute, attr); #define to_module_kobject(n) container_of(n, struct module_kobject, kobj); @@ -674,19 +671,27 @@ static struct sysfs_ops module_sysfs_ops = { .store = module_attr_store, }; -#else -static struct sysfs_ops module_sysfs_ops = { - .show = NULL, - .store = NULL, +static struct kobj_type module_ktype; + +static int uevent_filter(struct kset *kset, struct kobject *kobj) +{ + struct kobj_type *ktype = get_ktype(kobj); + + if (ktype == &module_ktype) + return 1; + return 0; +} + +static struct kset_uevent_ops module_uevent_ops = { + .filter = uevent_filter, }; -#endif + +decl_subsys(module, &module_ktype, &module_uevent_ops); static struct kobj_type module_ktype = { .sysfs_ops = &module_sysfs_ops, }; -decl_subsys(module, &module_ktype, NULL); - /* * param_sysfs_init - wrapper for built-in params support */ @@ -707,6 +712,15 @@ static int __init param_sysfs_init(void) } subsys_initcall(param_sysfs_init); +#else +#if 0 +static struct sysfs_ops module_sysfs_ops = { + .show = NULL, + .store = NULL, +}; +#endif +#endif + EXPORT_SYMBOL(param_set_byte); EXPORT_SYMBOL(param_get_byte); EXPORT_SYMBOL(param_set_short); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 7c3e1e6..657f776 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -304,7 +304,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) * should be able to see it. */ struct task_struct *p; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_pid(pid); if (p) { if (CPUCLOCK_PERTHREAD(which_clock)) { @@ -312,12 +312,17 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) error = cpu_clock_sample(which_clock, p, &rtn); } - } else if (p->tgid == pid && p->signal) { - error = cpu_clock_sample_group(which_clock, - p, &rtn); + } else { + read_lock(&tasklist_lock); + if (p->tgid == pid && p->signal) { + error = + cpu_clock_sample_group(which_clock, + p, &rtn); + } + read_unlock(&tasklist_lock); } } - read_unlock(&tasklist_lock); + rcu_read_unlock(); } if (error) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5fe87de..44318ca 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -145,7 +145,7 @@ static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); static int common_timer_del(struct k_itimer *timer); -static int posix_timer_fn(struct hrtimer *data); +static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); @@ -334,12 +334,12 @@ EXPORT_SYMBOL_GPL(posix_timer_event); * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. */ -static int posix_timer_fn(struct hrtimer *timer) +static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { struct k_itimer *timr; unsigned long flags; int si_private = 0; - int ret = HRTIMER_NORESTART; + enum hrtimer_restart ret = HRTIMER_NORESTART; timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); @@ -356,7 +356,7 @@ static int posix_timer_fn(struct hrtimer *timer) if (timr->it.real.interval.tv64 != 0) { timr->it_overrun += hrtimer_forward(timer, - timer->base->softirq_time, + hrtimer_cb_get_time(timer), timr->it.real.interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; @@ -399,10 +399,9 @@ EXPORT_SYMBOL_GPL(register_posix_clock); static struct k_itimer * alloc_posix_timer(void) { struct k_itimer *tmr; - tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL); + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; - memset(tmr, 0, sizeof (struct k_itimer)); if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); tmr = NULL; @@ -723,7 +722,7 @@ common_timer_set(struct k_itimer *timr, int flags, if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; + mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; @@ -735,7 +734,7 @@ common_timer_set(struct k_itimer *timr, int flags, /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_REL) + if (mode == HRTIMER_MODE_REL) timer->expires = ktime_add(timer->expires, timer->base->get_time()); return 0; @@ -951,7 +950,8 @@ static int common_nsleep(const clockid_t which_clock, int flags, struct timespec *tsave, struct timespec __user *rmtp) { return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? - HRTIMER_ABS : HRTIMER_REL, which_clock); + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } asmlinkage long diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ed29622..95f6657 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -131,3 +131,29 @@ config SUSPEND_SMP bool depends on HOTPLUG_CPU && X86 && PM default y + +config APM_EMULATION + tristate "Advanced Power Management Emulation" + depends on PM && SYS_SUPPORTS_APM_EMULATION + help + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with + APM compliant BIOSes. If you say Y here, the system time will be + reset after a RESUME operation, the /proc/apm device will provide + battery status information, and user-space programs will receive + notification of APM "events" (e.g. battery status change). + + In order to use APM, you will need supporting software. For location + and more information, read <file:Documentation/pm.txt> and the + Battery Powered Linux mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. + + This driver does not spin down disk drives (see the hdparm(8) + manpage ("man 8 hdparm") for that), and it doesn't turn off + VESA-compliant "green" monitors. + + Generally, if you don't have a battery in your machine, there isn't + much point in using this driver and you should say N. If you get + random kernel OOPSes or reboots that don't seem to be related to + anything, try disabling/enabling this option (or disabling/enabling + APM in your BIOS). diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 88fc5d7..406b20a 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -87,52 +87,24 @@ static inline void platform_finish(void) } } +static void unprepare_processes(void) +{ + thaw_processes(); + pm_restore_console(); +} + static int prepare_processes(void) { int error = 0; pm_prepare_console(); - - error = disable_nonboot_cpus(); - if (error) - goto enable_cpus; - if (freeze_processes()) { error = -EBUSY; - goto thaw; + unprepare_processes(); } - - if (pm_disk_mode == PM_DISK_TESTPROC) { - printk("swsusp debug: Waiting for 5 seconds.\n"); - mdelay(5000); - goto thaw; - } - - error = platform_prepare(); - if (error) - goto thaw; - - /* Free memory before shutting down devices. */ - if (!(error = swsusp_shrink_memory())) - return 0; - - platform_finish(); - thaw: - thaw_processes(); - enable_cpus: - enable_nonboot_cpus(); - pm_restore_console(); return error; } -static void unprepare_processes(void) -{ - platform_finish(); - thaw_processes(); - enable_nonboot_cpus(); - pm_restore_console(); -} - /** * pm_suspend_disk - The granpappy of hibernation power management. * @@ -150,29 +122,45 @@ int pm_suspend_disk(void) if (error) return error; - if (pm_disk_mode == PM_DISK_TESTPROC) - return 0; + if (pm_disk_mode == PM_DISK_TESTPROC) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto Thaw; + } + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (error) + goto Thaw; + + error = platform_prepare(); + if (error) + goto Thaw; suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { - resume_console(); - printk("Some devices failed to suspend\n"); - goto Thaw; + printk(KERN_ERR "PM: Some devices failed to suspend\n"); + goto Resume_devices; } + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; if (pm_disk_mode == PM_DISK_TEST) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); - goto Done; + goto Enable_cpus; } pr_debug("PM: snapshotting memory.\n"); in_suspend = 1; - if ((error = swsusp_suspend())) - goto Done; + error = swsusp_suspend(); + if (error) + goto Enable_cpus; if (in_suspend) { + enable_nonboot_cpus(); + platform_finish(); device_resume(); resume_console(); pr_debug("PM: writing image.\n"); @@ -188,7 +176,10 @@ int pm_suspend_disk(void) } swsusp_free(); - Done: + Enable_cpus: + enable_nonboot_cpus(); + Resume_devices: + platform_finish(); device_resume(); resume_console(); Thaw: @@ -237,19 +228,28 @@ static int software_resume(void) pr_debug("PM: Checking swsusp image.\n"); - if ((error = swsusp_check())) + error = swsusp_check(); + if (error) goto Done; pr_debug("PM: Preparing processes for restore.\n"); - if ((error = prepare_processes())) { + error = prepare_processes(); + if (error) { swsusp_close(); goto Done; } + error = platform_prepare(); + if (error) { + swsusp_free(); + goto Thaw; + } + pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read())) { + error = swsusp_read(); + if (error) { swsusp_free(); goto Thaw; } @@ -257,21 +257,22 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); suspend_console(); - if ((error = device_suspend(PMSG_PRETHAW))) { - resume_console(); - printk("Some devices failed to suspend\n"); - swsusp_free(); - goto Thaw; - } + error = device_suspend(PMSG_PRETHAW); + if (error) + goto Free; - mb(); + error = disable_nonboot_cpus(); + if (!error) + swsusp_resume(); - pr_debug("PM: Restoring saved image.\n"); - swsusp_resume(); - pr_debug("PM: Restore failed, recovering.n"); + enable_nonboot_cpus(); + Free: + swsusp_free(); + platform_finish(); device_resume(); resume_console(); Thaw: + printk(KERN_ERR "PM: Restore failed, recovering.\n"); unprepare_processes(); Done: /* For success case, the suspend path will release the lock */ diff --git a/kernel/power/main.c b/kernel/power/main.c index ff3a618..a064dfd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -20,6 +20,7 @@ #include <linux/cpu.h> #include <linux/resume-trace.h> #include <linux/freezer.h> +#include <linux/vmstat.h> #include "power.h" @@ -43,6 +44,11 @@ void pm_set_ops(struct pm_ops * ops) mutex_unlock(&pm_mutex); } +static inline void pm_finish(suspend_state_t state) +{ + if (pm_ops->finish) + pm_ops->finish(state); +} /** * suspend_prepare - Do prep work before entering low-power state. @@ -63,16 +69,13 @@ static int suspend_prepare(suspend_state_t state) pm_prepare_console(); - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpu; - if (freeze_processes()) { error = -EAGAIN; goto Thaw; } - if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) { + if ((free_pages = global_page_state(NR_FREE_PAGES)) + < FREE_PAGE_NUMBER) { pr_debug("PM: free some memory\n"); shrink_all_memory(FREE_PAGE_NUMBER - free_pages); if (nr_free_pages() < FREE_PAGE_NUMBER) { @@ -88,18 +91,22 @@ static int suspend_prepare(suspend_state_t state) } suspend_console(); - if ((error = device_suspend(PMSG_SUSPEND))) { + error = device_suspend(PMSG_SUSPEND); + if (error) { printk(KERN_ERR "Some devices failed to suspend\n"); - goto Finish; + goto Resume_devices; } - return 0; - Finish: - if (pm_ops->finish) - pm_ops->finish(state); + error = disable_nonboot_cpus(); + if (!error) + return 0; + + enable_nonboot_cpus(); + Resume_devices: + pm_finish(state); + device_resume(); + resume_console(); Thaw: thaw_processes(); - Enable_cpu: - enable_nonboot_cpus(); pm_restore_console(); return error; } @@ -134,12 +141,11 @@ int suspend_enter(suspend_state_t state) static void suspend_finish(suspend_state_t state) { + enable_nonboot_cpus(); + pm_finish(state); device_resume(); resume_console(); thaw_processes(); - enable_nonboot_cpus(); - if (pm_ops && pm_ops->finish) - pm_ops->finish(state); pm_restore_console(); } @@ -161,7 +167,10 @@ static inline int valid_state(suspend_state_t state) if (state == PM_SUSPEND_DISK) return 1; - if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) + /* all other states need lowlevel support and need to be + * valid to the lowlevel implementation, no valid callback + * implies that all are valid. */ + if (!pm_ops || (pm_ops->valid && !pm_ops->valid(state))) return 0; return 1; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c024606..fc53ad0 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -591,7 +591,7 @@ static unsigned int count_free_highmem_pages(void) for_each_zone(zone) if (populated_zone(zone) && is_highmem(zone)) - cnt += zone->free_pages; + cnt += zone_page_state(zone, NR_FREE_PAGES); return cnt; } @@ -869,7 +869,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) for_each_zone(zone) { meta += snapshot_additional_pages(zone); if (!is_highmem(zone)) - free += zone->free_pages; + free += zone_page_state(zone, NR_FREE_PAGES); } nr_pages += count_pages_for_highmem(nr_highmem); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 31aa039..7fb8343 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -230,9 +230,10 @@ int swsusp_shrink_memory(void) for_each_zone (zone) if (populated_zone(zone)) { if (is_highmem(zone)) { - highmem_size -= zone->free_pages; + highmem_size -= + zone_page_state(zone, NR_FREE_PAGES); } else { - tmp -= zone->free_pages; + tmp -= zone_page_state(zone, NR_FREE_PAGES); tmp += zone->lowmem_reserve[ZONE_NORMAL]; tmp += snapshot_additional_pages(zone); } diff --git a/kernel/power/user.c b/kernel/power/user.c index f7b7a78..dd09efe 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -37,6 +37,7 @@ static struct snapshot_data { int mode; char frozen; char ready; + char platform_suspend; } snapshot_state; static atomic_t device_available = ATOMIC_INIT(1); @@ -66,6 +67,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->bitmap = NULL; data->frozen = 0; data->ready = 0; + data->platform_suspend = 0; return 0; } @@ -122,6 +124,92 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } +static inline int platform_prepare(void) +{ + int error = 0; + + if (pm_ops && pm_ops->prepare) + error = pm_ops->prepare(PM_SUSPEND_DISK); + + return error; +} + +static inline void platform_finish(void) +{ + if (pm_ops && pm_ops->finish) + pm_ops->finish(PM_SUSPEND_DISK); +} + +static inline int snapshot_suspend(int platform_suspend) +{ + int error; + + mutex_lock(&pm_mutex); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (error) + goto Finish; + + if (platform_suspend) { + error = platform_prepare(); + if (error) + goto Finish; + } + suspend_console(); + error = device_suspend(PMSG_FREEZE); + if (error) + goto Resume_devices; + + error = disable_nonboot_cpus(); + if (!error) { + in_suspend = 1; + error = swsusp_suspend(); + } + enable_nonboot_cpus(); + Resume_devices: + if (platform_suspend) + platform_finish(); + + device_resume(); + resume_console(); + Finish: + mutex_unlock(&pm_mutex); + return error; +} + +static inline int snapshot_restore(int platform_suspend) +{ + int error; + + mutex_lock(&pm_mutex); + pm_prepare_console(); + if (platform_suspend) { + error = platform_prepare(); + if (error) + goto Finish; + } + suspend_console(); + error = device_suspend(PMSG_PRETHAW); + if (error) + goto Resume_devices; + + error = disable_nonboot_cpus(); + if (!error) + error = swsusp_resume(); + + enable_nonboot_cpus(); + Resume_devices: + if (platform_suspend) + platform_finish(); + + device_resume(); + resume_console(); + Finish: + pm_restore_console(); + mutex_unlock(&pm_mutex); + return error; +} + static int snapshot_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { @@ -145,14 +233,9 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (data->frozen) break; mutex_lock(&pm_mutex); - error = disable_nonboot_cpus(); - if (!error) { - error = freeze_processes(); - if (error) { - thaw_processes(); - enable_nonboot_cpus(); - error = -EBUSY; - } + if (freeze_processes()) { + thaw_processes(); + error = -EBUSY; } mutex_unlock(&pm_mutex); if (!error) @@ -164,7 +247,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; mutex_lock(&pm_mutex); thaw_processes(); - enable_nonboot_cpus(); mutex_unlock(&pm_mutex); data->frozen = 0; break; @@ -174,20 +256,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - mutex_lock(&pm_mutex); - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (!error) { - suspend_console(); - error = device_suspend(PMSG_FREEZE); - if (!error) { - in_suspend = 1; - error = swsusp_suspend(); - device_resume(); - } - resume_console(); - } - mutex_unlock(&pm_mutex); + error = snapshot_suspend(data->platform_suspend); if (!error) error = put_user(in_suspend, (unsigned int __user *)arg); if (!error) @@ -201,17 +270,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - mutex_lock(&pm_mutex); - pm_prepare_console(); - suspend_console(); - error = device_suspend(PMSG_PRETHAW); - if (!error) { - error = swsusp_resume(); - device_resume(); - } - resume_console(); - pm_restore_console(); - mutex_unlock(&pm_mutex); + error = snapshot_restore(data->platform_suspend); break; case SNAPSHOT_FREE: @@ -282,6 +341,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_S2RAM: + if (!pm_ops) { + error = -ENOSYS; + break; + } + if (!data->frozen) { error = -EPERM; break; @@ -319,28 +383,35 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_PMOPS: + error = -EINVAL; + switch (arg) { case PMOPS_PREPARE: - if (pm_ops->prepare) { - error = pm_ops->prepare(PM_SUSPEND_DISK); + if (pm_ops && pm_ops->enter) { + data->platform_suspend = 1; + error = 0; + } else { + error = -ENOSYS; } break; case PMOPS_ENTER: - kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = pm_ops->enter(PM_SUSPEND_DISK); + if (data->platform_suspend) { + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); + error = pm_ops->enter(PM_SUSPEND_DISK); + error = 0; + } break; case PMOPS_FINISH: - if (pm_ops && pm_ops->finish) { - pm_ops->finish(PM_SUSPEND_DISK); - } + if (data->platform_suspend) + error = 0; + break; default: printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - error = -EINVAL; } break; diff --git a/kernel/printk.c b/kernel/printk.c index c770e1a..4b47e59 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -54,7 +54,7 @@ int console_printk[4] = { }; /* - * Low lever drivers may need that to know if they can schedule in + * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. */ int oops_in_progress; @@ -483,7 +483,7 @@ static int have_callable_console(void) * printk - print a kernel message * @fmt: format string * - * This is printk. It can be called from any context. We want it to work. + * This is printk(). It can be called from any context. We want it to work. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output @@ -529,7 +529,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) zap_locks(); /* This stops the holder of console_sem just where we want him */ - local_irq_save(flags); + raw_local_irq_save(flags); lockdep_off(); spin_lock(&logbuf_lock); printk_cpu = smp_processor_id(); @@ -618,7 +618,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) up(&console_sem); } lockdep_on(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -628,7 +628,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); lockdep_on(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } preempt_enable(); @@ -783,6 +783,12 @@ int is_console_locked(void) return console_locked; } +void wake_up_klogd(void) +{ + if (!oops_in_progress && waitqueue_active(&log_wait)) + wake_up_interruptible(&log_wait); +} + /** * release_console_sem - unlock the console system * @@ -825,8 +831,8 @@ void release_console_sem(void) console_locked = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); - if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) - wake_up_interruptible(&log_wait); + if (wake_klogd) + wake_up_klogd(); } EXPORT_SYMBOL(release_console_sem); diff --git a/kernel/profile.c b/kernel/profile.c index d6579d5..9bfadb2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -449,7 +449,6 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) /* create /proc/irq/prof_cpu_mask */ if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) return; - entry->nlink = 1; entry->data = (void *)&prof_cpu_mask; entry->read_proc = prof_cpu_mask_read_proc; entry->write_proc = prof_cpu_mask_write_proc; diff --git a/kernel/relay.c b/kernel/relay.c index 284e2e8..ef8a935 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -7,6 +7,8 @@ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) * * Moved to kernel/relay.c by Paul Mundt, 2006. + * November 2006 - CPU hotplug support by Mathieu Desnoyers + * (mathieu.desnoyers@polymtl.ca) * * This file is released under the GPL. */ @@ -18,6 +20,11 @@ #include <linux/relay.h> #include <linux/vmalloc.h> #include <linux/mm.h> +#include <linux/cpu.h> + +/* list of open channels, for cpu hotplug */ +static DEFINE_MUTEX(relay_channels_mutex); +static LIST_HEAD(relay_channels); /* * close() vm_op implementation for relay file mapping. @@ -187,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf) __free_page(buf->page_array[i]); kfree(buf->page_array); } + chan->buf[buf->cpu] = NULL; kfree(buf->padding); kfree(buf); kref_put(&chan->kref, relay_destroy_channel); @@ -320,7 +328,7 @@ static void wakeup_readers(struct work_struct *work) * @buf: the channel buffer * @init: 1 if this is a first-time initialization * - * See relay_reset for description of effect. + * See relay_reset() for description of effect. */ static void __relay_reset(struct rchan_buf *buf, unsigned int init) { @@ -356,57 +364,75 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) * and restarting the channel in its initial state. The buffers * are not freed, so any mappings are still in effect. * - * NOTE: Care should be taken that the channel isn't actually + * NOTE. Care should be taken that the channel isn't actually * being used by anything when this call is made. */ void relay_reset(struct rchan *chan) { unsigned int i; - struct rchan_buf *prev = NULL; if (!chan) return; - for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i] || chan->buf[i] == prev) - break; - __relay_reset(chan->buf[i], 0); - prev = chan->buf[i]; + if (chan->is_global && chan->buf[0]) { + __relay_reset(chan->buf[0], 0); + return; } + + mutex_lock(&relay_channels_mutex); + for_each_online_cpu(i) + if (chan->buf[i]) + __relay_reset(chan->buf[i], 0); + mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_reset); /* * relay_open_buf - create a new relay channel buffer * - * Internal - used by relay_open(). + * used by relay_open() and CPU hotplug. */ -static struct rchan_buf *relay_open_buf(struct rchan *chan, - const char *filename, - struct dentry *parent, - int *is_global) +static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) { - struct rchan_buf *buf; + struct rchan_buf *buf = NULL; struct dentry *dentry; + char *tmpname; - if (*is_global) + if (chan->is_global) return chan->buf[0]; + tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmpname) + goto end; + snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); + buf = relay_create_buf(chan); if (!buf) - return NULL; + goto free_name; + + buf->cpu = cpu; + __relay_reset(buf, 1); /* Create file in fs */ - dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR, - buf, is_global); - if (!dentry) { - relay_destroy_buf(buf); - return NULL; - } + dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, + buf, &chan->is_global); + if (!dentry) + goto free_buf; buf->dentry = dentry; - __relay_reset(buf, 1); + if(chan->is_global) { + chan->buf[0] = buf; + buf->cpu = 0; + } + + goto free_name; + +free_buf: + relay_destroy_buf(buf); +free_name: + kfree(tmpname); +end: return buf; } @@ -448,31 +474,71 @@ static void setup_callbacks(struct rchan *chan, } /** + * + * relay_hotcpu_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) + */ +static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + struct rchan *chan; + + switch(action) { + case CPU_UP_PREPARE: + mutex_lock(&relay_channels_mutex); + list_for_each_entry(chan, &relay_channels, list) { + if (chan->buf[hotcpu]) + continue; + chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); + if(!chan->buf[hotcpu]) { + printk(KERN_ERR + "relay_hotcpu_callback: cpu %d buffer " + "creation failed\n", hotcpu); + mutex_unlock(&relay_channels_mutex); + return NOTIFY_BAD; + } + } + mutex_unlock(&relay_channels_mutex); + break; + case CPU_DEAD: + /* No need to flush the cpu : will be flushed upon + * final relay_flush() call. */ + break; + } + return NOTIFY_OK; +} + +/** * relay_open - create a new relay channel * @base_filename: base name of files to create * @parent: dentry of parent directory, %NULL for root directory * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers * @cb: client callback functions + * @private_data: user-defined data * * Returns channel pointer if successful, %NULL otherwise. * * Creates a channel buffer for each cpu using the sizes and * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File - * permissions will be S_IRUSR. + * permissions will be %S_IRUSR. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, size_t subbuf_size, size_t n_subbufs, - struct rchan_callbacks *cb) + struct rchan_callbacks *cb, + void *private_data) { unsigned int i; struct rchan *chan; - char *tmpname; - int is_global = 0; - if (!base_filename) return NULL; @@ -487,38 +553,32 @@ struct rchan *relay_open(const char *base_filename, chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); + chan->parent = parent; + chan->private_data = private_data; + strlcpy(chan->base_filename, base_filename, NAME_MAX); setup_callbacks(chan, cb); kref_init(&chan->kref); - tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmpname) - goto free_chan; - + mutex_lock(&relay_channels_mutex); for_each_online_cpu(i) { - sprintf(tmpname, "%s%d", base_filename, i); - chan->buf[i] = relay_open_buf(chan, tmpname, parent, - &is_global); + chan->buf[i] = relay_open_buf(chan, i); if (!chan->buf[i]) goto free_bufs; - - chan->buf[i]->cpu = i; } + list_add(&chan->list, &relay_channels); + mutex_unlock(&relay_channels_mutex); - kfree(tmpname); return chan; free_bufs: - for (i = 0; i < NR_CPUS; i++) { + for_each_online_cpu(i) { if (!chan->buf[i]) break; relay_close_buf(chan->buf[i]); - if (is_global) - break; } - kfree(tmpname); -free_chan: kref_put(&chan->kref, relay_destroy_channel); + mutex_unlock(&relay_channels_mutex); return NULL; } EXPORT_SYMBOL_GPL(relay_open); @@ -588,7 +648,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf); * subbufs_consumed should be the number of sub-buffers newly consumed, * not the total consumed. * - * NOTE: Kernel clients don't need to call this function if the channel + * NOTE. Kernel clients don't need to call this function if the channel * mode is 'overwrite'. */ void relay_subbufs_consumed(struct rchan *chan, @@ -619,24 +679,26 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); void relay_close(struct rchan *chan) { unsigned int i; - struct rchan_buf *prev = NULL; if (!chan) return; - for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i] || chan->buf[i] == prev) - break; - relay_close_buf(chan->buf[i]); - prev = chan->buf[i]; - } + mutex_lock(&relay_channels_mutex); + if (chan->is_global && chan->buf[0]) + relay_close_buf(chan->buf[0]); + else + for_each_possible_cpu(i) + if (chan->buf[i]) + relay_close_buf(chan->buf[i]); if (chan->last_toobig) printk(KERN_WARNING "relay: one or more items not logged " "[item size (%Zd) > sub-buffer size (%Zd)]\n", chan->last_toobig, chan->subbuf_size); + list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); + mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_close); @@ -649,17 +711,20 @@ EXPORT_SYMBOL_GPL(relay_close); void relay_flush(struct rchan *chan) { unsigned int i; - struct rchan_buf *prev = NULL; if (!chan) return; - for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i] || chan->buf[i] == prev) - break; - relay_switch_subbuf(chan->buf[i], 0); - prev = chan->buf[i]; + if (chan->is_global && chan->buf[0]) { + relay_switch_subbuf(chan->buf[0], 0); + return; } + + mutex_lock(&relay_channels_mutex); + for_each_possible_cpu(i) + if (chan->buf[i]) + relay_switch_subbuf(chan->buf[i], 0); + mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_flush); @@ -684,7 +749,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) * @filp: the file * @vma: the vma describing what to map * - * Calls upon relay_mmap_buf to map the file into user space. + * Calls upon relay_mmap_buf() to map the file into user space. */ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) { @@ -826,7 +891,7 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, * @read_pos: file read position * @buf: relay channel buffer * - * If the read_pos is in the middle of padding, return the + * If the @read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ @@ -1022,3 +1087,12 @@ const struct file_operations relay_file_operations = { .sendfile = relay_file_sendfile, }; EXPORT_SYMBOL_GPL(relay_file_operations); + +static __init int relay_init(void) +{ + + hotcpu_notifier(relay_hotcpu_callback, 0); + return 0; +} + +module_init(relay_init); diff --git a/kernel/resource.c b/kernel/resource.c index 7b9a497..bdb55a3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -8,7 +8,6 @@ */ #include <linux/module.h> -#include <linux/sched.h> #include <linux/errno.h> #include <linux/ioport.h> #include <linux/init.h> @@ -17,6 +16,7 @@ #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/device.h> #include <asm/io.h> @@ -618,6 +618,67 @@ void __release_region(struct resource *parent, resource_size_t start, EXPORT_SYMBOL(__release_region); /* + * Managed region resource + */ +struct region_devres { + struct resource *parent; + resource_size_t start; + resource_size_t n; +}; + +static void devm_region_release(struct device *dev, void *res) +{ + struct region_devres *this = res; + + __release_region(this->parent, this->start, this->n); +} + +static int devm_region_match(struct device *dev, void *res, void *match_data) +{ + struct region_devres *this = res, *match = match_data; + + return this->parent == match->parent && + this->start == match->start && this->n == match->n; +} + +struct resource * __devm_request_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n, const char *name) +{ + struct region_devres *dr = NULL; + struct resource *res; + + dr = devres_alloc(devm_region_release, sizeof(struct region_devres), + GFP_KERNEL); + if (!dr) + return NULL; + + dr->parent = parent; + dr->start = start; + dr->n = n; + + res = __request_region(parent, start, n, name); + if (res) + devres_add(dev, dr); + else + devres_free(dr); + + return res; +} +EXPORT_SYMBOL(__devm_request_region); + +void __devm_release_region(struct device *dev, struct resource *parent, + resource_size_t start, resource_size_t n) +{ + struct region_devres match_data = { parent, start, n }; + + __release_region(parent, start, n); + WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + &match_data)); +} +EXPORT_SYMBOL(__devm_release_region); + +/* * Called from init/main.c to reserve IO ports. */ #define MAXRESERVE 4 diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 4ab17da..180978c 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -625,7 +625,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, /* Setup the timer, when timeout != NULL */ if (unlikely(timeout)) hrtimer_start(&timeout->timer, timeout->timer.expires, - HRTIMER_ABS); + HRTIMER_MODE_ABS); for (;;) { /* Try to acquire the lock: */ diff --git a/kernel/sched.c b/kernel/sched.c index cca93cc..0dc7572 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -57,6 +57,16 @@ #include <asm/unistd.h> /* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (1000000000 / HZ); +} + +/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. @@ -1843,6 +1853,13 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_enter_lazy_cpu_mode(); + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); @@ -2887,14 +2904,16 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static void update_load(struct rq *this_rq) { unsigned long this_load; - int i, scale; + unsigned int i, scale; this_load = this_rq->raw_weighted_load; /* Update our load: */ - for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { + for (i = 0, scale = 1; i < 3; i++, scale += scale) { unsigned long old_load, new_load; + /* scale is effectively 1 << i now, and >> i divides by scale */ + old_load = this_rq->cpu_load[i]; new_load = this_load; /* @@ -2904,7 +2923,7 @@ static void update_load(struct rq *this_rq) */ if (new_load > old_load) new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } } @@ -4193,13 +4212,12 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) } /** - * sched_setscheduler - change the scheduling policy and/or RT priority of - * a thread. + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. * - * NOTE: the task may be already dead + * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) @@ -4567,7 +4585,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, /** * sys_sched_yield - yield the current processor to other threads. * - * this function yields the current CPU by moving the calling thread + * This function yields the current CPU by moving the calling thread * to the expired array. If there are no other threads running on this * CPU then this function will return. */ @@ -4694,7 +4712,7 @@ EXPORT_SYMBOL(cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * this is a shortcut for kernel-space yielding - it marks the + * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ void __sched yield(void) diff --git a/kernel/signal.c b/kernel/signal.c index 8a04869..3670225 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -456,26 +456,50 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { int signr = __dequeue_signal(&tsk->pending, mask, info); - if (!signr) + if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); + /* + * itimer signal ? + * + * itimers are process shared and we restart periodic + * itimers in the signal delivery path to prevent DoS + * attacks in the high resolution timer case. This is + * compliant with the old way of self restarting + * itimers, as the SIGALRM is a legacy signal and only + * queued once. Changing the restart behaviour to + * restart the timer in the signal dequeue path is + * reducing the timer noise on heavy loaded !highres + * systems too. + */ + if (unlikely(signr == SIGALRM)) { + struct hrtimer *tmr = &tsk->signal->real_timer; + + if (!hrtimer_is_queued(tmr) && + tsk->signal->it_real_incr.tv64 != 0) { + hrtimer_forward(tmr, tmr->base->get_time(), + tsk->signal->it_real_incr); + hrtimer_restart(tmr); + } + } + } recalc_sigpending_tsk(tsk); - if (signr && unlikely(sig_kernel_stop(signr))) { - /* - * Set a marker that we have dequeued a stop signal. Our - * caller might release the siglock and then the pending - * stop signal it is about to process is no longer in the - * pending bitmasks, but must still be cleared by a SIGCONT - * (and overruled by a SIGKILL). So those cases clear this - * shared flag after we've set it. Note that this flag may - * remain set after the signal we return is ignored or - * handled. That doesn't matter because its only purpose - * is to alert stop-signal processing code when another - * processor has come along and cleared the flag. - */ - if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; - } + if (signr && unlikely(sig_kernel_stop(signr))) { + /* + * Set a marker that we have dequeued a stop signal. Our + * caller might release the siglock and then the pending + * stop signal it is about to process is no longer in the + * pending bitmasks, but must still be cleared by a SIGCONT + * (and overruled by a SIGKILL). So those cases clear this + * shared flag after we've set it. Note that this flag may + * remain set after the signal we return is ignored or + * handled. That doesn't matter because its only purpose + * is to alert stop-signal processing code when another + * processor has come along and cleared the flag. + */ + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) + tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + } if ( signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ @@ -1096,42 +1120,21 @@ int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) return retval; } -int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) -{ - if (pgrp <= 0) - return -EINVAL; - - return __kill_pgrp_info(sig, info, find_pid(pgrp)); -} - -int -kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) -{ - int retval; - - read_lock(&tasklist_lock); - retval = __kill_pg_info(sig, info, pgrp); - read_unlock(&tasklist_lock); - - return retval; -} - int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { int error; - int acquired_tasklist_lock = 0; struct task_struct *p; rcu_read_lock(); - if (unlikely(sig_needs_tasklist(sig))) { + if (unlikely(sig_needs_tasklist(sig))) read_lock(&tasklist_lock); - acquired_tasklist_lock = 1; - } + p = pid_task(pid, PIDTYPE_PID); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); - if (unlikely(acquired_tasklist_lock)) + + if (unlikely(sig_needs_tasklist(sig))) read_unlock(&tasklist_lock); rcu_read_unlock(); return error; @@ -1193,8 +1196,10 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); static int kill_something_info(int sig, struct siginfo *info, int pid) { + int ret; + rcu_read_lock(); if (!pid) { - return kill_pg_info(sig, info, process_group(current)); + ret = kill_pgrp_info(sig, info, task_pgrp(current)); } else if (pid == -1) { int retval = 0, count = 0; struct task_struct * p; @@ -1209,12 +1214,14 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) } } read_unlock(&tasklist_lock); - return count ? retval : -ESRCH; + ret = count ? retval : -ESRCH; } else if (pid < 0) { - return kill_pg_info(sig, info, -pid); + ret = kill_pgrp_info(sig, info, find_pid(-pid)); } else { - return kill_proc_info(sig, info, pid); + ret = kill_pid_info(sig, info, find_pid(pid)); } + rcu_read_unlock(); + return ret; } /* @@ -1313,12 +1320,6 @@ int kill_pid(struct pid *pid, int sig, int priv) EXPORT_SYMBOL(kill_pid); int -kill_pg(pid_t pgrp, int sig, int priv) -{ - return kill_pg_info(sig, __si_special(priv), pgrp); -} - -int kill_proc(pid_t pid, int sig, int priv) { return kill_proc_info(sig, __si_special(priv), pid); @@ -1907,7 +1908,7 @@ relock: /* signals can be posted during this window */ - if (is_orphaned_pgrp(process_group(current))) + if (is_current_pgrp_orphaned()) goto relock; spin_lock_irq(¤t->sighand->siglock); @@ -1957,7 +1958,6 @@ EXPORT_SYMBOL(recalc_sigpending); EXPORT_SYMBOL_GPL(dequeue_signal); EXPORT_SYMBOL(flush_signals); EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(kill_pg); EXPORT_SYMBOL(kill_proc); EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(send_sig); @@ -2284,7 +2284,7 @@ static int do_tkill(int tgid, int pid, int sig) * @pid: the PID of the thread * @sig: signal to be sent * - * This syscall also checks the tgid and returns -ESRCH even if the PID + * This syscall also checks the @tgid and returns -ESRCH even if the PID * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 918e52d..8b75008 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -17,6 +17,7 @@ #include <linux/kthread.h> #include <linux/rcupdate.h> #include <linux/smp.h> +#include <linux/tick.h> #include <asm/irq.h> /* @@ -273,6 +274,18 @@ EXPORT_SYMBOL(do_softirq); #endif +/* + * Enter an interrupt context. + */ +void irq_enter(void) +{ + __irq_enter(); +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) + tick_nohz_update_jiffies(); +#endif +} + #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED # define invoke_softirq() __do_softirq() #else @@ -289,6 +302,12 @@ void irq_exit(void) sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + +#ifdef CONFIG_NO_HZ + /* Make sure that timer wheel updates are propagated */ + if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) + tick_nohz_stop_sched_tick(); +#endif preempt_enable_no_resched(); } diff --git a/kernel/sys.c b/kernel/sys.c index 6e2101d..123b165 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -215,7 +215,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * This routine uses RCU to synchronize with changes to the chain. * * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain + * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value @@ -313,7 +313,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain + * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value @@ -393,7 +393,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); * All locking must be provided by the caller. * * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then raw_notifier_call_chain + * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value @@ -487,7 +487,7 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain + * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value @@ -538,7 +538,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head); * Registers a function with the list of functions * to be called at reboot time. * - * Currently always returns zero, as blocking_notifier_chain_register + * Currently always returns zero, as blocking_notifier_chain_register() * always returns zero. */ @@ -596,6 +596,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) struct task_struct *g, *p; struct user_struct *user; int error = -EINVAL; + struct pid *pgrp; if (which > 2 || which < 0) goto out; @@ -610,18 +611,21 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: - if (!who) - who = current->pid; - p = find_task_by_pid(who); + if (who) + p = find_task_by_pid(who); + else + p = current; if (p) error = set_one_prio(p, niceval, error); break; case PRIO_PGRP: - if (!who) - who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + if (who) + pgrp = find_pid(who); + else + pgrp = task_pgrp(current); + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -656,6 +660,7 @@ asmlinkage long sys_getpriority(int which, int who) struct task_struct *g, *p; struct user_struct *user; long niceval, retval = -ESRCH; + struct pid *pgrp; if (which > 2 || which < 0) return -EINVAL; @@ -663,9 +668,10 @@ asmlinkage long sys_getpriority(int which, int who) read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: - if (!who) - who = current->pid; - p = find_task_by_pid(who); + if (who) + p = find_task_by_pid(who); + else + p = current; if (p) { niceval = 20 - task_nice(p); if (niceval > retval) @@ -673,13 +679,15 @@ asmlinkage long sys_getpriority(int which, int who) } break; case PRIO_PGRP: - if (!who) - who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + if (who) + pgrp = find_pid(who); + else + pgrp = task_pgrp(current); + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -1388,7 +1396,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->real_parent == group_leader) { err = -EPERM; - if (process_session(p) != process_session(group_leader)) + if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; if (p->did_exec) @@ -1407,7 +1415,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct task_struct *g = find_task_by_pid_type(PIDTYPE_PGID, pgid); - if (!g || process_session(g) != process_session(group_leader)) + if (!g || task_session(g) != task_session(group_leader)) goto out; } @@ -1510,7 +1518,6 @@ asmlinkage long sys_setsid(void) spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; - group_leader->signal->tty_old_pgrp = 0; spin_unlock(&group_leader->sighand->siglock); err = process_group(group_leader); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 600b333..3ca1d5f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -90,12 +90,6 @@ extern char modprobe_path[]; #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif -#ifdef CONFIG_SYSVIPC -static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); -#endif #ifdef __sparc__ extern char reboot_command []; @@ -135,22 +129,12 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *); #endif -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); - -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen); - -#ifdef CONFIG_SYSVIPC -static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen); -#endif #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif static ctl_table root_table[]; @@ -174,59 +158,6 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif -static void *get_uts(ctl_table *table, int write) -{ - char *which = table->data; -#ifdef CONFIG_UTS_NS - struct uts_namespace *uts_ns = current->nsproxy->uts_ns; - which = (which - (char *)&init_uts_ns) + (char *)uts_ns; -#endif - if (!write) - down_read(&uts_sem); - else - down_write(&uts_sem); - return which; -} - -static void put_uts(ctl_table *table, int write, void *which) -{ - if (!write) - up_read(&uts_sem); - else - up_write(&uts_sem); -} - -#ifdef CONFIG_SYSVIPC -static void *get_ipc(ctl_table *table, int write) -{ - char *which = table->data; - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; - return which; -} -#else -#define get_ipc(T,W) ((T)->data) -#endif - -/* /proc declarations: */ - -#ifdef CONFIG_PROC_SYSCTL - -static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); -static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); -static int proc_opensys(struct inode *, struct file *); - -const struct file_operations proc_sys_file_operations = { - .open = proc_opensys, - .read = proc_readsys, - .write = proc_writesys, -}; - -extern struct proc_dir_entry *proc_sys_root; - -static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *); -static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); -#endif /* The default sysctl tables: */ @@ -275,51 +206,6 @@ static ctl_table root_table[] = { static ctl_table kern_table[] = { { - .ctl_name = KERN_OSTYPE, - .procname = "ostype", - .data = init_uts_ns.name.sysname, - .maxlen = sizeof(init_uts_ns.name.sysname), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { - .ctl_name = KERN_OSRELEASE, - .procname = "osrelease", - .data = init_uts_ns.name.release, - .maxlen = sizeof(init_uts_ns.name.release), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { - .ctl_name = KERN_VERSION, - .procname = "version", - .data = init_uts_ns.name.version, - .maxlen = sizeof(init_uts_ns.name.version), - .mode = 0444, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { - .ctl_name = KERN_NODENAME, - .procname = "hostname", - .data = init_uts_ns.name.nodename, - .maxlen = sizeof(init_uts_ns.name.nodename), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { - .ctl_name = KERN_DOMAINNAME, - .procname = "domainname", - .data = init_uts_ns.name.domainname, - .maxlen = sizeof(init_uts_ns.name.domainname), - .mode = 0644, - .proc_handler = &proc_do_uts_string, - .strategy = &sysctl_uts_string, - }, - { .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, @@ -344,14 +230,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, +#ifdef CONFIG_PROC_SYSCTL { .ctl_name = KERN_TAINTED, .procname = "tainted", .data = &tainted, .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, + .mode = 0644, + .proc_handler = &proc_dointvec_taint, }, +#endif { .ctl_name = KERN_CAP_BSET, .procname = "cap-bound", @@ -473,71 +361,6 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_SYSVIPC - { - .ctl_name = KERN_SHMMAX, - .procname = "shmmax", - .data = &init_ipc_ns.shm_ctlmax, - .maxlen = sizeof (init_ipc_ns.shm_ctlmax), - .mode = 0644, - .proc_handler = &proc_ipc_doulongvec_minmax, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_SHMALL, - .procname = "shmall", - .data = &init_ipc_ns.shm_ctlall, - .maxlen = sizeof (init_ipc_ns.shm_ctlall), - .mode = 0644, - .proc_handler = &proc_ipc_doulongvec_minmax, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_SHMMNI, - .procname = "shmmni", - .data = &init_ipc_ns.shm_ctlmni, - .maxlen = sizeof (init_ipc_ns.shm_ctlmni), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_MSGMAX, - .procname = "msgmax", - .data = &init_ipc_ns.msg_ctlmax, - .maxlen = sizeof (init_ipc_ns.msg_ctlmax), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_MSGMNI, - .procname = "msgmni", - .data = &init_ipc_ns.msg_ctlmni, - .maxlen = sizeof (init_ipc_ns.msg_ctlmni), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_MSGMNB, - .procname = "msgmnb", - .data = &init_ipc_ns.msg_ctlmnb, - .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, - { - .ctl_name = KERN_SEM, - .procname = "sem", - .data = &init_ipc_ns.sem_ctls, - .maxlen = 4*sizeof (int), - .mode = 0644, - .proc_handler = &proc_ipc_dointvec, - .strategy = sysctl_ipc_data, - }, -#endif #ifdef CONFIG_MAGIC_SYSRQ { .ctl_name = KERN_SYSRQ, @@ -1038,6 +861,12 @@ static ctl_table vm_table[] = { { .ctl_name = 0 } }; +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) +static ctl_table binfmt_misc_table[] = { + { .ctl_name = 0 } +}; +#endif + static ctl_table fs_table[] = { { .ctl_name = FS_NRINODE, @@ -1161,6 +990,14 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "binfmt_misc", + .mode = 0555, + .child = binfmt_misc_table, + }, +#endif { .ctl_name = 0 } }; @@ -1172,8 +1009,6 @@ static ctl_table dev_table[] = { { .ctl_name = 0 } }; -extern void init_irq_proc (void); - static DEFINE_SPINLOCK(sysctl_lock); /* called under sysctl_lock */ @@ -1215,19 +1050,47 @@ static void start_unregistering(struct ctl_table_header *p) list_del_init(&p->ctl_entry); } -void __init sysctl_init(void) +void sysctl_head_finish(struct ctl_table_header *head) { -#ifdef CONFIG_PROC_SYSCTL - register_proc_table(root_table, proc_sys_root, &root_table_header); - init_irq_proc(); -#endif + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) +{ + struct ctl_table_header *head; + struct list_head *tmp; + spin_lock(&sysctl_lock); + if (prev) { + tmp = &prev->ctl_entry; + unuse_table(prev); + goto next; + } + tmp = &root_table_header.ctl_entry; + for (;;) { + head = list_entry(tmp, struct ctl_table_header, ctl_entry); + + if (!use_table(head)) + goto next; + spin_unlock(&sysctl_lock); + return head; + next: + tmp = tmp->next; + if (tmp == &root_table_header.ctl_entry) + break; + } + spin_unlock(&sysctl_lock); + return NULL; } #ifdef CONFIG_SYSCTL_SYSCALL int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - struct list_head *tmp; + struct ctl_table_header *head; int error = -ENOTDIR; if (nlen <= 0 || nlen >= CTL_MAXNAME) @@ -1237,26 +1100,16 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol if (!oldlenp || get_user(old_len, oldlenp)) return -EFAULT; } - spin_lock(&sysctl_lock); - tmp = &root_table_header.ctl_entry; - do { - struct ctl_table_header *head = - list_entry(tmp, struct ctl_table_header, ctl_entry); - - if (!use_table(head)) - continue; - - spin_unlock(&sysctl_lock); + for (head = sysctl_head_next(NULL); head; + head = sysctl_head_next(head)) { error = parse_table(name, nlen, oldval, oldlenp, newval, newlen, head->ctl_table); - - spin_lock(&sysctl_lock); - unuse_table(head); - if (error != -ENOTDIR) + if (error != -ENOTDIR) { + sysctl_head_finish(head); break; - } while ((tmp = tmp->next) != &root_table_header.ctl_entry); - spin_unlock(&sysctl_lock); + } + } return error; } @@ -1277,7 +1130,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) #endif /* CONFIG_SYSCTL_SYSCALL */ /* - * ctl_perm does NOT grant the superuser all rights automatically, because + * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. */ @@ -1292,7 +1145,7 @@ static int test_perm(int mode, int op) return -EACCES; } -static inline int ctl_perm(ctl_table *table, int op) +int sysctl_perm(ctl_table *table, int op) { int error; error = security_sysctl(table, op); @@ -1316,19 +1169,11 @@ repeat: for ( ; table->ctl_name || table->procname; table++) { if (!table->ctl_name) continue; - if (n == table->ctl_name || table->ctl_name == CTL_ANY) { + if (n == table->ctl_name) { int error; if (table->child) { - if (ctl_perm(table, 001)) + if (sysctl_perm(table, 001)) return -EPERM; - if (table->strategy) { - error = table->strategy( - table, name, nlen, - oldval, oldlenp, - newval, newlen); - if (error) - return error; - } name++; nlen--; table = table->child; @@ -1356,7 +1201,7 @@ int do_sysctl_strategy (ctl_table *table, op |= 004; if (newval) op |= 002; - if (ctl_perm(table, op)) + if (sysctl_perm(table, op)) return -EPERM; if (table->strategy) { @@ -1395,10 +1240,26 @@ int do_sysctl_strategy (ctl_table *table, } #endif /* CONFIG_SYSCTL_SYSCALL */ +static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) +{ + for (; table->ctl_name || table->procname; table++) { + table->parent = parent; + if (table->child) + sysctl_set_parent(table, table->child); + } +} + +static __init int sysctl_init(void) +{ + sysctl_set_parent(NULL, root_table); + return 0; +} + +core_initcall(sysctl_init); + /** * register_sysctl_table - register a sysctl hierarchy * @table: the top-level table structure - * @insert_at_head: whether the entry should be inserted in front or at the end * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. An entry with a ctl_name of 0 terminates the table. @@ -1464,8 +1325,7 @@ int do_sysctl_strategy (ctl_table *table, * This routine returns %NULL on a failure to register, and a pointer * to the table header on success. */ -struct ctl_table_header *register_sysctl_table(ctl_table * table, - int insert_at_head) +struct ctl_table_header *register_sysctl_table(ctl_table * table) { struct ctl_table_header *tmp; tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); @@ -1475,15 +1335,10 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, INIT_LIST_HEAD(&tmp->ctl_entry); tmp->used = 0; tmp->unregistering = NULL; + sysctl_set_parent(NULL, table); spin_lock(&sysctl_lock); - if (insert_at_head) - list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); - else - list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); + list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); spin_unlock(&sysctl_lock); -#ifdef CONFIG_PROC_SYSCTL - register_proc_table(table, proc_sys_root, tmp); -#endif return tmp; } @@ -1499,9 +1354,6 @@ void unregister_sysctl_table(struct ctl_table_header * header) might_sleep(); spin_lock(&sysctl_lock); start_unregistering(header); -#ifdef CONFIG_PROC_SYSCTL - unregister_proc_table(header->ctl_table, proc_sys_root); -#endif spin_unlock(&sysctl_lock); kfree(header); } @@ -1525,155 +1377,6 @@ void unregister_sysctl_table(struct ctl_table_header * table) #ifdef CONFIG_PROC_SYSCTL -/* Scan the sysctl entries in table and add them all into /proc */ -static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) -{ - struct proc_dir_entry *de; - int len; - mode_t mode; - - for (; table->ctl_name || table->procname; table++) { - /* Can't do anything without a proc name. */ - if (!table->procname) - continue; - /* Maybe we can't do anything with it... */ - if (!table->proc_handler && !table->child) { - printk(KERN_WARNING "SYSCTL: Can't register %s\n", - table->procname); - continue; - } - - len = strlen(table->procname); - mode = table->mode; - - de = NULL; - if (table->proc_handler) - mode |= S_IFREG; - else { - mode |= S_IFDIR; - for (de = root->subdir; de; de = de->next) { - if (proc_match(len, table->procname, de)) - break; - } - /* If the subdir exists already, de is non-NULL */ - } - - if (!de) { - de = create_proc_entry(table->procname, mode, root); - if (!de) - continue; - de->set = set; - de->data = (void *) table; - if (table->proc_handler) - de->proc_fops = &proc_sys_file_operations; - } - table->de = de; - if (de->mode & S_IFDIR) - register_proc_table(table->child, de, set); - } -} - -/* - * Unregister a /proc sysctl table and any subdirectories. - */ -static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root) -{ - struct proc_dir_entry *de; - for (; table->ctl_name || table->procname; table++) { - if (!(de = table->de)) - continue; - if (de->mode & S_IFDIR) { - if (!table->child) { - printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); - continue; - } - unregister_proc_table(table->child, de); - - /* Don't unregister directories which still have entries.. */ - if (de->subdir) - continue; - } - - /* - * In any case, mark the entry as goner; we'll keep it - * around if it's busy, but we'll know to do nothing with - * its fields. We are under sysctl_lock here. - */ - de->data = NULL; - - /* Don't unregister proc entries that are still being used.. */ - if (atomic_read(&de->count)) - continue; - - table->de = NULL; - remove_proc_entry(table->procname, root); - } -} - -static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - int op; - struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); - struct ctl_table *table; - size_t res; - ssize_t error = -ENOTDIR; - - spin_lock(&sysctl_lock); - if (de && de->data && use_table(de->set)) { - /* - * at that point we know that sysctl was not unregistered - * and won't be until we finish - */ - spin_unlock(&sysctl_lock); - table = (struct ctl_table *) de->data; - if (!table || !table->proc_handler) - goto out; - error = -EPERM; - op = (write ? 002 : 004); - if (ctl_perm(table, op)) - goto out; - - /* careful: calling conventions are nasty here */ - res = count; - error = (*table->proc_handler)(table, write, file, - buf, &res, ppos); - if (!error) - error = res; - out: - spin_lock(&sysctl_lock); - unuse_table(de->set); - } - spin_unlock(&sysctl_lock); - return error; -} - -static int proc_opensys(struct inode *inode, struct file *file) -{ - if (file->f_mode & FMODE_WRITE) { - /* - * sysctl entries that are not writable, - * are _NOT_ writable, capabilities or not. - */ - if (!(inode->i_mode & S_IWUSR)) - return -EPERM; - } - - return 0; -} - -static ssize_t proc_readsys(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - return do_rw_proc(0, file, buf, count, ppos); -} - -static ssize_t proc_writesys(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - return do_rw_proc(1, file, (char __user *) buf, count, ppos); -} - static int _proc_do_string(void* data, int maxlen, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1681,13 +1384,12 @@ static int _proc_do_string(void* data, int maxlen, int write, size_t len; char __user *p; char c; - - if (!data || !maxlen || !*lenp || - (*ppos && !write)) { + + if (!data || !maxlen || !*lenp) { *lenp = 0; return 0; } - + if (write) { len = 0; p = buffer; @@ -1708,6 +1410,15 @@ static int _proc_do_string(void* data, int maxlen, int write, len = strlen(data); if (len > maxlen) len = maxlen; + + if (*ppos > len) { + *lenp = 0; + return 0; + } + + data += *ppos; + len -= *ppos; + if (len > *lenp) len = *lenp; if (len) @@ -1749,21 +1460,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, buffer, lenp, ppos); } -/* - * Special case of dostring for the UTS structure. This has locks - * to observe. Should this be in kernel/sys.c ???? - */ - -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int r; - void *which; - which = get_uts(table, write); - r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); - put_uts(table, write, which); - return r; -} static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1927,6 +1623,7 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, #define OP_SET 0 #define OP_AND 1 +#define OP_OR 2 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1938,6 +1635,7 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, switch(op) { case OP_SET: *valp = val; break; case OP_AND: *valp &= val; break; + case OP_OR: *valp |= val; break; } } else { int val = *valp; @@ -1961,7 +1659,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, { int op; - if (!capable(CAP_SYS_MODULE)) { + if (write && !capable(CAP_SYS_MODULE)) { return -EPERM; } @@ -1970,6 +1668,22 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, do_proc_dointvec_bset_conv,&op); } +/* + * Taint values can only be increased + */ +static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int op; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + op = OP_OR; + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + do_proc_dointvec_bset_conv,&op); +} + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; @@ -2331,27 +2045,6 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, do_proc_dointvec_ms_jiffies_conv, NULL); } -#ifdef CONFIG_SYSVIPC -static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - void *which; - which = get_ipc(table, write); - return __do_proc_dointvec(which, table, write, filp, buffer, - lenp, ppos, NULL, NULL); -} - -static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) -{ - void *which; - which = get_ipc(table, write); - return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, - lenp, ppos, 1l, 1l); -} - -#endif - static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2382,31 +2075,6 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return -ENOSYS; } -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - -#ifdef CONFIG_SYSVIPC -static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} -static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} -static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, - struct file *filp, void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} -#endif - int proc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2553,17 +2221,23 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - if (oldval) { + if (oldval && oldlenp) { size_t olen; - if (oldlenp) { - if (get_user(olen, oldlenp)) + + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen) { + int val; + + if (olen < sizeof(int)) + return -EINVAL; + + val = *(int *)(table->data) / HZ; + if (put_user(val, (int __user *)oldval)) + return -EFAULT; + if (put_user(sizeof(int), oldlenp)) return -EFAULT; - if (olen!=sizeof(int)) - return -EINVAL; } - if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) || - (oldlenp && put_user(sizeof(int),oldlenp))) - return -EFAULT; } if (newval && newlen) { int new; @@ -2581,17 +2255,23 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - if (oldval) { + if (oldval && oldlenp) { size_t olen; - if (oldlenp) { - if (get_user(olen, oldlenp)) + + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen) { + int val; + + if (olen < sizeof(int)) + return -EINVAL; + + val = jiffies_to_msecs(*(int *)(table->data)); + if (put_user(val, (int __user *)oldval)) + return -EFAULT; + if (put_user(sizeof(int), oldlenp)) return -EFAULT; - if (olen!=sizeof(int)) - return -EINVAL; } - if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) || - (oldlenp && put_user(sizeof(int),oldlenp))) - return -EFAULT; } if (newval && newlen) { int new; @@ -2605,62 +2285,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, } -/* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table uts_table; - int r, write; - write = newval && newlen; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); - put_uts(table, write, uts_table.data); - return r; -} - -#ifdef CONFIG_SYSVIPC -/* The generic sysctl ipc data routine. */ -static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - size_t len; - void *data; - - /* Get out of I don't have a variable */ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - data = get_ipc(table, 1); - if (!data) - return -ENOTDIR; - - if (oldval && oldlenp) { - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, data, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - if (newval && newlen) { - if (newlen > table->maxlen) - newlen = table->maxlen; - - if (copy_from_user(data, newval, newlen)) - return -EFAULT; - } - return 1; -} -#endif #else /* CONFIG_SYSCTL_SYSCALL */ @@ -2726,18 +2350,6 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, return -ENOSYS; } -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} -static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} #endif /* CONFIG_SYSCTL_SYSCALL */ /* diff --git a/kernel/time.c b/kernel/time.c index 0e017bff..c6c80ea5 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -470,6 +470,260 @@ struct timeval ns_to_timeval(const s64 nsec) return tv; } +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldnt overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; +#endif +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC; +#endif +} +EXPORT_SYMBOL(usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); +} +EXPORT_SYMBOL(jiffies_to_timespec); + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u64 nsec = (u64)jiffies * TICK_NSEC; + long tv_usec; + + value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); + tv_usec /= NSEC_PER_USEC; + value->tv_usec = tv_usec; +} + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + return x / (HZ / USER_HZ); +#else + u64 tmp = (u64)x * TICK_NSEC; + do_div(tmp, (NSEC_PER_SEC / USER_HZ)); + return (long)tmp; +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + u64 jif; + + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + jif = x * (u64) HZ; + do_div(jif, USER_HZ); + return jif; +#endif +} +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + do_div(x, HZ / USER_HZ); +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x *= TICK_NSEC; + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} + +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + do_div(x, (NSEC_PER_SEC / USER_HZ)); +#elif (USER_HZ % 512) == 0 + x *= USER_HZ/512; + do_div(x, (NSEC_PER_SEC / 512)); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + x *= 9; + do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) / + USER_HZ)); +#endif + return x; +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 0000000..f663511 --- /dev/null +++ b/kernel/time/Kconfig @@ -0,0 +1,25 @@ +# +# Timer subsystem related configuration options +# +config TICK_ONESHOT + bool + default n + +config NO_HZ + bool "Tickless System (Dynamic Ticks)" + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables a tickless system: timer interrupts will + only trigger on an as-needed basis both when the system is + busy and when the system is idle. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 61a3907..93bccba 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1 +1,8 @@ -obj-y += ntp.o clocksource.o jiffies.o +obj-y += ntp.o clocksource.o jiffies.o timer_list.o + +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o +obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 0000000..67932ea --- /dev/null +++ b/kernel/time/clockevents.c @@ -0,0 +1,345 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event devices. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ + +#include <linux/clockchips.h> +#include <linux/hrtimer.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/smp.h> +#include <linux/sysdev.h> + +/* The registered clock event devices */ +static LIST_HEAD(clockevent_devices); +static LIST_HEAD(clockevents_released); + +/* Notification for clock events */ +static RAW_NOTIFIER_HEAD(clockevents_chain); + +/* Protection for the above */ +static DEFINE_SPINLOCK(clockevents_lock); + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +unsigned long clockevent_delta2ns(unsigned long latch, + struct clock_event_device *evt) +{ + u64 clc = ((u64) latch << evt->shift); + + do_div(clc, evt->mult); + if (clc < 1000) + clc = 1000; + if (clc > LONG_MAX) + clc = LONG_MAX; + + return (unsigned long) clc; +} + +/** + * clockevents_set_mode - set the operating mode of a clock event device + * @dev: device to modify + * @mode: new mode + * + * Must be called with interrupts disabled ! + */ +void clockevents_set_mode(struct clock_event_device *dev, + enum clock_event_mode mode) +{ + if (dev->mode != mode) { + dev->set_mode(mode, dev); + dev->mode = mode; + } +} + +/** + * clockevents_program_event - Reprogram the clock event device. + * @expires: absolute expiry time (monotonic clock) + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, + ktime_t now) +{ + unsigned long long clc; + int64_t delta; + + delta = ktime_to_ns(ktime_sub(expires, now)); + + if (delta <= 0) + return -ETIME; + + dev->next_event = expires; + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + if (delta > dev->max_delta_ns) + delta = dev->max_delta_ns; + if (delta < dev->min_delta_ns) + delta = dev->min_delta_ns; + + clc = delta * dev->mult; + clc >>= dev->shift; + + return dev->set_next_event((unsigned long) clc, dev); +} + +/** + * clockevents_register_notifier - register a clock events change listener + */ +int clockevents_register_notifier(struct notifier_block *nb) +{ + int ret; + + spin_lock(&clockevents_lock); + ret = raw_notifier_chain_register(&clockevents_chain, nb); + spin_unlock(&clockevents_lock); + + return ret; +} + +/** + * clockevents_unregister_notifier - unregister a clock events change listener + */ +void clockevents_unregister_notifier(struct notifier_block *nb) +{ + spin_lock(&clockevents_lock); + raw_notifier_chain_unregister(&clockevents_chain, nb); + spin_unlock(&clockevents_lock); +} + +/* + * Notify about a clock event change. Called with clockevents_lock + * held. + */ +static void clockevents_do_notify(unsigned long reason, void *dev) +{ + raw_notifier_call_chain(&clockevents_chain, reason, dev); +} + +/* + * Called after a notify add to make devices availble which were + * released from the notifier call. + */ +static void clockevents_notify_released(void) +{ + struct clock_event_device *dev; + + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + } +} + +/** + * clockevents_register_device - register a clock event device + * @dev: device to register + */ +void clockevents_register_device(struct clock_event_device *dev) +{ + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + + spin_lock(&clockevents_lock); + + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + clockevents_notify_released(); + + spin_unlock(&clockevents_lock); +} + +/* + * Noop handler when we shut down an event device + */ +static void clockevents_handle_noop(struct clock_event_device *dev) +{ +} + +/** + * clockevents_exchange_device - release and request clock devices + * @old: device to release (can be NULL) + * @new: device to request (can be NULL) + * + * Called from the notifier chain. clockevents_lock is held already + */ +void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new) +{ + unsigned long flags; + + local_irq_save(flags); + /* + * Caller releases a clock event device. We queue it into the + * released list and do a notify add later. + */ + if (old) { + old->event_handler = clockevents_handle_noop; + clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); + list_del(&old->list); + list_add(&old->list, &clockevents_released); + } + + if (new) { + BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); + clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + } + local_irq_restore(flags); +} + +/** + * clockevents_request_device + */ +struct clock_event_device *clockevents_request_device(unsigned int features, + cpumask_t cpumask) +{ + struct clock_event_device *cur, *dev = NULL; + struct list_head *tmp; + + spin_lock(&clockevents_lock); + + list_for_each(tmp, &clockevent_devices) { + cur = list_entry(tmp, struct clock_event_device, list); + + if ((cur->features & features) == features && + cpus_equal(cpumask, cur->cpumask)) { + if (!dev || dev->rating < cur->rating) + dev = cur; + } + } + + clockevents_exchange_device(NULL, dev); + + spin_unlock(&clockevents_lock); + + return dev; +} + +/** + * clockevents_release_device + */ +void clockevents_release_device(struct clock_event_device *dev) +{ + spin_lock(&clockevents_lock); + + clockevents_exchange_device(dev, NULL); + clockevents_notify_released(); + + spin_unlock(&clockevents_lock); +} + +/** + * clockevents_notify - notification about relevant events + */ +void clockevents_notify(unsigned long reason, void *arg) +{ + spin_lock(&clockevents_lock); + clockevents_do_notify(reason, arg); + + switch (reason) { + case CLOCK_EVT_NOTIFY_CPU_DEAD: + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + while (!list_empty(&clockevents_released)) { + struct clock_event_device *dev; + + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + } + break; + default: + break; + } + spin_unlock(&clockevents_lock); +} +EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS + +/** + * clockevents_show_registered - sysfs interface for listing clockevents + * @dev: unused + * @buf: char buffer to be filled with clock events list + * + * Provides sysfs interface for listing registered clock event devices + */ +static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *p = buf; + int cpu; + + spin_lock(&clockevents_lock); + + list_for_each(tmp, &clockevent_devices) { + struct clock_event_device *ce; + + ce = list_entry(tmp, struct clock_event_device, list); + p += sprintf(p, "%-20s F:%04x M:%d", ce->name, + ce->features, ce->mode); + p += sprintf(p, " C:"); + if (!cpus_equal(ce->cpumask, cpu_possible_map)) { + for_each_cpu_mask(cpu, ce->cpumask) + p += sprintf(p, " %d", cpu); + } else { + /* + * FIXME: Add the cpu which is handling this sucker + */ + } + p += sprintf(p, "\n"); + } + + spin_unlock(&clockevents_lock); + + return p - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(registered, 0600, + clockevents_show_registered, NULL); + +static struct sysdev_class clockevents_sysclass = { + set_kset_name("clockevents"), +}; + +static struct sys_device clockevents_sys_device = { + .id = 0, + .cls = &clockevents_sysclass, +}; + +static int __init clockevents_sysfs_init(void) +{ + int error = sysdev_class_register(&clockevents_sysclass); + + if (!error) + error = sysdev_register(&clockevents_sys_device); + if (!error) + error = sysdev_create_file( + &clockevents_sys_device, + &attr_registered); + return error; +} +device_initcall(clockevents_sysfs_init); +#endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 22504af..193a079 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -28,6 +28,8 @@ #include <linux/sysdev.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ +#include <linux/tick.h> /* XXX - Would like a better way for initializing curr_clocksource */ extern struct clocksource clocksource_jiffies; @@ -47,6 +49,7 @@ extern struct clocksource clocksource_jiffies; */ static struct clocksource *curr_clocksource = &clocksource_jiffies; static struct clocksource *next_clocksource; +static struct clocksource *clocksource_override; static LIST_HEAD(clocksource_list); static DEFINE_SPINLOCK(clocksource_lock); static char override_name[32]; @@ -61,9 +64,123 @@ static int __init clocksource_done_booting(void) finished_booting = 1; return 0; } - late_initcall(clocksource_done_booting); +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static LIST_HEAD(watchdog_list); +static struct clocksource *watchdog; +static struct timer_list watchdog_timer; +static DEFINE_SPINLOCK(watchdog_lock); +static cycle_t watchdog_last; +/* + * Interval: 0.5sec Treshold: 0.0625s + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) + +static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +{ + if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) + return; + + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + clocksource_change_rating(cs, 0); + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + list_del(&cs->wd_list); +} + +static void clocksource_watchdog(unsigned long data) +{ + struct clocksource *cs, *tmp; + cycle_t csnow, wdnow; + int64_t wd_nsec, cs_nsec; + + spin_lock(&watchdog_lock); + + wdnow = watchdog->read(); + wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + watchdog_last = wdnow; + + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + csnow = cs->read(); + /* Initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as + * highres-capable, notify the rest of the + * system as well so that we transition + * into high-res mode: + */ + tick_clock_notify(); + } + cs->flags |= CLOCK_SOURCE_WATCHDOG; + cs->wd_last = csnow; + } else { + cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs->wd_last = csnow; + /* Check the delta. Might remove from the list ! */ + clocksource_ratewd(cs, cs_nsec - wd_nsec); + } + } + + if (!list_empty(&watchdog_list)) { + __mod_timer(&watchdog_timer, + watchdog_timer.expires + WATCHDOG_INTERVAL); + } + spin_unlock(&watchdog_lock); +} +static void clocksource_check_watchdog(struct clocksource *cs) +{ + struct clocksource *cse; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + int started = !list_empty(&watchdog_list); + + list_add(&cs->wd_list, &watchdog_list); + if (!started && watchdog) { + watchdog_last = watchdog->read(); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer(&watchdog_timer); + } + } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + if (!watchdog || cs->rating > watchdog->rating) { + if (watchdog) + del_timer(&watchdog_timer); + watchdog = cs; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + + /* Reset watchdog cycles */ + list_for_each_entry(cse, &watchdog_list, wd_list) + cse->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Start if list is not empty */ + if (!list_empty(&watchdog_list)) { + watchdog_last = watchdog->read(); + watchdog_timer.expires = + jiffies + WATCHDOG_INTERVAL; + add_timer(&watchdog_timer); + } + } + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} +#else +static void clocksource_check_watchdog(struct clocksource *cs) +{ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; +} +#endif + /** * clocksource_get_next - Returns the selected clocksource * @@ -83,60 +200,54 @@ struct clocksource *clocksource_get_next(void) } /** - * select_clocksource - Finds the best registered clocksource. + * select_clocksource - Selects the best registered clocksource. * * Private function. Must hold clocksource_lock when called. * - * Looks through the list of registered clocksources, returning - * the one with the highest rating value. If there is a clocksource - * name that matches the override string, it returns that clocksource. + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. */ static struct clocksource *select_clocksource(void) { - struct clocksource *best = NULL; - struct list_head *tmp; + struct clocksource *next; - list_for_each(tmp, &clocksource_list) { - struct clocksource *src; + if (list_empty(&clocksource_list)) + return NULL; - src = list_entry(tmp, struct clocksource, list); - if (!best) - best = src; - - /* check for override: */ - if (strlen(src->name) == strlen(override_name) && - !strcmp(src->name, override_name)) { - best = src; - break; - } - /* pick the highest rating: */ - if (src->rating > best->rating) - best = src; - } + if (clocksource_override) + next = clocksource_override; + else + next = list_entry(clocksource_list.next, struct clocksource, + list); + + if (next == curr_clocksource) + return NULL; - return best; + return next; } -/** - * is_registered_source - Checks if clocksource is registered - * @c: pointer to a clocksource - * - * Private helper function. Must hold clocksource_lock when called. - * - * Returns one if the clocksource is already registered, zero otherwise. +/* + * Enqueue the clocksource sorted by rating */ -static int is_registered_source(struct clocksource *c) +static int clocksource_enqueue(struct clocksource *c) { - int len = strlen(c->name); - struct list_head *tmp; + struct list_head *tmp, *entry = &clocksource_list; list_for_each(tmp, &clocksource_list) { - struct clocksource *src; - - src = list_entry(tmp, struct clocksource, list); - if (strlen(src->name) == len && !strcmp(src->name, c->name)) - return 1; + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (cs == c) + return -EBUSY; + /* Keep track of the place, where to insert */ + if (cs->rating >= c->rating) + entry = tmp; } + list_add(&c->list, entry); + + if (strlen(c->name) == strlen(override_name) && + !strcmp(c->name, override_name)) + clocksource_override = c; return 0; } @@ -149,42 +260,35 @@ static int is_registered_source(struct clocksource *c) */ int clocksource_register(struct clocksource *c) { - int ret = 0; unsigned long flags; + int ret; spin_lock_irqsave(&clocksource_lock, flags); - /* check if clocksource is already registered */ - if (is_registered_source(c)) { - printk("register_clocksource: Cannot register %s. " - "Already registered!", c->name); - ret = -EBUSY; - } else { - /* register it */ - list_add(&c->list, &clocksource_list); - /* scan the registered clocksources, and pick the best one */ + ret = clocksource_enqueue(c); + if (!ret) next_clocksource = select_clocksource(); - } spin_unlock_irqrestore(&clocksource_lock, flags); + if (!ret) + clocksource_check_watchdog(c); return ret; } EXPORT_SYMBOL(clocksource_register); /** - * clocksource_reselect - Rescan list for next clocksource + * clocksource_change_rating - Change the rating of a registered clocksource * - * A quick helper function to be used if a clocksource changes its - * rating. Forces the clocksource list to be re-scanned for the best - * clocksource. */ -void clocksource_reselect(void) +void clocksource_change_rating(struct clocksource *cs, int rating) { unsigned long flags; spin_lock_irqsave(&clocksource_lock, flags); + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); next_clocksource = select_clocksource(); spin_unlock_irqrestore(&clocksource_lock, flags); } -EXPORT_SYMBOL(clocksource_reselect); #ifdef CONFIG_SYSFS /** @@ -220,7 +324,11 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf) static ssize_t sysfs_override_clocksource(struct sys_device *dev, const char *buf, size_t count) { + struct clocksource *ovr = NULL; + struct list_head *tmp; size_t ret = count; + int len; + /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) return -EINVAL; @@ -228,17 +336,32 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, /* strip of \n: */ if (buf[count-1] == '\n') count--; - if (count < 1) - return -EINVAL; spin_lock_irq(&clocksource_lock); - /* copy the name given: */ - memcpy(override_name, buf, count); + if (count > 0) + memcpy(override_name, buf, count); override_name[count] = 0; - /* try to select it: */ - next_clocksource = select_clocksource(); + len = strlen(override_name); + if (len) { + ovr = clocksource_override; + /* try to select it: */ + list_for_each(tmp, &clocksource_list) { + struct clocksource *cs; + + cs = list_entry(tmp, struct clocksource, list); + if (strlen(cs->name) == len && + !strcmp(cs->name, override_name)) + ovr = cs; + } + } + + /* Reselect, when the override name has changed */ + if (ovr != clocksource_override) { + clocksource_override = ovr; + next_clocksource = select_clocksource(); + } spin_unlock_irq(&clocksource_lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a99b2a6..3be8da8 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -62,7 +62,6 @@ struct clocksource clocksource_jiffies = { .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, - .is_continuous = 0, /* tick based, not free running */ }; static int __init init_jiffies_clocksource(void) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3afeaa3..eb12509 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -24,7 +24,7 @@ static u64 tick_length, tick_length_base; #define MAX_TICKADJ 500 /* microsecs */ #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ - TICK_LENGTH_SHIFT) / HZ) + TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables @@ -46,13 +46,17 @@ long time_adjust; static void ntp_update_frequency(void) { - tick_length_base = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - tick_length_base += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; - tick_length_base += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << TICK_LENGTH_SHIFT; + second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); - do_div(tick_length_base, HZ); + tick_length_base = second_length; - tick_nsec = tick_length_base >> TICK_LENGTH_SHIFT; + do_div(second_length, HZ); + tick_nsec = second_length >> TICK_LENGTH_SHIFT; + + do_div(tick_length_base, NTP_INTERVAL_FREQ); } /** @@ -162,7 +166,7 @@ void second_overflow(void) tick_length -= MAX_TICKADJ_SCALED; } else { tick_length += (s64)(time_adjust * NSEC_PER_USEC / - HZ) << TICK_LENGTH_SHIFT; + NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; time_adjust = 0; } } @@ -239,7 +243,8 @@ int do_adjtimex(struct timex *txc) result = -EINVAL; goto leave; } - time_freq = ((s64)txc->freq * NSEC_PER_USEC) >> (SHIFT_USEC - SHIFT_NSEC); + time_freq = ((s64)txc->freq * NSEC_PER_USEC) + >> (SHIFT_USEC - SHIFT_NSEC); } if (txc->modes & ADJ_MAXERROR) { @@ -309,7 +314,8 @@ int do_adjtimex(struct timex *txc) freq_adj += time_freq; freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); - time_offset = (time_offset / HZ) << SHIFT_UPDATE; + time_offset = (time_offset / NTP_INTERVAL_FREQ) + << SHIFT_UPDATE; } /* STA_PLL */ } /* txc->modes & ADJ_OFFSET */ if (txc->modes & ADJ_TICK) @@ -324,8 +330,10 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) txc->offset = save_adjust; else - txc->offset = shift_right(time_offset, SHIFT_UPDATE) * HZ / 1000; - txc->freq = (time_freq / NSEC_PER_USEC) << (SHIFT_USEC - SHIFT_NSEC); + txc->offset = shift_right(time_offset, SHIFT_UPDATE) + * NTP_INTERVAL_FREQ / 1000; + txc->freq = (time_freq / NSEC_PER_USEC) + << (SHIFT_USEC - SHIFT_NSEC); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 0000000..12b3efe --- /dev/null +++ b/kernel/time/tick-broadcast.c @@ -0,0 +1,480 @@ +/* + * linux/kernel/time/tick-broadcast.c + * + * This file contains functions which emulate a local clock-event + * device via a broadcast event source. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Broadcast support for broken x86 hardware, where the local apic + * timer stops in C3 state. + */ + +struct tick_device tick_broadcast_device; +static cpumask_t tick_broadcast_mask; +static DEFINE_SPINLOCK(tick_broadcast_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_broadcast_device(void) +{ + return &tick_broadcast_device; +} + +cpumask_t *tick_get_broadcast_mask(void) +{ + return &tick_broadcast_mask; +} + +/* + * Start the device in periodic mode + */ +static void tick_broadcast_start_periodic(struct clock_event_device *bc) +{ + if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) + tick_setup_periodic(bc, 1); +} + +/* + * Check, if the device can be utilized as broadcast device: + */ +int tick_check_broadcast_device(struct clock_event_device *dev) +{ + if (tick_broadcast_device.evtdev || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + clockevents_exchange_device(NULL, dev); + tick_broadcast_device.evtdev = dev; + if (!cpus_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(dev); + return 1; +} + +/* + * Check, if the device is the broadcast device + */ +int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return (dev && tick_broadcast_device.evtdev == dev); +} + +/* + * Check, if the device is disfunctional and a place holder, which + * needs to be handled by the broadcast device. + */ +int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Devices might be registered with both periodic and oneshot + * mode disabled. This signals, that the device needs to be + * operated from the broadcast device and is a placeholder for + * the cpu local device. + */ + if (!tick_device_is_functional(dev)) { + dev->event_handler = tick_handle_periodic; + cpu_set(cpu, tick_broadcast_mask); + tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + ret = 1; + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; +} + +/* + * Broadcast the event to the cpus, which are set in the mask + */ +int tick_do_broadcast(cpumask_t mask) +{ + int ret = 0, cpu = smp_processor_id(); + struct tick_device *td; + + /* + * Check, if the current cpu is in the mask + */ + if (cpu_isset(cpu, mask)) { + cpu_clear(cpu, mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->event_handler(td->evtdev); + ret = 1; + } + + if (!cpus_empty(mask)) { + /* + * It might be necessary to actually check whether the devices + * have different broadcast functions. For now, just use the + * one of the first device. This works as long as we have this + * misfeature only on x86 (lapic) + */ + cpu = first_cpu(mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->broadcast(mask); + ret = 1; + } + return ret; +} + +/* + * Periodic broadcast: + * - invoke the broadcast handlers + */ +static void tick_do_periodic_broadcast(void) +{ + cpumask_t mask; + + spin_lock(&tick_broadcast_lock); + + cpus_and(mask, cpu_online_map, tick_broadcast_mask); + tick_do_broadcast(mask); + + spin_unlock(&tick_broadcast_lock); +} + +/* + * Event handler for periodic broadcast ticks + */ +static void tick_handle_periodic_broadcast(struct clock_event_device *dev) +{ + dev->next_event.tv64 = KTIME_MAX; + + tick_do_periodic_broadcast(); + + /* + * The device is in periodic mode. No reprogramming necessary: + */ + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + return; + + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + for (;;) { + ktime_t next = ktime_add(dev->next_event, tick_period); + + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_do_periodic_broadcast(); + } +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +static void tick_do_broadcast_on_off(void *why) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags, *reason = why; + int cpu; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + bc = tick_broadcast_device.evtdev; + + /* + * Is the device in broadcast mode forever or is it not + * affected by the powerstate ? + */ + if (!dev || !tick_device_is_functional(dev) || + !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { + if (!cpu_isset(cpu, tick_broadcast_mask)) { + cpu_set(cpu, tick_broadcast_mask); + if (td->mode == TICKDEV_MODE_PERIODIC) + clockevents_set_mode(dev, + CLOCK_EVT_MODE_SHUTDOWN); + } + } else { + if (cpu_isset(cpu, tick_broadcast_mask)) { + cpu_clear(cpu, tick_broadcast_mask); + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(dev, 0); + } + } + + if (cpus_empty(tick_broadcast_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + else { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } +out: + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop. + */ +void tick_broadcast_on_off(unsigned long reason, int *oncpu) +{ + int cpu = get_cpu(); + + if (cpu == *oncpu) + tick_do_broadcast_on_off(&reason); + else + smp_call_function_single(*oncpu, tick_do_broadcast_on_off, + &reason, 1, 1); + put_cpu(); +} + +/* + * Set the periodic handler depending on broadcast on/off + */ +void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + if (!broadcast) + dev->event_handler = tick_handle_periodic; + else + dev->event_handler = tick_handle_periodic_broadcast; +} + +/* + * Remove a CPU from broadcasting + */ +void tick_shutdown_broadcast(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpu_clear(cpu, tick_broadcast_mask); + + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + if (bc && cpus_empty(tick_broadcast_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_TICK_ONESHOT + +static cpumask_t tick_broadcast_oneshot_mask; + +/* + * Debugging: see timer_list.c + */ +cpumask_t *tick_get_broadcast_oneshot_mask(void) +{ + return &tick_broadcast_oneshot_mask; +} + +static int tick_broadcast_set_event(ktime_t expires, int force) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + ktime_t now = ktime_get(); + int res; + + for(;;) { + res = clockevents_program_event(bc, expires, now); + if (!res || !force) + return res; + now = ktime_get(); + expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); + } +} + +/* + * Reprogram the broadcast device: + * + * Called with tick_broadcast_lock held and interrupts disabled. + */ +static int tick_broadcast_reprogram(void) +{ + ktime_t expires = { .tv64 = KTIME_MAX }; + struct tick_device *td; + int cpu; + + /* + * Find the event which expires next: + */ + for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; + cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 < expires.tv64) + expires = td->evtdev->next_event; + } + + if (expires.tv64 == KTIME_MAX) + return 0; + + return tick_broadcast_set_event(expires, 0); +} + +/* + * Handle oneshot mode broadcasting + */ +static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td; + cpumask_t mask; + ktime_t now; + int cpu; + + spin_lock(&tick_broadcast_lock); +again: + dev->next_event.tv64 = KTIME_MAX; + mask = CPU_MASK_NONE; + now = ktime_get(); + /* Find all expired events */ + for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; + cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 <= now.tv64) + cpu_set(cpu, mask); + } + + /* + * Wakeup the cpus which have an expired event. The broadcast + * device is reprogrammed in the return from idle code. + */ + if (!tick_do_broadcast(mask)) { + /* + * The global event did not expire any CPU local + * events. This happens in dyntick mode, as the + * maximum PIT delta is quite small. + */ + if (tick_broadcast_reprogram()) + goto again; + } + spin_unlock(&tick_broadcast_lock); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +void tick_broadcast_oneshot_control(unsigned long reason) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags; + int cpu; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Periodic mode does not care about the enter/exit of power + * states + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + goto out; + + bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + cpu_set(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + if (dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(dev->next_event, 1); + } + } else { + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + cpu_clear(cpu, tick_broadcast_oneshot_mask); + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + if (dev->next_event.tv64 != KTIME_MAX) + tick_program_event(dev->next_event, 1); + } + } + +out: + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/** + * tick_broadcast_setup_highres - setup the broadcast device for highres + */ +void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; + } +} + +/* + * Select oneshot operating mode for the broadcast device + */ +void tick_broadcast_switch_to_oneshot(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + bc = tick_broadcast_device.evtdev; + if (bc) + tick_broadcast_setup_oneshot(bc); + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + + +/* + * Remove a dead CPU from broadcasting + */ +void tick_shutdown_broadcast_oneshot(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpu_clear(cpu, tick_broadcast_oneshot_mask); + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { + if (bc && cpus_empty(tick_broadcast_oneshot_mask)) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } + + spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 0000000..0986a2b --- /dev/null +++ b/kernel/time/tick-common.c @@ -0,0 +1,347 @@ +/* + * linux/kernel/time/tick-common.c + * + * This file contains the base functions to manage periodic tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/* + * Tick devices + */ +DEFINE_PER_CPU(struct tick_device, tick_cpu_device); +/* + * Tick next event: keeps track of the tick time + */ +ktime_t tick_next_period; +ktime_t tick_period; +static int tick_do_timer_cpu = -1; +DEFINE_SPINLOCK(tick_device_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_device(int cpu) +{ + return &per_cpu(tick_cpu_device, cpu); +} + +/** + * tick_is_oneshot_available - check for a oneshot capable event device + */ +int tick_is_oneshot_available(void) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); +} + +/* + * Periodic tick + */ +static void tick_periodic(int cpu) +{ + if (tick_do_timer_cpu == cpu) { + write_seqlock(&xtime_lock); + + /* Keep track of the next tick event */ + tick_next_period = ktime_add(tick_next_period, tick_period); + + do_timer(1); + write_sequnlock(&xtime_lock); + } + + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} + +/* + * Event handler for periodic ticks + */ +void tick_handle_periodic(struct clock_event_device *dev) +{ + int cpu = smp_processor_id(); + ktime_t next; + + tick_periodic(cpu); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return; + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(dev->next_event, tick_period); + for (;;) { + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_periodic(cpu); + next = ktime_add(next, tick_period); + } +} + +/* + * Setup the device for a periodic tick + */ +void tick_setup_periodic(struct clock_event_device *dev, int broadcast) +{ + tick_set_periodic_handler(dev, broadcast); + + /* Broadcast setup ? */ + if (!tick_device_is_functional(dev)) + return; + + if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + } else { + unsigned long seq; + ktime_t next; + + do { + seq = read_seqbegin(&xtime_lock); + next = tick_next_period; + } while (read_seqretry(&xtime_lock, seq)); + + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + + for (;;) { + if (!clockevents_program_event(dev, next, ktime_get())) + return; + next = ktime_add(next, tick_period); + } + } +} + +/* + * Setup the tick device + */ +static void tick_setup_device(struct tick_device *td, + struct clock_event_device *newdev, int cpu, + cpumask_t cpumask) +{ + ktime_t next_event; + void (*handler)(struct clock_event_device *) = NULL; + + /* + * First device setup ? + */ + if (!td->evtdev) { + /* + * If no cpu took the do_timer update, assign it to + * this cpu: + */ + if (tick_do_timer_cpu == -1) { + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); + tick_period = ktime_set(0, NSEC_PER_SEC / HZ); + } + + /* + * Startup in periodic mode first. + */ + td->mode = TICKDEV_MODE_PERIODIC; + } else { + handler = td->evtdev->event_handler; + next_event = td->evtdev->next_event; + } + + td->evtdev = newdev; + + /* + * When the device is not per cpu, pin the interrupt to the + * current cpu: + */ + if (!cpus_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); + + /* + * When global broadcasting is active, check if the current + * device is registered as a placeholder for broadcast mode. + * This allows us to handle this x86 misfeature in a generic + * way. + */ + if (tick_device_uses_broadcast(newdev, cpu)) + return; + + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(newdev, 0); + else + tick_setup_oneshot(newdev, handler, next_event); +} + +/* + * Check, if the new registered device should be used. + */ +static int tick_check_new_device(struct clock_event_device *newdev) +{ + struct clock_event_device *curdev; + struct tick_device *td; + int cpu, ret = NOTIFY_OK; + unsigned long flags; + cpumask_t cpumask; + + spin_lock_irqsave(&tick_device_lock, flags); + + cpu = smp_processor_id(); + if (!cpu_isset(cpu, newdev->cpumask)) + goto out; + + td = &per_cpu(tick_cpu_device, cpu); + curdev = td->evtdev; + cpumask = cpumask_of_cpu(cpu); + + /* cpu local device ? */ + if (!cpus_equal(newdev->cpumask, cpumask)) { + + /* + * If the cpu affinity of the device interrupt can not + * be set, ignore it. + */ + if (!irq_can_set_affinity(newdev->irq)) + goto out_bc; + + /* + * If we have a cpu local device already, do not replace it + * by a non cpu local device + */ + if (curdev && cpus_equal(curdev->cpumask, cpumask)) + goto out_bc; + } + + /* + * If we have an active device, then check the rating and the oneshot + * feature. + */ + if (curdev) { + /* + * Prefer one shot capable devices ! + */ + if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + goto out_bc; + /* + * Check the rating + */ + if (curdev->rating >= newdev->rating) + goto out_bc; + } + + /* + * Replace the eventually existing device by the new + * device. If the current device is the broadcast device, do + * not give it back to the clockevents layer ! + */ + if (tick_is_broadcast_device(curdev)) { + clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + curdev = NULL; + } + clockevents_exchange_device(curdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); + + spin_unlock_irqrestore(&tick_device_lock, flags); + return NOTIFY_STOP; + +out_bc: + /* + * Can the new device be used as a broadcast device ? + */ + if (tick_check_broadcast_device(newdev)) + ret = NOTIFY_STOP; +out: + spin_unlock_irqrestore(&tick_device_lock, flags); + + return ret; +} + +/* + * Shutdown an event device on a given cpu: + * + * This is called on a life CPU, when a CPU is dead. So we cannot + * access the hardware device itself. + * We just set the mode and remove it from the lists. + */ +static void tick_shutdown(unsigned int *cpup) +{ + struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); + struct clock_event_device *dev = td->evtdev; + unsigned long flags; + + spin_lock_irqsave(&tick_device_lock, flags); + td->mode = TICKDEV_MODE_PERIODIC; + if (dev) { + /* + * Prevent that the clock events layer tries to call + * the set mode function! + */ + dev->mode = CLOCK_EVT_MODE_UNUSED; + clockevents_exchange_device(dev, NULL); + td->evtdev = NULL; + } + spin_unlock_irqrestore(&tick_device_lock, flags); +} + +/* + * Notification about clock event devices + */ +static int tick_notify(struct notifier_block *nb, unsigned long reason, + void *dev) +{ + switch (reason) { + + case CLOCK_EVT_NOTIFY_ADD: + return tick_check_new_device(dev); + + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + tick_broadcast_on_off(reason, dev); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(dev); + tick_shutdown_broadcast(dev); + tick_shutdown(dev); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block tick_notifier = { + .notifier_call = tick_notify, +}; + +/** + * tick_init - initialize the tick control + * + * Register the notifier with the clockevents framework + */ +void __init tick_init(void) +{ + clockevents_register_notifier(&tick_notifier); +} diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 0000000..54861a0 --- /dev/null +++ b/kernel/time/tick-internal.h @@ -0,0 +1,110 @@ +/* + * tick internal variable and functions used by low/high res code + */ +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); +extern spinlock_t tick_device_lock; +extern ktime_t tick_next_period; +extern ktime_t tick_period; + +extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); +extern void tick_handle_periodic(struct clock_event_device *dev); + +/* + * NO_HZ / high resolution timer shared code + */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +extern void tick_broadcast_oneshot_control(unsigned long reason); +extern void tick_broadcast_switch_to_oneshot(void); +extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); +# else /* BROADCAST */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +# endif /* !BROADCAST */ + +#else /* !ONESHOT */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) +{ + BUG(); +} +static inline int tick_program_event(ktime_t expires, int force) +{ + return 0; +} +static inline void tick_oneshot_notify(void) { } +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +#endif /* !TICK_ONESHOT */ + +/* + * Broadcasting support + */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_do_broadcast(cpumask_t mask); + +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); +extern void tick_shutdown_broadcast(unsigned int *cpup); + +extern void +tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); + +#else /* !BROADCAST */ + +static inline int tick_check_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} + +static inline int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, + int cpu) +{ + return 0; +} +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } +static inline void tick_shutdown_broadcast(unsigned int *cpup) { } + +/* + * Set the periodic handler in non broadcast mode + */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, + int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +#endif /* !BROADCAST */ + +/* + * Check, if the device is functional or a dummy for broadcast + */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 0000000..2e8b7ff --- /dev/null +++ b/kernel/time/tick-oneshot.c @@ -0,0 +1,84 @@ +/* + * linux/kernel/time/tick-oneshot.c + * + * This file contains functions which manage high resolution tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/irq.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include "tick-internal.h" + +/** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + ktime_t now = ktime_get(); + + while (1) { + int ret = clockevents_program_event(dev, expires, now); + + if (!ret || !force) + return ret; + now = ktime_get(); + expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + } +} + +/** + * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + */ +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t next_event) +{ + newdev->event_handler = handler; + clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); + clockevents_program_event(newdev, next_event, ktime_get()); +} + +/** + * tick_switch_to_oneshot - switch to oneshot mode + */ +int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || + !tick_device_is_functional(dev)) + return -EINVAL; + + td->mode = TICKDEV_MODE_ONESHOT; + dev->event_handler = handler; + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + tick_broadcast_switch_to_oneshot(); + return 0; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * tick_init_highres - switch to high resolution mode + * + * Called with interrupts disabled. + */ +int tick_init_highres(void) +{ + return tick_switch_to_oneshot(hrtimer_interrupt); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 0000000..51556b9 --- /dev/null +++ b/kernel/time/tick-sched.c @@ -0,0 +1,567 @@ +/* + * linux/kernel/time/tick-sched.c + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * No idle tick implementation for low and high resolution timers + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/tick.h> + +#include <asm/irq_regs.h> + +#include "tick-internal.h" + +/* + * Per cpu nohz control structure + */ +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); + +/* + * The time, when the last jiffy update happened. Protected by xtime_lock. + */ +static ktime_t last_jiffies_update; + +struct tick_sched *tick_get_tick_sched(int cpu) +{ + return &per_cpu(tick_cpu_sched, cpu); +} + +/* + * Must be called with interrupts disabled ! + */ +static void tick_do_update_jiffies64(ktime_t now) +{ + unsigned long ticks = 0; + ktime_t delta; + + /* Reevalute with xtime_lock held */ + write_seqlock(&xtime_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 >= tick_period.tv64) { + + delta = ktime_sub(delta, tick_period); + last_jiffies_update = ktime_add(last_jiffies_update, + tick_period); + + /* Slow path for long timeouts */ + if (unlikely(delta.tv64 >= tick_period.tv64)) { + s64 incr = ktime_to_ns(tick_period); + + ticks = ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } + do_timer(++ticks); + } + write_sequnlock(&xtime_lock); +} + +/* + * Initialize and return retrieve the jiffies update. + */ +static ktime_t tick_init_jiffy_update(void) +{ + ktime_t period; + + write_seqlock(&xtime_lock); + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update.tv64 == 0) + last_jiffies_update = tick_next_period; + period = last_jiffies_update; + write_sequnlock(&xtime_lock); + return period; +} + +/* + * NOHZ - aka dynamic tick functionality + */ +#ifdef CONFIG_NO_HZ +/* + * NO HZ enabled ? + */ +static int tick_nohz_enabled __read_mostly = 1; + +/* + * Enable / Disable tickless mode + */ +static int __init setup_tick_nohz(char *str) +{ + if (!strcmp(str, "off")) + tick_nohz_enabled = 0; + else if (!strcmp(str, "on")) + tick_nohz_enabled = 1; + else + return 0; + return 1; +} + +__setup("nohz=", setup_tick_nohz); + +/** + * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * + * Called from interrupt entry when the CPU was idle + * + * In case the sched_tick was stopped on this CPU, we have to check if jiffies + * must be updated. Otherwise an interrupt handler could use a stale jiffy + * value. We do this unconditionally on any cpu, as we don't know whether the + * cpu, which has the update task assigned is in a long sleep. + */ +void tick_nohz_update_jiffies(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long flags; + ktime_t now; + + if (!ts->tick_stopped) + return; + + cpu_clear(cpu, nohz_cpu_mask); + now = ktime_get(); + + local_irq_save(flags); + tick_do_update_jiffies64(now); + local_irq_restore(flags); +} + +/** + * tick_nohz_stop_sched_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called either from the idle loop or from irq_exit() when an idle period was + * just interrupted by an interrupt which did not cause a reschedule. + */ +void tick_nohz_stop_sched_tick(void) +{ + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; + struct tick_sched *ts; + ktime_t last_update, expires, now, delta; + int cpu; + + local_irq_save(flags); + + cpu = smp_processor_id(); + ts = &per_cpu(tick_cpu_sched, cpu); + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + goto end; + + if (need_resched()) + goto end; + + cpu = smp_processor_id(); + if (unlikely(local_softirq_pending())) + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + local_softirq_pending()); + + now = ktime_get(); + /* + * When called from irq_exit we need to account the idle sleep time + * correctly. + */ + if (ts->tick_stopped) { + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + } + + ts->idle_entrytime = now; + ts->idle_calls++; + + /* Read jiffies and the time when jiffies were updated last */ + do { + seq = read_seqbegin(&xtime_lock); + last_update = last_jiffies_update; + last_jiffies = jiffies; + } while (read_seqretry(&xtime_lock, seq)); + + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + + if (rcu_needs_cpu(cpu)) + delta_jiffies = 1; + /* + * Do not stop the tick, if we are only one off + * or if the cpu is required for rcu + */ + if (!ts->tick_stopped && delta_jiffies == 1) + goto out; + + /* Schedule the tick, if we are at least one jiffie off */ + if ((long)delta_jiffies >= 1) { + + if (delta_jiffies > 1) + cpu_set(cpu, nohz_cpu_mask); + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + ts->idle_tick = ts->sched_timer.expires; + ts->tick_stopped = 1; + ts->idle_jiffies = last_jiffies; + } + /* + * calculate the expiry time for the next timer wheel + * timer + */ + expires = ktime_add_ns(last_update, tick_period.tv64 * + delta_jiffies); + ts->idle_expires = expires; + ts->idle_sleeps++; + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + goto out; + } else if(!tick_program_event(expires, 0)) + goto out; + /* + * We are past the event already. So we crossed a + * jiffie boundary. Update jiffies and raise the + * softirq. + */ + tick_do_update_jiffies64(ktime_get()); + cpu_clear(cpu, nohz_cpu_mask); + } + raise_softirq_irqoff(TIMER_SOFTIRQ); +out: + ts->next_jiffies = next_jiffies; + ts->last_jiffies = last_jiffies; +end: + local_irq_restore(flags); +} + +/** + * nohz_restart_sched_tick - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + */ +void tick_nohz_restart_sched_tick(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long ticks; + ktime_t now, delta; + + if (!ts->tick_stopped) + return; + + /* Update jiffies first */ + now = ktime_get(); + + local_irq_disable(); + tick_do_update_jiffies64(now); + cpu_clear(cpu, nohz_cpu_mask); + + /* Account the idle time */ + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) { + add_preempt_count(HARDIRQ_OFFSET); + account_system_time(current, HARDIRQ_OFFSET, + jiffies_to_cputime(ticks)); + sub_preempt_count(HARDIRQ_OFFSET); + } + + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } + local_irq_enable(); +} + +static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) +{ + hrtimer_forward(&ts->sched_timer, now, tick_period); + return tick_program_event(ts->sched_timer.expires, 0); +} + +/* + * The nohz low res interrupt handler + */ +static void tick_nohz_handler(struct clock_event_device *dev) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + dev->next_event.tv64 = KTIME_MAX; + + /* Check, if the jiffies need an update */ + tick_do_update_jiffies64(now); + + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start + * of idle" jiffy stamp so the idle accounting adjustment we + * do when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + + /* Do not restart, when we are in the idle loop */ + if (ts->tick_stopped) + return; + + while (tick_nohz_reprogram(ts, now)) { + now = ktime_get(); + tick_do_update_jiffies64(now); + } +} + +/** + * tick_nohz_switch_to_nohz - switch to nohz mode + */ +static void tick_nohz_switch_to_nohz(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t next; + + if (!tick_nohz_enabled) + return; + + local_irq_disable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) { + local_irq_enable(); + return; + } + + ts->nohz_mode = NOHZ_MODE_LOWRES; + + /* + * Recycle the hrtimer in ts, so we can share the + * hrtimer_forward with the highres code. + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + /* Get the next period */ + next = tick_init_jiffy_update(); + + for (;;) { + ts->sched_timer.expires = next; + if (!tick_program_event(next, 0)) + break; + next = ktime_add(next, tick_period); + } + local_irq_enable(); + + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", + smp_processor_id()); +} + +#else + +static inline void tick_nohz_switch_to_nohz(void) { } + +#endif /* NO_HZ */ + +/* + * High resolution timer specific code + */ +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We rearm the timer until we get disabled by the idle code + * Called with interrupts disabled and timer->base->cpu_base->lock held. + */ +static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +{ + struct tick_sched *ts = + container_of(timer, struct tick_sched, sched_timer); + struct hrtimer_cpu_base *base = timer->base->cpu_base; + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + /* Check, if the jiffies need an update */ + tick_do_update_jiffies64(now); + + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (regs) { + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + /* + * update_process_times() might take tasklist_lock, hence + * drop the base lock. sched-tick hrtimers are per-CPU and + * never accessible by userspace APIs, so this is safe to do. + */ + spin_unlock(&base->lock); + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + spin_lock(&base->lock); + } + + /* Do not restart, when we are in the idle loop */ + if (ts->tick_stopped) + return HRTIMER_NORESTART; + + hrtimer_forward(timer, now, tick_period); + + return HRTIMER_RESTART; +} + +/** + * tick_setup_sched_timer - setup the tick emulation timer + */ +void tick_setup_sched_timer(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t now = ktime_get(); + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.function = tick_sched_timer; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + + /* Get the next period */ + ts->sched_timer.expires = tick_init_jiffy_update(); + + for (;;) { + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + now = ktime_get(); + } + +#ifdef CONFIG_NO_HZ + if (tick_nohz_enabled) + ts->nohz_mode = NOHZ_MODE_HIGHRES; +#endif +} + +void tick_cancel_sched_timer(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (ts->sched_timer.base) + hrtimer_cancel(&ts->sched_timer); + ts->tick_stopped = 0; + ts->nohz_mode = NOHZ_MODE_INACTIVE; +} +#endif /* HIGH_RES_TIMERS */ + +/** + * Async notification about clocksource changes + */ +void tick_clock_notify(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); +} + +/* + * Async notification about clock event changes + */ +void tick_oneshot_notify(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + set_bit(0, &ts->check_clocks); +} + +/** + * Check, if a change happened, which makes oneshot possible. + * + * Called cyclic from the hrtimer softirq (driven by the timer + * softirq) allow_nohz signals, that we can switch into low-res nohz + * mode, because high resolution timers are disabled (either compile + * or runtime). + */ +int tick_check_oneshot_change(int allow_nohz) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!test_and_clear_bit(0, &ts->check_clocks)) + return 0; + + if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + return 0; + + if (!timekeeping_is_continuous() || !tick_is_oneshot_available()) + return 0; + + if (!allow_nohz) + return 1; + + tick_nohz_switch_to_nohz(); + return 0; +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 0000000..f82c635 --- /dev/null +++ b/kernel/time/timer_list.c @@ -0,0 +1,287 @@ +/* + * kernel/time/timer_list.c + * + * List pending timers + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/tick.h> + +#include <asm/uaccess.h> + +typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +/* + * This allows printing both to /proc/timer_list and + * to the console (on SysRq-Q): + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +static void print_name_offset(struct seq_file *m, void *sym) +{ + unsigned long addr = (unsigned long)sym; + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + SEQ_printf(m, "%s", sym_name); + else + SEQ_printf(m, "<%p>", sym); +} + +static void +print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +{ +#ifdef CONFIG_TIMER_STATS + char tmp[TASK_COMM_LEN + 1]; +#endif + SEQ_printf(m, " #%d: ", idx); + print_name_offset(m, timer); + SEQ_printf(m, ", "); + print_name_offset(m, timer->function); + SEQ_printf(m, ", S:%02lx", timer->state); +#ifdef CONFIG_TIMER_STATS + SEQ_printf(m, ", "); + print_name_offset(m, timer->start_site); + memcpy(tmp, timer->start_comm, TASK_COMM_LEN); + tmp[TASK_COMM_LEN] = 0; + SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); +#endif + SEQ_printf(m, "\n"); + SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(timer->expires), + (unsigned long long)(ktime_to_ns(timer->expires) - now)); +} + +static void +print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, + u64 now) +{ + struct hrtimer *timer, tmp; + unsigned long next = 0, i; + struct rb_node *curr; + unsigned long flags; + +next_one: + i = 0; + spin_lock_irqsave(&base->cpu_base->lock, flags); + + curr = base->first; + /* + * Crude but we have to do this O(N*N) thing, because + * we have to unlock the base when printing: + */ + while (curr && i < next) { + curr = rb_next(curr); + i++; + } + + if (curr) { + + timer = rb_entry(curr, struct hrtimer, node); + tmp = *timer; + spin_unlock_irqrestore(&base->cpu_base->lock, flags); + + print_timer(m, &tmp, i, now); + next++; + goto next_one; + } + spin_unlock_irqrestore(&base->cpu_base->lock, flags); +} + +static void +print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) +{ + SEQ_printf(m, " .index: %d\n", + base->index); + SEQ_printf(m, " .resolution: %Ld nsecs\n", + (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .get_time: "); + print_name_offset(m, base->get_time); + SEQ_printf(m, "\n"); +#ifdef CONFIG_HIGH_RES_TIMERS + SEQ_printf(m, " .offset: %Ld nsecs\n", + ktime_to_ns(base->offset)); +#endif + SEQ_printf(m, "active timers:\n"); + print_active_timers(m, base, now); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + SEQ_printf(m, "\ncpu: %d\n", cpu); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + SEQ_printf(m, " clock %d:\n", i); + print_base(m, cpu_base->clock_base + i, now); + } +#define P(x) \ + SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) +#define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ + (u64)(ktime_to_ns(cpu_base->x))) + +#ifdef CONFIG_HIGH_RES_TIMERS + P_ns(expires_next); + P(hres_active); + P(nr_events); +#endif +#undef P +#undef P_ns + +#ifdef CONFIG_TICK_ONESHOT +# define P(x) \ + SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) +# define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ + (u64)(ktime_to_ns(ts->x))) + { + struct tick_sched *ts = tick_get_tick_sched(cpu); + P(nohz_mode); + P_ns(idle_tick); + P(tick_stopped); + P(idle_jiffies); + P(idle_calls); + P(idle_sleeps); + P_ns(idle_entrytime); + P_ns(idle_sleeptime); + P(last_jiffies); + P(next_jiffies); + P_ns(idle_expires); + SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); + } +#endif + +#undef P +#undef P_ns +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static void +print_tickdevice(struct seq_file *m, struct tick_device *td) +{ + struct clock_event_device *dev = td->evtdev; + + SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode); + + SEQ_printf(m, "Clock Event Device: "); + if (!dev) { + SEQ_printf(m, "<NULL>\n"); + return; + } + SEQ_printf(m, "%s\n", dev->name); + SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns); + SEQ_printf(m, " mult: %ld\n", dev->mult); + SEQ_printf(m, " shift: %d\n", dev->shift); + SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " next_event: %Ld nsecs\n", + (unsigned long long) ktime_to_ns(dev->next_event)); + + SEQ_printf(m, " set_next_event: "); + print_name_offset(m, dev->set_next_event); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " event_handler: "); + print_name_offset(m, dev->event_handler); + SEQ_printf(m, "\n"); +} + +static void timer_list_show_tickdevices(struct seq_file *m) +{ + int cpu; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + print_tickdevice(m, tick_get_broadcast_device()); + SEQ_printf(m, "tick_broadcast_mask: %08lx\n", + tick_get_broadcast_mask()->bits[0]); +#ifdef CONFIG_TICK_ONESHOT + SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", + tick_get_broadcast_oneshot_mask()->bits[0]); +#endif + SEQ_printf(m, "\n"); +#endif + for_each_online_cpu(cpu) + print_tickdevice(m, tick_get_device(cpu)); + SEQ_printf(m, "\n"); +} +#else +static void timer_list_show_tickdevices(struct seq_file *m) { } +#endif + +static int timer_list_show(struct seq_file *m, void *v) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + + for_each_online_cpu(cpu) + print_cpu(m, cpu, now); + + SEQ_printf(m, "\n"); + timer_list_show_tickdevices(m); + + return 0; +} + +void sysrq_timer_list_show(void) +{ + timer_list_show(NULL, NULL); +} + +static int timer_list_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timer_list_show, NULL); +} + +static struct file_operations timer_list_fops = { + .open = timer_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init init_timer_list_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_list", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &timer_list_fops; + + return 0; +} +__initcall(init_timer_list_procfs); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 0000000..1bc4882 --- /dev/null +++ b/kernel/time/timer_stats.c @@ -0,0 +1,411 @@ +/* + * kernel/time/timer_stats.c + * + * Collect timer usage statistics. + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * timer_stats is based on timer_top, a similar functionality which was part of + * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the + * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based + * on dynamic allocation of the statistics entries and linear search based + * lookup combined with a global lock, rather than the static array, hash + * and per-CPU locking which is used by timer_stats. It was written for the + * pre hrtimer kernel code and therefore did not take hrtimers into account. + * Nevertheless it provided the base for the timer_stats implementation and + * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks + * for this effort. + * + * timer_top.c is + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini <d.pensator@gmail.com> + * timer_top.c was released under the GNU General Public License version 2 + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * Start/stop data collection: + * # echo 1[0] >/proc/timer_stats + * + * Display the information collected so far: + * # cat /proc/timer_stats + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> + +#include <asm/uaccess.h> + +/* + * This is our basic unit of interest: a timer expiry event identified + * by the timer, its start/expire functions and the PID of the task that + * started the timer. We count the number of times an event happens: + */ +struct entry { + /* + * Hash list: + */ + struct entry *next; + + /* + * Hash keys: + */ + void *timer; + void *start_func; + void *expire_func; + pid_t pid; + + /* + * Number of timeout events: + */ + unsigned long count; + + /* + * We save the command-line string to preserve + * this information past task exit: + */ + char comm[TASK_COMM_LEN + 1]; + +} ____cacheline_aligned_in_smp; + +/* + * Spinlock protecting the tables - not taken during lookup: + */ +static DEFINE_SPINLOCK(table_lock); + +/* + * Per-CPU lookup locks for fast hash lookup: + */ +static DEFINE_PER_CPU(spinlock_t, lookup_lock); + +/* + * Mutex to serialize state changes with show-stats activities: + */ +static DEFINE_MUTEX(show_mutex); + +/* + * Collection status, active/inactive: + */ +static int __read_mostly active; + +/* + * Beginning/end timestamps of measurement: + */ +static ktime_t time_start, time_stop; + +/* + * tstat entry structs only get allocated while collection is + * active and never freed during that time - this simplifies + * things quite a bit. + * + * They get freed when a new collection period is started. + */ +#define MAX_ENTRIES_BITS 10 +#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) + +static unsigned long nr_entries; +static struct entry entries[MAX_ENTRIES]; + +static atomic_t overflow_count; + +static void reset_entries(void) +{ + nr_entries = 0; + memset(entries, 0, sizeof(entries)); + atomic_set(&overflow_count, 0); +} + +static struct entry *alloc_entry(void) +{ + if (nr_entries >= MAX_ENTRIES) + return NULL; + + return entries + nr_entries++; +} + +/* + * The entries are in a hash-table, for fast lookup: + */ +#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) +#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) +#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) + +#define __tstat_hashfn(entry) \ + (((unsigned long)(entry)->timer ^ \ + (unsigned long)(entry)->start_func ^ \ + (unsigned long)(entry)->expire_func ^ \ + (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) + +#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) + +static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; + +static int match_entries(struct entry *entry1, struct entry *entry2) +{ + return entry1->timer == entry2->timer && + entry1->start_func == entry2->start_func && + entry1->expire_func == entry2->expire_func && + entry1->pid == entry2->pid; +} + +/* + * Look up whether an entry matching this item is present + * in the hash already. Must be called with irqs off and the + * lookup lock held: + */ +static struct entry *tstat_lookup(struct entry *entry, char *comm) +{ + struct entry **head, *curr, *prev; + + head = tstat_hashentry(entry); + curr = *head; + + /* + * The fastpath is when the entry is already hashed, + * we do this with the lookup lock held, but with the + * table lock not held: + */ + while (curr) { + if (match_entries(curr, entry)) + return curr; + + curr = curr->next; + } + /* + * Slowpath: allocate, set up and link a new hash entry: + */ + prev = NULL; + curr = *head; + + spin_lock(&table_lock); + /* + * Make sure we have not raced with another CPU: + */ + while (curr) { + if (match_entries(curr, entry)) + goto out_unlock; + + prev = curr; + curr = curr->next; + } + + curr = alloc_entry(); + if (curr) { + *curr = *entry; + curr->count = 0; + memcpy(curr->comm, comm, TASK_COMM_LEN); + if (prev) + prev->next = curr; + else + *head = curr; + curr->next = NULL; + } + out_unlock: + spin_unlock(&table_lock); + + return curr; +} + +/** + * timer_stats_update_stats - Update the statistics for a timer. + * @timer: pointer to either a timer_list or a hrtimer + * @pid: the pid of the task which set up the timer + * @startf: pointer to the function which did the timer setup + * @timerf: pointer to the timer callback function of the timer + * @comm: name of the process which set up the timer + * + * When the timer is already registered, then the event counter is + * incremented. Otherwise the timer is registered in a free slot. + */ +void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char * comm) +{ + /* + * It doesnt matter which lock we take: + */ + spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + struct entry *entry, input; + unsigned long flags; + + input.timer = timer; + input.start_func = startf; + input.expire_func = timerf; + input.pid = pid; + + spin_lock_irqsave(lock, flags); + if (!active) + goto out_unlock; + + entry = tstat_lookup(&input, comm); + if (likely(entry)) + entry->count++; + else + atomic_inc(&overflow_count); + + out_unlock: + spin_unlock_irqrestore(lock, flags); +} + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s", sym_name); + else + seq_printf(m, "<%p>", (void *)addr); +} + +static int tstats_show(struct seq_file *m, void *v) +{ + struct timespec period; + struct entry *entry; + unsigned long ms; + long events = 0; + ktime_t time; + int i; + + mutex_lock(&show_mutex); + /* + * If still active then calculate up to now: + */ + if (active) + time_stop = ktime_get(); + + time = ktime_sub(time_stop, time_start); + + period = ktime_to_timespec(time); + ms = period.tv_nsec / 1000000; + + seq_puts(m, "Timer Stats Version: v0.1\n"); + seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + if (atomic_read(&overflow_count)) + seq_printf(m, "Overflow: %d entries\n", + atomic_read(&overflow_count)); + + for (i = 0; i < nr_entries; i++) { + entry = entries + i; + seq_printf(m, "%4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + + print_name_offset(m, (unsigned long)entry->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)entry->expire_func); + seq_puts(m, ")\n"); + + events += entry->count; + } + + ms += period.tv_sec * 1000; + if (!ms) + ms = 1; + + if (events && period.tv_sec) + seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, + events / period.tv_sec, events * 1000 / ms); + else + seq_printf(m, "%ld total events\n", events); + + mutex_unlock(&show_mutex); + + return 0; +} + +/* + * After a state change, make sure all concurrent lookup/update + * activities have stopped: + */ +static void sync_access(void) +{ + unsigned long flags; + int cpu; + + for_each_online_cpu(cpu) { + spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + /* nothing */ + spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + } +} + +static ssize_t tstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + char ctl[2]; + + if (count != 2 || *offs) + return -EINVAL; + + if (copy_from_user(ctl, buf, count)) + return -EFAULT; + + mutex_lock(&show_mutex); + switch (ctl[0]) { + case '0': + if (active) { + active = 0; + time_stop = ktime_get(); + sync_access(); + } + break; + case '1': + if (!active) { + reset_entries(); + time_start = ktime_get(); + active = 1; + } + break; + default: + count = -EINVAL; + } + mutex_unlock(&show_mutex); + + return count; +} + +static int tstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, tstats_show, NULL); +} + +static struct file_operations tstats_fops = { + .open = tstats_open, + .read = seq_read, + .write = tstats_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +void __init init_timer_stats(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + spin_lock_init(&per_cpu(lookup_lock, cpu)); +} + +static int __init init_tstats_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = create_proc_entry("timer_stats", 0644, NULL); + if (!pe) + return -ENOMEM; + + pe->proc_fops = &tstats_fops; + + return 0; +} +__initcall(init_tstats_procfs); diff --git a/kernel/timer.c b/kernel/timer.c index c2a8ccf..cb1b86a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -34,6 +34,8 @@ #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/delay.h> +#include <linux/tick.h> +#include <linux/kallsyms.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -85,7 +87,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * - * __round_jiffies rounds an absolute time in the future (in jiffies) + * __round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. @@ -98,7 +100,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * - * The return value is the rounded version of the "j" parameter. + * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies(unsigned long j, int cpu) { @@ -142,7 +144,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies); * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * - * __round_jiffies_relative rounds a time delta in the future (in jiffies) + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. @@ -155,7 +157,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies); * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * - * The return value is the rounded version of the "j" parameter. + * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies_relative(unsigned long j, int cpu) { @@ -173,7 +175,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative); * round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * - * round_jiffies rounds an absolute time in the future (in jiffies) + * round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. @@ -182,7 +184,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative); * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * - * The return value is the rounded version of the "j" parameter. + * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies(unsigned long j) { @@ -194,7 +196,7 @@ EXPORT_SYMBOL_GPL(round_jiffies); * round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * - * round_jiffies_relative rounds a time delta in the future (in jiffies) + * round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. @@ -203,7 +205,7 @@ EXPORT_SYMBOL_GPL(round_jiffies); * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * - * The return value is the rounded version of the "j" parameter. + * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies_relative(unsigned long j) { @@ -262,6 +264,18 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) list_add_tail(&timer->entry, vec); } +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} +#endif + /** * init_timer - initialize a timer. * @timer: the timer to be initialized @@ -273,11 +287,16 @@ void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif } EXPORT_SYMBOL(init_timer); static inline void detach_timer(struct timer_list *timer, - int clear_pending) + int clear_pending) { struct list_head *entry = &timer->entry; @@ -324,6 +343,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) unsigned long flags; int ret = 0; + timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); @@ -374,6 +394,7 @@ void add_timer_on(struct timer_list *timer, int cpu) tvec_base_t *base = per_cpu(tvec_bases, cpu); unsigned long flags; + timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer->base = base; @@ -387,7 +408,7 @@ void add_timer_on(struct timer_list *timer, int cpu) * @timer: the timer to be modified * @expires: new timeout in jiffies * - * mod_timer is a more efficient way to update the expire field of an + * mod_timer() is a more efficient way to update the expire field of an * active timer (if the timer is inactive it will be activated) * * mod_timer(timer, expires) is equivalent to: @@ -406,6 +427,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) { BUG_ON(!timer->function); + timer_stats_timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified @@ -436,6 +458,7 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { @@ -490,7 +513,7 @@ out: * the timer it also makes sure the handler has finished executing on other * CPUs. * - * Synchronization rules: callers must prevent restarting of the timer, + * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts. The caller must not hold locks which would prevent * completion of the timer's handler. The timer's handler must not call @@ -569,6 +592,8 @@ static inline void __run_timers(tvec_base_t *base) fn = timer->function; data = timer->data; + timer_stats_account_timer(timer); + set_running_timer(base, timer); detach_timer(timer, 1); spin_unlock_irq(&base->lock); @@ -591,105 +616,124 @@ static inline void __run_timers(tvec_base_t *base) spin_unlock_irq(&base->lock); } -#ifdef CONFIG_NO_IDLE_HZ +#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a cpus is idle. * This functions needs to be called disabled. */ -unsigned long next_timer_interrupt(void) +static unsigned long __next_timer_interrupt(tvec_base_t *base) { - tvec_base_t *base; - struct list_head *list; + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + (LONG_MAX >> 1); + int index, slot, array, found = 0; struct timer_list *nte; - unsigned long expires; - unsigned long hr_expires = MAX_JIFFY_OFFSET; - ktime_t hr_delta; tvec_t *varray[4]; - int i, j; - - hr_delta = hrtimer_get_next_event(); - if (hr_delta.tv64 != KTIME_MAX) { - struct timespec tsdelta; - tsdelta = ktime_to_timespec(hr_delta); - hr_expires = timespec_to_jiffies(&tsdelta); - if (hr_expires < 3) - return hr_expires + jiffies; - } - hr_expires += jiffies; - - base = __get_cpu_var(tvec_bases); - spin_lock(&base->lock); - expires = base->timer_jiffies + (LONG_MAX >> 1); - list = NULL; /* Look for timer events in tv1. */ - j = base->timer_jiffies & TVR_MASK; + index = slot = timer_jiffies & TVR_MASK; do { - list_for_each_entry(nte, base->tv1.vec + j, entry) { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + found = 1; expires = nte->expires; - if (j < (base->timer_jiffies & TVR_MASK)) - list = base->tv2.vec + (INDEX(0)); - goto found; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; } - j = (j + 1) & TVR_MASK; - } while (j != (base->timer_jiffies & TVR_MASK)); + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; /* Check tv2-tv5. */ varray[0] = &base->tv2; varray[1] = &base->tv3; varray[2] = &base->tv4; varray[3] = &base->tv5; - for (i = 0; i < 4; i++) { - j = INDEX(i); + + for (array = 0; array < 4; array++) { + tvec_t *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; do { - if (list_empty(varray[i]->vec + j)) { - j = (j + 1) & TVN_MASK; - continue; - } - list_for_each_entry(nte, varray[i]->vec + j, entry) + list_for_each_entry(nte, varp->vec + slot, entry) { + found = 1; if (time_before(nte->expires, expires)) expires = nte->expires; - if (j < (INDEX(i)) && i < 3) - list = varray[i + 1]->vec + (INDEX(i + 1)); - goto found; - } while (j != (INDEX(i))); - } -found: - if (list) { - /* - * The search wrapped. We need to look at the next list - * from next tv element that would cascade into tv element - * where we found the timer element. - */ - list_for_each_entry(nte, list, entry) { - if (time_before(nte->expires, expires)) - expires = nte->expires; - } + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); + + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; } - spin_unlock(&base->lock); + return expires; +} - /* - * It can happen that other CPUs service timer IRQs and increment - * jiffies, but we have not yet got a local timer tick to process - * the timer wheels. In that case, the expiry time can be before - * jiffies, but since the high-resolution timer here is relative to - * jiffies, the default expression when high-resolution timers are - * not active, - * - * time_before(MAX_JIFFY_OFFSET + jiffies, expires) - * - * would falsely evaluate to true. If that is the case, just - * return jiffies so that we can immediately fire the local timer - */ - if (time_before(expires, jiffies)) - return jiffies; +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) +{ + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; - if (time_before(hr_expires, expires)) - return hr_expires; + if (hr_delta.tv64 == KTIME_MAX) + return expires; + if (hr_delta.tv64 <= TICK_NSEC) + return now; + + tsdelta = ktime_to_timespec(hr_delta); + now += timespec_to_jiffies(&tsdelta); + if (time_before(now, expires)) + return now; return expires; } + +/** + * next_timer_interrupt - return the jiffy of the next pending timer + */ +unsigned long get_next_timer_interrupt(unsigned long now) +{ + tvec_base_t *base = __get_cpu_var(tvec_bases); + unsigned long expires; + + spin_lock(&base->lock); + expires = __next_timer_interrupt(base); + spin_unlock(&base->lock); + + if (time_before_eq(expires, now)) + return now; + + return cmp_next_hrtimer_event(now, expires); +} + +#ifdef CONFIG_NO_IDLE_HZ +unsigned long next_timer_interrupt(void) +{ + return get_next_timer_interrupt(jiffies); +} +#endif + #endif /******************************************************************/ @@ -832,32 +876,35 @@ EXPORT_SYMBOL(do_settimeofday); * * Accumulates current time interval and initializes new clocksource */ -static int change_clocksource(void) +static void change_clocksource(void) { struct clocksource *new; cycle_t now; u64 nsec; + new = clocksource_get_next(); - if (clock != new) { - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); - - clock = new; - clock->cycle_last = now; - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - return 1; - } else if (clock->update_callback) { - return clock->update_callback(); - } - return 0; + + if (clock == new) + return; + + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + + tick_clock_notify(); + + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); } #else -static inline int change_clocksource(void) -{ - return 0; -} +static inline void change_clocksource(void) { } #endif /** @@ -871,33 +918,56 @@ int timekeeping_is_continuous(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->is_continuous; + ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); return ret; } +/** + * read_persistent_clock - Return time in seconds from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Returns seconds from epoch using the battery backed persistent clock. + * Returns zero if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +unsigned long __attribute__((weak)) read_persistent_clock(void) +{ + return 0; +} + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { unsigned long flags; + unsigned long sec = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); ntp_clear(); clock = clocksource_get_next(); - clocksource_calculate_interval(clock, tick_nsec); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); clock->cycle_last = clocksource_read(clock); + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + write_sequnlock_irqrestore(&xtime_lock, flags); } - +/* flag for if timekeeping is suspended */ static int timekeeping_suspended; +/* time in seconds when suspend began */ +static unsigned long timekeeping_suspend_time; + /** * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused @@ -909,13 +979,26 @@ static int timekeeping_suspended; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; + unsigned long now = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* restart the last cycle value */ + + if (now && (now > timekeeping_suspend_time)) { + unsigned long sleep_length = now - timekeeping_suspend_time; + + xtime.tv_sec += sleep_length; + wall_to_monotonic.tv_sec -= sleep_length; + } + /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); clock->error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); + + touch_softlockup_watchdog(); + /* Resume hrtimers */ + clock_was_set(); + return 0; } @@ -925,6 +1008,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) write_seqlock_irqsave(&xtime_lock, flags); timekeeping_suspended = 1; + timekeeping_suspend_time = read_persistent_clock(); write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -1089,11 +1173,8 @@ static void update_wall_time(void) clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; /* check to see if there is a new clocksource to use */ - if (change_clocksource()) { - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, tick_nsec); - } + change_clocksource(); + update_vsyscall(&xtime, clock); } /* @@ -1162,11 +1243,9 @@ static inline void calc_load(unsigned long ticks) * This read-write spinlock protects us from races in SMP while * playing with xtime and avenrun. */ -#ifndef ARCH_HAVE_XTIME_LOCK -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); +__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); EXPORT_SYMBOL(xtime_lock); -#endif /* * This function runs timers and the timer-tq in bottom half context. @@ -1175,7 +1254,8 @@ static void run_timer_softirq(struct softirq_action *h) { tvec_base_t *base = __get_cpu_var(tvec_bases); - hrtimer_run_queues(); + hrtimer_run_queues(); + if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); } @@ -1392,17 +1472,16 @@ asmlinkage long sys_gettid(void) } /** - * sys_sysinfo - fill in sysinfo struct + * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill */ -asmlinkage long sys_sysinfo(struct sysinfo __user *info) +int do_sysinfo(struct sysinfo *info) { - struct sysinfo val; unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; unsigned long seq; - memset((char *)&val, 0, sizeof(struct sysinfo)); + memset(info, 0, sizeof(struct sysinfo)); do { struct timespec tp; @@ -1422,17 +1501,17 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; } - val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - val.procs = nr_threads; + info->procs = nr_threads; } while (read_seqretry(&xtime_lock, seq)); - si_meminfo(&val); - si_swapinfo(&val); + si_meminfo(info); + si_swapinfo(info); /* * If the sum of all the available memory (i.e. ram + swap) @@ -1443,11 +1522,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) * -Erik Andersen <andersee@debian.org> */ - mem_total = val.totalram + val.totalswap; - if (mem_total < val.totalram || mem_total < val.totalswap) + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) goto out; bitcount = 0; - mem_unit = val.mem_unit; + mem_unit = info->mem_unit; while (mem_unit > 1) { bitcount++; mem_unit >>= 1; @@ -1459,22 +1538,31 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) /* * If mem_total did not overflow, multiply all memory values by - * val.mem_unit and set it to 1. This leaves things compatible + * info->mem_unit and set it to 1. This leaves things compatible * with 2.2.x, and also retains compatibility with earlier 2.4.x * kernels... */ - val.mem_unit = 1; - val.totalram <<= bitcount; - val.freeram <<= bitcount; - val.sharedram <<= bitcount; - val.bufferram <<= bitcount; - val.totalswap <<= bitcount; - val.freeswap <<= bitcount; - val.totalhigh <<= bitcount; - val.freehigh <<= bitcount; + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; + +out: + return 0; +} + +asmlinkage long sys_sysinfo(struct sysinfo __user *info) +{ + struct sysinfo val; + + do_sysinfo(&val); - out: if (copy_to_user(info, &val, sizeof(struct sysinfo))) return -EFAULT; @@ -1613,6 +1701,8 @@ void __init init_timers(void) int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + init_timer_stats(); + BUG_ON(err == NOTIFY_BAD); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); @@ -1624,7 +1714,7 @@ struct time_interpolator *time_interpolator __read_mostly; static struct time_interpolator *time_interpolator_list __read_mostly; static DEFINE_SPINLOCK(time_interpolator_lock); -static inline u64 time_interpolator_get_cycles(unsigned int src) +static inline cycles_t time_interpolator_get_cycles(unsigned int src) { unsigned long (*x)(void); @@ -1650,8 +1740,8 @@ static inline u64 time_interpolator_get_counter(int writelock) if (time_interpolator->jitter) { - u64 lcycle; - u64 now; + cycles_t lcycle; + cycles_t now; do { lcycle = time_interpolator->last_cycle; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index baacc36..658f638 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -22,8 +22,6 @@ #include <linux/acct.h> #include <linux/jiffies.h> - -#define USEC_PER_TICK (USEC_PER_SEC/HZ) /* * fill in basic accounting fields */ diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c new file mode 100644 index 0000000..f22b9db --- /dev/null +++ b/kernel/utsname_sysctl.c @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2007 + * + * Author: Eric Biederman <ebiederm@xmision.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/module.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <linux/version.h> +#include <linux/sysctl.h> + +static void *get_uts(ctl_table *table, int write) +{ + char *which = table->data; +#ifdef CONFIG_UTS_NS + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + which = (which - (char *)&init_uts_ns) + (char *)uts_ns; +#endif + if (!write) + down_read(&uts_sem); + else + down_write(&uts_sem); + return which; +} + +static void put_uts(ctl_table *table, int write, void *which) +{ + if (!write) + up_read(&uts_sem); + else + up_write(&uts_sem); +} + +#ifdef CONFIG_PROC_FS +/* + * Special case of dostring for the UTS structure. This has locks + * to observe. Should this be in kernel/sys.c ???? + */ +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table uts_table; + int r; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + put_uts(table, write, uts_table.data); + return r; +} +#else +#define proc_do_uts_string NULL +#endif + + +#ifdef CONFIG_SYSCTL_SYSCALL +/* The generic string strategy routine: */ +static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + struct ctl_table uts_table; + int r, write; + write = newval && newlen; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = sysctl_string(&uts_table, name, nlen, + oldval, oldlenp, newval, newlen); + put_uts(table, write, uts_table.data); + return r; +} +#else +#define sysctl_uts_string NULL +#endif + +static struct ctl_table uts_kern_table[] = { + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), + .mode = 0444, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), + .mode = 0444, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), + .mode = 0444, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), + .mode = 0644, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), + .mode = 0644, + .proc_handler = proc_do_uts_string, + .strategy = sysctl_uts_string, + }, + {} +}; + +static struct ctl_table uts_root_table[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = uts_kern_table, + }, + {} +}; + +static int __init utsname_sysctl_init(void) +{ + register_sysctl_table(uts_root_table); + return 0; +} + +__initcall(utsname_sysctl_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a3da07c..b6fa5e6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -218,7 +218,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) } EXPORT_SYMBOL_GPL(queue_work); -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; struct workqueue_struct *wq = get_wq_data(&dwork->work); @@ -245,6 +245,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + timer_stats_timer_set_start_info(timer); if (delay == 0) return queue_work(wq, work); @@ -593,8 +594,10 @@ EXPORT_SYMBOL(schedule_work); * After waiting for a given time this puts a job in the kernel-global * workqueue. */ -int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) +int fastcall schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) { + timer_stats_timer_set_start_info(&dwork->timer); return queue_delayed_work(keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); @@ -656,8 +659,7 @@ void flush_scheduled_work(void) EXPORT_SYMBOL(flush_scheduled_work); /** - * cancel_rearming_delayed_workqueue - reliably kill off a delayed - * work whose handler rearms the delayed work. + * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work. * @wq: the controlling workqueue structure * @dwork: the delayed work struct */ @@ -670,8 +672,7 @@ void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); /** - * cancel_rearming_delayed_work - reliably kill off a delayed keventd - * work whose handler rearms the delayed work. + * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work. * @dwork: the delayed work struct */ void cancel_rearming_delayed_work(struct delayed_work *dwork) |