diff options
Diffstat (limited to 'kernel')
87 files changed, 4633 insertions, 2728 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index bbde5f1..271fd31 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,6 +24,7 @@ endif obj-y += sched/ obj-y += power/ +obj-y += cpu/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -175,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ -batch -x509 -config x509.genkey \ -outform DER -out signing_key.x509 \ - -keyout signing_key.priv + -keyout signing_key.priv 2>&1 @echo "###" @echo "### Key pair generated." @echo "###" diff --git a/kernel/acct.c b/kernel/acct.c index b9bd7f0..8d6e145 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_swaps = encode_comp_t(0); /* + * Get freeze protection. If the fs is frozen, just skip the write + * as we could deadlock the system otherwise. + */ + if (!file_start_write_trylock(file)) + goto out; + /* * Kernel segment override to datasegment and write it * to the accounting file. */ @@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, sizeof(acct_t), &file->f_pos); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; set_fs(fs); + file_end_write(file); out: revert_creds(orig_cred); } diff --git a/kernel/audit.c b/kernel/audit.c index 9816a1b..21c7fa6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -49,6 +49,8 @@ #include <linux/slab.h> #include <linux/err.h> #include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/syscalls.h> #include <linux/audit.h> @@ -58,7 +60,7 @@ #ifdef CONFIG_SECURITY #include <linux/security.h> #endif -#include <linux/netlink.h> +#include <net/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> #include <linux/pid_namespace.h> @@ -265,7 +267,6 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - kuid_t loginuid, u32 sessionid, u32 sid, int allow_changes) { struct audit_buffer *ab; @@ -274,29 +275,17 @@ static int audit_log_config_change(char *function_name, int new, int old, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, - old, from_kuid(&init_user_ns, loginuid), sessionid); - if (sid) { - char *ctx = NULL; - u32 len; - - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) { - audit_log_format(ab, " sid=%u", sid); - allow_changes = 0; /* Something weird, deny request */ - } else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + audit_log_format(ab, "%s=%d old=%d", function_name, new, old); + audit_log_session_info(ab); + rc = audit_log_task_context(ab); + if (rc) + allow_changes = 0; /* Something weird, deny request */ audit_log_format(ab, " res=%d", allow_changes); audit_log_end(ab); return rc; } -static int audit_do_config_change(char *function_name, int *to_change, - int new, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_do_config_change(char *function_name, int *to_change, int new) { int allow_changes, rc = 0, old = *to_change; @@ -307,8 +296,7 @@ static int audit_do_config_change(char *function_name, int *to_change, allow_changes = 1; if (audit_enabled != AUDIT_OFF) { - rc = audit_log_config_change(function_name, new, old, loginuid, - sessionid, sid, allow_changes); + rc = audit_log_config_change(function_name, new, old, allow_changes); if (rc) allow_changes = 0; } @@ -322,44 +310,37 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_set_rate_limit(int limit) { - return audit_do_config_change("audit_rate_limit", &audit_rate_limit, - limit, loginuid, sessionid, sid); + return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit); } -static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, - u32 sid) +static int audit_set_backlog_limit(int limit) { - return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, - limit, loginuid, sessionid, sid); + return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit); } -static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_enabled(int state) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) return -EINVAL; - rc = audit_do_config_change("audit_enabled", &audit_enabled, state, - loginuid, sessionid, sid); - + rc = audit_do_config_change("audit_enabled", &audit_enabled, state); if (!rc) audit_ever_enabled |= !!state; return rc; } -static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_failure(int state) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK && state != AUDIT_FAIL_PANIC) return -EINVAL; - return audit_do_config_change("audit_failure", &audit_failure, state, - loginuid, sessionid, sid); + return audit_do_config_change("audit_failure", &audit_failure, state); } /* @@ -417,34 +398,53 @@ static void kauditd_send_skb(struct sk_buff *skb) consume_skb(skb); } -static int kauditd_thread(void *dummy) +/* + * flush_hold_queue - empty the hold queue if auditd appears + * + * If auditd just started, drain the queue of messages already + * sent to syslog/printk. Remember loss here is ok. We already + * called audit_log_lost() if it didn't go out normally. so the + * race between the skb_dequeue and the next check for audit_pid + * doesn't matter. + * + * If you ever find kauditd to be too slow we can get a perf win + * by doing our own locking and keeping better track if there + * are messages in this queue. I don't see the need now, but + * in 5 years when I want to play with this again I'll see this + * note and still have no friggin idea what i'm thinking today. + */ +static void flush_hold_queue(void) { struct sk_buff *skb; + if (!audit_default || !audit_pid) + return; + + skb = skb_dequeue(&audit_skb_hold_queue); + if (likely(!skb)) + return; + + while (skb && audit_pid) { + kauditd_send_skb(skb); + skb = skb_dequeue(&audit_skb_hold_queue); + } + + /* + * if auditd just disappeared but we + * dequeued an skb we need to drop ref + */ + if (skb) + consume_skb(skb); +} + +static int kauditd_thread(void *dummy) +{ set_freezable(); while (!kthread_should_stop()) { - /* - * if auditd just started drain the queue of messages already - * sent to syslog/printk. remember loss here is ok. we already - * called audit_log_lost() if it didn't go out normally. so the - * race between the skb_dequeue and the next check for audit_pid - * doesn't matter. - * - * if you ever find kauditd to be too slow we can get a perf win - * by doing our own locking and keeping better track if there - * are messages in this queue. I don't see the need now, but - * in 5 years when I want to play with this again I'll see this - * note and still have no friggin idea what i'm thinking today. - */ - if (audit_default && audit_pid) { - skb = skb_dequeue(&audit_skb_hold_queue); - if (unlikely(skb)) { - while (skb && audit_pid) { - kauditd_send_skb(skb); - skb = skb_dequeue(&audit_skb_hold_queue); - } - } - } + struct sk_buff *skb; + DECLARE_WAITQUEUE(wait, current); + + flush_hold_queue(); skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); @@ -453,19 +453,18 @@ static int kauditd_thread(void *dummy) kauditd_send_skb(skb); else audit_printk_skb(skb); - } else { - DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kauditd_wait, &wait); - - if (!skb_queue_len(&audit_skb_queue)) { - try_to_freeze(); - schedule(); - } + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kauditd_wait, &wait); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&kauditd_wait, &wait); + if (!skb_queue_len(&audit_skb_queue)) { + try_to_freeze(); + schedule(); } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kauditd_wait, &wait); } return 0; } @@ -579,13 +578,14 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return -EPERM; switch (msg_type) { - case AUDIT_GET: case AUDIT_LIST: - case AUDIT_LIST_RULES: - case AUDIT_SET: case AUDIT_ADD: - case AUDIT_ADD_RULE: case AUDIT_DEL: + return -EOPNOTSUPP; + case AUDIT_GET: + case AUDIT_SET: + case AUDIT_LIST_RULES: + case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: case AUDIT_SIGNAL_INFO: case AUDIT_TTY_GET: @@ -608,12 +608,10 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - kuid_t auid, u32 ses, u32 sid) +static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) { int rc = 0; - char *ctx = NULL; - u32 len; + uid_t uid = from_kuid(&init_user_ns, current_uid()); if (!audit_enabled) { *ab = NULL; @@ -623,33 +621,21 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return rc; - audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", - task_tgid_vnr(current), - from_kuid(&init_user_ns, current_uid()), - from_kuid(&init_user_ns, auid), ses); - if (sid) { - rc = security_secid_to_secctx(sid, &ctx, &len); - if (rc) - audit_log_format(*ab, " ssid=%u", sid); - else { - audit_log_format(*ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid); + audit_log_session_info(*ab); + audit_log_task_context(*ab); return rc; } static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { - u32 seq, sid; + u32 seq; void *data; struct audit_status *status_get, status_set; int err; struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; - kuid_t loginuid; /* loginuid of sender */ - u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; u32 len; @@ -668,9 +654,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } } - loginuid = audit_get_loginuid(current); - sessionid = audit_get_sessionid(current); - security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); @@ -691,14 +674,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { - err = audit_set_enabled(status_get->enabled, - loginuid, sessionid, sid); + err = audit_set_enabled(status_get->enabled); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { - err = audit_set_failure(status_get->failure, - loginuid, sessionid, sid); + err = audit_set_failure(status_get->failure); if (err < 0) return err; } @@ -706,22 +687,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int new_pid = status_get->pid; if (audit_enabled != AUDIT_OFF) - audit_log_config_change("audit_pid", new_pid, - audit_pid, loginuid, - sessionid, sid, 1); - + audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; audit_nlk_portid = NETLINK_CB(skb).portid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { - err = audit_set_rate_limit(status_get->rate_limit, - loginuid, sessionid, sid); + err = audit_set_rate_limit(status_get->rate_limit); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - err = audit_set_backlog_limit(status_get->backlog_limit, - loginuid, sessionid, sid); + err = audit_set_backlog_limit(status_get->backlog_limit); break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: @@ -729,25 +705,22 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(); + err = audit_filter_user(msg_type); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_task(current, loginuid, - sessionid); + err = tty_audit_push_current(); if (err) break; } - audit_log_common_recv_msg(&ab, msg_type, - loginuid, sessionid, sid); - + audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.1024s'", (char *)data); else { int size; - audit_log_format(ab, " msg="); + audit_log_format(ab, " data="); size = nlmsg_len(nlh); if (size > 0 && ((unsigned char *)data)[size - 1] == '\0') @@ -758,50 +731,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); } break; - case AUDIT_ADD: - case AUDIT_DEL: - if (nlmsg_len(nlh) < sizeof(struct audit_rule)) - return -EINVAL; - if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); - audit_log_end(ab); - return -EPERM; - } - /* fallthrough */ - case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); - break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - - audit_log_format(ab, " audit_enabled=%d res=0", - audit_enabled); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); audit_log_end(ab); return -EPERM; } /* fallthrough */ case AUDIT_LIST_RULES: err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh), - loginuid, sessionid, sid); + seq, data, nlmsg_len(nlh)); break; case AUDIT_TRIM: audit_trim_trees(); - - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); - + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; @@ -831,8 +778,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, - loginuid, sessionid, sid); + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); @@ -871,27 +817,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_tty_status s; struct task_struct *tsk = current; - spin_lock_irq(&tsk->sighand->siglock); + spin_lock(&tsk->sighand->siglock); s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); + s.log_passwd = tsk->signal->audit_tty_log_passwd; + spin_unlock(&tsk->sighand->siglock); audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { - struct audit_tty_status *s; + struct audit_tty_status s; struct task_struct *tsk = current; - if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) - return -EINVAL; - s = data; - if (s->enabled != 0 && s->enabled != 1) + memset(&s, 0, sizeof(s)); + /* guard against past and future API changes */ + memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); + if ((s.enabled != 0 && s.enabled != 1) || + (s.log_passwd != 0 && s.log_passwd != 1)) return -EINVAL; - spin_lock_irq(&tsk->sighand->siglock); - tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); + spin_lock(&tsk->sighand->siglock); + tsk->signal->audit_tty = s.enabled; + tsk->signal->audit_tty_log_passwd = s.log_passwd; + spin_unlock(&tsk->sighand->siglock); break; } default: @@ -910,7 +859,7 @@ static void audit_receive_skb(struct sk_buff *skb) { struct nlmsghdr *nlh; /* - * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 + * len MUST be signed for nlmsg_next to be able to dec it below 0 * if the nlmsg_len was not aligned */ int len; @@ -919,13 +868,13 @@ static void audit_receive_skb(struct sk_buff *skb) nlh = nlmsg_hdr(skb); len = skb->len; - while (NLMSG_OK(nlh, len)) { + while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ if (err || (nlh->nlmsg_flags & NLM_F_ACK)) netlink_ack(skb, nlh, err); - nlh = NLMSG_NEXT(nlh, len); + nlh = nlmsg_next(nlh, &len); } } @@ -1434,6 +1383,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, kfree(pathname); } +void audit_log_session_info(struct audit_buffer *ab) +{ + u32 sessionid = audit_get_sessionid(current); + uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + + audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); +} + void audit_log_key(struct audit_buffer *ab, char *key) { audit_log_format(ab, " key="); @@ -1443,6 +1400,224 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } +void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) { + audit_log_format(ab, "%08x", + cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + } +} + +void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + kernel_cap_t *perm = &name->fcap.permitted; + kernel_cap_t *inh = &name->fcap.inheritable; + int log = 0; + + if (!cap_isclear(*perm)) { + audit_log_cap(ab, "cap_fp", perm); + log = 1; + } + if (!cap_isclear(*inh)) { + audit_log_cap(ab, "cap_fi", inh); + log = 1; + } + + if (log) + audit_log_format(ab, " cap_fe=%d cap_fver=%x", + name->fcap.fE, name->fcap_ver); +} + +static inline int audit_copy_fcaps(struct audit_names *name, + const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> + VFS_CAP_REVISION_SHIFT; + + return 0; +} + +/* Copy inode data into an audit_names. */ +void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + const struct inode *inode) +{ + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + security_inode_getsecid(inode, &name->osid); + audit_copy_fcaps(name, dentry); +} + +/** + * audit_log_name - produce AUDIT_PATH record from struct audit_names + * @context: audit_context for the task + * @n: audit_names structure with reportable details + * @path: optional path to report instead of audit_names->name + * @record_num: record number to report when handling a list of names + * @call_panic: optional pointer to int that will be updated if secid fails + */ +void audit_log_name(struct audit_context *context, struct audit_names *n, + struct path *path, int record_num, int *call_panic) +{ + struct audit_buffer *ab; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; + + audit_log_format(ab, "item=%d", record_num); + + if (path) + audit_log_d_path(ab, " name=", path); + else if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + if (call_panic) + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + audit_log_end(ab); +} + +int audit_log_task_context(struct audit_buffer *ab) +{ + char *ctx = NULL; + unsigned len; + int error; + u32 sid; + + security_task_getsecid(current, &sid); + if (!sid) + return 0; + + error = security_secid_to_secctx(sid, &ctx, &len); + if (error) { + if (error != -EINVAL) + goto error_path; + return 0; + } + + audit_log_format(ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + return 0; + +error_path: + audit_panic("error in audit_log_task_context"); + return error; +} +EXPORT_SYMBOL(audit_log_task_context); + +void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +{ + const struct cred *cred; + char name[sizeof(tsk->comm)]; + struct mm_struct *mm = tsk->mm; + char *tty; + + if (!ab) + return; + + /* tsk == current */ + cred = current_cred(); + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) + tty = tsk->signal->tty->name; + else + tty = "(none)"; + spin_unlock_irq(&tsk->sighand->siglock); + + audit_log_format(ab, + " ppid=%ld pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", + sys_getppid(), + tsk->pid, + from_kuid(&init_user_ns, audit_get_loginuid(tsk)), + from_kuid(&init_user_ns, cred->uid), + from_kgid(&init_user_ns, cred->gid), + from_kuid(&init_user_ns, cred->euid), + from_kuid(&init_user_ns, cred->suid), + from_kuid(&init_user_ns, cred->fsuid), + from_kgid(&init_user_ns, cred->egid), + from_kgid(&init_user_ns, cred->sgid), + from_kgid(&init_user_ns, cred->fsgid), + audit_get_sessionid(tsk), tty); + + get_task_comm(name, tsk); + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, name); + + if (mm) { + down_read(&mm->mmap_sem); + if (mm->exe_file) + audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); + up_read(&mm->mmap_sem); + } + audit_log_task_context(ab); +} +EXPORT_SYMBOL(audit_log_task_info); + /** * audit_log_link_denied - report a link restriction denial * @operation: specific link opreation @@ -1451,19 +1626,28 @@ void audit_log_key(struct audit_buffer *ab, char *key) void audit_log_link_denied(const char *operation, struct path *link) { struct audit_buffer *ab; + struct audit_names *name; + + name = kzalloc(sizeof(*name), GFP_NOFS); + if (!name) + return; + /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ ab = audit_log_start(current->audit_context, GFP_KERNEL, AUDIT_ANOM_LINK); if (!ab) - return; - audit_log_format(ab, "op=%s action=denied", operation); - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_d_path(ab, " path=", link); - audit_log_format(ab, " dev="); - audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); + goto out; + audit_log_format(ab, "op=%s", operation); + audit_log_task_info(ab, current); + audit_log_format(ab, " res=0"); audit_log_end(ab); + + /* Generate AUDIT_PATH record with object. */ + name->type = AUDIT_TYPE_NORMAL; + audit_copy_inode(name, link->dentry, link->dentry->d_inode); + audit_log_name(current->audit_context, name, link, 0, NULL); +out: + kfree(name); } /** @@ -1483,7 +1667,7 @@ void audit_log_end(struct audit_buffer *ab) audit_log_lost("rate limit exceeded"); } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); + nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; if (audit_pid) { skb_queue_tail(&audit_skb_queue, ab->skb); diff --git a/kernel/audit.h b/kernel/audit.h index 11468d9..1c95131 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/audit.h> #include <linux/skbuff.h> +#include <uapi/linux/mqueue.h> /* 0 = no checking 1 = put_count checking @@ -29,6 +30,11 @@ */ #define AUDIT_DEBUG 0 +/* AUDIT_NAMES is the number of slots we reserve in the audit_context + * for saving names from getname(). If we get more names we will allocate + * a name dynamically and also add those to the list anchored by names_list. */ +#define AUDIT_NAMES 5 + /* At task start time, the audit_state is set in the audit_context using a per-task filter. At syscall entry, the audit_state is augmented by the syscall filter. */ @@ -59,8 +65,158 @@ struct audit_entry { struct audit_krule rule; }; +struct audit_cap_data { + kernel_cap_t permitted; + kernel_cap_t inheritable; + union { + unsigned int fE; /* effective bit of file cap */ + kernel_cap_t effective; /* effective set of process */ + }; +}; + +/* When fs/namei.c:getname() is called, we store the pointer in name and + * we don't let putname() free it (instead we free all of the saved + * pointers at syscall exit time). + * + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */ +struct audit_names { + struct list_head list; /* audit_context->names_list */ + + struct filename *name; + int name_len; /* number of chars to log */ + bool name_put; /* call __putname()? */ + + unsigned long ino; + dev_t dev; + umode_t mode; + kuid_t uid; + kgid_t gid; + dev_t rdev; + u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; + unsigned char type; /* record type */ + /* + * This was an allocated audit_names and not from the array of + * names allocated in the task audit context. Thus this name + * should be freed on syscall exit. + */ + bool should_free; +}; + +/* The per-task audit context. */ +struct audit_context { + int dummy; /* must be the first element */ + int in_syscall; /* 1 if task is in a syscall */ + enum audit_state state, current_state; + unsigned int serial; /* serial number for record */ + int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ + unsigned long argv[4]; /* syscall arguments */ + long return_code;/* syscall return code */ + u64 prio; + int return_valid; /* return code is valid */ + /* + * The names_list is the list of all audit_names collected during this + * syscall. The first AUDIT_NAMES entries in the names_list will + * actually be from the preallocated_names array for performance + * reasons. Except during allocation they should never be referenced + * through the preallocated_names array and should only be found/used + * by running the names_list. + */ + struct audit_names preallocated_names[AUDIT_NAMES]; + int name_count; /* total records in names_list */ + struct list_head names_list; /* struct audit_names->list anchor */ + char *filterkey; /* key for rule that triggered record */ + struct path pwd; + struct audit_aux_data *aux; + struct audit_aux_data *aux_pids; + struct sockaddr_storage *sockaddr; + size_t sockaddr_len; + /* Save things to print about task_struct */ + pid_t pid, ppid; + kuid_t uid, euid, suid, fsuid; + kgid_t gid, egid, sgid, fsgid; + unsigned long personality; + int arch; + + pid_t target_pid; + kuid_t target_auid; + kuid_t target_uid; + unsigned int target_sessionid; + u32 target_sid; + char target_comm[TASK_COMM_LEN]; + + struct audit_tree_refs *trees, *first_trees; + struct list_head killed_trees; + int tree_count; + + int type; + union { + struct { + int nargs; + long args[6]; + } socketcall; + struct { + kuid_t uid; + kgid_t gid; + umode_t mode; + u32 osid; + int has_perm; + uid_t perm_uid; + gid_t perm_gid; + umode_t perm_mode; + unsigned long qbytes; + } ipc; + struct { + mqd_t mqdes; + struct mq_attr mqstat; + } mq_getsetattr; + struct { + mqd_t mqdes; + int sigev_signo; + } mq_notify; + struct { + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; + } mq_sendrecv; + struct { + int oflag; + umode_t mode; + struct mq_attr attr; + } mq_open; + struct { + pid_t pid; + struct audit_cap_data cap; + } capset; + struct { + int fd; + int flags; + } mmap; + }; + int fds[2]; + +#if AUDIT_DEBUG + int put_count; + int ino_count; +#endif +}; + extern int audit_ever_enabled; +extern void audit_copy_inode(struct audit_names *name, + const struct dentry *dentry, + const struct inode *inode); +extern void audit_log_cap(struct audit_buffer *ab, char *prefix, + kernel_cap_t *cap); +extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); +extern void audit_log_name(struct audit_context *context, + struct audit_names *n, struct path *path, + int record_num, int *call_panic); + extern int audit_pid; #define AUDIT_INODE_BUCKETS 32 diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 2674368..83a2970 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -310,121 +310,83 @@ static u32 audit_to_op(u32 op) return n; } - -/* Translate struct audit_rule to kernel's rule respresentation. - * Exists for backward compatibility with userspace. */ -static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) +/* check if an audit field is valid */ +static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { - struct audit_entry *entry; - int err = 0; - int i; - - entry = audit_to_entry_common(rule); - if (IS_ERR(entry)) - goto exit_nofree; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &entry->rule.fields[i]; - u32 n; - - n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); - - /* Support for legacy operators where - * AUDIT_NEGATE bit signifies != and otherwise assumes == */ - if (n & AUDIT_NEGATE) - f->op = Audit_not_equal; - else if (!n) - f->op = Audit_equal; - else - f->op = audit_to_op(n); - - entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; - - f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); - f->val = rule->values[i]; - f->uid = INVALID_UID; - f->gid = INVALID_GID; - - err = -EINVAL; - if (f->op == Audit_bad) - goto exit_free; - - switch(f->type) { - default: - goto exit_free; - case AUDIT_UID: - case AUDIT_EUID: - case AUDIT_SUID: - case AUDIT_FSUID: - case AUDIT_LOGINUID: - /* bit ops not implemented for uid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - - f->uid = make_kuid(current_user_ns(), f->val); - if (!uid_valid(f->uid)) - goto exit_free; - break; - case AUDIT_GID: - case AUDIT_EGID: - case AUDIT_SGID: - case AUDIT_FSGID: - /* bit ops not implemented for gid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - - f->gid = make_kgid(current_user_ns(), f->val); - if (!gid_valid(f->gid)) - goto exit_free; - break; - case AUDIT_PID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - /* bit ops are only useful on syscall args */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; - /* arch is only allowed to be = or != */ - case AUDIT_ARCH: - if (f->op != Audit_not_equal && f->op != Audit_equal) - goto exit_free; - entry->rule.arch_f = f; - break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; - case AUDIT_INODE: - err = audit_to_inode(&entry->rule, f); - if (err) - goto exit_free; - break; - } - } - - if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) - entry->rule.inode_f = NULL; - -exit_nofree: - return entry; + switch(f->type) { + case AUDIT_MSGTYPE: + if (entry->rule.listnr != AUDIT_FILTER_TYPE && + entry->rule.listnr != AUDIT_FILTER_USER) + return -EINVAL; + break; + }; -exit_free: - audit_free_rule(entry); - return ERR_PTR(err); + switch(f->type) { + default: + return -EINVAL; + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_LOGINUID: + case AUDIT_OBJ_UID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_OBJ_GID: + case AUDIT_PID: + case AUDIT_PERS: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + /* bit ops are only useful on syscall args */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + return -EINVAL; + break; + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + case AUDIT_WATCH: + case AUDIT_DIR: + case AUDIT_FILTERKEY: + break; + case AUDIT_LOGINUID_SET: + if ((f->val != 0) && (f->val != 1)) + return -EINVAL; + /* FALL THROUGH */ + case AUDIT_ARCH: + if (f->op != Audit_not_equal && f->op != Audit_equal) + return -EINVAL; + break; + case AUDIT_PERM: + if (f->val & ~15) + return -EINVAL; + break; + case AUDIT_FILETYPE: + if (f->val & ~S_IFMT) + return -EINVAL; + break; + case AUDIT_FIELD_COMPARE: + if (f->val > AUDIT_MAX_FIELD_COMPARE) + return -EINVAL; + break; + }; + return 0; } /* Translate struct audit_rule_data to kernel's rule respresentation. */ @@ -459,17 +421,25 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->gid = INVALID_GID; f->lsm_str = NULL; f->lsm_rule = NULL; - switch(f->type) { + + /* Support legacy tests for a valid loginuid */ + if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { + f->type = AUDIT_LOGINUID_SET; + f->val = 0; + } + + err = audit_field_valid(entry, f); + if (err) + goto exit_free; + + err = -EINVAL; + switch (f->type) { + case AUDIT_LOGINUID: case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: case AUDIT_FSUID: - case AUDIT_LOGINUID: case AUDIT_OBJ_UID: - /* bit ops not implemented for uid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - f->uid = make_kuid(current_user_ns(), f->val); if (!uid_valid(f->uid)) goto exit_free; @@ -479,27 +449,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_SGID: case AUDIT_FSGID: case AUDIT_OBJ_GID: - /* bit ops not implemented for gid comparisons */ - if (f->op == Audit_bitmask || f->op == Audit_bittest) - goto exit_free; - f->gid = make_kgid(current_user_ns(), f->val); if (!gid_valid(f->gid)) goto exit_free; break; - case AUDIT_PID: - case AUDIT_PERS: - case AUDIT_MSGTYPE: - case AUDIT_PPID: - case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: - case AUDIT_EXIT: - case AUDIT_SUCCESS: - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: - break; case AUDIT_ARCH: entry->rule.arch_f = f; break; @@ -570,20 +523,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f->val; entry->rule.filterkey = str; break; - case AUDIT_PERM: - if (f->val & ~15) - goto exit_free; - break; - case AUDIT_FILETYPE: - if (f->val & ~S_IFMT) - goto exit_free; - break; - case AUDIT_FIELD_COMPARE: - if (f->val > AUDIT_MAX_FIELD_COMPARE) - goto exit_free; - break; - default: - goto exit_free; } } @@ -613,36 +552,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str) return len; } -/* Translate kernel rule respresentation to struct audit_rule. - * Exists for backward compatibility with userspace. */ -static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) -{ - struct audit_rule *rule; - int i; - - rule = kzalloc(sizeof(*rule), GFP_KERNEL); - if (unlikely(!rule)) - return NULL; - - rule->flags = krule->flags | krule->listnr; - rule->action = krule->action; - rule->field_count = krule->field_count; - for (i = 0; i < rule->field_count; i++) { - rule->values[i] = krule->fields[i].val; - rule->fields[i] = krule->fields[i].type; - - if (krule->vers_ops == 1) { - if (krule->fields[i].op == Audit_not_equal) - rule->fields[i] |= AUDIT_NEGATE; - } else { - rule->fields[i] |= audit_ops[krule->fields[i].op]; - } - } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; - - return rule; -} - /* Translate kernel rule respresentation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { @@ -1055,35 +964,6 @@ out: return ret; } -/* List rules using struct audit_rule. Exists for backward - * compatibility with userspace. */ -static void audit_list(int pid, int seq, struct sk_buff_head *q) -{ - struct sk_buff *skb; - struct audit_krule *r; - int i; - - /* This is a blocking read, so use audit_filter_mutex instead of rcu - * iterator to sync with list writers. */ - for (i=0; i<AUDIT_NR_FILTERS; i++) { - list_for_each_entry(r, &audit_rules_list[i], list) { - struct audit_rule *rule; - - rule = audit_krule_to_rule(r); - if (unlikely(!rule)) - break; - skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, - rule, sizeof(*rule)); - if (skb) - skb_queue_tail(q, skb); - kfree(rule); - } - } - skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - if (skb) - skb_queue_tail(q, skb); -} - /* List rules using struct audit_rule_data. */ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { @@ -1113,11 +993,11 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, - char *action, struct audit_krule *rule, - int res) +static void audit_log_rule_change(char *action, struct audit_krule *rule, int res) { struct audit_buffer *ab; + uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); + u32 sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -1125,18 +1005,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u", - from_kuid(&init_user_ns, loginuid), sessionid); - if (sid) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } + audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); + audit_log_task_context(ab); audit_log_format(ab, " op="); audit_log_string(ab, action); audit_log_key(ab, rule->filterkey); @@ -1155,8 +1025,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ -int audit_receive_filter(int type, int pid, int seq, void *data, - size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) +int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz) { struct task_struct *tsk; struct audit_netlink_list *dest; @@ -1164,7 +1033,6 @@ int audit_receive_filter(int type, int pid, int seq, void *data, struct audit_entry *entry; switch (type) { - case AUDIT_LIST: case AUDIT_LIST_RULES: /* We can't just spew out the rules here because we might fill * the available socket buffer space and deadlock waiting for @@ -1179,10 +1047,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data, skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); - if (type == AUDIT_LIST) - audit_list(pid, seq, &dest->q); - else - audit_list_rules(pid, seq, &dest->q); + audit_list_rules(pid, seq, &dest->q); mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list, dest, "audit_send_list"); @@ -1192,35 +1057,23 @@ int audit_receive_filter(int type, int pid, int seq, void *data, err = PTR_ERR(tsk); } break; - case AUDIT_ADD: case AUDIT_ADD_RULE: - if (type == AUDIT_ADD) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); + entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); err = audit_add_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "add rule", - &entry->rule, !err); - + audit_log_rule_change("add rule", &entry->rule, !err); if (err) audit_free_rule(entry); break; - case AUDIT_DEL: case AUDIT_DEL_RULE: - if (type == AUDIT_DEL) - entry = audit_rule_to_entry(data); - else - entry = audit_data_to_entry(data, datasz); + entry = audit_data_to_entry(data, datasz); if (IS_ERR(entry)) return PTR_ERR(entry); err = audit_del_rule(entry); - audit_log_rule_change(loginuid, sessionid, sid, "remove rule", - &entry->rule, !err); - + audit_log_rule_change("remove rule", &entry->rule, !err); audit_free_rule(entry); break; default: @@ -1358,7 +1211,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) return strncmp(p, dname, dlen); } -static int audit_filter_user_rules(struct audit_krule *rule, +static int audit_filter_user_rules(struct audit_krule *rule, int type, enum audit_state *state) { int i; @@ -1382,6 +1235,13 @@ static int audit_filter_user_rules(struct audit_krule *rule, result = audit_uid_comparator(audit_get_loginuid(current), f->op, f->uid); break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(current), + f->op, f->val); + break; + case AUDIT_MSGTYPE: + result = audit_comparator(type, f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -1408,7 +1268,7 @@ static int audit_filter_user_rules(struct audit_krule *rule, return 1; } -int audit_filter_user(void) +int audit_filter_user(int type) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; @@ -1416,7 +1276,7 @@ int audit_filter_user(void) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { - if (audit_filter_user_rules(&e->rule, &state)) { + if (audit_filter_user_rules(&e->rule, type, &state)) { if (state == AUDIT_DISABLED) ret = 0; break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index c682294..3c8a601 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -76,11 +76,6 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). If we get more names we will allocate - * a name dynamically and also add those to the list anchored by names_list. */ -#define AUDIT_NAMES 5 - /* no execve audit message should be longer than this (userspace limits) */ #define MAX_EXECVE_AUDIT_LEN 7500 @@ -90,44 +85,6 @@ int audit_n_rules; /* determines whether we collect data for signals sent */ int audit_signals; -struct audit_cap_data { - kernel_cap_t permitted; - kernel_cap_t inheritable; - union { - unsigned int fE; /* effective bit of a file capability */ - kernel_cap_t effective; /* effective set of a process */ - }; -}; - -/* When fs/namei.c:getname() is called, we store the pointer in name and - * we don't let putname() free it (instead we free all of the saved - * pointers at syscall exit time). - * - * Further, in fs/namei.c:path_lookup() we store the inode and device. - */ -struct audit_names { - struct list_head list; /* audit_context->names_list */ - struct filename *name; - unsigned long ino; - dev_t dev; - umode_t mode; - kuid_t uid; - kgid_t gid; - dev_t rdev; - u32 osid; - struct audit_cap_data fcap; - unsigned int fcap_ver; - int name_len; /* number of name's characters to log */ - unsigned char type; /* record type */ - bool name_put; /* call __putname() for this name */ - /* - * This was an allocated audit_names and not from the array of - * names allocated in the task audit context. Thus this name - * should be freed on syscall exit - */ - bool should_free; -}; - struct audit_aux_data { struct audit_aux_data *next; int type; @@ -175,106 +132,6 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; -/* The per-task audit context. */ -struct audit_context { - int dummy; /* must be the first element */ - int in_syscall; /* 1 if task is in a syscall */ - enum audit_state state, current_state; - unsigned int serial; /* serial number for record */ - int major; /* syscall number */ - struct timespec ctime; /* time of syscall entry */ - unsigned long argv[4]; /* syscall arguments */ - long return_code;/* syscall return code */ - u64 prio; - int return_valid; /* return code is valid */ - /* - * The names_list is the list of all audit_names collected during this - * syscall. The first AUDIT_NAMES entries in the names_list will - * actually be from the preallocated_names array for performance - * reasons. Except during allocation they should never be referenced - * through the preallocated_names array and should only be found/used - * by running the names_list. - */ - struct audit_names preallocated_names[AUDIT_NAMES]; - int name_count; /* total records in names_list */ - struct list_head names_list; /* anchor for struct audit_names->list */ - char * filterkey; /* key for rule that triggered record */ - struct path pwd; - struct audit_aux_data *aux; - struct audit_aux_data *aux_pids; - struct sockaddr_storage *sockaddr; - size_t sockaddr_len; - /* Save things to print about task_struct */ - pid_t pid, ppid; - kuid_t uid, euid, suid, fsuid; - kgid_t gid, egid, sgid, fsgid; - unsigned long personality; - int arch; - - pid_t target_pid; - kuid_t target_auid; - kuid_t target_uid; - unsigned int target_sessionid; - u32 target_sid; - char target_comm[TASK_COMM_LEN]; - - struct audit_tree_refs *trees, *first_trees; - struct list_head killed_trees; - int tree_count; - - int type; - union { - struct { - int nargs; - long args[6]; - } socketcall; - struct { - kuid_t uid; - kgid_t gid; - umode_t mode; - u32 osid; - int has_perm; - uid_t perm_uid; - gid_t perm_gid; - umode_t perm_mode; - unsigned long qbytes; - } ipc; - struct { - mqd_t mqdes; - struct mq_attr mqstat; - } mq_getsetattr; - struct { - mqd_t mqdes; - int sigev_signo; - } mq_notify; - struct { - mqd_t mqdes; - size_t msg_len; - unsigned int msg_prio; - struct timespec abs_timeout; - } mq_sendrecv; - struct { - int oflag; - umode_t mode; - struct mq_attr attr; - } mq_open; - struct { - pid_t pid; - struct audit_cap_data cap; - } capset; - struct { - int fd; - int flags; - } mmap; - }; - int fds[2]; - -#if AUDIT_DEBUG - int put_count; - int ino_count; -#endif -}; - static inline int open_arg(int flags, int mask) { int n = ACC_MODE(flags); @@ -633,9 +490,23 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_GID: result = audit_gid_comparator(cred->gid, f->op, f->gid); + if (f->op == Audit_equal) { + if (!result) + result = in_group_p(f->gid); + } else if (f->op == Audit_not_equal) { + if (result) + result = !in_group_p(f->gid); + } break; case AUDIT_EGID: result = audit_gid_comparator(cred->egid, f->op, f->gid); + if (f->op == Audit_equal) { + if (!result) + result = in_egroup_p(f->gid); + } else if (f->op == Audit_not_equal) { + if (result) + result = !in_egroup_p(f->gid); + } break; case AUDIT_SGID: result = audit_gid_comparator(cred->sgid, f->op, f->gid); @@ -742,6 +613,9 @@ static int audit_filter_rules(struct task_struct *tsk, if (ctx) result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); break; + case AUDIT_LOGINUID_SET: + result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -987,6 +861,8 @@ static inline void audit_free_names(struct audit_context *context) #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { + int i = 0; + printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" " name_count=%d put_count=%d" " ino_count=%d [NOT freeing]\n", @@ -995,7 +871,7 @@ static inline void audit_free_names(struct audit_context *context) context->name_count, context->put_count, context->ino_count); list_for_each_entry(n, &context->names_list, list) { - printk(KERN_ERR "names[%d] = %p = %s\n", i, + printk(KERN_ERR "names[%d] = %p = %s\n", i++, n->name, n->name->name ?: "(null)"); } dump_stack(); @@ -1010,7 +886,7 @@ static inline void audit_free_names(struct audit_context *context) list_for_each_entry_safe(n, next, &context->names_list, list) { list_del(&n->list); if (n->name && n->name_put) - __putname(n->name); + final_putname(n->name); if (n->should_free) kfree(n); } @@ -1093,88 +969,6 @@ static inline void audit_free_context(struct audit_context *context) kfree(context); } -void audit_log_task_context(struct audit_buffer *ab) -{ - char *ctx = NULL; - unsigned len; - int error; - u32 sid; - - security_task_getsecid(current, &sid); - if (!sid) - return; - - error = security_secid_to_secctx(sid, &ctx, &len); - if (error) { - if (error != -EINVAL) - goto error_path; - return; - } - - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - return; - -error_path: - audit_panic("error in audit_log_task_context"); - return; -} - -EXPORT_SYMBOL(audit_log_task_context); - -void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) -{ - const struct cred *cred; - char name[sizeof(tsk->comm)]; - struct mm_struct *mm = tsk->mm; - char *tty; - - if (!ab) - return; - - /* tsk == current */ - cred = current_cred(); - - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - - - audit_log_format(ab, - " ppid=%ld pid=%d auid=%u uid=%u gid=%u" - " euid=%u suid=%u fsuid=%u" - " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", - sys_getppid(), - tsk->pid, - from_kuid(&init_user_ns, tsk->loginuid), - from_kuid(&init_user_ns, cred->uid), - from_kgid(&init_user_ns, cred->gid), - from_kuid(&init_user_ns, cred->euid), - from_kuid(&init_user_ns, cred->suid), - from_kuid(&init_user_ns, cred->fsuid), - from_kgid(&init_user_ns, cred->egid), - from_kgid(&init_user_ns, cred->sgid), - from_kgid(&init_user_ns, cred->fsgid), - tsk->sessionid, tty); - - get_task_comm(name, tsk); - audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, name); - - if (mm) { - down_read(&mm->mmap_sem); - if (mm->exe_file) - audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); - up_read(&mm->mmap_sem); - } - audit_log_task_context(ab); -} - -EXPORT_SYMBOL(audit_log_task_info); - static int audit_log_pid_context(struct audit_context *context, pid_t pid, kuid_t auid, kuid_t uid, unsigned int sessionid, u32 sid, char *comm) @@ -1191,12 +985,14 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (security_secid_to_secctx(sid, &ctx, &len)) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + if (sid) { + if (security_secid_to_secctx(sid, &ctx, &len)) { + audit_log_format(ab, " obj=(none)"); + rc = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } } audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); @@ -1390,35 +1186,6 @@ static void audit_log_execve_info(struct audit_context *context, kfree(buf); } -static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) -{ - int i; - - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) { - audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); - } -} - -static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -{ - kernel_cap_t *perm = &name->fcap.permitted; - kernel_cap_t *inh = &name->fcap.inheritable; - int log = 0; - - if (!cap_isclear(*perm)) { - audit_log_cap(ab, "cap_fp", perm); - log = 1; - } - if (!cap_isclear(*inh)) { - audit_log_cap(ab, "cap_fi", inh); - log = 1; - } - - if (log) - audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); -} - static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1516,68 +1283,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } -static void audit_log_name(struct audit_context *context, struct audit_names *n, - int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", record_num); - - if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - from_kuid(&init_user_ns, n->uid), - from_kgid(&init_user_ns, n->gid), - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - - audit_log_end(ab); -} - static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -1695,7 +1400,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts i = 0; list_for_each_entry(n, &context->names_list, list) - audit_log_name(context, n, i++, &call_panic); + audit_log_name(context, n, NULL, i++, &call_panic); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -2030,18 +1735,18 @@ void audit_putname(struct filename *name) BUG_ON(!context); if (!context->in_syscall) { #if AUDIT_DEBUG == 2 - printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", + printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { struct audit_names *n; - int i; + int i = 0; list_for_each_entry(n, &context->names_list, list) - printk(KERN_ERR "name[%d] = %p = %s\n", i, + printk(KERN_ERR "name[%d] = %p = %s\n", i++, n->name, n->name->name ?: "(null)"); } #endif - __putname(name); + final_putname(name); } #if AUDIT_DEBUG else { @@ -2060,41 +1765,6 @@ void audit_putname(struct filename *name) #endif } -static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) -{ - struct cpu_vfs_cap_data caps; - int rc; - - if (!dentry) - return 0; - - rc = get_vfs_caps_from_disk(dentry, &caps); - if (rc) - return rc; - - name->fcap.permitted = caps.permitted; - name->fcap.inheritable = caps.inheritable; - name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; - - return 0; -} - - -/* Copy inode data into an audit_names. */ -static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); - audit_copy_fcaps(name, dentry); -} - /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -2303,7 +1973,7 @@ int audit_set_loginuid(kuid_t loginuid) unsigned int sessionid; #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (uid_valid(task->loginuid)) + if (audit_loginuid_set(task)) return -EPERM; #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ if (!capable(CAP_AUDIT_CONTROL)) @@ -2471,17 +2141,20 @@ int __audit_bprm(struct linux_binprm *bprm) /** * audit_socketcall - record audit data for sys_socketcall - * @nargs: number of args + * @nargs: number of args, which should not be more than AUDITSC_ARGS. * @args: args array * */ -void __audit_socketcall(int nargs, unsigned long *args) +int __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; + if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) + return -EINVAL; context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); + return 0; } /** diff --git a/kernel/cgroup.c b/kernel/cgroup.c index eeb7e49..2a99262 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4380,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - ss->active = 1; BUG_ON(online_css(ss, dummytop)); mutex_unlock(&cgroup_mutex); @@ -4485,7 +4484,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ss->active = 1; ret = online_css(ss, dummytop); if (ret) goto err_unload; @@ -4526,7 +4524,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); offline_css(ss, dummytop); - ss->active = 0; if (ss->use_id) idr_destroy(&ss->idr); @@ -4681,7 +4678,7 @@ out: */ /* TODO: Use a proper seq_file iterator */ -static int proc_cgroup_show(struct seq_file *m, void *v) +int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; @@ -4733,19 +4730,6 @@ out: return retval; } -static int cgroup_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cgroup_show, pid); -} - -const struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* Display information about each subsystem and each hierarchy */ static int proc_cgroupstats_show(struct seq_file *m, void *v) { diff --git a/kernel/compat.c b/kernel/compat.c index 19971d8..0a09e48 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) return 0; } -asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) -{ - struct rusage r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_getrusage(who, (struct rusage __user *) &r); - set_fs(old_fs); - - if (ret) - return ret; - - if (put_compat_rusage(&r, ru)) - return -EFAULT; - - return 0; -} - COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, @@ -1138,71 +1119,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, } #endif -struct compat_sysinfo { - s32 uptime; - u32 loads[3]; - u32 totalram; - u32 freeram; - u32 sharedram; - u32 bufferram; - u32 totalswap; - u32 freeswap; - u16 procs; - u16 pad; - u32 totalhigh; - u32 freehigh; - u32 mem_unit; - char _f[20-2*sizeof(u32)-sizeof(int)]; -}; - -asmlinkage long -compat_sys_sysinfo(struct compat_sysinfo __user *info) -{ - struct sysinfo s; - - do_sysinfo(&s); - - /* Check to see if any memory value is too large for 32-bit and scale - * down if needed - */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { - int bitcount = 0; - - while (s.mem_unit < PAGE_SIZE) { - s.mem_unit <<= 1; - bitcount++; - } - - s.totalram >>= bitcount; - s.freeram >>= bitcount; - s.sharedram >>= bitcount; - s.bufferram >>= bitcount; - s.totalswap >>= bitcount; - s.freeswap >>= bitcount; - s.totalhigh >>= bitcount; - s.freehigh >>= bitcount; - } - - if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || - __put_user (s.uptime, &info->uptime) || - __put_user (s.loads[0], &info->loads[0]) || - __put_user (s.loads[1], &info->loads[1]) || - __put_user (s.loads[2], &info->loads[2]) || - __put_user (s.totalram, &info->totalram) || - __put_user (s.freeram, &info->freeram) || - __put_user (s.sharedram, &info->sharedram) || - __put_user (s.bufferram, &info->bufferram) || - __put_user (s.totalswap, &info->totalswap) || - __put_user (s.freeswap, &info->freeswap) || - __put_user (s.procs, &info->procs) || - __put_user (s.totalhigh, &info->totalhigh) || - __put_user (s.freehigh, &info->freehigh) || - __put_user (s.mem_unit, &info->mem_unit)) - return -EFAULT; - - return 0; -} - COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, struct compat_timespec __user *, interval) diff --git a/kernel/configs.c b/kernel/configs.c index 42e8fa0..c18b1f1 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -79,7 +79,7 @@ static int __init ikconfig_init(void) if (!entry) return -ENOMEM; - entry->size = kernel_config_data_size; + proc_set_size(entry, kernel_config_data_size); return 0; } diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile new file mode 100644 index 0000000..59ab052 --- /dev/null +++ b/kernel/cpu/Makefile @@ -0,0 +1 @@ +obj-y = idle.o diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c new file mode 100644 index 0000000..8b86c0c --- /dev/null +++ b/kernel/cpu/idle.c @@ -0,0 +1,116 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/tick.h> +#include <linux/mm.h> + +#include <asm/tlb.h> + +#include <trace/events/power.h> + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; +} + +/* + * Generic idle loop implementation + */ +static void cpu_idle_loop(void) +{ + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + cpu_idle_poll(); + } else { + current_clr_polling(); + if (!need_resched()) { + stop_critical_timings(); + rcu_idle_enter(); + arch_cpu_idle(); + WARN_ON_ONCE(irqs_disabled()); + rcu_idle_exit(); + start_critical_timings(); + } else { + local_irq_enable(); + } + current_set_polling(); + } + arch_cpu_idle_exit(); + } + tick_nohz_idle_exit(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + current_set_polling(); + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1233112..64b3f79 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2609,7 +2609,7 @@ void __cpuset_memory_pressure_bump(void) * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ -static int proc_cpuset_show(struct seq_file *m, void *unused_v) +int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; @@ -2643,19 +2643,6 @@ out_free: out: return retval; } - -static int cpuset_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cpuset_show, pid); -} - -const struct file_operations proc_cpuset_operations = { - .open = cpuset_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif /* CONFIG_PROC_PID_CPUSET */ /* Display task mems_allowed in /proc/<pid>/status file. */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index c26278f..0506d44 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -775,7 +775,7 @@ static void sysrq_handle_dbg(int key) static struct sysrq_key_op sysrq_dbg_op = { .handler = sysrq_handle_dbg, - .help_msg = "debug(G)", + .help_msg = "debug(g)", .action_msg = "DEBUG", }; #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index dce6e13..6b41c18 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -18,6 +18,7 @@ #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> +#include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> @@ -37,6 +38,7 @@ #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> +#include <linux/cgroup.h> #include "internal.h" @@ -234,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF /* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info __percpu *info; +}; + +/* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. @@ -670,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu) WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) + if (list_empty(&cpuctx->rotation_list)) { + int was_empty = list_empty(head); list_add(&cpuctx->rotation_list, head); + if (was_empty) + tick_nohz_full_kick(); + } } static void get_ctx(struct perf_event_context *ctx) @@ -976,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); + if (sample_type & PERF_SAMPLE_WEIGHT) + size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_READ) size += event->read_size; + if (sample_type & PERF_SAMPLE_DATA_SRC) + size += sizeof(data->data_src.val); + event->header_size = size; } @@ -2570,6 +2596,16 @@ done: list_del_init(&cpuctx->rotation_list); } +#ifdef CONFIG_NO_HZ_FULL +bool perf_event_can_stop_tick(void) +{ + if (list_empty(&__get_cpu_var(rotation_list))) + return true; + else + return false; +} +#endif + void perf_event_task_tick(void) { struct list_head *head = &__get_cpu_var(rotation_list); @@ -4193,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + + if (sample_type & PERF_SAMPLE_WEIGHT) + perf_output_put(handle, data->weight); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + perf_output_put(handle, data->data_src.val); } void perf_prepare_sample(struct perf_event_header *header, @@ -4782,6 +4824,9 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + if (!(vma->vm_flags & VM_EXEC)) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 97fddb0..cd55144 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -326,11 +326,16 @@ void rb_free(struct ring_buffer *rb) } #else +static int data_page_nr(struct ring_buffer *rb) +{ + return rb->nr_pages << page_order(rb); +} struct page * perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { - if (pgoff > (1UL << page_order(rb))) + /* The '>' counts in the user page. */ + if (pgoff > data_page_nr(rb)) return NULL; return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); @@ -350,10 +355,11 @@ static void rb_free_work(struct work_struct *work) int i, nr; rb = container_of(work, struct ring_buffer, work); - nr = 1 << page_order(rb); + nr = data_page_nr(rb); base = rb->user_page; - for (i = 0; i < nr + 1; i++) + /* The '<=' counts in the user page. */ + for (i = 0; i <= nr; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); @@ -387,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; rb->page_order = ilog2(nr_pages); - rb->nr_pages = 1; + rb->nr_pages = !!nr_pages; ring_buffer_init(rb, watermark, flags); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a567c8c..f356974 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -75,6 +75,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct return_instance { + struct uprobe *uprobe; + unsigned long func; + unsigned long orig_ret_vaddr; /* original return address */ + bool chained; /* true, if instance is nested */ + + struct return_instance *next; /* keep as stack */ +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } -static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +/** + * is_trap_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_trap_insn + * Returns true if @insn is a breakpoint instruction. + * + * This function is needed for the case where an architecture has multiple + * trap instructions (like powerpc). + */ +bool __weak is_trap_insn(uprobe_opcode_t *insn) +{ + return is_swbp_insn(insn); +} + +static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); - memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); + memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); + kunmap_atomic(kaddr); +} + +static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) +{ + void *kaddr = kmap_atomic(page); + memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } @@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_opcode_t old_opcode; bool is_swbp; - copy_opcode(page, vaddr, &old_opcode); + /* + * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. + * We do not check if it is any other 'trap variant' which could + * be conditional trap instruction such as the one powerpc supports. + * + * The logic is that we do not care if the underlying instruction + * is a trap variant; uprobes always wins over any other (gdb) + * breakpoint. + */ + copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify is_swbp_at_addr and + * supported by that architecture then we need to modify is_trap_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ @@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; - void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; int ret; @@ -246,15 +284,8 @@ retry: __SetPageUptodate(new_page); - /* copy the page now that we've got it stable */ - vaddr_old = kmap_atomic(old_page); - vaddr_new = kmap_atomic(new_page); - - memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); - - kunmap_atomic(vaddr_new); - kunmap_atomic(vaddr_old); + copy_highpage(new_page, old_page); + copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = anon_vma_prepare(vma); if (ret) @@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, unsigned long nbytes, loff_t offset) { struct page *page; - void *vaddr; - unsigned long off; - pgoff_t idx; - - if (!filp) - return -EINVAL; if (!mapping->a_ops->readpage) return -EIO; - - idx = offset >> PAGE_CACHE_SHIFT; - off = offset & ~PAGE_MASK; - /* * Ensure that the page that has the original instruction is * populated and in page-cache. */ - page = read_mapping_page(mapping, idx, filp); + page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); if (IS_ERR(page)) return PTR_ERR(page); - vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off, nbytes); - kunmap_atomic(vaddr); + copy_from_page(page, offset, insn, nbytes); page_cache_release(page); return 0; @@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) down_write(&mm->mmap_sem); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || @@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; + /* Uprobe must have at least one set consumer */ + if (!uc->handler && !uc->ret_handler) + return -EINVAL; + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) loff_t offset; if (!valid_vma(vma, false) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; @@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (no_uprobe_events() || !valid_vma(vma, true)) return 0; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); if (!inode) return 0; @@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e struct inode *inode; struct rb_node *n; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; @@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; + uprobe_opcode_t insn = UPROBE_SWBP_INSN; area = mm->uprobes_state.xol_area; if (area) @@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void) if (!area->page) goto free_bitmap; + /* allocate first slot of task's xol_area for the return probes */ + set_bit(0, area->bitmap); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + atomic_set(&area->slot_count, 1); init_waitqueue_head(&area->wq); + if (!xol_add_vma(area)) return area; @@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; - unsigned long offset; unsigned long xol_vaddr; - void *vaddr; area = get_xol_area(); if (!area) @@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) return 0; /* Initialize the slot */ - offset = xol_vaddr & ~PAGE_MASK; - vaddr = kmap_atomic(area->page); - memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); - kunmap_atomic(vaddr); + copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. @@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; + struct return_instance *ri, *tmp; if (!utask) return; @@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t) if (utask->active_uprobe) put_uprobe(utask->active_uprobe); + ri = utask->return_instances; + while (ri) { + tmp = ri; + ri = ri->next; + + put_uprobe(tmp->uprobe); + kfree(tmp); + } + xol_free_insn_slot(t); kfree(utask); t->utask = NULL; @@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void) return current->utask; } +/* + * Current area->vaddr notion assume the trampoline address is always + * equal area->vaddr. + * + * Returns -1 in case the xol_area is not allocated. + */ +static unsigned long get_trampoline_vaddr(void) +{ + struct xol_area *area; + unsigned long trampoline_vaddr = -1; + + area = current->mm->uprobes_state.xol_area; + smp_read_barrier_depends(); + if (area) + trampoline_vaddr = area->vaddr; + + return trampoline_vaddr; +} + +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct return_instance *ri; + struct uprobe_task *utask; + unsigned long orig_ret_vaddr, trampoline_vaddr; + bool chained = false; + + if (!get_xol_area()) + return; + + utask = get_utask(); + if (!utask) + return; + + if (utask->depth >= MAX_URETPROBE_DEPTH) { + printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" + " nestedness limit pid/tgid=%d/%d\n", + current->pid, current->tgid); + return; + } + + ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!ri) + goto fail; + + trampoline_vaddr = get_trampoline_vaddr(); + orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); + if (orig_ret_vaddr == -1) + goto fail; + + /* + * We don't want to keep trampoline address in stack, rather keep the + * original return address of first caller thru all the consequent + * instances. This also makes breakpoint unwrapping easier. + */ + if (orig_ret_vaddr == trampoline_vaddr) { + if (!utask->return_instances) { + /* + * This situation is not possible. Likely we have an + * attack from user-space. + */ + pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + goto fail; + } + + chained = true; + orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; + } + + atomic_inc(&uprobe->ref); + ri->uprobe = uprobe; + ri->func = instruction_pointer(regs); + ri->orig_ret_vaddr = orig_ret_vaddr; + ri->chained = chained; + + utask->depth++; + + /* add instance to the stack */ + ri->next = utask->return_instances; + utask->return_instances = ri; + + return; + + fail: + kfree(ri); +} + /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) @@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; @@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_opcode(page, vaddr, &opcode); + copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: - return is_swbp_insn(&opcode); + /* This needs to return true for any variant of the trap insn */ + return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) @@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { - struct inode *inode = vma->vm_file->f_mapping->host; + struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe(inode, offset); } if (!uprobe) - *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } @@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; + bool need_prep = false; /* prepare return uprobe, when needed */ down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - int rc = uc->handler(uc, regs); + int rc = 0; + + if (uc->handler) { + rc = uc->handler(uc, regs); + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + } + + if (uc->ret_handler) + need_prep = true; - WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); remove &= rc; } + if (need_prep && !remove) + prepare_uretprobe(uprobe, regs); /* put bp at return */ + if (remove && uprobe->consumers) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); @@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } +static void +handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +{ + struct uprobe *uprobe = ri->uprobe; + struct uprobe_consumer *uc; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + if (uc->ret_handler) + uc->ret_handler(uc, ri->func, regs); + } + up_read(&uprobe->register_rwsem); +} + +static bool handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *tmp; + bool chained; + + utask = current->utask; + if (!utask) + return false; + + ri = utask->return_instances; + if (!ri) + return false; + + /* + * TODO: we should throw out return_instance's invalidated by + * longjmp(), currently we assume that the probed function always + * returns. + */ + instruction_pointer_set(regs, ri->orig_ret_vaddr); + + for (;;) { + handle_uretprobe_chain(ri, regs); + + chained = ri->chained; + put_uprobe(ri->uprobe); + + tmp = ri; + ri = ri->next; + kfree(tmp); + + if (!chained) + break; + + utask->depth--; + + BUG_ON(!ri); + } + + utask->return_instances = ri; + + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + if (bp_vaddr == get_trampoline_vaddr()) { + if (handle_trampoline(regs)) + return; + + pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + } + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) + if (!current->mm) + return 0; + + if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); diff --git a/kernel/exit.c b/kernel/exit.c index 60bc027..af2eb3c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -847,7 +847,7 @@ void do_exit(long code) exit_io_context(tsk); if (tsk->splice_pipe) - __free_pipe_info(tsk->splice_pipe); + free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); @@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, } put_pid(pid); - - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } @@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, ret = do_wait(&wo); put_pid(pid); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } diff --git a/kernel/extable.c b/kernel/extable.c index fe35a63..67460b9 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed) + if (main_extable_sort_needed) { + pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); - else - pr_notice("__ex_table already sorted, skipping sort\n"); + } } /* Given an address, look for it in the exception tables. */ diff --git a/kernel/fork.c b/kernel/fork.c index 1766d32..987b28a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -70,6 +70,7 @@ #include <linux/khugepaged.h> #include <linux/signalfd.h> #include <linux/uprobes.h> +#include <linux/aio.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -1233,7 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -1303,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif +#ifdef CONFIG_BCACHE + p->sequential_io = 0; + p->sequential_io_avg = 0; +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p); @@ -1677,10 +1682,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); - asmlinkage_protect(5, ret, clone_flags, newsp, - parent_tidptr, child_tidptr, tls_val); - return ret; + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } #endif diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 14be27f..fd4b13b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -84,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get_boottime, .resolution = KTIME_LOW_RES, }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + .resolution = KTIME_LOW_RES, + }, } }; @@ -91,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, }; static inline int hrtimer_clockid_to_base(clockid_t clock_id) @@ -107,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, mono, boot; struct timespec xts, tom, slp; + s32 tai_offset; get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); + tai_offset = timekeeping_get_tai_offset(); xtim = timespec_to_ktime(xts); mono = ktime_add(xtim, timespec_to_ktime(tom)); @@ -116,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; + base->clock_base[HRTIMER_BASE_TAI].softirq_time = + ktime_add(xtim, ktime_set(tai_offset, 0)); } /* @@ -161,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, */ static int hrtimer_get_target(int this_cpu, int pinned) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) return get_nohz_timer_target(); #endif @@ -276,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) } else { unsigned long rem = do_div(nsec, NSEC_PER_SEC); + /* Make sure nsec fits into long */ + if (unlikely(nsec > KTIME_SEC_MAX)) + return (ktime_t){ .tv64 = KTIME_MAX }; + tmp = ktime_set((long)nsec, rem); } @@ -652,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets(offs_real, offs_boot); + return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); } /* @@ -1011,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1028,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * hrtimer_start - (re)start an hrtimer on the current CPU * @timer: the timer to be added * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) * * Returns: * 0 on success @@ -1107,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * @@ -1310,6 +1328,8 @@ retry: expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < 0) + expires.tv64 = KTIME_MAX; if (expires.tv64 < expires_next.tv64) expires_next = expires; break; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 96f3a1d..5a83dde 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -462,9 +462,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, if (domain->ops->map) { ret = domain->ops->map(domain, virq, hwirq); if (ret != 0) { - pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", - virq, hwirq, ret); - WARN_ON(1); + /* + * If map() returns -EPERM, this interrupt is protected + * by the firmware or some other service and shall not + * be mapped. + * + * Since on some platforms we blindly try to map everything + * we end up with a log full of backtraces. + * + * So instead, we silently fail on -EPERM, it is the + * responsibility of the PIC driver to display a relevant + * message if needed. + */ + if (ret != -EPERM) { + pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", + virq, hwirq, ret); + WARN_ON(1); + } irq_data->domain = NULL; irq_data->hwirq = 0; goto err_unmap; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 397db02..19ed5c4 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE(file_inode(file))->data; + unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); cpumask_var_t new_value; int err; @@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); } static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); + return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_affinity_proc_fops = { @@ -212,7 +212,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE(inode)->data); + return single_open(file, default_affinity_show, PDE_DATA(inode)); } static const struct file_operations default_affinity_proc_fops = { @@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) static int irq_node_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_node_proc_show, PDE(inode)->data); + return single_open(file, irq_node_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_node_proc_fops = { @@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) static int irq_spurious_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_spurious_proc_show, PDE(inode)->data); + return single_open(file, irq_spurious_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_spurious_proc_fops = { @@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { - if (action->dir) { - struct irq_desc *desc = irq_to_desc(irq); - - remove_proc_entry(action->dir->name, desc->dir); - } + proc_remove(action->dir); } static void register_default_affinity_proc(void) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 2169fee..3127ad5 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr) /* * Expand a compressed symbol data into the resulting uncompressed string, + * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ -static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) +static unsigned int kallsyms_expand_symbol(unsigned int off, + char *result, size_t maxlen) { int len, skipped_first = 0; const u8 *tptr, *data; @@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) while (*tptr) { if (skipped_first) { + if (maxlen <= 1) + goto tail; *result = *tptr; result++; + maxlen--; } else skipped_first = 1; tptr++; } } - *result = '\0'; +tail: + if (maxlen) + *result = '\0'; /* Return to offset to the next symbol. */ return off; @@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name) unsigned int off; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); + off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) return kallsyms_addresses[i]; @@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, int ret; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf); + off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); if (ret != 0) return ret; @@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr, pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); + kallsyms_expand_symbol(get_symbol_offset(pos), + namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; return namebuf; @@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname) pos = get_symbol_pos(addr, NULL, NULL); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), symname); + kallsyms_expand_symbol(get_symbol_offset(pos), + symname, KSYM_NAME_LEN); return 0; } /* See if it's in a module. */ @@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, pos = get_symbol_pos(addr, size, offset); /* Grab name */ - kallsyms_expand_symbol(get_symbol_offset(pos), name); + kallsyms_expand_symbol(get_symbol_offset(pos), + name, KSYM_NAME_LEN); modname[0] = '\0'; return 0; } @@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) iter->type = kallsyms_get_symbol_type(off); - off = kallsyms_expand_symbol(off, iter->name); + off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); return off - iter->nameoff; } diff --git a/kernel/kexec.c b/kernel/kexec.c index b574920..59f7b55 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image, struct kexec_segment *segment) { unsigned long maddr; - unsigned long ubytes, mbytes; + size_t ubytes, mbytes; int result; unsigned char __user *buf; @@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image, /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) - uchunk = ubytes; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); result = copy_from_user(ptr, buf, uchunk); kunmap(page); @@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image, * We do things a page at a time for the sake of kmap. */ unsigned long maddr; - unsigned long ubytes, mbytes; + size_t ubytes, mbytes; int result; unsigned char __user *buf; @@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ptr = kmap(page); ptr += maddr & ~PAGE_MASK; - mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) - mchunk = mbytes; - - uchunk = mchunk; - if (uchunk > ubytes) { - uchunk = ubytes; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } @@ -1540,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; char buf[0x50]; - int r; + size_t r; va_start(args, fmt); r = vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - if (r + vmcoreinfo_size > vmcoreinfo_max_size) - r = vmcoreinfo_max_size - vmcoreinfo_size; + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); diff --git a/kernel/kmod.c b/kernel/kmod.c index 56dd349..1296e72 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -77,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info) static int call_modprobe(char *module_name, int wait) { + struct subprocess_info *info; static char *envp[] = { "HOME=/", "TERM=linux", @@ -98,8 +99,15 @@ static int call_modprobe(char *module_name, int wait) argv[3] = module_name; /* check free_modprobe_argv() */ argv[4] = NULL; - return call_usermodehelper_fns(modprobe_path, argv, envp, - wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); + info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, + NULL, free_modprobe_argv, NULL); + if (!info) + goto free_module_name; + + return call_usermodehelper_exec(info, wait | UMH_KILLABLE); + +free_module_name: + kfree(module_name); free_argv: kfree(argv); out: @@ -502,14 +510,28 @@ static void helper_unlock(void) * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation + * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. + * + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. */ -static struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, - char **envp, gfp_t gfp_mask) + char **envp, gfp_t gfp_mask, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), + void *data) { struct subprocess_info *sub_info; sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); @@ -520,50 +542,27 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; + + sub_info->cleanup = cleanup; + sub_info->init = init; + sub_info->data = data; out: return sub_info; } - -/** - * call_usermodehelper_setfns - set a cleanup/init function - * @info: a subprocess_info returned by call_usermodehelper_setup - * @cleanup: a cleanup function - * @init: an init function - * @data: arbitrary context sensitive data - * - * The init function is used to customize the helper process prior to - * exec. A non-zero return code causes the process to error out, exit, - * and return the failure to the calling process - * - * The cleanup function is just before ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -static -void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data) -{ - info->cleanup = cleanup; - info->init = init; - info->data = data; -} +EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa * @wait: wait for the application to finish and return status. - * when -1 don't wait at all, but you get no useful error back when - * the program couldn't be exec'ed. This makes it safe to call + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call * from interrupt context. * * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ -static int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); @@ -615,31 +614,34 @@ unlock: helper_unlock(); return retval; } +EXPORT_SYMBOL(call_usermodehelper_exec); -/* - * call_usermodehelper_fns() will not run the caller-provided cleanup function - * if a memory allocation failure is experienced. So the caller might need to - * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform - * the necessaary cleanup within the caller. +/** + * call_usermodehelper() - prepare and start a usermode application + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * This function is the equivalent to use call_usermodehelper_setup() and + * call_usermodehelper_exec(). */ -int call_usermodehelper_fns( - char *path, char **argv, char **envp, int wait, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *), void *data) +int call_usermodehelper(char *path, char **argv, char **envp, int wait) { struct subprocess_info *info; gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; - info = call_usermodehelper_setup(path, argv, envp, gfp_mask); - + info = call_usermodehelper_setup(path, argv, envp, gfp_mask, + NULL, NULL, NULL); if (info == NULL) return -ENOMEM; - call_usermodehelper_setfns(info, init, cleanup, data); - return call_usermodehelper_exec(info, wait); } -EXPORT_SYMBOL(call_usermodehelper_fns); +EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) diff --git a/kernel/kthread.c b/kernel/kthread.c index 16d8ddd..760e86d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> +#include <linux/uaccess.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -135,6 +136,24 @@ void *kthread_data(struct task_struct *task) return to_kthread(task)->data; } +/** + * probe_kthread_data - speculative version of kthread_data() + * @task: possible kthread task in question + * + * @task could be a kthread task. Return the data value specified when it + * was created if accessible. If @task isn't a kthread task or its data is + * inaccessible for any reason, %NULL is returned. This function requires + * that @task itself is safe to dereference. + */ +void *probe_kthread_data(struct task_struct *task) +{ + struct kthread *kthread = to_kthread(task); + void *data = NULL; + + probe_kernel_read(&data, &kthread->data, sizeof(data)); + return data; +} + static void __kthread_parkme(struct kthread *self) { __set_current_state(TASK_PARKED); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 6a3bccb..1f3186b 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2998,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; +EXPORT_SYMBOL_GPL(__lockdep_no_validate__); static int print_lock_nested_lock_not_held(struct task_struct *curr, diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S index 246b4c6..4a9a86d 100644 --- a/kernel/modsign_certificate.S +++ b/kernel/modsign_certificate.S @@ -1,15 +1,8 @@ -/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ -#ifndef SYMBOL_PREFIX -#define ASM_SYMBOL(sym) sym -#else -#define PASTE2(x,y) x##y -#define PASTE(x,y) PASTE2(x,y) -#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym) -#endif +#include <linux/export.h> #define GLOBAL(name) \ - .globl ASM_SYMBOL(name); \ - ASM_SYMBOL(name): + .globl VMLINUX_SYMBOL(name); \ + VMLINUX_SYMBOL(name): .section ".init.data","aw" diff --git a/kernel/module.c b/kernel/module.c index 0925c9a..b049939 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, /* Since this should be found in kernel (which can't be removed), * no locking is necessary. */ - if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL, &crc, true, false)) BUG(); - return check_version(sechdrs, versindex, "module_layout", mod, crc, + return check_version(sechdrs, versindex, + VMLINUX_SYMBOL_STR(module_layout), mod, crc, NULL); } @@ -1861,12 +1862,12 @@ static void free_module(struct module *mod) { trace_module_free(mod); - /* Delete from various lists */ - mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); - mutex_unlock(&module_mutex); mod_sysfs_teardown(mod); + /* We leave it in list to prevent duplicate loads, but make sure + * that noone uses it while it's being deconstructed. */ + mod->state = MODULE_STATE_UNFORMED; + /* Remove dynamic debug info */ ddebug_remove_module(mod->name); @@ -1879,6 +1880,11 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); + /* Now we can delete it from the lists */ + mutex_lock(&module_mutex); + stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); + /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index afc0456..364ceab 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,7 +22,7 @@ #include <linux/pid_namespace.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> @@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) const struct proc_ns_operations *ops; struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct proc_inode *ei; + struct proc_ns *ei; struct file *file; int err; @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = PROC_I(file_inode(file)); + ei = get_proc_ns(file_inode(file)); ops = ei->ns_ops; if (nstype && (ops->type != nstype)) goto out; diff --git a/kernel/panic.c b/kernel/panic.c index 7c57cc9..167ec097 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -22,7 +22,6 @@ #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> -#include <linux/dmi.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -400,13 +399,8 @@ struct slowpath_args { static void warn_slowpath_common(const char *file, int line, void *caller, unsigned taint, struct slowpath_args *args) { - const char *board; - printk(KERN_WARNING "------------[ cut here ]------------\n"); printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (board) - printk(KERN_WARNING "Hardware name: %s\n", board); if (args) vprintk(args->fmt, args->args); diff --git a/kernel/params.c b/kernel/params.c index ed35345..53b958f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), GFP_KERNEL); if (!new) { - kfree(mk->mp); + kfree(attrs); err = -ENOMEM; goto fail; } + /* Despite looking like the typical realloc() bug, this is safe. + * We *want* the old 'attrs' to be freed either way, and we'll store + * the new one in the success case. */ attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); if (!attrs) { err = -ENOMEM; diff --git a/kernel/pid.c b/kernel/pid.c index 047dc62..0db3e79 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> +#include <linux/proc_ns.h> #include <linux/proc_fs.h> #define pid_hashfn(nr, ns) \ @@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define BITS_PER_PAGE (PAGE_SIZE*8) -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) - static inline int mk_pid(struct pid_namespace *pid_ns, struct pidmap *map, int off) { @@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) break; } if (likely(atomic_read(&map->nr_free))) { - do { + for ( ; ; ) { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); set_last_pid(pid_ns, last, pid); return pid; } offset = find_next_offset(map, offset); + if (offset >= BITS_PER_PAGE) + break; pid = mk_pid(pid_ns, map, offset); - } while (offset < BITS_PER_PAGE && pid < pid_max); + if (pid >= pid_max) + break; + } } if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index bea15bd..6917e8e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -15,12 +15,10 @@ #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> #include <linux/reboot.h> #include <linux/export.h> -#define BITS_PER_PAGE (PAGE_SIZE*8) - struct pid_cache { int nr_ids; char name[16]; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8fd709c..42670e9 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -10,6 +10,8 @@ #include <linux/kernel_stat.h> #include <trace/events/timer.h> #include <linux/random.h> +#include <linux/tick.h> +#include <linux/workqueue.h> /* * Called after updating RLIMIT_CPU to run cpu timer and update @@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer, } } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) + return 1; + return 0; +} + static inline cputime_t prof_ticks(struct task_struct *p) { cputime_t utime, stime; @@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } +#ifdef CONFIG_NO_HZ_FULL +static void nohz_kick_work_fn(struct work_struct *work) +{ + tick_nohz_full_kick_all(); +} + +static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); + +/* + * We need the IPIs to be sent from sane process context. + * The posix cpu timers are always set with irqs disabled. + */ +static void posix_cpu_timer_kick_nohz(void) +{ + schedule_work(&nohz_kick_work); +} + +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) +{ + if (!task_cputime_zero(&tsk->cputime_expires)) + return false; + + if (tsk->signal->cputimer.running) + return false; + + return true; +} +#else +static inline void posix_cpu_timer_kick_nohz(void) { } +#endif + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } + if (!ret) + posix_cpu_timer_kick_nohz(); return ret; } @@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, } } -/** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ - if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) - return 1; - return 0; -} - /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } + + /* + * In case some timers were rescheduled after the queue got emptied, + * wake up full dynticks CPUs. + */ + if (tsk->signal->cputimer.running) + posix_cpu_timer_kick_nohz(); } /* @@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - return; + goto out; *newval += now.cpu; } @@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } +out: + posix_cpu_timer_kick_nohz(); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 6edbb2c..424c2d4 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -40,38 +40,31 @@ #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> -#include <linux/idr.h> +#include <linux/hash.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/export.h> +#include <linux/hashtable.h> /* - * Management arrays for POSIX timers. Timers are kept in slab memory - * Timer ids are allocated by an external routine that keeps track of the - * id and the timer. The external interface is: - * - * void *idr_find(struct idr *idp, int id); to find timer_id <id> - * int idr_get_new(struct idr *idp, void *ptr); to get a new id and - * related it to <ptr> - * void idr_remove(struct idr *idp, int id); to release <id> - * void idr_init(struct idr *idp); to initialize <idp> - * which we supply. - * The idr_get_new *may* call slab for more memory so it must not be - * called under a spin lock. Likewise idr_remore may release memory - * (but it may be ok to do this under a lock...). - * idr_find is just a memory look up and is quite fast. A -1 return - * indicates that the requested id does not exist. + * Management arrays for POSIX timers. Timers are now kept in static hash table + * with 512 entries. + * Timer ids are allocated by local routine, which selects proper hash head by + * key, constructed from current->signal address and per signal struct counter. + * This keeps timer ids unique per process, but now they can intersect between + * processes. */ /* * Lets keep our timers in a slab cache :-) */ static struct kmem_cache *posix_timers_cache; -static struct idr posix_timers_id; -static DEFINE_SPINLOCK(idr_lock); + +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other @@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); __timr; \ }) +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash) { + if ((timer->it_signal == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + int first_free_id = sig->posix_timer_id; + struct hlist_head *head; + int ret = -ENOENT; + + do { + spin_lock(&hash_lock); + head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; + if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + hlist_add_head_rcu(&timer->t_hash, head); + ret = sig->posix_timer_id; + } + if (++sig->posix_timer_id < 0) + sig->posix_timer_id = 0; + if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) + /* Loop over all possible ids completed */ + ret = -EAGAIN; + spin_unlock(&hash_lock); + } while (ret == -ENOENT); + return ret; +} + static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { spin_unlock_irqrestore(&timr->it_lock, flags); @@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +{ + timekeeping_clocktai(tp); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) @@ -261,6 +309,16 @@ static __init int init_posix_timers(void) .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, }; + struct k_clock clock_tai = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; struct k_clock clock_boottime = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_boottime, @@ -278,11 +336,11 @@ static __init int init_posix_timers(void) posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); + posix_timers_register_clock(CLOCK_TAI, &clock_tai); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); - idr_init(&posix_timers_id); return 0; } @@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) { if (it_id_set) { unsigned long flags; - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&posix_timers_id, tmr->it_id); - spin_unlock_irqrestore(&idr_lock, flags); + spin_lock_irqsave(&hash_lock, flags); + hlist_del_rcu(&tmr->t_hash); + spin_unlock_irqrestore(&hash_lock, flags); } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); @@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, return -EAGAIN; spin_lock_init(&new_timer->it_lock); - - idr_preload(GFP_KERNEL); - spin_lock_irq(&idr_lock); - error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT); - spin_unlock_irq(&idr_lock); - idr_preload_end(); - if (error < 0) { - /* - * Weird looking, but we return EAGAIN if the IDR is - * full (proper POSIX return value for this) - */ - if (error == -ENOSPC) - error = -EAGAIN; + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + error = new_timer_id; goto out; } - new_timer_id = error; it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; @@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) return NULL; rcu_read_lock(); - timr = idr_find(&posix_timers_id, (int)timer_id); + timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); if (timr->it_signal == current->signal) { diff --git a/kernel/power/console.c b/kernel/power/console.c index b1dc456..463aa673 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -4,6 +4,7 @@ * Originally from swsusp. */ +#include <linux/console.h> #include <linux/vt_kern.h> #include <linux/kbd_kern.h> #include <linux/vt.h> @@ -14,8 +15,120 @@ static int orig_fgconsole, orig_kmsg; +static DEFINE_MUTEX(vt_switch_mutex); + +struct pm_vt_switch { + struct list_head head; + struct device *dev; + bool required; +}; + +static LIST_HEAD(pm_vt_switch_list); + + +/** + * pm_vt_switch_required - indicate VT switch at suspend requirements + * @dev: device + * @required: if true, caller needs VT switch at suspend/resume time + * + * The different console drivers may or may not require VT switches across + * suspend/resume, depending on how they handle restoring video state and + * what may be running. + * + * Drivers can indicate support for switchless suspend/resume, which can + * save time and flicker, by using this routine and passing 'false' as + * the argument. If any loaded driver needs VT switching, or the + * no_console_suspend argument has been passed on the command line, VT + * switches will occur. + */ +void pm_vt_switch_required(struct device *dev, bool required) +{ + struct pm_vt_switch *entry, *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + /* already registered, update requirement */ + tmp->required = required; + goto out; + } + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out; + + entry->required = required; + entry->dev = dev; + + list_add(&entry->head, &pm_vt_switch_list); +out: + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_required); + +/** + * pm_vt_switch_unregister - stop tracking a device's VT switching needs + * @dev: device + * + * Remove @dev from the vt switch list. + */ +void pm_vt_switch_unregister(struct device *dev) +{ + struct pm_vt_switch *tmp; + + mutex_lock(&vt_switch_mutex); + list_for_each_entry(tmp, &pm_vt_switch_list, head) { + if (tmp->dev == dev) { + list_del(&tmp->head); + break; + } + } + mutex_unlock(&vt_switch_mutex); +} +EXPORT_SYMBOL(pm_vt_switch_unregister); + +/* + * There are three cases when a VT switch on suspend/resume are required: + * 1) no driver has indicated a requirement one way or another, so preserve + * the old behavior + * 2) console suspend is disabled, we want to see debug messages across + * suspend/resume + * 3) any registered driver indicates it needs a VT switch + * + * If none of these conditions is present, meaning we have at least one driver + * that doesn't need the switch, and none that do, we can avoid it to make + * resume look a little prettier (and suspend too, but that's usually hidden, + * e.g. when closing the lid on a laptop). + */ +static bool pm_vt_switch(void) +{ + struct pm_vt_switch *entry; + bool ret = true; + + mutex_lock(&vt_switch_mutex); + if (list_empty(&pm_vt_switch_list)) + goto out; + + if (!console_suspend_enabled) + goto out; + + list_for_each_entry(entry, &pm_vt_switch_list, head) { + if (entry->required) + goto out; + } + + ret = false; +out: + mutex_unlock(&vt_switch_mutex); + return ret; +} + int pm_prepare_console(void) { + if (!pm_vt_switch()) + return 0; + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); if (orig_fgconsole < 0) return 1; @@ -26,6 +139,9 @@ int pm_prepare_console(void) void pm_restore_console(void) { + if (!pm_vt_switch()) + return; + if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 68197a4..7ef6866 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -32,7 +32,7 @@ static void handle_poweroff(int key) static struct sysrq_key_op sysrq_poweroff_op = { .handler = handle_poweroff, - .help_msg = "powerOff", + .help_msg = "poweroff(o)", .action_msg = "Power Off", .enable_mask = SYSRQ_ENABLE_BOOT, }; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index d4feda0..bef86d1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -76,8 +76,20 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); bool valid_state(suspend_state_t state) { - if (state == PM_SUSPEND_FREEZE) - return true; + if (state == PM_SUSPEND_FREEZE) { +#ifdef CONFIG_PM_DEBUG + if (pm_test_level != TEST_NONE && + pm_test_level != TEST_FREEZER && + pm_test_level != TEST_DEVICES && + pm_test_level != TEST_PLATFORM) { + printk(KERN_WARNING "Unsupported pm_test mode for " + "freeze state, please choose " + "none/freezer/devices/platform.\n"); + return false; + } +#endif + return true; + } /* * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel * support and need to be valid to the lowlevel @@ -184,6 +196,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } + if (suspend_test(TEST_PLATFORM)) + goto Platform_wake; + /* * PM_SUSPEND_FREEZE equals * frozen processes + suspended devices + idle processors. @@ -195,9 +210,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } - if (suspend_test(TEST_PLATFORM)) - goto Platform_wake; - error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; diff --git a/kernel/printk.c b/kernel/printk.c index 376914e..fa36e14 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -32,6 +32,7 @@ #include <linux/security.h> #include <linux/bootmem.h> #include <linux/memblock.h> +#include <linux/aio.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -43,6 +44,7 @@ #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> +#include <linux/utsname.h> #include <asm/uaccess.h> @@ -2849,4 +2851,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper) raw_spin_unlock_irqrestore(&logbuf_lock, flags); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +static char dump_stack_arch_desc_str[128]; + +/** + * dump_stack_set_arch_desc - set arch-specific str to show with task dumps + * @fmt: printf-style format string + * @...: arguments for the format string + * + * The configured string will be printed right after utsname during task + * dumps. Usually used to add arch-specific system identifiers. If an + * arch wants to make use of such an ID string, it should initialize this + * as soon as possible during boot. + */ +void __init dump_stack_set_arch_desc(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), + fmt, args); + va_end(args); +} + +/** + * dump_stack_print_info - print generic debug info for dump_stack() + * @log_lvl: log level + * + * Arch-specific dump_stack() implementations can use this function to + * print out the same debug information as the generic dump_stack(). + */ +void dump_stack_print_info(const char *log_lvl) +{ + printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n", + log_lvl, raw_smp_processor_id(), current->pid, current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + if (dump_stack_arch_desc_str[0] != '\0') + printk("%sHardware name: %s\n", + log_lvl, dump_stack_arch_desc_str); + + print_worker_info(log_lvl, current); +} + +/** + * show_regs_print_info - print generic debug info for show_regs() + * @log_lvl: log level + * + * show_regs() implementations can use this function to print out generic + * debug information. + */ +void show_regs_print_info(const char *log_lvl) +{ + dump_stack_print_info(log_lvl); + + printk("%stask: %p ti: %p task.ti: %p\n", + log_lvl, current, current_thread_info(), + task_thread_info(current)); +} + #endif diff --git a/kernel/profile.c b/kernel/profile.c index dc3384e..0bf4007 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -462,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = { .write = prof_cpu_mask_proc_write, }; -void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) +void create_prof_cpu_mask(void) { /* create /proc/irq/prof_cpu_mask */ - proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); + proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops); } /* @@ -600,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ NULL, &proc_profile_operations); if (!entry) return 0; - entry->size = (1+prof_len) * sizeof(atomic_t); + proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); hotcpu_notifier(profile_cpu_callback, 0); return 0; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index acbd284..aed981a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -17,6 +17,7 @@ #include <linux/ptrace.h> #include <linux/security.h> #include <linux/signal.h> +#include <linux/uio.h> #include <linux/audit.h> #include <linux/pid_namespace.h> #include <linux/syscalls.h> @@ -24,6 +25,7 @@ #include <linux/regset.h> #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> +#include <linux/compat.h> static int ptrace_trapping_sleep_fn(void *flags) @@ -618,6 +620,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) return error; } +static int ptrace_peek_siginfo(struct task_struct *child, + unsigned long addr, + unsigned long data) +{ + struct ptrace_peeksiginfo_args arg; + struct sigpending *pending; + struct sigqueue *q; + int ret, i; + + ret = copy_from_user(&arg, (void __user *) addr, + sizeof(struct ptrace_peeksiginfo_args)); + if (ret) + return -EFAULT; + + if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED) + return -EINVAL; /* unknown flags */ + + if (arg.nr < 0) + return -EINVAL; + + if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) + pending = &child->signal->shared_pending; + else + pending = &child->pending; + + for (i = 0; i < arg.nr; ) { + siginfo_t info; + s32 off = arg.off + i; + + spin_lock_irq(&child->sighand->siglock); + list_for_each_entry(q, &pending->list, list) { + if (!off--) { + copy_siginfo(&info, &q->info); + break; + } + } + spin_unlock_irq(&child->sighand->siglock); + + if (off >= 0) /* beyond the end of the list */ + break; + +#ifdef CONFIG_COMPAT + if (unlikely(is_compat_task())) { + compat_siginfo_t __user *uinfo = compat_ptr(data); + + ret = copy_siginfo_to_user32(uinfo, &info); + ret |= __put_user(info.si_code, &uinfo->si_code); + } else +#endif + { + siginfo_t __user *uinfo = (siginfo_t __user *) data; + + ret = copy_siginfo_to_user(uinfo, &info); + ret |= __put_user(info.si_code, &uinfo->si_code); + } + + if (ret) { + ret = -EFAULT; + break; + } + + data += sizeof(siginfo_t); + i++; + + if (signal_pending(current)) + break; + + cond_resched(); + } + + if (i > 0) + return i; + + return ret; +} #ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) @@ -748,6 +825,10 @@ int ptrace_request(struct task_struct *child, long request, ret = put_user(child->ptrace_message, datalp); break; + case PTRACE_PEEKSIGINFO: + ret = ptrace_peek_siginfo(child, addr, data); + break; + case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) diff --git a/kernel/range.c b/kernel/range.c index 9b8ae2d..071b0ab 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -97,7 +97,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) range[i].end = range[j].end; range[i].start = end; } else { - printk(KERN_ERR "run of slot in ranges\n"); + pr_err("%s: run out of slot in ranges\n", + __func__); } range[j].end = start; continue; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2f8530b..16ea679 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -64,7 +64,7 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname, cr) { \ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ @@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ + .abbr = sabbr, \ } struct rcu_state rcu_sched_state = - RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); + RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ + if (rcu_nocb_needs_gp(rsp)) + return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) @@ -794,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rdp->offline_fqs++; return 1; } + + /* + * There is a possibility that a CPU in adaptive-ticks state + * might run in the kernel with the scheduling-clock tick disabled + * for an extended time period. Invoke rcu_kick_nohz_cpu() to + * force the CPU to restart the scheduling-clock tick in this + * CPU is in this state. + */ + rcu_kick_nohz_cpu(rdp->cpu); + return 0; } @@ -1035,10 +1050,11 @@ static void init_callback_list(struct rcu_data *rdp) { int i; + if (init_nocb_callback_list(rdp)) + return; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - init_nocb_callback_list(rdp); } /* @@ -1071,6 +1087,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, } /* + * Trace-event helper function for rcu_start_future_gp() and + * rcu_nocb_wait_gp(). + */ +static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, char *s) +{ + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, s); +} + +/* + * Start some future grace period, as needed to handle newly arrived + * callbacks. The required future grace periods are recorded in each + * rcu_node structure's ->need_future_gp field. + * + * The caller must hold the specified rcu_node structure's ->lock. + */ +static unsigned long __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +{ + unsigned long c; + int i; + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + /* + * Pick up grace-period number for new callbacks. If this + * grace period is already marked as needed, return to the caller. + */ + c = rcu_cbs_completed(rdp->rsp, rnp); + trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); + if (rnp->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); + return c; + } + + /* + * If either this rcu_node structure or the root rcu_node structure + * believe that a grace period is in progress, then we must wait + * for the one following, which is in "c". Because our request + * will be noticed at the end of the current grace period, we don't + * need to explicitly start one. + */ + if (rnp->gpnum != rnp->completed || + ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + rnp->need_future_gp[c & 0x1]++; + trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); + return c; + } + + /* + * There might be no grace period in progress. If we don't already + * hold it, acquire the root rcu_node structure's lock in order to + * start one (if needed). + */ + if (rnp != rnp_root) + raw_spin_lock(&rnp_root->lock); + + /* + * Get a new grace-period number. If there really is no grace + * period in progress, it will be smaller than the one we obtained + * earlier. Adjust callbacks as needed. Note that even no-CBs + * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + */ + c = rcu_cbs_completed(rdp->rsp, rnp_root); + for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) + if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) + rdp->nxtcompleted[i] = c; + + /* + * If the needed for the required grace period is already + * recorded, trace and leave. + */ + if (rnp_root->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); + goto unlock_out; + } + + /* Record the need for the future grace period. */ + rnp_root->need_future_gp[c & 0x1]++; + + /* If a grace period is not already in progress, start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); + } else { + trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); + rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + } +unlock_out: + if (rnp != rnp_root) + raw_spin_unlock(&rnp_root->lock); + return c; +} + +/* + * Clean up any old requests for the just-ended grace period. Also return + * whether any additional grace periods have been requested. Also invoke + * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads + * waiting for this grace period to complete. + */ +static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int c = rnp->completed; + int needmore; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + + rcu_nocb_gp_cleanup(rsp, rnp); + rnp->need_future_gp[c & 0x1] = 0; + needmore = rnp->need_future_gp[(c + 1) & 0x1]; + trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); + return needmore; +} + +/* * If there is room, assign a ->completed number to any callbacks on * this CPU that have not already been assigned. Also accelerate any * callbacks that were previously assigned a ->completed number that has @@ -1129,6 +1259,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtcompleted[i] = c; } + /* Record any needed additional grace periods. */ + rcu_start_future_gp(rnp, rdp); /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) @@ -1308,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; + ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; WARN_ON_ONCE(rnp->completed != rsp->completed); - rnp->completed = rsp->completed; + ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -1319,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes * 8)) == 0) + if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 && + system_state == SYSTEM_RUNNING) schedule_timeout_uninterruptible(2); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); @@ -1361,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; + int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1390,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; + ACCESS_ONCE(rnp->completed) = rsp->gpnum; + rdp = this_cpu_ptr(rsp->rda); + if (rnp == rdp->mynode) + __rcu_process_gp_end(rsp, rnp, rdp); + nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); + rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); @@ -1476,57 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg) /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. + * the root node's ->lock and hard irqs must be disabled. * * Note that it is legal for a dying CPU (which is marked as offline) to * invoke this function. This can happen when the dying CPU reports its * quiescent state. */ static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - - if (!rsp->gp_kthread || - !cpu_needs_another_gp(rsp, rdp)) { + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period * task, this CPU does not need another grace period, * or a grace period is already in progress. * Either way, don't start a new grace period. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - - /* - * Because there is no grace period in progress right now, - * any callbacks we have up to this point will be satisfied - * by the next grace period. So this is a good place to - * assign a grace period number to recently posted callbacks. - */ - rcu_accelerate_cbs(rsp, rnp, rdp); - rsp->gp_flags = RCU_GP_FLAG_INIT; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ - - /* Ensure that CPU is aware of completion of last grace period. */ - rcu_process_gp_end(rsp, rdp); - local_irq_restore(flags); /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); } /* + * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's + * callbacks. Note that rcu_start_gp_advanced() cannot do this because it + * is invoked indirectly from rcu_advance_cbs(), which would result in + * endless recursion -- or would do so if it wasn't for the self-deadlock + * that is encountered beforehand. + */ +static void +rcu_start_gp(struct rcu_state *rsp) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* + * If there is no grace period in progress right now, any + * callbacks we have up to this point will be satisfied by the + * next grace period. Also, advancing the callbacks reduces the + * probability of false positives from cpu_needs_another_gp() + * resulting in pointless grace periods. So, advance callbacks + * then start the grace period! + */ + rcu_advance_cbs(rsp, rnp, rdp); + rcu_start_gp_advanced(rsp, rnp, rdp); +} + +/* * Report a full set of quiescent states to the specified rcu_state * data structure. This involves cleaning up after the prior grace * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. + * if one is needed. Note that the caller must hold rnp->lock, which + * is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) @@ -1685,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { /* No-CBs CPUs do not have orphanable callbacks. */ - if (is_nocb_cpu(rdp->cpu)) + if (rcu_is_nocb_cpu(rdp->cpu)) return; /* @@ -2124,7 +2269,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ - rcu_start_gp(rsp, flags); /* releases above lock */ + rcu_start_gp(rsp); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); } else { local_irq_restore(flags); } @@ -2169,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) static void invoke_rcu_core(void) { - raise_softirq(RCU_SOFTIRQ); + if (cpu_online(smp_processor_id())) + raise_softirq(RCU_SOFTIRQ); } /* @@ -2204,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + raw_spin_lock(&rnp_root->lock); + rcu_start_gp(rsp); + raw_spin_unlock(&rnp_root->lock); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; @@ -2628,19 +2775,27 @@ static int rcu_pending(int cpu) } /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. + * Return true if the specified CPU has any callback. If all_lazy is + * non-NULL, store an indication of whether all callbacks are lazy. + * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu) +static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { + bool al = true; + bool hc = false; + struct rcu_data *rdp; struct rcu_state *rsp; - /* RCU callbacks either ready or pending? */ - for_each_rcu_flavor(rsp) - if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) - return 1; - return 0; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->qlen != rdp->qlen_lazy) + al = false; + if (rdp->nxtlist) + hc = true; + } + if (all_lazy) + *all_lazy = al; + return hc; } /* @@ -2747,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp) * corresponding CPU's preceding callbacks have been invoked. */ for_each_possible_cpu(cpu) { - if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) + if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) continue; rdp = per_cpu_ptr(rsp->rda, cpu); - if (is_nocb_cpu(cpu)) { + if (rcu_is_nocb_cpu(cpu)) { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, rsp->n_barrier_done); atomic_inc(&rsp->barrier_cpu_count); @@ -2859,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* Add CPU to rcu_node bitmasks. */ @@ -2909,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - int ret = NOTIFY_OK; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2923,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - if (nocb_cpu_expendable(cpu)) - rcu_boost_kthread_setaffinity(rnp, cpu); - else - ret = NOTIFY_BAD; + rcu_boost_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: - /* - * The whole machine is "stopped" except this CPU, so we can - * touch any data without introducing corruption. We send the - * dying CPU's callbacks to an arbitrarily chosen online CPU. - */ for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); - rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -2950,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; } trace_rcu_utilization("End CPU hotplug"); - return ret; + return NOTIFY_OK; } /* @@ -3085,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); + rcu_init_one_nocb(rnp); } } @@ -3170,8 +3315,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - rcu_init_nocb(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c896b50..da77a8f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,18 +88,13 @@ struct rcu_dynticks { int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ #ifdef CONFIG_RCU_FAST_NO_HZ - int dyntick_drain; /* Prepare-for-idle state variable. */ - unsigned long dyntick_holdoff; - /* No retries for the jiffy of failure. */ - struct timer_list idle_gp_timer; - /* Wake up CPU sleeping with callbacks. */ - unsigned long idle_gp_timer_expires; - /* When to wake up CPU (for repost). */ - bool idle_first_pass; /* First pass of attempt to go idle? */ + bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; /* # times non-lazy CBs posted to CPU. */ unsigned long nonlazy_posted_snap; /* idle-period nonlazy_posted snapshot. */ + unsigned long last_accelerate; + /* Last jiffy CBs were accelerated. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; @@ -134,9 +129,6 @@ struct rcu_node { /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ - atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ - /* Since this has meaning only for leaf */ - /* rcu_node structures, 32 bits suffices. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ @@ -196,6 +188,12 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_NOCB_CPU + wait_queue_head_t nocb_gp_wq[2]; + /* Place for rcu_nocb_kthread() to wait GP. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + int need_future_gp[2]; + /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; @@ -328,6 +326,11 @@ struct rcu_data { struct task_struct *nocb_kthread; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + /* 8) RCU CPU stall data. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned int softirq_snap; /* Snapshot of softirq activity. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + int cpu; struct rcu_state *rsp; }; @@ -375,12 +378,6 @@ struct rcu_state { struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); -#ifdef CONFIG_RCU_NOCB_CPU - void (*call_remote)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); - /* call_rcu() flavor, but for */ - /* placing on remote CPU. */ -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -443,6 +440,7 @@ struct rcu_state { unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ char *name; /* Name of structure. */ + char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ }; @@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ static void __cpuinit rcu_prepare_kthreads(int cpu); -static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); static void rcu_idle_count_callbacks_posted(void); @@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); -static bool is_nocb_cpu(int cpu); +static int rcu_nocb_needs_gp(struct rcu_state *rsp); +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp); -static bool nocb_cpu_expendable(int cpu); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void init_nocb_callback_list(struct rcu_data *rdp); -static void __init rcu_init_nocb(void); +static void rcu_kick_nohz_cpu(int cpu); +static bool init_nocb_callback_list(struct rcu_data *rdp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e1..170814d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,6 +28,7 @@ #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> +#include <linux/tick.h> #define RCU_KTHREAD_PRIO 1 @@ -85,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void) if (nr_cpu_ids != NR_CPUS) printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU +#ifndef CONFIG_RCU_NOCB_CPU_NONE + if (!have_rcu_nocb_mask) { + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + have_rcu_nocb_mask = true; + } +#ifdef CONFIG_RCU_NOCB_CPU_ZERO + pr_info("\tExperimental no-CBs CPU 0\n"); + cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL + pr_info("\tExperimental no-CBs for all CPUs\n"); + cpumask_setall(rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { - if (cpumask_test_cpu(0, rcu_nocb_mask)) { - cpumask_clear_cpu(0, rcu_nocb_mask); - pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); - } cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) @@ -101,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_state rcu_preempt_state = - RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); + RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -1533,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ + return rcu_cpu_has_callbacks(cpu, NULL); } /* @@ -1577,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void) * * The following three proprocessor symbols control this state machine: * - * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt - * to satisfy RCU. Beyond this point, it is better to incur a periodic - * scheduling-clock interrupt than to loop through the state machine - * at full power. - * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are - * optional if RCU does not need anything immediately from this - * CPU, even if this CPU still has RCU callbacks queued. The first - * times through the state machine are mandatory: we need to give - * the state machine a chance to communicate a quiescent state - * to the RCU core. * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This * is sized to be roughly one RCU grace period. Those energy-efficiency @@ -1602,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void) * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ -#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ -#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -extern int tick_nohz_enabled; - -/* - * Does the specified flavor of RCU have non-lazy callbacks pending on - * the specified CPU? Both RCU flavor and CPU are specified by the - * rcu_data structure. - */ -static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) -{ - return rdp->qlen != rdp->qlen_lazy; -} +static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; +module_param(rcu_idle_gp_delay, int, 0644); +static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; +module_param(rcu_idle_lazy_gp_delay, int, 0644); -#ifdef CONFIG_TREE_PREEMPT_RCU +extern int tick_nohz_enabled; /* - * Are there non-lazy RCU-preempt callbacks? (There cannot be if there - * is no RCU-preempt in the kernel.) + * Try to advance callbacks for all flavors of RCU on the current CPU. + * Afterwards, if there are any callbacks ready for immediate invocation, + * return true. */ -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +static bool rcu_try_advance_all_cbs(void) { - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - return __rcu_cpu_has_nonlazy_callbacks(rdp); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + bool cbs_ready = false; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) -{ - return 0; -} + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; -#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ + /* + * Don't bother checking unless a grace period has + * completed since we last checked and there are + * callbacks not yet ready to invoke. + */ + if (rdp->completed != rnp->completed && + rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + rcu_process_gp_end(rsp, rdp); -/* - * Does any flavor of RCU have non-lazy callbacks on the specified CPU? - */ -static bool rcu_cpu_has_nonlazy_callbacks(int cpu) -{ - return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || - __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_cpu_has_nonlazy_callbacks(cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + cbs_ready = true; + } + return cbs_ready; } /* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! + * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready + * to invoke. If the CPU has callbacks, try to advance them. Tell the + * caller to set the timeout based on whether or not there are non-lazy + * callbacks. * - * The delta_jiffies argument is used to store the time when RCU is - * going to need the CPU again if it still has callbacks. The reason - * for this is that rcu_prepare_for_idle() might need to post a timer, - * but if so, it will do so after tick_nohz_stop_sched_tick() has set - * the wakeup time for this CPU. This means that RCU's timer can be - * delayed until the wakeup time, which defeats the purpose of posting - * a timer. + * The caller must have disabled interrupts. */ -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(int cpu, unsigned long *dj) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; + /* Snapshot to detect later posting of non-lazy callback. */ + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) { - *delta_jiffies = ULONG_MAX; + if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + *dj = ULONG_MAX; return 0; } - if (rdtp->dyntick_holdoff == jiffies) { - /* RCU recently tried and failed, so don't try again. */ - *delta_jiffies = 1; + + /* Attempt to advance callbacks. */ + if (rcu_try_advance_all_cbs()) { + /* Some ready to invoke, so initiate later invocation. */ + invoke_rcu_core(); return 1; } - /* Set up for the possibility that RCU will post a timer. */ - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, - RCU_IDLE_GP_DELAY) - jiffies; + rdtp->last_accelerate = jiffies; + + /* Request timer delay depending on laziness, and round. */ + if (rdtp->all_lazy) { + *dj = round_up(rcu_idle_gp_delay + jiffies, + rcu_idle_gp_delay) - jiffies; } else { - *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; - *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; + *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } return 0; } /* - * Handler for smp_call_function_single(). The only point of this - * handler is to wake the CPU up, so the handler does only tracing. - */ -void rcu_idle_demigrate(void *unused) -{ - trace_rcu_prep_idle("Demigrate"); -} - -/* - * Timer handler used to force CPU to start pushing its remaining RCU - * callbacks in the case where it entered dyntick-idle mode with callbacks - * pending. The hander doesn't really need to do anything because the - * real work is done upon re-entry to idle, or by the next scheduling-clock - * interrupt should idle not be re-entered. - * - * One special case: the timer gets migrated without awakening the CPU - * on which the timer was scheduled on. In this case, we must wake up - * that CPU. We do so with smp_call_function_single(). - */ -static void rcu_idle_gp_timer_func(unsigned long cpu_in) -{ - int cpu = (int)cpu_in; - - trace_rcu_prep_idle("Timer"); - if (cpu != smp_processor_id()) - smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); - else - WARN_ON_ONCE(1); /* Getting here can hang the system... */ -} - -/* - * Initialize the timer used to pull CPUs out of dyntick-idle mode. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdtp->dyntick_holdoff = jiffies - 1; - setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); - rdtp->idle_gp_timer_expires = jiffies - 1; - rdtp->idle_first_pass = 1; -} - -/* - * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to ->idle_gp_timer, so cancel it. This will - * do nothing if this timer is not active, so just cancel it unconditionally. - */ -static void rcu_cleanup_after_idle(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - del_timer(&rdtp->idle_gp_timer); - trace_rcu_prep_idle("Cleanup after idle"); - rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); -} - -/* - * Check to see if any RCU-related work can be done by the current CPU, - * and if so, schedule a softirq to get it done. This function is part - * of the RCU implementation; it is -not- an exported member of the RCU API. - * - * The idea is for the current CPU to clear out all work required by the - * RCU core for the current grace period, so that this CPU can be permitted - * to enter dyntick-idle mode. In some cases, it will need to be awakened - * at the end of the grace period by whatever CPU ends the grace period. - * This allows CPUs to go dyntick-idle more quickly, and to reduce the - * number of wakeups by a modest integer factor. - * - * Because it is not legal to invoke rcu_process_callbacks() with irqs - * disabled, we do one pass of force_quiescent_state(), then do a - * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The ->dyntick_drain field controls the sequencing. + * Prepare a CPU for idle from an RCU perspective. The first major task + * is to sense whether nohz mode has been enabled or disabled via sysfs. + * The second major task is to check to see if a non-lazy callback has + * arrived at a CPU that previously had only lazy callbacks. The third + * major task is to accelerate (that is, assign grace-period numbers to) + * any recently arrived callbacks. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { - struct timer_list *tp; + struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_node *rnp; + struct rcu_state *rsp; int tne; /* Handle nohz enablement switches conservatively. */ tne = ACCESS_ONCE(tick_nohz_enabled); if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu)) + if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ rdtp->tick_nohz_enabled_snap = tne; return; @@ -1789,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu) if (!tne) return; - /* Adaptive-tick mode, where usermode execution is idle to RCU. */ - if (!is_idle_task(current)) { - rdtp->dyntick_holdoff = jiffies - 1; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - trace_rcu_prep_idle("User dyntick with callbacks"); - rdtp->idle_gp_timer_expires = - round_up(jiffies + RCU_IDLE_GP_DELAY, - RCU_IDLE_GP_DELAY); - } else if (rcu_cpu_has_callbacks(cpu)) { - rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); - trace_rcu_prep_idle("User dyntick with lazy callbacks"); - } else { - return; - } - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + /* If this is a no-CBs CPU, no callbacks, just return. */ + if (rcu_is_nocb_cpu(cpu)) return; - } /* - * If this is an idle re-entry, for example, due to use of - * RCU_NONIDLE() or the new idle-loop tracing API within the idle - * loop, then don't take any state-machine actions, unless the - * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the ->idle_gp_timer if this CPU has callbacks - * pending. + * If a non-lazy callback arrived at a CPU having only lazy + * callbacks, invoke RCU core for the side-effect of recalculating + * idle duration on re-entry to idle. */ - if (!rdtp->idle_first_pass && - (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { - if (rcu_cpu_has_callbacks(cpu)) { - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); - } + if (rdtp->all_lazy && + rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { + invoke_rcu_core(); return; } - rdtp->idle_first_pass = 0; - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* - * If there are no callbacks on this CPU, enter dyntick-idle mode. - * Also reset state to avoid prejudicing later attempts. + * If we have not yet accelerated this jiffy, accelerate all + * callbacks on this CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) { - rdtp->dyntick_holdoff = jiffies - 1; - rdtp->dyntick_drain = 0; - trace_rcu_prep_idle("No callbacks"); + if (rdtp->last_accelerate == jiffies) return; + rdtp->last_accelerate = jiffies; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (!*rdp->nxttail[RCU_DONE_TAIL]) + continue; + rnp = rdp->mynode; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } +} - /* - * If in holdoff mode, just return. We will presumably have - * refrained from disabling the scheduling-clock tick. - */ - if (rdtp->dyntick_holdoff == jiffies) { - trace_rcu_prep_idle("In holdoff"); - return; - } +/* + * Clean up for exit from idle. Attempt to advance callbacks based on + * any grace periods that elapsed while the CPU was idle, and if any + * callbacks are now ready to invoke, initiate invocation. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + struct rcu_data *rdp; + struct rcu_state *rsp; - /* Check and update the ->dyntick_drain sequencing. */ - if (rdtp->dyntick_drain <= 0) { - /* First time through, initialize the counter. */ - rdtp->dyntick_drain = RCU_IDLE_FLUSHES; - } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && - !rcu_pending(cpu) && - !local_softirq_pending()) { - /* Can we go dyntick-idle despite still having callbacks? */ - rdtp->dyntick_drain = 0; - rdtp->dyntick_holdoff = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - trace_rcu_prep_idle("Dyntick with callbacks"); - rdtp->idle_gp_timer_expires = - round_up(jiffies + RCU_IDLE_GP_DELAY, - RCU_IDLE_GP_DELAY); - } else { - rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); - trace_rcu_prep_idle("Dyntick with lazy callbacks"); - } - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; - return; /* Nothing more to do immediately. */ - } else if (--(rdtp->dyntick_drain) <= 0) { - /* We have hit the limit, so time to give up. */ - rdtp->dyntick_holdoff = jiffies; - trace_rcu_prep_idle("Begin holdoff"); - invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ + if (rcu_is_nocb_cpu(cpu)) return; - } - - /* - * Do one step of pushing the remaining RCU callbacks through - * the RCU core state machine. - */ -#ifdef CONFIG_TREE_PREEMPT_RCU - if (per_cpu(rcu_preempt_data, cpu).nxtlist) { - rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state); - } -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - if (per_cpu(rcu_sched_data, cpu).nxtlist) { - rcu_sched_qs(cpu); - force_quiescent_state(&rcu_sched_state); - } - if (per_cpu(rcu_bh_data, cpu).nxtlist) { - rcu_bh_qs(cpu); - force_quiescent_state(&rcu_bh_state); - } - - /* - * If RCU callbacks are still pending, RCU still needs this CPU. - * So try forcing the callbacks through the grace period. - */ - if (rcu_cpu_has_callbacks(cpu)) { - trace_rcu_prep_idle("More callbacks"); - invoke_rcu_core(); - } else { - trace_rcu_prep_idle("Callbacks drained"); + rcu_try_advance_all_cbs(); + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_core(); } } @@ -2015,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier); static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - struct timer_list *tltp = &rdtp->idle_gp_timer; - char c; + unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; - c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; - if (timer_pending(tltp)) - sprintf(cp, "drain=%d %c timer=%lu", - rdtp->dyntick_drain, c, tltp->expires - jiffies); - else - sprintf(cp, "drain=%d %c timer not pending", - rdtp->dyntick_drain, c); + sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", + rdtp->last_accelerate & 0xffff, jiffies & 0xffff, + ulong2long(nlpd), + rdtp->all_lazy ? 'L' : '.', + rdtp->tick_nohz_enabled_snap ? '.' : 'D'); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -2070,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", + printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), fast_no_hz); } @@ -2087,6 +1932,7 @@ static void print_cpu_stall_info_end(void) static void zero_cpu_stall_ticks(struct rcu_data *rdp) { rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); } /* Increment ->ticks_this_gp for all flavors of RCU. */ @@ -2165,8 +2011,49 @@ static int __init parse_rcu_nocb_poll(char *arg) } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +/* + * Do any no-CBs CPUs need another grace period? + * + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); +} + +/* + * Set the root rcu_node structure's ->need_future_gp field + * based on the sum of those of all rcu_node structures. This does + * double-count the root rcu_node structure's requests, but this + * is necessary to handle the possibility of a rcu_nocb_kthread() + * having awakened during the time that the rcu_node structures + * were being updated for the end of the previous grace period. + */ +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ + rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_waitqueue_head(&rnp->nocb_gp_wq[0]); + init_waitqueue_head(&rnp->nocb_gp_wq[1]); +} + /* Is the specified CPU a no-CPUs CPU? */ -static bool is_nocb_cpu(int cpu) +bool rcu_is_nocb_cpu(int cpu) { if (have_rcu_nocb_mask) return cpumask_test_cpu(cpu, rcu_nocb_mask); @@ -2224,9 +2111,16 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy) { - if (!is_nocb_cpu(rdp->cpu)) + if (!rcu_is_nocb_cpu(rdp->cpu)) return 0; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + if (__is_kfree_rcu_offset((unsigned long)rhp->func)) + trace_rcu_kfree_callback(rdp->rsp->name, rhp, + (unsigned long)rhp->func, + rdp->qlen_lazy, rdp->qlen); + else + trace_rcu_callback(rdp->rsp->name, rhp, + rdp->qlen_lazy, rdp->qlen); return 1; } @@ -2241,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, long qll = rsp->qlen_lazy; /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ - if (!is_nocb_cpu(smp_processor_id())) + if (!rcu_is_nocb_cpu(smp_processor_id())) return 0; rsp->qlen = 0; rsp->qlen_lazy = 0; @@ -2265,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, } /* - * There must be at least one non-no-CBs CPU in operation at any given - * time, because no-CBs CPUs are not capable of initiating grace periods - * independently. This function therefore complains if the specified - * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to - * avoid offlining the last such CPU. (Recursion is a wonderful thing, - * but you have to have a base case!) + * If necessary, kick off a new grace period, and either way wait + * for a subsequent grace period to complete. */ -static bool nocb_cpu_expendable(int cpu) +static void rcu_nocb_wait_gp(struct rcu_data *rdp) { - cpumask_var_t non_nocb_cpus; - int ret; + unsigned long c; + bool d; + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave(&rnp->lock, flags); + c = rcu_start_future_gp(rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); /* - * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, - * then offlining this CPU is harmless. Let it happen. + * Wait for the grace period. Do so interruptibly to avoid messing + * up the load average. */ - if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) - return 1; - - /* If no memory, play it safe and keep the CPU around. */ - if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) - return 0; - cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); - cpumask_clear_cpu(cpu, non_nocb_cpus); - ret = !cpumask_empty(non_nocb_cpus); - free_cpumask_var(non_nocb_cpus); - return ret; -} - -/* - * Helper structure for remote registry of RCU callbacks. - * This is needed for when a no-CBs CPU needs to start a grace period. - * If it just invokes call_rcu(), the resulting callback will be queued, - * which can result in deadlock. - */ -struct rcu_head_remote { - struct rcu_head *rhp; - call_rcu_func_t *crf; - void (*func)(struct rcu_head *rhp); -}; - -/* - * Register a callback as specified by the rcu_head_remote struct. - * This function is intended to be invoked via smp_call_function_single(). - */ -static void call_rcu_local(void *arg) -{ - struct rcu_head_remote *rhrp = - container_of(arg, struct rcu_head_remote, rhp); - - rhrp->crf(rhrp->rhp, rhrp->func); -} - -/* - * Set up an rcu_head_remote structure and the invoke call_rcu_local() - * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via - * smp_call_function_single(). - */ -static void invoke_crf_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp), - call_rcu_func_t crf) -{ - struct rcu_head_remote rhr; - - rhr.rhp = rhp; - rhr.crf = crf; - rhr.func = func; - smp_call_function_single(0, call_rcu_local, &rhr, 1); -} - -/* - * Helper functions to be passed to wait_rcu_gp(), each of which - * invokes invoke_crf_remote() to register a callback appropriately. - */ -static void __maybe_unused -call_rcu_preempt_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu); -} -static void call_rcu_bh_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu_bh); -} -static void call_rcu_sched_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu_sched); + trace_rcu_future_gp(rnp, rdp, c, "StartWait"); + for (;;) { + wait_event_interruptible( + rnp->nocb_gp_wq[c & 0x1], + (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); + if (likely(d)) + break; + flush_signals(current); + trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); + } + trace_rcu_future_gp(rnp, rdp, c, "EndWait"); + smp_mb(); /* Ensure that CB invocation happens after GP end. */ } /* @@ -2390,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg) cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); ACCESS_ONCE(rdp->nocb_p_count) += c; ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - wait_rcu_gp(rdp->rsp->call_remote); + rcu_nocb_wait_gp(rdp); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2436,36 +2271,40 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) return; for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); - t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); + t = kthread_run(rcu_nocb_kthread, rdp, + "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); ACCESS_ONCE(rdp->nocb_kthread) = t; } } /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static void init_nocb_callback_list(struct rcu_data *rdp) +static bool init_nocb_callback_list(struct rcu_data *rdp) { if (rcu_nocb_mask == NULL || !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) - return; + return false; rdp->nxttail[RCU_NEXT_TAIL] = NULL; + return true; } -/* Initialize the ->call_remote fields in the rcu_state structures. */ -static void __init rcu_init_nocb(void) +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static int rcu_nocb_needs_gp(struct rcu_state *rsp) { -#ifdef CONFIG_PREEMPT_RCU - rcu_preempt_state.call_remote = call_rcu_preempt_remote; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - rcu_bh_state.call_remote = call_rcu_bh_remote; - rcu_sched_state.call_remote = call_rcu_sched_remote; + return 0; } -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ +} -static bool is_nocb_cpu(int cpu) +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) { - return false; } static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, @@ -2480,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, return 0; } -static bool nocb_cpu_expendable(int cpu) -{ - return 1; -} - static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } @@ -2493,12 +2327,26 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { } -static void init_nocb_callback_list(struct rcu_data *rdp) +static bool init_nocb_callback_list(struct rcu_data *rdp) { + return false; } -static void __init rcu_init_nocb(void) +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + +/* + * An adaptive-ticks CPU can potentially execute in kernel mode for an + * arbitrarily long period of time with the scheduling-clock tick turned + * off. RCU will be paying attention to this CPU because it is in the + * kernel, but the CPU cannot be guaranteed to be executing the RCU state + * machine because the scheduling-clock tick has been disabled. Therefore, + * if an adaptive-ticks CPU is failing to respond to the current grace + * period and has not be idle from an RCU perspective, kick it. + */ +static void rcu_kick_nohz_cpu(int cpu) { +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(cpu)) + smp_send_reschedule(cpu); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ } - -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0d095dc..cf6c174 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,8 +46,6 @@ #define RCU_TREE_NONCORE #include "rcutree.h" -#define ulong2long(a) (*(long *)(&(a))) - static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { @@ -97,7 +95,7 @@ static const struct file_operations rcubarrier_fops = { .open = rcubarrier_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; #ifdef CONFIG_RCU_BOOST @@ -208,7 +206,7 @@ static const struct file_operations rcuexp_fops = { .open = rcuexp_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; #ifdef CONFIG_RCU_BOOST @@ -308,7 +306,7 @@ static const struct file_operations rcuhier_fops = { .open = rcuhier_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) @@ -350,7 +348,7 @@ static const struct file_operations rcugp_fops = { .open = rcugp_open, .read = seq_read, .llseek = no_llseek, - .release = seq_release, + .release = single_release, }; static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) diff --git a/kernel/relay.c b/kernel/relay.c index 01ab081..b91488b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf) static void relay_remove_buf(struct kref *kref) { struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); - buf->chan->cb->remove_buf_file(buf->dentry); relay_destroy_buf(buf); } @@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf) { buf->finalized = 1; del_timer_sync(&buf->timer); + buf->chan->cb->remove_buf_file(buf->dentry); kref_put(&buf->kref, relay_remove_buf); } @@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename, chan->version = RELAYFS_CHANNEL_VERSION; chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; - chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); + chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs); chan->parent = parent; chan->private_data = private_data; if (base_filename) { @@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, static int subbuf_read_actor(size_t read_start, struct rchan_buf *buf, size_t avail, - read_descriptor_t *desc, - read_actor_t actor) + read_descriptor_t *desc) { void *from; int ret = 0; @@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start, typedef int (*subbuf_actor_t) (size_t read_start, struct rchan_buf *buf, size_t avail, - read_descriptor_t *desc, - read_actor_t actor); + read_descriptor_t *desc); /* * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries */ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, subbuf_actor_t subbuf_actor, - read_actor_t actor, read_descriptor_t *desc) { struct rchan_buf *buf = filp->private_data; @@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, break; avail = min(desc->count, avail); - ret = subbuf_actor(read_start, buf, avail, desc, actor); + ret = subbuf_actor(read_start, buf, avail, desc); if (desc->error < 0) break; @@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp, desc.count = count; desc.arg.buf = buffer; desc.error = 0; - return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, - NULL, &desc); + return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc); } static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) diff --git a/kernel/rwsem.c b/kernel/rwsem.c index b3c6c3f..cfff143 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) EXPORT_SYMBOL(_down_write_nest_lock); +void down_read_non_owner(struct rw_semaphore *sem) +{ + might_sleep(); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); @@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); +void up_read_non_owner(struct rw_semaphore *sem) +{ + __up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + #endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249..deaf90e 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8285eb..58453b8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -512,11 +512,6 @@ static inline void init_hrtick(void) * the target CPU. */ #ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) 0 -#endif - void resched_task(struct task_struct *p) { int cpu; @@ -549,7 +544,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers * from an idle cpu. This is good for power-savings. @@ -587,7 +582,7 @@ unlock: * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event. */ -void wake_up_idle_cpu(int cpu) +static void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static bool wake_up_full_nohz_cpu(int cpu) +{ + if (tick_nohz_full_cpu(cpu)) { + if (cpu != smp_processor_id() || + tick_nohz_tick_stopped()) + smp_send_reschedule(cpu); + return true; + } + + return false; +} + +void wake_up_nohz_cpu(int cpu) +{ + if (!wake_up_full_nohz_cpu(cpu)) + wake_up_idle_cpu(cpu); +} + static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); } -#else /* CONFIG_NO_HZ */ +#else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) { return false; } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif /* CONFIG_NO_HZ_FULL */ void sched_avg_update(struct rq *rq) { @@ -1288,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); + trace_sched_wakeup(p, true); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -1362,7 +1393,8 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() + && !tick_nohz_full_cpu(smp_processor_id())) return; /* @@ -1379,6 +1411,7 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); + tick_nohz_full_check(); sched_ttwu_pending(); /* @@ -1860,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + tick_nohz_task_switch(current); } #ifdef CONFIG_SMP @@ -2123,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return load >> FSHIFT; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. * @@ -2349,12 +2384,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ static inline long calc_load_fold_idle(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2514,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -2574,7 +2609,7 @@ void update_cpu_load_nohz(void) } raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from scheduler_tick() @@ -2701,8 +2736,35 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + rq_last_tick_reset(rq); } +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = ACCESS_ONCE(jiffies); + + next = rq->last_sched_tick + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_usecs(next - now) * NSEC_PER_USEC; +} +#endif + notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { @@ -3039,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); + enum ctx_state prev_state; /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); - user_exit(); + prev_state = exception_enter(); + do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); @@ -3057,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void) */ barrier(); } while (need_resched()); + + exception_exit(prev_state); } #endif /* CONFIG_PREEMPT */ @@ -4587,6 +4653,7 @@ void sched_show_task(struct task_struct *p) task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags); + print_worker_info(KERN_INFO, p); show_stack(p, NULL); } @@ -6204,7 +6271,7 @@ static void sched_init_numa(void) * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). * - * The sched_domains_nume_distance[] array includes the actual distance + * The sched_domains_numa_distance[] array includes the actual distance * numbers. */ @@ -6817,11 +6884,15 @@ int in_sched_functions(unsigned long addr) } #ifdef CONFIG_CGROUP_SCHED +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6858,7 +6929,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ @@ -6884,12 +6955,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -6953,9 +7018,12 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = 0; +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7411,7 +7479,7 @@ unlock: return err; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { u64 rt_runtime, rt_period; @@ -7423,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_runtime(struct task_group *tg) +static long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; @@ -7435,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) { u64 rt_runtime, rt_period; @@ -7448,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } -long sched_group_rt_period(struct task_group *tg) +static long sched_group_rt_period(struct task_group *tg) { u64 rt_period_us; @@ -7483,7 +7551,7 @@ static int sched_rt_global_constraints(void) return ret; } -int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept realtime tasks when there is no way for them to run */ if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) @@ -7991,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, - { } /* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ - void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 0000000..dbb7e2c --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,296 @@ +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/kernel_stat.h> +#include <linux/err.h> + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + while (true) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + + ca = parent_ca(ca); + if (!ca) + break; + } + + rcu_read_unlock(); +} + +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; + + rcu_read_lock(); + ca = task_ca(p); + while (ca != &root_cpuacct) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += val; + ca = __parent_ca(ca); + } + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, + .early_init = 1, +}; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 0000000..ed60562 --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,17 @@ +#ifdef CONFIG_CGROUP_CPUACCT + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + +#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e93cca9..cc2dc3e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -115,10 +115,6 @@ static int irqtime_account_si_update(void) static inline void task_group_account_field(struct task_struct *p, int index, u64 tmp) { -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif /* * Since all updates are sure to touch the root cgroup, we * get ourselves ahead and touch it first. If the root cgroup @@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif + cpuacct_account_field(p, index, tmp); } /* @@ -388,7 +372,84 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ struct rq *rq) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_task_switch(struct task_struct *prev) +{ + if (!vtime_accounting_enabled()) + return; + + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + vtime_account_user(prev); +#endif + arch_vtime_task_switch(prev); +} +#endif + +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (!vtime_accounting_enabled()) + return; + + if (!in_interrupt()) { + /* + * If we interrupted user, context_tracking_in_user() + * is 1 because the context tracking don't hook + * on irq entry/exit. This way we know if + * we need to flush user time on kernel entry. + */ + if (context_tracking_in_user()) { + vtime_account_user(tsk); + return; + } + + if (is_idle_task(tsk)) { + vtime_account_idle(tsk); + return; + } + } + vtime_account_system(tsk); +} +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ /* * Account a single tick of cpu time. * @p: the process that the cpu time gets accounted to @@ -443,96 +504,50 @@ void account_idle_ticks(unsigned long ticks) account_idle_time(jiffies_to_cputime(ticks)); } -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ - -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} - -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (!vtime_accounting_enabled()) - return; - - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_system(prev); - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - vtime_account_user(prev); -#endif - arch_vtime_task_switch(prev); -} -#endif /* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). + * Perform (stime * rtime) / total, but avoid multiplication overflow by + * loosing precision when the numbers are big. */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) { - if (!vtime_accounting_enabled()) - return; + u64 scaled; - if (!in_interrupt()) { - /* - * If we interrupted user, context_tracking_in_user() - * is 1 because the context tracking don't hook - * on irq entry/exit. This way we know if - * we need to flush user time on kernel entry. - */ - if (context_tracking_in_user()) { - vtime_account_user(tsk); - return; + for (;;) { + /* Make sure "rtime" is the bigger of stime/rtime */ + if (stime > rtime) { + u64 tmp = rtime; rtime = stime; stime = tmp; } - if (is_idle_task(tsk)) { - vtime_account_idle(tsk); - return; - } - } - vtime_account_system(tsk); -} -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + /* Make sure 'total' fits in 32 bits */ + if (total >> 32) + goto drop_precision; -#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ + /* Does rtime (and thus stime) fit in 32 bits? */ + if (!(rtime >> 32)) + break; -static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) -{ - u64 temp = (__force u64) rtime; + /* Can we just balance rtime/stime rather than dropping bits? */ + if (stime >> 31) + goto drop_precision; - temp *= (__force u64) stime; + /* We can grow stime and shrink rtime and try to make them both fit */ + stime <<= 1; + rtime >>= 1; + continue; - if (sizeof(cputime_t) == 4) - temp = div_u64(temp, (__force u32) total); - else - temp = div64_u64(temp, (__force u64) total); +drop_precision: + /* We drop from rtime, it has more bits than stime */ + rtime >>= 1; + total >>= 1; + } - return (__force cputime_t) temp; + /* + * Make sure gcc understands that this is a 32x32->64 multiply, + * followed by a 64/32->64 divide. + */ + scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); + return (__force cputime_t) scaled; } /* @@ -543,7 +558,13 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, stime, total; + cputime_t rtime, stime, utime, total; + + if (vtime_accounting_enabled()) { + *ut = curr->utime; + *st = curr->stime; + return; + } stime = curr->stime; total = stime + curr->utime; @@ -560,10 +581,22 @@ static void cputime_adjust(struct task_cputime *curr, */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); - if (total) - stime = scale_stime(stime, rtime, total); - else + /* + * Update userspace visible utime/stime values only if actual execution + * time is bigger than already exported. Note that can happen, that we + * provided bigger values due to scaling inaccuracy on big numbers. + */ + if (prev->stime + prev->utime >= rtime) + goto out; + + if (total) { + stime = scale_stime((__force u64)stime, + (__force u64)rtime, (__force u64)total); + utime = rtime - stime; + } else { stime = rtime; + utime = 0; + } /* * If the tick based count grows faster than the scheduler one, @@ -571,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr, * Let's enforce monotonicity. */ prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, rtime - prev->stime); + prev->utime = max(prev->utime, utime); +out: *ut = prev->utime; *st = prev->stime; } @@ -597,7 +631,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static unsigned long long vtime_delta(struct task_struct *tsk) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a33e59..c61a614 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) { - s64 delta = (s64)(vruntime - min_vruntime); + s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) - min_vruntime = vruntime; + max_vruntime = vruntime; - return min_vruntime; + return max_vruntime; } static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) @@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) vruntime = min_vruntime(vruntime, se->vruntime); } + /* ensure we never gain time by being placed backwards. */ cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); #ifndef CONFIG_64BIT smp_wmb(); @@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) } /* - * We calculate the vruntime slice of a to be inserted task + * We calculate the vruntime slice of a to-be-inserted task. * * vs = s/w */ @@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + #else static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} @@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or + * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { - int new_dst_cpu; + int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; - new_dst_cpu = cpumask_first_and(env->dst_grpmask, - tsk_cpus_allowed(p)); - if (new_dst_cpu < nr_cpu_ids) { - env->flags |= LBF_SOME_PINNED; - env->new_dst_cpu = new_dst_cpu; + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = cpu; + break; + } } + return 0; } @@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif + return 1; } - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; - } - return 1; + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; } /* @@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) - continue; - if (!can_migrate_task(p, env)) continue; @@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env) break; } - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (!can_migrate_task(p, env)) goto next; load = task_h_load(p); @@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - if (!can_migrate_task(p, env)) - goto next; - move_task(p, env); pulled++; env->imbalance -= load; @@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; } @@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) return default_scale_freq_power(sd, cpu); } -unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; @@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) return default_scale_smt_power(sd, cpu); } -unsigned long scale_rt_power(int cpu) +static unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; @@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static int need_active_balance(struct lb_env *env) { @@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; - int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); struct lb_env env = { .sd = sd, @@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, .cpus = cpus, }; + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) + env.dst_grpmask = NULL; + cpumask_copy(cpus, cpu_active_mask); - max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -5034,7 +5059,6 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; - lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -5061,17 +5085,17 @@ more_balance: double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - /* * some other cpu did the load balance for us. */ if (cur_ld_moved && env.dst_cpu != smp_processor_id()) resched_cpu(env.dst_cpu); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group @@ -5091,14 +5115,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && - lb_iterations++ < max_lb_iterations) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. @@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; - update_rq_runnable_avg(this_rq, 1); - /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ @@ -5330,7 +5355,7 @@ out_unlock: return 0; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details * - When one of the busy CPUs notice that there may be an idle rebalancing @@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + for (; sd; sd = sd->parent) atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + for (; sd; sd = sd->parent) atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5468,7 +5499,7 @@ void update_max_interval(void) * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { @@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* - * We've pulled tasks over so either we're no - * longer idle. + * The LBF_SOME_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. */ - idle = CPU_NOT_IDLE; + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; } @@ -5540,9 +5572,9 @@ out: rq->next_balance = next_balance; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* - * In CONFIG_NO_HZ case, the idle balance kickee will do the + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) @@ -5685,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu) if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif @@ -6155,7 +6187,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf37..d8da010 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + idle_exit_fair(rq); + rq_last_tick_reset(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ + idle_enter_fair(rq); +} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -25,6 +36,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP + /* Trigger the post schedule to do an idle_enter for CFS */ + rq->post_schedule = 1; +#endif return rq->idle; } @@ -86,6 +101,8 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, + .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc03cfd..ce39224d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,8 +5,10 @@ #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> +#include <linux/tick.h> #include "cpupri.h" +#include "cpuacct.h" extern __read_mostly int scheduler_running; @@ -33,6 +35,31 @@ extern __read_mostly int scheduler_running; */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) +/* + * Increase resolution of nice-level calculations for 64-bit architectures. + * The extra resolution improves shares distribution and load balancing of + * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup + * hierarchies, especially on larger systems. This is not a user-visible change + * and does not change the user-interface for setting shares/weights. + * + * We increase resolution only if we have enough bits to allow this increased + * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution + * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the + * increased costs. + */ +#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ +# define SCHED_LOAD_RESOLUTION 10 +# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) +# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#else +# define SCHED_LOAD_RESOLUTION 0 +# define scale_load(w) (w) +# define scale_load_down(w) (w) +#endif + +#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) +#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) + #define NICE_0_LOAD SCHED_LOAD_SCALE #define NICE_0_SHIFT SCHED_LOAD_SHIFT @@ -154,11 +181,6 @@ struct task_group { #define MAX_SHARES (1UL << 18) #endif -/* Default task group. - * Every task in system belong to this group at bootup. - */ -extern struct task_group root_task_group; - typedef int (*tg_visitor)(struct task_group *, void *); extern int walk_tg_tree_from(struct task_group *from, @@ -196,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); +extern struct task_group *sched_create_group(struct task_group *parent); +extern void sched_online_group(struct task_group *tg, + struct task_group *parent); +extern void sched_destroy_group(struct task_group *tg); +extern void sched_offline_group(struct task_group *tg); + +extern void sched_move_task(struct task_struct *tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +#endif + #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; @@ -372,10 +406,13 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; #endif +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; +#endif int skip_clock_update; /* capture load from *all* tasks on this cpu: */ @@ -547,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +struct sched_group_power { + atomic_t ref; + /* + * CPU power of this group, SCHED_LOAD_SCALE being max power for a + * single CPU. + */ + unsigned int power, power_orig; + unsigned long next_update; + /* + * Number of busy cpus in this group. + */ + atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ +}; + +struct sched_group { + struct sched_group *next; /* Must be a circular list */ + atomic_t ref; + + unsigned int group_weight; + struct sched_group_power *sgp; + + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + */ + unsigned long cpumask[0]; +}; + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgp->cpumask); +} + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + extern int group_balance_cpu(struct sched_group *sg); #endif /* CONFIG_SMP */ @@ -784,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x4 /* internal use, task got migrated */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -856,14 +955,61 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif - CPUACCT_STAT_NSTATS, -}; +#define DEQUEUE_SLEEP 1 +struct sched_class { + const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); + + struct task_struct * (*pick_next_task) (struct rq *rq); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + +#ifdef CONFIG_SMP + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); + + void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + void (*post_schedule) (struct rq *this_rq); + void (*task_waking) (struct task_struct *task); + void (*task_woken) (struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, + const struct cpumask *newmask); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +#endif + + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); + void (*task_fork) (struct task_struct *p); + + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + int oldprio); + + unsigned int (*get_rr_interval) (struct rq *rq, + struct task_struct *task); + +#ifdef CONFIG_FAIR_GROUP_SCHED + void (*task_move_group) (struct task_struct *p, int on_rq); +#endif +}; #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ @@ -877,9 +1023,23 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP +extern void update_group_power(struct sched_domain *sd, int cpu); + extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif + #else /* CONFIG_SMP */ static inline void idle_balance(int cpu, struct rq *rq) @@ -891,7 +1051,6 @@ static inline void idle_balance(int cpu, struct rq *rq) extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -extern void update_group_power(struct sched_domain *sd, int cpu); extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); @@ -904,45 +1063,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern void update_idle_cpu_load(struct rq *this_rq); -#ifdef CONFIG_CGROUP_CPUACCT -#include <linux/cgroup.h> -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ - if (!ca || !ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif - #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { @@ -956,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal) static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; + +#ifdef CONFIG_NO_HZ_FULL + if (rq->nr_running == 2) { + if (tick_nohz_full_cpu(rq->cpu)) { + /* Order rq->nr_running write against the IPI */ + smp_wmb(); + smp_send_reschedule(rq->cpu); + } + } +#endif } static inline void dec_nr_running(struct rq *rq) @@ -963,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq) rq->nr_running--; } +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = jiffies; +#endif +} + extern void update_rq_clock(struct rq *rq); extern void activate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1183,11 +1320,10 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, - NOHZ_IDLE, }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index e036eda..da98af3 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -130,16 +130,11 @@ static int schedstat_open(struct inode *inode, struct file *file) return seq_open(file, &schedstat_sops); } -static int schedstat_release(struct inode *inode, struct file *file) -{ - return 0; -}; - static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, - .release = schedstat_release, + .release = seq_release, }; static int __init proc_schedstat_init(void) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5af44b5..b7a1004 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -160,6 +160,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) case BPF_S_ALU_AND_X: case BPF_S_ALU_OR_K: case BPF_S_ALU_OR_X: + case BPF_S_ALU_XOR_K: + case BPF_S_ALU_XOR_X: case BPF_S_ALU_LSH_K: case BPF_S_ALU_LSH_X: case BPF_S_ALU_RSH_K: diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 4567fc0..6815171 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -193,7 +193,7 @@ EXPORT_SYMBOL(up); struct semaphore_waiter { struct list_head list; struct task_struct *task; - int up; + bool up; }; /* @@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state, list_add_tail(&waiter.list, &sem->wait_list); waiter.task = task; - waiter.up = 0; + waiter.up = false; for (;;) { if (signal_pending_state(state, task)) goto interrupted; - if (timeout <= 0) + if (unlikely(timeout <= 0)) goto timed_out; __set_task_state(task, state); raw_spin_unlock_irq(&sem->lock); @@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem) struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); list_del(&waiter->list); - waiter->up = 1; + waiter->up = true; wake_up_process(waiter->task); } diff --git a/kernel/signal.c b/kernel/signal.c index 598dc06..113411b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -32,6 +32,7 @@ #include <linux/user_namespace.h> #include <linux/uprobes.h> #include <linux/compat.h> +#include <linux/cn_proc.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -854,12 +855,14 @@ static void ptrace_trap_notify(struct task_struct *t) * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ -static int prepare_signal(int sig, struct task_struct *p, bool force) +static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; - if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { + if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { + if (signal->flags & SIGNAL_GROUP_COREDUMP) + return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. */ @@ -1160,8 +1163,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, static void print_fatal_signal(int signr) { struct pt_regs *regs = signal_pt_regs(); - printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", - current->comm, task_pid_nr(current), signr); + printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr); #if defined(__i386__) && !defined(__arch_um__) printk(KERN_INFO "code at %08lx: ", regs->ip); @@ -2350,6 +2352,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) print_fatal_signal(info->si_signo); + proc_coredump_connector(current); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with diff --git a/kernel/smp.c b/kernel/smp.c index 8e451f3..4dba0f7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -100,16 +100,16 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void csd_lock_wait(struct call_single_data *data) +static void csd_lock_wait(struct call_single_data *csd) { - while (data->flags & CSD_FLAG_LOCK) + while (csd->flags & CSD_FLAG_LOCK) cpu_relax(); } -static void csd_lock(struct call_single_data *data) +static void csd_lock(struct call_single_data *csd) { - csd_lock_wait(data); - data->flags = CSD_FLAG_LOCK; + csd_lock_wait(csd); + csd->flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment @@ -119,16 +119,16 @@ static void csd_lock(struct call_single_data *data) smp_mb(); } -static void csd_unlock(struct call_single_data *data) +static void csd_unlock(struct call_single_data *csd) { - WARN_ON(!(data->flags & CSD_FLAG_LOCK)); + WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_mb(); - data->flags &= ~CSD_FLAG_LOCK; + csd->flags &= ~CSD_FLAG_LOCK; } /* @@ -137,7 +137,7 @@ static void csd_unlock(struct call_single_data *data) * ->func, ->info, and ->flags set. */ static -void generic_exec_single(int cpu, struct call_single_data *data, int wait) +void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; @@ -145,7 +145,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); - list_add_tail(&data->list, &dst->list); + list_add_tail(&csd->list, &dst->list); raw_spin_unlock_irqrestore(&dst->lock, flags); /* @@ -163,7 +163,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) arch_send_call_function_single_ipi(cpu); if (wait) - csd_lock_wait(data); + csd_lock_wait(csd); } /* @@ -173,7 +173,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) void generic_smp_call_function_single_interrupt(void) { struct call_single_queue *q = &__get_cpu_var(call_single_queue); - unsigned int data_flags; LIST_HEAD(list); /* @@ -186,25 +185,26 @@ void generic_smp_call_function_single_interrupt(void) raw_spin_unlock(&q->lock); while (!list_empty(&list)) { - struct call_single_data *data; + struct call_single_data *csd; + unsigned int csd_flags; - data = list_entry(list.next, struct call_single_data, list); - list_del(&data->list); + csd = list_entry(list.next, struct call_single_data, list); + list_del(&csd->list); /* - * 'data' can be invalid after this call if flags == 0 + * 'csd' can be invalid after this call if flags == 0 * (when called through generic_exec_single()), * so save them away before making the call: */ - data_flags = data->flags; + csd_flags = csd->flags; - data->func(data->info); + csd->func(csd->info); /* * Unlocked CSDs are valid through generic_exec_single(): */ - if (data_flags & CSD_FLAG_LOCK) - csd_unlock(data); + if (csd_flags & CSD_FLAG_LOCK) + csd_unlock(csd); } } @@ -249,16 +249,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, local_irq_restore(flags); } else { if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data = &d; + struct call_single_data *csd = &d; if (!wait) - data = &__get_cpu_var(csd_data); + csd = &__get_cpu_var(csd_data); - csd_lock(data); + csd_lock(csd); - data->func = func; - data->info = info; - generic_exec_single(cpu, data, wait); + csd->func = func; + csd->info = info; + generic_exec_single(cpu, csd, wait); } else { err = -ENXIO; /* CPU not online */ } @@ -325,7 +325,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); * pre-allocated data structure. Useful for embedding @data inside * other structures, for instance. */ -void __smp_call_function_single(int cpu, struct call_single_data *data, +void __smp_call_function_single(int cpu, struct call_single_data *csd, int wait) { unsigned int this_cpu; @@ -343,11 +343,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, if (cpu == this_cpu) { local_irq_save(flags); - data->func(data->info); + csd->func(csd->info); local_irq_restore(flags); } else { - csd_lock(data); - generic_exec_single(cpu, data, wait); + csd_lock(csd); + generic_exec_single(cpu, csd, wait); } put_cpu(); } @@ -369,7 +369,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { - struct call_function_data *data; + struct call_function_data *cfd; int cpu, next_cpu, this_cpu = smp_processor_id(); /* @@ -401,24 +401,24 @@ void smp_call_function_many(const struct cpumask *mask, return; } - data = &__get_cpu_var(cfd_data); + cfd = &__get_cpu_var(cfd_data); - cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, data->cpumask); + cpumask_and(cfd->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ - if (unlikely(!cpumask_weight(data->cpumask))) + if (unlikely(!cpumask_weight(cfd->cpumask))) return; /* - * After we put an entry into the list, data->cpumask - * may be cleared again when another CPU sends another IPI for - * a SMP function call, so data->cpumask will be zero. + * After we put an entry into the list, cfd->cpumask may be cleared + * again when another CPU sends another IPI for a SMP function call, so + * cfd->cpumask will be zero. */ - cpumask_copy(data->cpumask_ipi, data->cpumask); + cpumask_copy(cfd->cpumask_ipi, cfd->cpumask); - for_each_cpu(cpu, data->cpumask) { - struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); + for_each_cpu(cpu, cfd->cpumask) { + struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; @@ -433,12 +433,13 @@ void smp_call_function_many(const struct cpumask *mask, } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(data->cpumask_ipi); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { - for_each_cpu(cpu, data->cpumask) { - struct call_single_data *csd = - per_cpu_ptr(data->csd, cpu); + for_each_cpu(cpu, cfd->cpumask) { + struct call_single_data *csd; + + csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } diff --git a/kernel/softirq.c b/kernel/softirq.c index 14d7758..b5197dc 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -329,6 +329,19 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +static inline void tick_irq_exit(void) +{ +#ifdef CONFIG_NO_HZ_COMMON + int cpu = smp_processor_id(); + + /* Make sure that timer wheel updates are propagated */ + if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { + if (!in_interrupt()) + tick_nohz_irq_exit(); + } +#endif +} + /* * Exit an interrupt context. Process softirqs if needed and possible: */ @@ -346,11 +359,7 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -#ifdef CONFIG_NO_HZ - /* Make sure that timer wheel updates are propagated */ - if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) - tick_nohz_irq_exit(); -#endif + tick_irq_exit(); rcu_irq_exit(); } @@ -620,8 +629,7 @@ static void remote_softirq_receive(void *data) unsigned long flags; int softirq; - softirq = cp->priv; - + softirq = *(int *)cp->info; local_irq_save(flags); __local_trigger(cp, softirq); local_irq_restore(flags); @@ -631,9 +639,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir { if (cpu_online(cpu)) { cp->func = remote_softirq_receive; - cp->info = cp; + cp->info = &softirq; cp->flags = 0; - cp->priv = softirq; __smp_call_function_single(cpu, cp, 0); return 0; diff --git a/kernel/sys.c b/kernel/sys.c index 0da73cf..b95d3c7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -49,6 +49,11 @@ #include <linux/user_namespace.h> #include <linux/binfmts.h> +#include <linux/sched.h> +#include <linux/rcupdate.h> +#include <linux/uidgid.h> +#include <linux/cred.h> + #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> @@ -1044,6 +1049,67 @@ change_okay: return old_fsgid; } +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +SYSCALL_DEFINE0(getpid) +{ + return task_tgid_vnr(current); +} + +/* Thread ID - the internal kernel "pid" */ +SYSCALL_DEFINE0(gettid) +{ + return task_pid_vnr(current); +} + +/* + * Accessing ->real_parent is not SMP-safe, it could + * change from under us. However, we can use a stale + * value of ->real_parent under rcu_read_lock(), see + * release_task()->call_rcu(delayed_put_task_struct). + */ +SYSCALL_DEFINE0(getppid) +{ + int pid; + + rcu_read_lock(); + pid = task_tgid_vnr(rcu_dereference(current->real_parent)); + rcu_read_unlock(); + + return pid; +} + +SYSCALL_DEFINE0(getuid) +{ + /* Only we change this so SMP safe */ + return from_kuid_munged(current_user_ns(), current_uid()); +} + +SYSCALL_DEFINE0(geteuid) +{ + /* Only we change this so SMP safe */ + return from_kuid_munged(current_user_ns(), current_euid()); +} + +SYSCALL_DEFINE0(getgid) +{ + /* Only we change this so SMP safe */ + return from_kgid_munged(current_user_ns(), current_gid()); +} + +SYSCALL_DEFINE0(getegid) +{ + /* Only we change this so SMP safe */ + return from_kgid_munged(current_user_ns(), current_egid()); +} + void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; @@ -1785,13 +1851,26 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) return getrusage(current, who, ru); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) +{ + struct rusage r; + + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) + return -EINVAL; + + k_getrusage(current, who, &r); + return put_compat_rusage(&r, ru); +} +#endif + SYSCALL_DEFINE1(umask, int, mask) { mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); return mask; } -#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; @@ -1985,17 +2064,12 @@ out: return error; } +#ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) { return put_user(me->clear_child_tid, tid_addr); } - -#else /* CONFIG_CHECKPOINT_RESTORE */ -static int prctl_set_mm(int opt, unsigned long addr, - unsigned long arg4, unsigned long arg5) -{ - return -EINVAL; -} +#else static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) { return -EINVAL; @@ -2245,3 +2319,148 @@ int orderly_poweroff(bool force) return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); + +/** + * do_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill + */ +static int do_sysinfo(struct sysinfo *info) +{ + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + struct timespec tp; + + memset(info, 0, sizeof(struct sysinfo)); + + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); + + info->procs = nr_threads; + + si_meminfo(info); + si_swapinfo(info); + + /* + * If the sum of all the available memory (i.e. ram + swap) + * is less than can be stored in a 32 bit unsigned long then + * we can be binary compatible with 2.2.x kernels. If not, + * well, in that case 2.2.x was broken anyways... + * + * -Erik Andersen <andersee@debian.org> + */ + + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) + goto out; + bitcount = 0; + mem_unit = info->mem_unit; + while (mem_unit > 1) { + bitcount++; + mem_unit >>= 1; + sav_total = mem_total; + mem_total <<= 1; + if (mem_total < sav_total) + goto out; + } + + /* + * If mem_total did not overflow, multiply all memory values by + * info->mem_unit and set it to 1. This leaves things compatible + * with 2.2.x, and also retains compatibility with earlier 2.4.x + * kernels... + */ + + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; + +out: + return 0; +} + +SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) +{ + struct sysinfo val; + + do_sysinfo(&val); + + if (copy_to_user(info, &val, sizeof(struct sysinfo))) + return -EFAULT; + + return 0; +} + +#ifdef CONFIG_COMPAT +struct compat_sysinfo { + s32 uptime; + u32 loads[3]; + u32 totalram; + u32 freeram; + u32 sharedram; + u32 bufferram; + u32 totalswap; + u32 freeswap; + u16 procs; + u16 pad; + u32 totalhigh; + u32 freehigh; + u32 mem_unit; + char _f[20-2*sizeof(u32)-sizeof(int)]; +}; + +COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) +{ + struct sysinfo s; + + do_sysinfo(&s); + + /* Check to see if any memory value is too large for 32-bit and scale + * down if needed + */ + if ((s.totalram >> 32) || (s.totalswap >> 32)) { + int bitcount = 0; + + while (s.mem_unit < PAGE_SIZE) { + s.mem_unit <<= 1; + bitcount++; + } + + s.totalram >>= bitcount; + s.freeram >>= bitcount; + s.sharedram >>= bitcount; + s.bufferram >>= bitcount; + s.totalswap >>= bitcount; + s.freeswap >>= bitcount; + s.totalhigh >>= bitcount; + s.freehigh >>= bitcount; + } + + if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || + __put_user(s.uptime, &info->uptime) || + __put_user(s.loads[0], &info->loads[0]) || + __put_user(s.loads[1], &info->loads[1]) || + __put_user(s.loads[2], &info->loads[2]) || + __put_user(s.totalram, &info->totalram) || + __put_user(s.freeram, &info->freeram) || + __put_user(s.sharedram, &info->sharedram) || + __put_user(s.bufferram, &info->bufferram) || + __put_user(s.totalswap, &info->totalswap) || + __put_user(s.freeswap, &info->freeswap) || + __put_user(s.procs, &info->procs) || + __put_user(s.totalhigh, &info->totalhigh) || + __put_user(s.freehigh, &info->freehigh) || + __put_user(s.mem_unit, &info->mem_unit)) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_COMPAT */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 395084d..7078052 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -20,6 +20,7 @@ cond_syscall(sys_quotactl); cond_syscall(sys32_quotactl); cond_syscall(sys_acct); cond_syscall(sys_lookup_dcookie); +cond_syscall(compat_sys_lookup_dcookie); cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); @@ -155,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev); cond_syscall(sys_pciconfig_read); cond_syscall(sys_pciconfig_write); cond_syscall(sys_pciconfig_iobase); -cond_syscall(sys32_ipc); +cond_syscall(compat_sys_s390_ipc); cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); @@ -199,6 +200,7 @@ cond_syscall(sys_perf_event_open); /* fanotify! */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); +cond_syscall(compat_sys_fanotify_mark); /* open by handle */ cond_syscall(sys_name_to_handle_at); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ebf7235..aea4a9e 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -15,6 +15,7 @@ #include <linux/netdevice.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/compat.h> #ifdef CONFIG_SYSCTL_SYSCALL @@ -1447,7 +1448,6 @@ SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) #ifdef CONFIG_COMPAT -#include <asm/compat.h> struct compat_sysctl_args { compat_uptr_t name; @@ -1459,7 +1459,7 @@ struct compat_sysctl_args { compat_ulong_t __unused[4]; }; -asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) +COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args) { struct compat_sysctl_args tmp; compat_size_t __user *compat_oldlenp; diff --git a/kernel/time.c b/kernel/time.c index f8342a4..d3617db 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -138,13 +138,14 @@ int persistent_clock_is_local; */ static inline void warp_clock(void) { - struct timespec adjust; + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; - adjust = current_kernel_time(); - if (sys_tz.tz_minuteswest != 0) persistent_clock_is_local = 1; - adjust.tv_sec += sys_tz.tz_minuteswest * 60; - do_settimeofday(&adjust); + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } } /* diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d8..e4c07b0 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -64,20 +64,88 @@ config GENERIC_CMOS_UPDATE if GENERIC_CLOCKEVENTS menu "Timers subsystem" -# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independ of this. config TICK_ONESHOT bool -config NO_HZ - bool "Tickless System (Dynamic Ticks)" +config NO_HZ_COMMON + bool depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT + +choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC + bool "Periodic timer ticks (constant rate, no dynticks)" + help + This option keeps the tick running periodically at a constant + rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE + bool "Idle dynticks system (tickless idle)" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select NO_HZ_COMMON + help + This option enables a tickless idle system: timer interrupts + will only trigger on an as-needed basis when the system is idle. + This is usually interesting for energy saving. + + Most of the time you want to say Y here. + +config NO_HZ_FULL + bool "Full dynticks system (tickless)" + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping + depends on SMP + # RCU_USER_QS dependency + depends on HAVE_CONTEXT_TRACKING + # VIRT_CPU_ACCOUNTING_GEN dependency + depends on 64BIT + select NO_HZ_COMMON + select RCU_USER_QS + select RCU_NOCB_CPU + select VIRT_CPU_ACCOUNTING_GEN + select CONTEXT_TRACKING_FORCE + select IRQ_WORK + help + Adaptively try to shutdown the tick whenever possible, even when + the CPU is running tasks. Typically this requires running a single + task on the CPU. Chances for running tickless are maximized when + the task mostly runs in userspace and has few kernel activity. + + You need to fill up the nohz_full boot parameter with the + desired range of dynticks CPUs. + + This is implemented at the expense of some overhead in user <-> kernel + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + + Say N. + +endchoice + +config NO_HZ_FULL_ALL + bool "Full dynticks system on all CPUs by default" + depends on NO_HZ_FULL + help + If the user doesn't pass the nohz_full boot option to + define the range of full dynticks CPUs, consider that all + CPUs in the system are full dynticks by default. + Note the boot CPU will still be kept outside the range to + handle the timekeeping duty. + +config NO_HZ + bool "Old Idle dynticks config" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS help - This option enables a tickless system: timer interrupts will - only trigger on an as-needed basis both when the system is - busy and when the system is idle. + This is the old config entry that enables dynticks idle. + We keep it around for a little while to enforce backward + compatibility with older config files. config HIGH_RES_TIMERS bool "High Resolution Timer Support" diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 072bb06..12ff13a 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,13 +18,14 @@ #include <linux/rtc.h> #include "tick-internal.h" +#include "ntp_internal.h" /* * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks. */ -DEFINE_RAW_SPINLOCK(ntp_lock); - /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; @@ -53,9 +54,6 @@ static int time_state = TIME_OK; /* clock status bits: */ static int time_status = STA_UNSYNC; -/* TAI offset (secs): */ -static long time_tai; - /* time adjustment (nsecs): */ static s64 time_offset; @@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void) /** * pps_clear - Clears the PPS state variables - * - * Must be called while holding a write on the ntp_lock */ static inline void pps_clear(void) { @@ -150,8 +146,6 @@ static inline void pps_clear(void) /* Decrease pps_valid to indicate that another second has passed since * the last PPS signal. When it reaches 0, indicate that PPS signal is * missing. - * - * Must be called while holding a write on the ntp_lock */ static inline void pps_dec_valid(void) { @@ -346,10 +340,6 @@ static void ntp_update_offset(long offset) */ void ntp_clear(void) { - unsigned long flags; - - raw_spin_lock_irqsave(&ntp_lock, flags); - time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; @@ -362,20 +352,12 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); - raw_spin_unlock_irqrestore(&ntp_lock, flags); - } u64 ntp_tick_length(void) { - unsigned long flags; - s64 ret; - - raw_spin_lock_irqsave(&ntp_lock, flags); - ret = tick_length; - raw_spin_unlock_irqrestore(&ntp_lock, flags); - return ret; + return tick_length; } @@ -393,9 +375,6 @@ int second_overflow(unsigned long secs) { s64 delta; int leap = 0; - unsigned long flags; - - raw_spin_lock_irqsave(&ntp_lock, flags); /* * Leap second processing. If in leap-insert state at the end of the @@ -415,7 +394,6 @@ int second_overflow(unsigned long secs) else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; - time_tai++; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } @@ -425,7 +403,6 @@ int second_overflow(unsigned long secs) time_state = TIME_OK; else if ((secs + 1) % 86400 == 0) { leap = 1; - time_tai--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -479,8 +456,6 @@ int second_overflow(unsigned long secs) time_adjust = 0; out: - raw_spin_unlock_irqrestore(&ntp_lock, flags); - return leap; } @@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_status |= txc->status & ~STA_RONLY; } -/* - * Called with ntp_lock held, so we can access and modify - * all the global NTP state: - */ -static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) + +static inline void process_adjtimex_modes(struct timex *txc, + struct timespec *ts, + s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc, ts); @@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts } if (txc->modes & ADJ_TAI && txc->constant > 0) - time_tai = txc->constant; + *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); @@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts ntp_update_frequency(); } -/* - * adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. + + +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -int do_adjtimex(struct timex *txc) +int ntp_validate_timex(struct timex *txc) { - struct timespec ts; - int result; - - /* Validate the data before disabling interrupts */ if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) @@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc) /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; - /* * if the quartz is off by more than 10% then * something is VERY wrong! @@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc) return -EINVAL; } - if (txc->modes & ADJ_SETOFFSET) { - struct timespec delta; - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!capable(CAP_SYS_TIME)) - return -EPERM; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - result = timekeeping_inject_offset(&delta); - if (result) - return result; - } + if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) + return -EPERM; - getnstimeofday(&ts); + return 0; +} - raw_spin_lock_irq(&ntp_lock); + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +{ + int result; if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc) /* If there are input parameters, then process them: */ if (txc->modes) - process_adjtimex_modes(txc, &ts); + process_adjtimex_modes(txc, ts, time_tai); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); @@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc) txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = tick_usec; - txc->tai = time_tai; + txc->tai = *time_tai; /* fill PPS status fields */ pps_fill_timex(txc); - raw_spin_unlock_irq(&ntp_lock); - - txc->time.tv_sec = ts.tv_sec; - txc->time.tv_usec = ts.tv_nsec; + txc->time.tv_sec = ts->tv_sec; + txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; @@ -894,7 +860,7 @@ static void hardpps_update_phase(long error) } /* - * hardpps() - discipline CPU clock oscillator to external PPS signal + * __hardpps() - discipline CPU clock oscillator to external PPS signal * * This routine is called at each PPS signal arrival in order to * discipline the CPU clock oscillator to the PPS signal. It takes two @@ -905,15 +871,13 @@ static void hardpps_update_phase(long error) * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) { struct pps_normtime pts_norm, freq_norm; unsigned long flags; pts_norm = pps_normalize_ts(*phase_ts); - raw_spin_lock_irqsave(&ntp_lock, flags); - /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -925,7 +889,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - raw_spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -940,7 +903,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - raw_spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -957,10 +919,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - raw_spin_unlock_irqrestore(&ntp_lock, flags); } -EXPORT_SYMBOL(hardpps); - #endif /* CONFIG_NTP_PPS */ static int __init ntp_tick_adj_setup(char *str) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 0000000..1950cb4 --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern int second_overflow(unsigned long secs); +extern int ntp_validate_timex(struct timex *); +extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern void __hardpps(const struct timespec *, const struct timespec *); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7f32fe0..206bbfb 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,9 +28,8 @@ */ static struct tick_device tick_broadcast_device; -/* FIXME: Use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); -static DECLARE_BITMAP(tmpmask, NR_CPUS); +static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; @@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void) struct cpumask *tick_get_broadcast_mask(void) { - return to_cpumask(tick_broadcast_mask); + return tick_broadcast_mask; } /* @@ -67,6 +66,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { + struct clock_event_device *cur = tick_broadcast_device.evtdev; + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || @@ -74,9 +75,21 @@ int tick_check_broadcast_device(struct clock_event_device *dev) return 0; clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + if (cur) + cur->event_handler = clockevents_handle_noop; tick_broadcast_device.evtdev = dev; - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(dev); + /* + * Inform all cpus about this. We might be in a situation + * where we did not switch to oneshot mode because the per cpu + * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack + * of a oneshot capable broadcast device. Without that + * notification the systems stays stuck in periodic mode + * forever. + */ + if (dev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_clock_notify(); return 1; } @@ -124,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) if (!tick_device_is_functional(dev)) { dev->event_handler = tick_handle_periodic; tick_device_setup_broadcast_func(dev); - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + cpumask_set_cpu(cpu, tick_broadcast_mask); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; } else { @@ -135,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) */ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { int cpu = smp_processor_id(); - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); tick_broadcast_clear_oneshot(cpu); } else { tick_device_setup_broadcast_func(dev); @@ -199,9 +212,8 @@ static void tick_do_periodic_broadcast(void) { raw_spin_lock(&tick_broadcast_lock); - cpumask_and(to_cpumask(tmpmask), - cpu_online_mask, tick_get_broadcast_mask()); - tick_do_broadcast(to_cpumask(tmpmask)); + cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); + tick_do_broadcast(tmpmask); raw_spin_unlock(&tick_broadcast_lock); } @@ -264,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason) if (!tick_device_is_functional(dev)) goto out; - bc_stopped = cpumask_empty(tick_get_broadcast_mask()); + bc_stopped = cpumask_empty(tick_broadcast_mask); switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); @@ -280,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (!tick_broadcast_force && - cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); @@ -289,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) break; } - if (cpumask_empty(tick_get_broadcast_mask())) { + if (cpumask_empty(tick_broadcast_mask)) { if (!bc_stopped) clockevents_shutdown(bc); } else if (bc_stopped) { @@ -338,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_mask); if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { - if (bc && cpumask_empty(tick_get_broadcast_mask())) + if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } @@ -377,13 +387,13 @@ int tick_resume_broadcast(void) switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); broadcast = cpumask_test_cpu(smp_processor_id(), - tick_get_broadcast_mask()); + tick_broadcast_mask); break; case TICKDEV_MODE_ONESHOT: - if (!cpumask_empty(tick_get_broadcast_mask())) + if (!cpumask_empty(tick_broadcast_mask)) broadcast = tick_resume_broadcast_oneshot(bc); break; } @@ -396,25 +406,58 @@ int tick_resume_broadcast(void) #ifdef CONFIG_TICK_ONESHOT -/* FIXME: use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); +static cpumask_var_t tick_broadcast_oneshot_mask; +static cpumask_var_t tick_broadcast_pending_mask; +static cpumask_var_t tick_broadcast_force_mask; /* * Exposed for debugging: see timer_list.c */ struct cpumask *tick_get_broadcast_oneshot_mask(void) { - return to_cpumask(tick_broadcast_oneshot_mask); + return tick_broadcast_oneshot_mask; } -static int tick_broadcast_set_event(ktime_t expires, int force) +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void) { - struct clock_event_device *bc = tick_broadcast_device.evtdev; + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, + const struct cpumask *cpumask) +{ + if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) + return; + + if (cpumask_equal(bc->cpumask, cpumask)) + return; + + bc->cpumask = cpumask; + irq_set_affinity(bc->irq, bc->cpumask); +} + +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires, int force) +{ + int ret; if (bc->mode != CLOCK_EVT_MODE_ONESHOT) clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - return clockevents_program_event(bc, expires, force); + ret = clockevents_program_event(bc, expires, force); + if (!ret) + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); + return ret; } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -429,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) */ void tick_check_oneshot_broadcast(int cpu) { - if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { + if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { struct tick_device *td = &per_cpu(tick_cpu_device, cpu); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -443,27 +486,39 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; ktime_t now, next_event; - int cpu; + int cpu, next_cpu = 0; raw_spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; - cpumask_clear(to_cpumask(tmpmask)); + cpumask_clear(tmpmask); now = ktime_get(); /* Find all expired events */ - for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { + for_each_cpu(cpu, tick_broadcast_oneshot_mask) { td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 <= now.tv64) - cpumask_set_cpu(cpu, to_cpumask(tmpmask)); - else if (td->evtdev->next_event.tv64 < next_event.tv64) + if (td->evtdev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tmpmask); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + cpumask_set_cpu(cpu, tick_broadcast_pending_mask); + } else if (td->evtdev->next_event.tv64 < next_event.tv64) { next_event.tv64 = td->evtdev->next_event.tv64; + next_cpu = cpu; + } } + /* Take care of enforced broadcast requests */ + cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); + cpumask_clear(tick_broadcast_force_mask); + /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(to_cpumask(tmpmask)); + tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -480,7 +535,7 @@ again: * Rearm the broadcast device. If event expired, * repeat the above */ - if (tick_broadcast_set_event(next_event, 0)) + if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) goto again; } raw_spin_unlock(&tick_broadcast_lock); @@ -495,6 +550,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; + ktime_t now; int cpu; /* @@ -519,21 +575,84 @@ void tick_broadcast_oneshot_control(unsigned long reason) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { - if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); - if (dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(dev->next_event, 1); + /* + * We only reprogram the broadcast timer if we + * did not mark ourself in the force mask and + * if the cpu local event is earlier than the + * broadcast event. If the current CPU is in + * the force mask, then we are going to be + * woken by the IPI right away. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && + dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } } else { - if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { - cpumask_clear_cpu(cpu, - tick_get_broadcast_oneshot_mask()); + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 != KTIME_MAX) - tick_program_event(dev->next_event, 1); + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* + * The cpu which was handling the broadcast + * timer marked this cpu in the broadcast + * pending mask and fired the broadcast + * IPI. So we are going to handle the expired + * event anyway via the broadcast IPI + * handler. No need to reprogram the timer + * with an already expired event. + */ + if (cpumask_test_and_clear_cpu(cpu, + tick_broadcast_pending_mask)) + goto out; + + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are not longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefor rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tick_broadcast_force_mask); + goto out; + } + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ + tick_program_event(dev->next_event, 1); } } +out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -544,7 +663,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) */ static void tick_broadcast_clear_oneshot(int cpu) { - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -574,7 +693,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; /* Take the do_timer update */ - tick_do_timer_cpu = cpu; + if (!tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; /* * We must be careful here. There might be other CPUs @@ -582,17 +702,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) * oneshot_mask bits for those and program the * broadcast device to fire. */ - cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); - cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); - cpumask_or(tick_get_broadcast_oneshot_mask(), - tick_get_broadcast_oneshot_mask(), - to_cpumask(tmpmask)); + cpumask_copy(tmpmask, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, tmpmask); - if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + if (was_periodic && !cpumask_empty(tmpmask)) { clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; } else { @@ -640,7 +759,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) * Clear the broadcast mask flag for the dead cpu, but do not * stop the broadcast device! */ - cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -664,3 +783,14 @@ bool tick_broadcast_oneshot_available(void) } #endif + +void __init tick_broadcast_init(void) +{ + alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + alloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT + alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b1600a6..5d3fb10 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - tick_do_timer_cpu = cpu; + if (!tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; + else + tick_do_timer_cpu = TICK_DO_TIMER_NONE; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); } @@ -323,6 +326,7 @@ static void tick_shutdown(unsigned int *cpup) */ dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); + dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; } raw_spin_unlock_irqrestore(&tick_device_lock, flags); @@ -416,4 +420,5 @@ static struct notifier_block tick_notifier = { void __init tick_init(void) { clockevents_register_notifier(&tick_notifier); + tick_broadcast_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cf3e59e..f0299ea 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include <linux/hrtimer.h> #include <linux/tick.h> +extern seqlock_t jiffies_lock; + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD #define TICK_DO_TIMER_NONE -1 @@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); extern void tick_suspend_broadcast(void); extern int tick_resume_broadcast(void); - +extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); @@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } +static inline void tick_broadcast_init(void) { } /* * Set the periodic handler in non broadcast mode diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a19a399..bc67d42 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -21,11 +21,15 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/perf_event.h> #include <asm/irq_regs.h> #include "tick-internal.h" +#include <trace/events/timer.h> + /* * Per cpu nohz control structure */ @@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now) { int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the cpu in charge went @@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now) * this duty, then the jiffies update is still serialized by * jiffies_lock. */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) + && !tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; #endif @@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now) static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long @@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) profile_tick(CPU_PROFILING); } +#ifdef CONFIG_NO_HZ_FULL +static cpumask_var_t nohz_full_mask; +bool have_nohz_full_mask; + +static bool can_stop_full_tick(void) +{ + WARN_ON_ONCE(!irqs_disabled()); + + if (!sched_can_stop_tick()) { + trace_tick_stop(0, "more than 1 task in runqueue\n"); + return false; + } + + if (!posix_cpu_timers_can_stop_tick(current)) { + trace_tick_stop(0, "posix timers running\n"); + return false; + } + + if (!perf_event_can_stop_tick()) { + trace_tick_stop(0, "perf events running\n"); + return false; + } + + /* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + /* + * TODO: kick full dynticks CPUs when + * sched_clock_stable is set. + */ + if (!sched_clock_stable) { + trace_tick_stop(0, "unstable sched clock\n"); + return false; + } +#endif + + return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +void tick_nohz_full_check(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (tick_nohz_full_cpu(smp_processor_id())) { + if (ts->tick_stopped && !is_idle_task(current)) { + if (!can_stop_full_tick()) + tick_nohz_restart_sched_tick(ts, ktime_get()); + } + } +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ + tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { + .func = nohz_full_kick_work_func, +}; + +/* + * Kick the current CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick(void) +{ + if (tick_nohz_full_cpu(smp_processor_id())) + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +static void nohz_full_kick_ipi(void *info) +{ + tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ + if (!have_nohz_full_mask) + return; + + preempt_disable(); + smp_call_function_many(nohz_full_mask, + nohz_full_kick_ipi, NULL, false); + preempt_enable(); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void tick_nohz_task_switch(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + + if (!tick_nohz_full_cpu(smp_processor_id())) + goto out; + + if (tick_nohz_tick_stopped() && !can_stop_full_tick()) + tick_nohz_full_kick(); + +out: + local_irq_restore(flags); +} + +int tick_nohz_full_cpu(int cpu) +{ + if (!have_nohz_full_mask) + return 0; + + return cpumask_test_cpu(cpu, nohz_full_mask); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_full_setup(char *str) +{ + int cpu; + + alloc_bootmem_cpumask_var(&nohz_full_mask); + if (cpulist_parse(str, nohz_full_mask) < 0) { + pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + return 1; + } + + cpu = smp_processor_id(); + if (cpumask_test_cpu(cpu, nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, nohz_full_mask); + } + have_nohz_full_mask = true; + + return 1; +} +__setup("nohz_full=", tick_nohz_full_setup); + +static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* + * If we handle the timekeeping duty for full dynticks CPUs, + * we can't safely shutdown that CPU. + */ + if (have_nohz_full_mask && tick_do_timer_cpu == cpu) + return -EINVAL; + break; + } + return NOTIFY_OK; +} + +/* + * Worst case string length in chunks of CPU range seems 2 steps + * separations: 0,2,4,6,... + * This is NR_CPUS + sizeof('\0') + */ +static char __initdata nohz_full_buf[NR_CPUS + 1]; + +static int tick_nohz_init_all(void) +{ + int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL + if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { + pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); + return err; + } + err = 0; + cpumask_setall(nohz_full_mask); + cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); + have_nohz_full_mask = true; +#endif + return err; +} + +void __init tick_nohz_init(void) +{ + int cpu; + + if (!have_nohz_full_mask) { + if (tick_nohz_init_all() < 0) + return; + } + + cpu_notifier(tick_nohz_cpu_down_callback, 0); + + /* Make sure full dynticks CPU are also RCU nocbs */ + for_each_cpu(cpu, nohz_full_mask) { + if (!rcu_is_nocb_cpu(cpu)) { + pr_warning("NO_HZ: CPU %d is not RCU nocb: " + "cleared from nohz_full range", cpu); + cpumask_clear_cpu(cpu, nohz_full_mask); + } + } + + cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); + pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); +} +#else +#define have_nohz_full_mask (0) +#endif + /* * NOHZ - aka dynamic tick functionality */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ @@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, delta_jiffies = rcu_delta_jiffies; } } + /* - * Do not stop the tick, if we are only one off - * or if the cpu is required for rcu + * Do not stop the tick, if we are only one off (or less) + * or if the cpu is required for RCU: */ - if (!ts->tick_stopped && delta_jiffies == 1) + if (!ts->tick_stopped && delta_jiffies <= 1) goto out; /* Schedule the tick, if we are at least one jiffie off */ @@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, time_delta = KTIME_MAX; } +#ifdef CONFIG_NO_HZ_FULL + if (!ts->inidle) { + time_delta = min(time_delta, + scheduler_tick_max_deferment()); + } +#endif + /* * calculate the expiry time for the next timer wheel * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals @@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; + trace_tick_stop(1, " "); } /* @@ -457,6 +687,24 @@ out: return ret; } +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = smp_processor_id(); + + if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + return; + + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; + + if (!can_stop_full_tick()) + return; + + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} + static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* @@ -482,13 +730,28 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - (unsigned int) local_softirq_pending()); + pr_warn("NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); ratelimit++; } return false; } + if (have_nohz_full_mask) { + /* + * Keep the tick alive to guarantee timekeeping progression + * if there are full dynticks CPUs around + */ + if (tick_do_timer_cpu == cpu) + return false; + /* + * Boot safety: make sure the timekeeping duty has been + * assigned before entering dyntick-idle mode, + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + return false; + } + return true; } @@ -568,12 +831,13 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); - if (!ts->inidle) - return; - - /* Cancel the timer because CPU already waken up from the C-states*/ - menu_hrtimer_cancel(); - __tick_nohz_idle_enter(ts); + if (ts->inidle) { + /* Cancel the timer because CPU already waken up from the C-states*/ + menu_hrtimer_cancel(); + __tick_nohz_idle_enter(ts); + } else { + tick_nohz_full_stop_tick(ts); + } } /** @@ -802,7 +1066,7 @@ static inline void tick_check_nohz(int cpu) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_check_nohz(int cpu) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() @@ -887,14 +1151,14 @@ void tick_setup_sched_timer(void) now = ktime_get(); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (tick_nohz_enabled) ts->nohz_mode = NOHZ_MODE_HIGHRES; #endif } #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9a0bc98..98cd470 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -23,8 +23,13 @@ #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> +#include "tick-internal.h" +#include "ntp_internal.h" static struct timekeeper timekeeper; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static seqcount_t timekeeper_seq; +static struct timekeeper shadow_timekeeper; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) tk->wall_to_monotonic = wtm; set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec_to_ktime(tmp); + tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) old_clock = tk->clock; tk->clock = clock; - clock->cycle_last = clock->read(clock); + tk->cycle_last = clock->cycle_last = clock->read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk) /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener - * - * Must hold write on timekeeper.lock */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { @@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); - /* update timekeeping data */ update_pvclock_gtod(tk); - write_sequnlock_irqrestore(&tk->lock, flags); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } @@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener - * - * Must hold write on timekeeper.lock */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { - struct timekeeper *tk = &timekeeper; unsigned long flags; int ret; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); - write_sequnlock_irqrestore(&tk->lock, flags); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); -/* must hold write on timekeeper.lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp) +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) { if (clearntp) { tk->ntp_error = 0; @@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp) } update_vsyscall(tk); update_pvclock_gtod(tk); + + if (mirror) + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); } /** @@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) clock = tk->clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - clock->cycle_last = cycle_now; + tk->cycle_last = clock->cycle_last = cycle_now; tk->xtime_nsec += cycle_delta * tk->mult; @@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts) s64 nsecs = 0; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); @@ -335,11 +338,11 @@ ktime_t ktime_get(void) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; @@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts); + +/** + * timekeeping_clocktai - Returns the TAI time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void timekeeping_clocktai(struct timespec *ts) +{ + struct timekeeper *tk = &timekeeper; + unsigned long seq; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&timekeeper_seq); + + ts->tv_sec = tk->xtime_sec + tk->tai_offset; + nsecs = timekeeping_get_ns(tk); + + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + ts->tv_nsec = 0; + timespec_add_ns(ts, nsecs); + +} +EXPORT_SYMBOL(timekeeping_clocktai); + + +/** + * ktime_get_clocktai - Returns the TAI time of day in a ktime + * + * Returns the time of day in a ktime. + */ +ktime_t ktime_get_clocktai(void) +{ + struct timespec ts; + + timekeeping_clocktai(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL(ktime_get_clocktai); + #ifdef CONFIG_NTP_PPS /** @@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) WARN_ON_ONCE(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); *ts_raw = tk->raw_time; ts_real->tv_sec = tk->xtime_sec; @@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw = timekeeping_get_ns_raw(tk); nsecs_real = timekeeping_get_ns(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv) if (!timespec_valid_strict(tv)) return -EINVAL; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv) tk_set_xtime(tk, tv); - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts) if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); @@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */ } EXPORT_SYMBOL(timekeeping_inject_offset); + +/** + * timekeeping_get_tai_offset - Returns current TAI offset from UTC + * + */ +s32 timekeeping_get_tai_offset(void) +{ + struct timekeeper *tk = &timekeeper; + unsigned int seq; + s32 ret; + + do { + seq = read_seqcount_begin(&timekeeper_seq); + ret = tk->tai_offset; + } while (read_seqcount_retry(&timekeeper_seq, seq)); + + return ret; +} + +/** + * __timekeeping_set_tai_offset - Lock free worker function + * + */ +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ + tk->tai_offset = tai_offset; + tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); +} + +/** + * timekeeping_set_tai_offset - Sets the current TAI offset from UTC + * + */ +void timekeeping_set_tai_offset(s32 tai_offset) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + __timekeeping_set_tai_offset(tk, tai_offset); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + clock_was_set(); +} + /** * change_clocksource - Swaps clocksources if a new one is available * @@ -526,7 +623,8 @@ static int change_clocksource(void *data) new = (struct clocksource *) data; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); if (!new->enable || new->enable(new) == 0) { @@ -535,9 +633,10 @@ static int change_clocksource(void *data) if (old->disable) old->disable(old); } - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; } @@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts) s64 nsecs; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); nsecs = timekeeping_get_ns_raw(tk); *ts = tk->raw_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); timespec_add_ns(ts, nsecs); } @@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void) u64 ret; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ret = tk->clock->max_idle_ns; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return ret; } @@ -693,11 +792,10 @@ void __init timekeeping_init(void) boot.tv_nsec = 0; } - seqlock_init(&tk->lock); - + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); ntp_init(); - write_seqlock_irqsave(&tk->lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); @@ -716,7 +814,10 @@ void __init timekeeping_init(void) tmp.tv_nsec = 0; tk_set_sleep_time(tk, tmp); - write_sequnlock_irqrestore(&tk->lock, flags); + memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began */ @@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (has_persistent_clock()) return; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); __timekeeping_inject_sleeptime(tk, delta); - timekeeping_update(tk, true); + timekeeping_update(tk, true, true); - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta) static void timekeeping_resume(void) { struct timekeeper *tk = &timekeeper; + struct clocksource *clock = tk->clock; unsigned long flags; - struct timespec ts; + struct timespec ts_new, ts_delta; + cycle_t cycle_now, cycle_delta; + bool suspendtime_found = false; - read_persistent_clock(&ts); + read_persistent_clock(&ts_new); clockevents_resume(); clocksource_resume(); - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + /* + * After system resumes, we need to calculate the suspended time and + * compensate it for the OS time. There are 3 sources that could be + * used: Nonstop clocksource during suspend, persistent clock and rtc + * device. + * + * One specific platform may have 1 or 2 or all of them, and the + * preference will be: + * suspend-nonstop clocksource -> persistent clock -> rtc + * The less preferred source will only be tried if there is no better + * usable source. The rtc part is handled separately in rtc core code. + */ + cycle_now = clock->read(clock); + if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && + cycle_now > clock->cycle_last) { + u64 num, max = ULLONG_MAX; + u32 mult = clock->mult; + u32 shift = clock->shift; + s64 nsec = 0; + + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { - ts = timespec_sub(ts, timekeeping_suspend_time); - __timekeeping_inject_sleeptime(tk, &ts); + /* + * "cycle_delta * mutl" may cause 64 bits overflow, if the + * suspended time is too long. In that case we need do the + * 64 bits math carefully + */ + do_div(max, mult); + if (cycle_delta > max) { + num = div64_u64(cycle_delta, max); + nsec = (((u64) max * mult) >> shift) * num; + cycle_delta -= num * max; + } + nsec += ((u64) cycle_delta * mult) >> shift; + + ts_delta = ns_to_timespec(nsec); + suspendtime_found = true; + } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); + suspendtime_found = true; } - /* re-base the last cycle value */ - tk->clock->cycle_last = tk->clock->read(tk->clock); + + if (suspendtime_found) + __timekeeping_inject_sleeptime(tk, &ts_delta); + + /* Re-base the last cycle value */ + tk->cycle_last = clock->cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; - timekeeping_update(tk, false); - write_sequnlock_irqrestore(&tk->lock, flags); + timekeeping_update(tk, false, true); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); @@ -826,7 +975,8 @@ static int timekeeping_suspend(void) read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; @@ -849,7 +999,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&tk->lock, flags); + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -1099,6 +1250,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts)); + __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + clock_was_set_delayed(); } } @@ -1116,15 +1269,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, u32 shift) { + cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; /* If the offset is smaller then a shifted interval, do nothing */ - if (offset < tk->cycle_interval<<shift) + if (offset < interval) return offset; /* Accumulate one shifted interval */ - offset -= tk->cycle_interval << shift; - tk->clock->cycle_last += tk->cycle_interval << shift; + offset -= interval; + tk->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; accumulate_nsecs_to_secs(tk); @@ -1181,27 +1335,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) static void update_wall_time(void) { struct clocksource *clock; - struct timekeeper *tk = &timekeeper; + struct timekeeper *real_tk = &timekeeper; + struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; unsigned long flags; - write_seqlock_irqsave(&tk->lock, flags); + raw_spin_lock_irqsave(&timekeeper_lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) goto out; - clock = tk->clock; + clock = real_tk->clock; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = tk->cycle_interval; + offset = real_tk->cycle_interval; #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif /* Check if there's really nothing to do */ - if (offset < tk->cycle_interval) + if (offset < real_tk->cycle_interval) goto out; /* @@ -1238,11 +1393,24 @@ static void update_wall_time(void) */ accumulate_nsecs_to_secs(tk); - timekeeping_update(tk, false); - + write_seqcount_begin(&timekeeper_seq); + /* Update clock->cycle_last with the new value */ + clock->cycle_last = tk->cycle_last; + /* + * Update the real timekeeper. + * + * We could avoid this memcpy by switching pointers, but that + * requires changes to all other timekeeper usage sites as + * well, i.e. move the timekeeper pointer getter into the + * spinlocked/seqcount protected sections. And we trade this + * memcpy under the timekeeper_seq against one before we start + * updating. + */ + memcpy(real_tk, tk, sizeof(*tk)); + timekeeping_update(real_tk, false, false); + write_seqcount_end(&timekeeper_seq); out: - write_sequnlock_irqrestore(&tk->lock, flags); - + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /** @@ -1289,13 +1457,13 @@ void get_monotonic_boottime(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); ts->tv_sec += tomono.tv_sec + sleep.tv_sec; ts->tv_nsec = 0; @@ -1354,10 +1522,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); now = tk_xtime(tk); - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return now; } @@ -1370,11 +1538,11 @@ struct timespec get_monotonic_coarse(void) unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1405,11 +1573,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, unsigned long seq; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); *xtim = tk_xtime(tk); *wtom = tk->wall_to_monotonic; *sleep = tk->total_sleep_time; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1421,7 +1589,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * Returns current monotonic time and updates the offsets * Called from hrtimer_interupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) { struct timekeeper *tk = &timekeeper; ktime_t now; @@ -1429,14 +1598,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) u64 secs, nsecs; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); secs = tk->xtime_sec; nsecs = timekeeping_get_ns(tk); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; - } while (read_seqretry(&tk->lock, seq)); + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&timekeeper_seq, seq)); now = ktime_add_ns(ktime_set(secs, 0), nsecs); now = ktime_sub(now, *offs_real); @@ -1454,15 +1624,79 @@ ktime_t ktime_get_monotonic_offset(void) struct timespec wtom; do { - seq = read_seqbegin(&tk->lock); + seq = read_seqcount_begin(&timekeeper_seq); wtom = tk->wall_to_monotonic; - } while (read_seqretry(&tk->lock, seq)); + } while (read_seqcount_retry(&timekeeper_seq, seq)); return timespec_to_ktime(wtom); } EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); /** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ + struct timekeeper *tk = &timekeeper; + unsigned long flags; + struct timespec ts; + s32 orig_tai, tai; + int ret; + + /* Validate the data before disabling interrupts */ + ret = ntp_validate_timex(txc); + if (ret) + return ret; + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = timekeeping_inject_offset(&delta); + if (ret) + return ret; + } + + getnstimeofday(&ts); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + orig_tai = tai = tk->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai); + + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tk, tai); + clock_was_set_delayed(); + } + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&timekeeper_seq); + + __hardpps(phase_ts, raw_ts); + + write_seqcount_end(&timekeeper_seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif + +/** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. * diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index af5a7e9..3bdf283 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -20,6 +20,13 @@ #include <asm/uaccess.h> + +struct timer_list_iter { + int cpu; + bool second_pass; + u64 now; +}; + typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); @@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - SEQ_printf(m, "\n"); SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); @@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #undef P #undef P_ns + SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; - SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); @@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); + SEQ_printf(m, "\n"); } -static void timer_list_show_tickdevices(struct seq_file *m) +static void timer_list_show_tickdevices_header(struct seq_file *m) { - int cpu; - #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", @@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m) #endif SEQ_printf(m, "\n"); #endif - for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu), cpu); - SEQ_printf(m, "\n"); } -#else -static void timer_list_show_tickdevices(struct seq_file *m) { } #endif +static inline void timer_list_header(struct seq_file *m, u64 now) +{ + SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); +} + static int timer_list_show(struct seq_file *m, void *v) { + struct timer_list_iter *iter = v; + u64 now = ktime_to_ns(ktime_get()); + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + +void sysrq_timer_list_show(void) +{ u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.7\n"); - SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); - SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + timer_list_header(NULL, now); for_each_online_cpu(cpu) - print_cpu(m, cpu, now); + print_cpu(NULL, cpu, now); - SEQ_printf(m, "\n"); - timer_list_show_tickdevices(m); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + timer_list_show_tickdevices_header(NULL); + for_each_online_cpu(cpu) + print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif + return; +} - return 0; +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) { + iter->cpu = -1; + iter->now = ktime_to_ns(ktime_get()); + } else if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; +#else + return NULL; +#endif + } + return iter; } -void sysrq_timer_list_show(void) +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + ++*offset; + return timer_list_start(file, offset); +} + +static void timer_list_stop(struct seq_file *seq, void *v) { - timer_list_show(NULL, NULL); } +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + static int timer_list_open(struct inode *inode, struct file *filp) { - return single_open(filp, timer_list_show, NULL); + return seq_open_private(filp, &timer_list_sops, + sizeof(struct timer_list_iter)); } static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = seq_release_private, }; static int __init init_timer_list_procfs(void) diff --git a/kernel/timer.c b/kernel/timer.c index dbf7a78..a860bba 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1,7 +1,7 @@ /* * linux/kernel/timer.c * - * Kernel internal timers, basic process system calls + * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds * @@ -41,6 +41,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/slab.h> +#include <linux/compat.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -738,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) cpu = get_nohz_timer_target(); #endif @@ -930,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu) debug_activate(timer, timer->expires); internal_add_timer(base, timer); /* - * Check whether the other CPU is idle and needs to be - * triggered to reevaluate the timer wheel when nohz is - * active. We are protected against the other CPU fiddling + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling * with the timer by holding the timer base lock. This also - * makes sure that a CPU on the way to idle can not evaluate - * the timer wheel. + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. */ - wake_up_idle_cpu(cpu); + wake_up_nohz_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1188,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(&base->lock); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a CPU is idle. @@ -1395,61 +1396,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -/** - * sys_getpid - return the thread group id of the current process - * - * Note, despite the name, this returns the tgid not the pid. The tgid and - * the pid are identical unless CLONE_THREAD was specified on clone() in - * which case the tgid is the same in all threads of the same group. - * - * This is SMP safe as current->tgid does not change. - */ -SYSCALL_DEFINE0(getpid) -{ - return task_tgid_vnr(current); -} - -/* - * Accessing ->real_parent is not SMP-safe, it could - * change from under us. However, we can use a stale - * value of ->real_parent under rcu_read_lock(), see - * release_task()->call_rcu(delayed_put_task_struct). - */ -SYSCALL_DEFINE0(getppid) -{ - int pid; - - rcu_read_lock(); - pid = task_tgid_vnr(rcu_dereference(current->real_parent)); - rcu_read_unlock(); - - return pid; -} - -SYSCALL_DEFINE0(getuid) -{ - /* Only we change this so SMP safe */ - return from_kuid_munged(current_user_ns(), current_uid()); -} - -SYSCALL_DEFINE0(geteuid) -{ - /* Only we change this so SMP safe */ - return from_kuid_munged(current_user_ns(), current_euid()); -} - -SYSCALL_DEFINE0(getgid) -{ - /* Only we change this so SMP safe */ - return from_kgid_munged(current_user_ns(), current_gid()); -} - -SYSCALL_DEFINE0(getegid) -{ - /* Only we change this so SMP safe */ - return from_kgid_munged(current_user_ns(), current_egid()); -} - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); @@ -1557,91 +1503,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -/* Thread ID - the internal kernel "pid" */ -SYSCALL_DEFINE0(gettid) -{ - return task_pid_vnr(current); -} - -/** - * do_sysinfo - fill in sysinfo struct - * @info: pointer to buffer to fill - */ -int do_sysinfo(struct sysinfo *info) -{ - unsigned long mem_total, sav_total; - unsigned int mem_unit, bitcount; - struct timespec tp; - - memset(info, 0, sizeof(struct sysinfo)); - - ktime_get_ts(&tp); - monotonic_to_bootbased(&tp); - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - - info->procs = nr_threads; - - si_meminfo(info); - si_swapinfo(info); - - /* - * If the sum of all the available memory (i.e. ram + swap) - * is less than can be stored in a 32 bit unsigned long then - * we can be binary compatible with 2.2.x kernels. If not, - * well, in that case 2.2.x was broken anyways... - * - * -Erik Andersen <andersee@debian.org> - */ - - mem_total = info->totalram + info->totalswap; - if (mem_total < info->totalram || mem_total < info->totalswap) - goto out; - bitcount = 0; - mem_unit = info->mem_unit; - while (mem_unit > 1) { - bitcount++; - mem_unit >>= 1; - sav_total = mem_total; - mem_total <<= 1; - if (mem_total < sav_total) - goto out; - } - - /* - * If mem_total did not overflow, multiply all memory values by - * info->mem_unit and set it to 1. This leaves things compatible - * with 2.2.x, and also retains compatibility with earlier 2.4.x - * kernels... - */ - - info->mem_unit = 1; - info->totalram <<= bitcount; - info->freeram <<= bitcount; - info->sharedram <<= bitcount; - info->bufferram <<= bitcount; - info->totalswap <<= bitcount; - info->freeswap <<= bitcount; - info->totalhigh <<= bitcount; - info->freehigh <<= bitcount; - -out: - return 0; -} - -SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) -{ - struct sysinfo val; - - do_sysinfo(&val); - - if (copy_to_user(info, &val, sizeof(struct sysinfo))) - return -EFAULT; - - return 0; -} - static int __cpuinit init_timers_cpu(int cpu) { int j; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ed58a32..b8b8560 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1808,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i] = '\0'; } +EXPORT_SYMBOL_GPL(blk_fill_rwbs); #endif /* CONFIG_EVENT_TRACING */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9e01458..711ca7d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -109,11 +109,6 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; -struct uprobe_trace_entry_head { - struct trace_entry ent; - unsigned long ip; -}; - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8dad2a9..32494fb0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,18 @@ #define UPROBE_EVENT_SYSTEM "uprobes" +struct uprobe_trace_entry_head { + struct trace_entry ent; + unsigned long vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return) \ + (sizeof(struct uprobe_trace_entry_head) + \ + sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return) \ + ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) + struct trace_uprobe_filter { rwlock_t rwlock; int nr_systemwide; @@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock); static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs); static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { @@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) return !filter->nr_systemwide && list_empty(&filter->perf_events); } +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ + return tu->consumer.ret_handler != NULL; +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ static struct trace_uprobe * -alloc_trace_uprobe(const char *group, const char *event, int nargs) +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; @@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) INIT_LIST_HEAD(&tu->list); tu->consumer.handler = uprobe_dispatcher; + if (is_ret) + tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); return tu; @@ -180,7 +201,7 @@ end: /* * Argument syntax: - * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] * * - Remove uprobe: -:[GRP/]EVENT */ @@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv) char buf[MAX_EVENT_NAME_LEN]; struct path path; unsigned long offset; - bool is_delete; + bool is_delete, is_return; int i, ret; inode = NULL; ret = 0; is_delete = false; + is_return = false; event = NULL; group = NULL; /* argc must be >= 1 */ if (argv[0][0] == '-') is_delete = true; + else if (argv[0][0] == 'r') + is_return = true; else if (argv[0][0] != 'p') { - pr_info("Probe definition must be started with 'p' or '-'.\n"); + pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); return -EINVAL; } @@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv) kfree(tail); } - tu = alloc_trace_uprobe(group, event, argc); + tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); @@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; + char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); + seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); for (i = 0; i < tu->nr_args; i++) @@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = { .release = seq_release, }; -/* uprobe handler */ -static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_trace_print(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; - unsigned long irq_flags; + void *data; + int size, i; struct ftrace_event_call *call = &tu->call; - local_save_flags(irq_flags); - pc = preempt_count(); - - size = sizeof(*entry) + tu->size; - + size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + size + tu->size, 0, 0); if (!event) - return 0; + return; entry = ring_buffer_event_data(event); - entry->ip = instruction_pointer(task_pt_regs(current)); - data = (u8 *)&entry[1]; + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_buffer_unlock_commit(buffer, event, 0, 0); +} +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ + if (!is_ret_probe(tu)) + uprobe_trace_print(tu, 0, regs); return 0; } +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs) +{ + uprobe_trace_print(tu, func, regs); +} + /* Event entry printers */ static enum print_line_t print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { - struct uprobe_trace_entry_head *field; + struct uprobe_trace_entry_head *entry; struct trace_seq *s = &iter->seq; struct trace_uprobe *tu; u8 *data; int i; - field = (struct uprobe_trace_entry_head *)iter->ent; + entry = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, call.event); - if (!trace_seq_printf(s, "%s: (", tu->call.name)) - goto partial; - - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; - - if (!trace_seq_puts(s, ")")) - goto partial; + if (is_ret_probe(tu)) { + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, + entry->vaddr[1], entry->vaddr[0])) + goto partial; + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, + entry->vaddr[0])) + goto partial; + data = DATAOF_TRACE_ENTRY(entry, false); + } - data = (u8 *)&field[1]; for (i = 0; i < tu->nr_args; i++) { if (!tu->args[i].type->print(s, tu->args[i].name, - data + tu->args[i].offset, field)) + data + tu->args[i].offset, entry)) goto partial; } @@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) static int uprobe_event_define_fields(struct ftrace_event_call *event_call) { - int ret, i; + int ret, i, size; struct uprobe_trace_entry_head field; - struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; + struct trace_uprobe *tu = event_call->data; - DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + if (is_ret_probe(tu)) { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); + size = SIZEOF_TRACE_ENTRY(true); + } else { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); + size = SIZEOF_TRACE_ENTRY(false); + } /* Set argument names as fields */ for (i = 0; i < tu->nr_args; i++) { ret = trace_define_field(event_call, tu->args[i].type->fmttype, tu->args[i].name, - sizeof(field) + tu->args[i].offset, + size + tu->args[i].offset, tu->args[i].type->size, tu->args[i].type->is_signed, FILTER_OTHER); @@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) int i; int pos = 0; - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; + if (is_ret_probe(tu)) { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } else { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } /* When len=0, we just calculate the needed length */ @@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, return ret; } -/* uprobe profile handler */ -static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_perf_print(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; - int rctx; + void *data; + int size, rctx, i; - if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) - return UPROBE_HANDLER_REMOVE; - - __size = sizeof(*entry) + tu->size; - size = ALIGN(__size + sizeof(u32), sizeof(u64)); - size -= sizeof(u32); + size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return 0; + return; preempt_disable(); + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + goto out; entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) goto out; - entry->ip = instruction_pointer(task_pt_regs(current)); - data = (u8 *)&entry[1]; + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); - head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); - + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + + if (!is_ret_probe(tu)) + uprobe_perf_print(tu, 0, regs); return 0; } + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs) +{ + uprobe_perf_print(tu, func, regs); +} #endif /* CONFIG_PERF_EVENTS */ static int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { - struct trace_uprobe *tu = (struct trace_uprobe *)event->data; + struct trace_uprobe *tu = event->data; switch (type) { case TRACE_REG_REGISTER: @@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) return ret; } +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + + tu = container_of(con, struct trace_uprobe, consumer); + + if (tu->flags & TP_FLAG_TRACE) + uretprobe_trace_func(tu, func, regs); + +#ifdef CONFIG_PERF_EVENTS + if (tu->flags & TP_FLAG_PROFILE) + uretprobe_perf_func(tu, func, regs); +#endif + return 0; +} + static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; diff --git a/kernel/uid16.c b/kernel/uid16.c index d7948eb..f6c83d7 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -18,67 +18,43 @@ SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, user, group); - return ret; + return sys_chown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) { - long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, user, group); - return ret; + return sys_lchown(filename, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) { - long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, fd, user, group); - return ret; + return sys_fchown(fd, low2highuid(user), low2highgid(group)); } SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) { - long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, rgid, egid); - return ret; + return sys_setregid(low2highgid(rgid), low2highgid(egid)); } SYSCALL_DEFINE1(setgid16, old_gid_t, gid) { - long ret = sys_setgid(low2highgid(gid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, gid); - return ret; + return sys_setgid(low2highgid(gid)); } SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) { - long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, ruid, euid); - return ret; + return sys_setreuid(low2highuid(ruid), low2highuid(euid)); } SYSCALL_DEFINE1(setuid16, old_uid_t, uid) { - long ret = sys_setuid(low2highuid(uid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, uid); - return ret; + return sys_setuid(low2highuid(uid)); } SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) { - long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), + return sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, ruid, euid, suid); - return ret; } SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) @@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) { - long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), + return sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, rgid, egid, sgid); - return ret; } @@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) { - long ret = sys_setfsuid(low2highuid(uid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, uid); - return ret; + return sys_setfsuid(low2highuid(uid)); } SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) { - long ret = sys_setfsgid(low2highgid(gid)); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(1, ret, gid); - return ret; + return sys_setfsgid(low2highgid(gid)); } static int groups16_to_user(old_gid_t __user *grouplist, diff --git a/kernel/user.c b/kernel/user.c index 8e635a1..69b4c3d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -16,7 +16,7 @@ #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> /* * userns count is 1 for root user, 1 for init_uts_ns, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e134d8f..d8c30db 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,7 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index a47fc5d..2fc8576 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -15,7 +15,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/user_namespace.h> -#include <linux/proc_fs.h> +#include <linux/proc_ns.h> static struct uts_namespace *create_uts_ns(void) { diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4a94467..05039e3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write, return ret; set_sample_period(); + /* + * Watchdog threads shouldn't be enabled if they are + * disabled. The 'watchdog_disabled' variable check in + * watchdog_*_all_cpus() function takes care of this. + */ if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 154aa12..4aa9f5b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -46,6 +46,7 @@ #include <linux/rculist.h> #include <linux/nodemask.h> #include <linux/moduleparam.h> +#include <linux/uaccess.h> #include "workqueue_internal.h" @@ -2197,6 +2198,7 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; + worker->desc_valid = false; pwq_dec_nr_in_flight(pwq, work_color); } @@ -4365,6 +4367,83 @@ unsigned int work_busy(struct work_struct *work) } EXPORT_SYMBOL_GPL(work_busy); +/** + * set_worker_desc - set description for the current work item + * @fmt: printf-style format string + * @...: arguments for the format string + * + * This function can be called by a running work function to describe what + * the work item is about. If the worker task gets dumped, this + * information will be printed out together to help debugging. The + * description can be at most WORKER_DESC_LEN including the trailing '\0'. + */ +void set_worker_desc(const char *fmt, ...) +{ + struct worker *worker = current_wq_worker(); + va_list args; + + if (worker) { + va_start(args, fmt); + vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); + va_end(args); + worker->desc_valid = true; + } +} + +/** + * print_worker_info - print out worker information and description + * @log_lvl: the log level to use when printing + * @task: target task + * + * If @task is a worker and currently executing a work item, print out the + * name of the workqueue being serviced and worker description set with + * set_worker_desc() by the currently executing work item. + * + * This function can be safely called on any task as long as the + * task_struct itself is accessible. While safe, this function isn't + * synchronized and may print out mixups or garbages of limited length. + */ +void print_worker_info(const char *log_lvl, struct task_struct *task) +{ + work_func_t *fn = NULL; + char name[WQ_NAME_LEN] = { }; + char desc[WORKER_DESC_LEN] = { }; + struct pool_workqueue *pwq = NULL; + struct workqueue_struct *wq = NULL; + bool desc_valid = false; + struct worker *worker; + + if (!(task->flags & PF_WQ_WORKER)) + return; + + /* + * This function is called without any synchronization and @task + * could be in any state. Be careful with dereferences. + */ + worker = probe_kthread_data(task); + + /* + * Carefully copy the associated workqueue's workfn and name. Keep + * the original last '\0' in case the original contains garbage. + */ + probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); + probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); + probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); + probe_kernel_read(name, wq->name, sizeof(name) - 1); + + /* copy worker description */ + probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid)); + if (desc_valid) + probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); + + if (fn || name[0] || desc[0]) { + printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + if (desc[0]) + pr_cont(" (%s)", desc); + pr_cont("\n"); + } +} + /* * CPU hotplug. * diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 84ab6e1..ad83c96 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -29,15 +29,25 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ struct pool_workqueue *current_pwq; /* L: current_work's pwq */ + bool desc_valid; /* ->desc is valid */ struct list_head scheduled; /* L: scheduled works */ + + /* 64 bytes boundary on 64bit, 32 on 32bit */ + struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ /* L: for rescuers */ - /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ + /* + * Opaque string set with work_set_desc(). Printed out with task + * dump for debugging - WARN, BUG, panic or sysrq. + */ + char desc[WORKER_DESC_LEN]; + /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; |