diff options
Diffstat (limited to 'kernel')
49 files changed, 2249 insertions, 905 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0026cf5..dc5c775 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -105,7 +105,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/kernel/acct.c b/kernel/acct.c index a1844f1..b4c667d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,6 +59,7 @@ #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ #include <linux/pid_namespace.h> +#include <linux/fs_pin.h> /* * These constants control the amount of freespace that suspend and @@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *); +static void do_acct_process(struct bsd_acct_struct *acct); -/* - * This structure is used so that all the data protected by lock - * can be placed in the same cache line as the lock. This primes - * the cache line to have the data after getting the lock. - */ struct bsd_acct_struct { + struct fs_pin pin; + struct mutex lock; int active; unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct list_head list; + struct work_struct work; + struct completion done; }; -static DEFINE_SPINLOCK(acct_lock); -static LIST_HEAD(acct_list); - /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct, struct file *file) +static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - int res; - int act; - u64 resume; - u64 suspend; - - spin_lock(&acct_lock); - res = acct->active; - if (!file || time_is_before_jiffies(acct->needcheck)) + + if (time_is_before_jiffies(acct->needcheck)) goto out; - spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(&file->f_path, &sbuf)) - return res; - suspend = sbuf.f_blocks * SUSPEND; - resume = sbuf.f_blocks * RESUME; - - do_div(suspend, 100); - do_div(resume, 100); - - if (sbuf.f_bavail <= suspend) - act = -1; - else if (sbuf.f_bavail >= resume) - act = 1; - else - act = 0; - - /* - * If some joker switched acct->file under us we'ld better be - * silent and _not_ touch anything. - */ - spin_lock(&acct_lock); - if (file != acct->file) { - if (act) - res = act > 0; + if (vfs_statfs(&acct->file->f_path, &sbuf)) goto out; - } if (acct->active) { - if (act < 0) { + u64 suspend = sbuf.f_blocks * SUSPEND; + do_div(suspend, 100); + if (sbuf.f_bavail <= suspend) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { - if (act > 0) { + u64 resume = sbuf.f_blocks * RESUME; + do_div(resume, 100); + if (sbuf.f_bavail >= resume) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - res = acct->active; out: - spin_unlock(&acct_lock); + return acct->active; +} + +static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) +{ + struct bsd_acct_struct *res; +again: + smp_rmb(); + rcu_read_lock(); + res = ACCESS_ONCE(ns->bacct); + if (!res) { + rcu_read_unlock(); + return NULL; + } + if (!atomic_long_inc_not_zero(&res->pin.count)) { + rcu_read_unlock(); + cpu_relax(); + goto again; + } + rcu_read_unlock(); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + pin_put(&res->pin); + goto again; + } return res; } -/* - * Close the old accounting file (if currently open) and then replace - * it with file (if non-NULL). - * - * NOTE: acct_lock MUST be held on entry and exit. - */ -static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, - struct pid_namespace *ns) +static void close_work(struct work_struct *work) +{ + struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); + struct file *file = acct->file; + if (file->f_op->flush) + file->f_op->flush(file, NULL); + __fput_sync(file); + complete(&acct->done); +} + +static void acct_kill(struct bsd_acct_struct *acct, + struct bsd_acct_struct *new) { - struct file *old_acct = NULL; - struct pid_namespace *old_ns = NULL; - - if (acct->file) { - old_acct = acct->file; - old_ns = acct->ns; - acct->active = 0; - acct->file = NULL; + if (acct) { + struct pid_namespace *ns = acct->ns; + do_acct_process(acct); + INIT_WORK(&acct->work, close_work); + init_completion(&acct->done); + schedule_work(&acct->work); + wait_for_completion(&acct->done); + pin_remove(&acct->pin); + ns->bacct = new; acct->ns = NULL; - list_del(&acct->list); + atomic_long_dec(&acct->pin.count); + mutex_unlock(&acct->lock); + pin_put(&acct->pin); } - if (file) { - acct->file = file; - acct->ns = ns; - acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - acct->active = 1; - list_add(&acct->list, &acct_list); - } - if (old_acct) { - mnt_unpin(old_acct->f_path.mnt); - spin_unlock(&acct_lock); - do_acct_process(acct, old_ns, old_acct); - filp_close(old_acct, NULL); - spin_lock(&acct_lock); +} + +static void acct_pin_kill(struct fs_pin *pin) +{ + struct bsd_acct_struct *acct; + acct = container_of(pin, struct bsd_acct_struct, pin); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + pin_put(pin); + acct = NULL; } + acct_kill(acct, NULL); } static int acct_on(struct filename *pathname) { struct file *file; - struct vfsmount *mnt; - struct pid_namespace *ns; - struct bsd_acct_struct *acct = NULL; + struct vfsmount *mnt, *internal; + struct pid_namespace *ns = task_active_pid_ns(current); + struct bsd_acct_struct *acct, *old; + int err; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; /* Difference from BSD - they don't do O_APPEND */ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) + if (IS_ERR(file)) { + kfree(acct); return PTR_ERR(file); + } if (!S_ISREG(file_inode(file)->i_mode)) { + kfree(acct); filp_close(file, NULL); return -EACCES; } if (!file->f_op->write) { + kfree(acct); filp_close(file, NULL); return -EIO; } - - ns = task_active_pid_ns(current); - if (ns->bacct == NULL) { - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (acct == NULL) { - filp_close(file, NULL); - return -ENOMEM; - } + internal = mnt_clone_internal(&file->f_path); + if (IS_ERR(internal)) { + kfree(acct); + filp_close(file, NULL); + return PTR_ERR(internal); } - - spin_lock(&acct_lock); - if (ns->bacct == NULL) { - ns->bacct = acct; - acct = NULL; + err = mnt_want_write(internal); + if (err) { + mntput(internal); + kfree(acct); + filp_close(file, NULL); + return err; } - mnt = file->f_path.mnt; - mnt_pin(mnt); - acct_file_reopen(ns->bacct, file, ns); - spin_unlock(&acct_lock); - - mntput(mnt); /* it's pinned, now give up active reference */ - kfree(acct); - + file->f_path.mnt = internal; + + atomic_long_set(&acct->pin.count, 1); + acct->pin.kill = acct_pin_kill; + acct->file = file; + acct->needcheck = jiffies; + acct->ns = ns; + mutex_init(&acct->lock); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ + pin_insert(&acct->pin, mnt); + + old = acct_get(ns); + if (old) + acct_kill(old, acct); + else + ns->bacct = acct; + mutex_unlock(&acct->lock); + mnt_drop_write(mnt); + mntput(mnt); return 0; } +static DEFINE_MUTEX(acct_on_mutex); + /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting @@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); + mutex_lock(&acct_on_mutex); error = acct_on(tmp); + mutex_unlock(&acct_on_mutex); putname(tmp); } else { - struct bsd_acct_struct *acct; - - acct = task_active_pid_ns(current)->bacct; - if (acct == NULL) - return 0; - - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); + acct_kill(acct_get(task_active_pid_ns(current)), NULL); } return error; } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) -{ - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.dentry->d_sb == sb) { - acct_file_reopen(acct, NULL, NULL); - goto restart; - } - spin_unlock(&acct_lock); -} - void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct = ns->bacct; - - if (acct == NULL) - return; - - spin_lock(&acct_lock); - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - - kfree(acct); + acct_kill(acct_get(ns), NULL); } /* @@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ +#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */ static comp2_t encode_comp2_t(u64 value) { @@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value) } #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 /* * encode an u64 into a 32 bit IEEE float */ @@ -429,8 +391,9 @@ static u32 encode_float(u64 value) unsigned exp = 190; unsigned u; - if (value==0) return 0; - while ((s64)value > 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -448,116 +411,112 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void fill_ac(acct_t *ac) { struct pacct_struct *pacct = ¤t->signal->pacct; - acct_t ac; - mm_segment_t fs; - unsigned long flim; u64 elapsed, run_time; struct tty_struct *tty; - const struct cred *orig_cred; - - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct, file)) - goto out; /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset(&ac, 0, sizeof(acct_t)); + memset(ac, 0, sizeof(acct_t)); - ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); + ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; + strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ run_time = ktime_get_ns(); run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 - ac.ac_etime = encode_float(elapsed); +#if ACCT_VERSION == 3 + ac->ac_etime = encode_float(elapsed); #else - ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); - ac.ac_etime_hi = etime >> 16; - ac.ac_etime_lo = (u16) etime; + + ac->ac_etime_hi = etime >> 16; + ac->ac_etime_lo = (u16) etime; } #endif do_div(elapsed, AHZ); - ac.ac_btime = get_seconds() - elapsed; + ac->ac_btime = get_seconds() - elapsed; +#if ACCT_VERSION==2 + ac->ac_ahz = AHZ; +#endif + + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ + ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; + ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac->ac_flag = pacct->ac_flag; + ac->ac_mem = encode_comp_t(pacct->ac_mem); + ac->ac_minflt = encode_comp_t(pacct->ac_minflt); + ac->ac_majflt = encode_comp_t(pacct->ac_majflt); + ac->ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); +} +/* + * do_acct_process does all actual work. Caller holds the reference to file. + */ +static void do_acct_process(struct bsd_acct_struct *acct) +{ + acct_t ac; + unsigned long flim; + const struct cred *orig_cred; + struct pid_namespace *ns = acct->ns; + struct file *file = acct->file; + + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); + + /* + * First check to see if there is enough free_space to continue + * the process accounting system. + */ + if (!check_free_space(acct)) + goto out; + + fill_ac(&ac); /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 - ac.ac_ahz = AHZ; -#endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); #endif - - spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; /* Safe as we hold the siglock */ - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); - ac.ac_flag = pacct->ac_flag; - ac.ac_mem = encode_comp_t(pacct->ac_mem); - ac.ac_minflt = encode_comp_t(pacct->ac_minflt); - ac.ac_majflt = encode_comp_t(pacct->ac_majflt); - ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock_irq(¤t->sighand->siglock); - ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ - ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_swaps = encode_comp_t(0); - /* * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. */ - if (!file_start_write_trylock(file)) - goto out; - /* - * Kernel segment override to datasegment and write it - * to the accounting file. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - file->f_op->write(file, (char *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); - file_end_write(file); + if (file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + file_end_write(file); + } out: + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; revert_creds(orig_cred); } @@ -574,6 +533,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { @@ -605,34 +565,20 @@ void acct_collect(long exitcode, int group_dead) spin_unlock_irq(¤t->sighand->siglock); } -static void acct_process_in_ns(struct pid_namespace *ns) +static void slow_acct_process(struct pid_namespace *ns) { - struct file *file = NULL; - struct bsd_acct_struct *acct; - - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - return; - - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { - spin_unlock(&acct_lock); - return; + for ( ; ns; ns = ns->parent) { + struct bsd_acct_struct *acct = acct_get(ns); + if (acct) { + do_acct_process(acct); + mutex_unlock(&acct->lock); + pin_put(&acct->pin); + } } - get_file(file); - spin_unlock(&acct_lock); - - do_acct_process(acct, ns, file); - fput(file); } /** - * acct_process - now just a wrapper around acct_process_in_ns, - * which in turn is a wrapper around do_acct_process. + * acct_process * * handles process accounting for an exiting task */ @@ -645,6 +591,10 @@ void acct_process(void) * alive and holds its namespace, which in turn holds * its parent. */ - for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) - acct_process_in_ns(ns); + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { + if (ns->bacct) + break; + } + if (unlikely(ns)) + slow_acct_process(ns); } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c..c447cd9 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246..e1d1d195 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> -#include <linux/page_cgroup.h> #include <linux/log2.h> #include <linux/spinlock_types.h> @@ -18,7 +17,6 @@ void foo(void) /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788..3a73f99 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1035,6 +1035,11 @@ static void cgroup_get(struct cgroup *cgrp) css_get(&cgrp->self); } +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + static void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); @@ -1147,7 +1152,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ - cgroup_get(cgrp); + if (!cgroup_tryget(cgrp)) + return NULL; kernfs_break_active_protection(kn); mutex_lock(&cgroup_mutex); @@ -3271,8 +3277,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - for (cft = cfts; cft && cft->name[0] != '\0'; cft++) - cft->flags |= __CFTYPE_NOT_ON_DFL; + /* + * If legacy_flies_on_dfl, we want to show the legacy files on the + * dfl hierarchy but iff the target subsystem hasn't been updated + * for the dfl hierarchy yet. + */ + if (!cgroup_legacy_files_on_dfl || + ss->dfl_cftypes != ss->legacy_cftypes) { + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; + } + return cgroup_add_cftypes(ss, cfts); } @@ -3970,7 +3985,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l = cgroup_pidlist_find_create(cgrp, type); if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); pidlist_free(array); return -ENOMEM; } @@ -4387,6 +4401,15 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; + + /* + * There are two control paths which try to determine + * cgroup from dentry without going through kernfs - + * cgroupstats_build() and css_tryget_online_from_dir(). + * Those are supported by RCU protecting clearing of + * cgrp->kn->priv backpointer. + */ + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } mutex_unlock(&cgroup_mutex); @@ -4543,6 +4566,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cftype *base_files; int ssid, ret; + /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. + */ + if (strchr(name, '\n')) + return -EINVAL; + parent = cgroup_kn_lock_live(parent_kn); if (!parent) return -ENODEV; @@ -4820,16 +4848,6 @@ static int cgroup_rmdir(struct kernfs_node *kn) cgroup_kn_unlock(kn); - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_online_from_dir(). Those are supported by RCU - * protecting clearing of cgrp->kn->priv backpointer, which should - * happen after all files under it have been removed. - */ - if (!ret) - RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - cgroup_put(cgrp); return ret; } @@ -5416,7 +5434,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See cgroup_rmdir() for details. + * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) diff --git a/kernel/compat.c b/kernel/compat.c index 633394f..ebb3c36 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) ret = hrtimer_nanosleep_restart(restart); set_fs(oldfs); - if (ret) { + if (ret == -ERESTART_RESTARTBLOCK) { rmtp = restart->nanosleep.compat_rmtp; if (rmtp && compat_put_timespec(&rmt, rmtp)) @@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); - if (ret) { + /* + * hrtimer_nanosleep() can only return 0 or + * -ERESTART_RESTARTBLOCK here because: + * + * - we call it with HRTIMER_MODE_REL and therefor exclude the + * -ERESTARTNOHAND return path. + * + * - we supply the rmtp argument from the task stack (due to + * the necessary compat conversion. So the update cannot + * fail, which excludes the -EFAULT return path as well. If + * it fails nevertheless we have a bigger problem and wont + * reach this place anymore. + * + * - if the return value is 0, we do not have to update rmtp + * because there is no remaining time. + * + * We check for -ERESTART_RESTARTBLOCK nevertheless if the + * core implementation decides to return random nonsense. + */ + if (ret == -ERESTART_RESTARTBLOCK) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } - return ret; } diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config new file mode 100644 index 0000000..c2de56a --- /dev/null +++ b/kernel/configs/tiny.config @@ -0,0 +1,4 @@ +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_KERNEL_XZ=y +CONFIG_OPTIMIZE_INLINING=y +CONFIG_SLOB=y diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 22874d7..52cb04c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -365,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) { if (is_spread_page(cs)) - tsk->flags |= PF_SPREAD_PAGE; + task_set_spread_page(tsk); else - tsk->flags &= ~PF_SPREAD_PAGE; + task_clear_spread_page(tsk); + if (is_spread_slab(cs)) - tsk->flags |= PF_SPREAD_SLAB; + task_set_spread_slab(tsk); else - tsk->flags &= ~PF_SPREAD_SLAB; + task_clear_spread_slab(tsk); } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3..963bf13 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -41,6 +41,7 @@ #include <linux/cgroup.h> #include <linux/module.h> #include <linux/mman.h> +#include <linux/compat.h> #include "internal.h" @@ -1523,6 +1524,11 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -1966,6 +1972,11 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -3717,6 +3728,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +#ifdef CONFIG_COMPAT +static long perf_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): + case _IOC_NR(PERF_EVENT_IOC_ID): + /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ + if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { + cmd &= ~IOCSIZE_MASK; + cmd |= sizeof(void *) << IOCSIZE_SHIFT; + } + break; + } + return perf_ioctl(file, cmd, arg); +} +#else +# define perf_compat_ioctl NULL +#endif + int perf_event_task_enable(void) { struct perf_event *event; @@ -4222,7 +4253,7 @@ static const struct file_operations perf_fops = { .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, + .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; @@ -7917,8 +7948,10 @@ int perf_event_init_task(struct task_struct *child) for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); - if (ret) + if (ret) { + perf_event_free_task(child); return ret; + } } return 0; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e..1d0af8a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -315,18 +323,11 @@ retry: if (!new_page) goto put_old; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - -put_new: page_cache_release(new_page); put_old: put_page(old_page); diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668..32c58f7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@ #include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +192,8 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -405,7 +407,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk) core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + clear_thread_flag(TIF_MEMDIE); } /* @@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father) list_for_each_entry_safe(p, n, &father->children, sibling) { struct task_struct *t = p; + do { t->real_parent = reaper; if (t->parent == father) { @@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* * This does two things: * - * A. Make init inherit all the child processes + * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) @@ -648,9 +653,8 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -691,8 +695,7 @@ void do_exit(long code) * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -716,9 +719,9 @@ void do_exit(long code) raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ @@ -836,7 +839,6 @@ void do_exit(long code) for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code) do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -869,6 +870,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); @@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index fbd3497..a91e47d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -374,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -430,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) @@ -536,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -558,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + return mm; + +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -596,7 +619,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -828,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -860,15 +875,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@ -1099,7 +1105,7 @@ static void copy_seccomp(struct task_struct *p) * needed because this new task is not yet running and cannot * be racing exec. */ - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Ref-count the new filter user, and assign it. */ get_seccomp_filter(current); @@ -1140,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ @@ -1346,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -1365,8 +1360,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_cleanup_perf; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1570,8 +1566,9 @@ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); -bad_fork_cleanup_policy: +bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: @@ -1918,6 +1915,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); diff --git a/kernel/futex.c b/kernel/futex.c index d3a9d94..815d7af 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2592,6 +2592,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * shared futexes. We need to compare the keys: */ if (match_futex(&q.key, &key2)) { + queue_unlock(hb); ret = -EINVAL; goto out_put_keys; } diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a..edf67c4 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) err_remove: pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); + debugfs_remove(root_node.dentry); return rc; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b28a2..6223fab 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -517,6 +517,7 @@ out: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37..ae51670 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) - return sprintf(buffer, "0x%lx", address); + return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); diff --git a/kernel/kcmp.c b/kernel/kcmp.c index e30ac0f..0aa69ea 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type) */ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) { - long ret; + long t1, t2; - ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + t1 = kptr_obfuscate((long)v1, type); + t2 = kptr_obfuscate((long)v2, type); - return (ret < 0) | ((ret > 0) << 1); + return (t1 < t2) | ((t1 > t2) << 1); } /* The caller must have pinned the task */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c9..2bee072 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> @@ -40,6 +42,9 @@ #include <asm/io.h> #include <asm/sections.h> +#include <crypto/hash.h> +#include <crypto/sha.h> + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -52,6 +57,17 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +#ifdef CONFIG_KEXEC_FILE +static int kexec_calculate_store_digests(struct kimage *image); +#endif + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -125,45 +141,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -185,9 +183,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -208,7 +206,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -220,131 +218,406 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: - return result; + return ret; } -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +#ifdef CONFIG_KEXEC_FILE +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { - int result; - struct kimage *image; - unsigned long i; + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; - image = NULL; - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; goto out; } - /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; goto out; + } - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; } + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. */ - result = -ENOMEM; + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: kfree(image); -out: - return result; + return ret; } +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ static int kimage_is_destination_range(struct kimage *image, unsigned long start, @@ -609,7 +882,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -663,6 +936,14 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -732,7 +1013,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; @@ -791,10 +1072,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -826,7 +1111,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -834,7 +1123,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -851,10 +1143,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -877,7 +1173,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -886,7 +1187,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -986,16 +1290,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, /* Loading another kernel to reboot into */ if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); } if (result) @@ -1077,6 +1381,85 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +#ifdef CONFIG_KEXEC_FILE +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + +#endif /* CONFIG_KEXEC_FILE */ + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -1632,6 +2015,685 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +#ifdef CONFIG_KEXEC_FILE +static int __kexec_add_segment(struct kimage *image, char *buf, + unsigned long bufsz, unsigned long mem, + unsigned long memsz) +{ + struct kexec_segment *ksegment; + + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = buf; + ksegment->bufsz = bufsz; + ksegment->mem = mem; + ksegment->memsz = memsz; + image->nr_segments++; + + return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments - 1]; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective archicture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} +#endif /* CONFIG_KEXEC_FILE */ + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 734e9a7..3995f54 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1778,7 +1778,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) unsigned long hash, flags = 0; struct kretprobe_instance *ri; - /*TODO: consider to only swap the RA after the last pre_handler fired */ + /* + * To avoid deadlocks, prohibit return probing in NMI contexts, + * just skip the probe and increase the (inexact) 'nmissed' + * statistical counter, so that the user is informed that + * something happened: + */ + if (unlikely(in_nmi())) { + rp->nmissed++; + return 0; + } + + /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { diff --git a/kernel/module.c b/kernel/module.c index ae79ce6..8a0dc91 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -135,7 +135,7 @@ static int param_set_bool_enable_only(const char *val, } static const struct kernel_param_ops param_ops_bool_enable_only = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; @@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); + + /* we can't deallocate the module until we clear memory protection */ + unset_module_init_ro_nx(mod); + unset_module_core_ro_nx(mod); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); @@ -3381,6 +3386,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size) */ static inline int is_arm_mapping_symbol(const char *str) { + if (str[0] == '.' && str[1] == 'L') + return true; return str[0] == '$' && strchr("atd", str[1]) && (str[2] == '\0' || str[2] == '.'); } @@ -3444,8 +3451,7 @@ const char *module_address_lookup(unsigned long addr, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -3469,8 +3475,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -3495,8 +3500,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -3760,8 +3764,7 @@ struct module *__module_address(unsigned long addr) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_core(addr, mod) - || within_module_init(addr, mod)) + if (within_module(addr, mod)) return mod; } return NULL; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e78110..ef42d0a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + task_lock(p); ns = p->nsproxy; + p->nsproxy = new; + task_unlock(p); - rcu_assign_pointer(p->nsproxy, new); - - if (ns && atomic_dec_and_test(&ns->count)) { - /* - * wait for others to get what they want from this nsproxy. - * - * cannot release this nsproxy via the call_rcu() since - * put_mnt_ns() will want to sleep - */ - synchronize_rcu(); + if (ns && atomic_dec_and_test(&ns->count)) free_nsproxy(ns); - } } void exit_task_namespaces(struct task_struct *p) diff --git a/kernel/panic.c b/kernel/panic.c index 62e16ce..d09dc5c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -224,6 +224,7 @@ static const struct tnt tnts[] = { { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, + { TAINT_SOFTLOCKUP, 'L', ' ' }, }; /** diff --git a/kernel/params.c b/kernel/params.c index 34f5270..041b589 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -83,6 +83,15 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } +static void param_check_unsafe(const struct kernel_param *kp) +{ + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { + pr_warn("Setting dangerous option %s - tainting kernel\n", + kp->name); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } +} + static int parse_one(char *param, char *val, const char *doing, @@ -104,11 +113,12 @@ static int parse_one(char *param, return 0; /* No one handled NULL, so do it here. */ if (!val && - !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) + !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); mutex_lock(¶m_lock); + param_check_unsafe(¶ms[i]); err = params[i].ops->set(val, ¶ms[i]); mutex_unlock(¶m_lock); return err; @@ -318,7 +328,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) EXPORT_SYMBOL(param_get_bool); struct kernel_param_ops param_ops_bool = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; @@ -369,7 +379,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) EXPORT_SYMBOL(param_set_bint); struct kernel_param_ops param_ops_bint = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; @@ -552,6 +562,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; mutex_lock(¶m_lock); + param_check_unsafe(attribute->param); err = attribute->param->ops->set(buf, attribute->param); mutex_unlock(¶m_lock); if (!err) diff --git a/kernel/power/power.h b/kernel/power/power.h index 5d49dca..2df883a 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ +extern const char *pm_labels[]; extern const char *pm_states[]; extern int suspend_devices_and_enter(suspend_state_t state); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4fc5c32..f1604d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -725,14 +725,6 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } -static void memory_bm_clear_current(struct memory_bitmap *bm) -{ - int bit; - - bit = max(bm->cur.node_bit - 1, 0); - clear_bit(bit, bm->cur.node->data); -} - static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -954,6 +946,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } } +static bool is_nosave_page(unsigned long pfn) +{ + struct nosave_region *region; + + list_for_each_entry(region, &nosave_regions, list) { + if (pfn >= region->start_pfn && pfn < region->end_pfn) { + pr_err("PM: %#010llx in e820 nosave region: " + "[mem %#010llx-%#010llx]\n", + (unsigned long long) pfn << PAGE_SHIFT, + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); + return true; + } + } + + return false; +} + /** * create_basic_memory_bitmaps - create bitmaps needed for marking page * frames that should not be saved and free page frames. The pointers @@ -1322,35 +1333,23 @@ static struct memory_bitmap copy_bm; void swsusp_free(void) { - unsigned long fb_pfn, fr_pfn; - - memory_bm_position_reset(forbidden_pages_map); - memory_bm_position_reset(free_pages_map); - -loop: - fr_pfn = memory_bm_next_pfn(free_pages_map); - fb_pfn = memory_bm_next_pfn(forbidden_pages_map); - - /* - * Find the next bit set in both bitmaps. This is guaranteed to - * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. - */ - do { - if (fb_pfn < fr_pfn) - fb_pfn = memory_bm_next_pfn(forbidden_pages_map); - if (fr_pfn < fb_pfn) - fr_pfn = memory_bm_next_pfn(free_pages_map); - } while (fb_pfn != fr_pfn); - - if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { - struct page *page = pfn_to_page(fr_pfn); + struct zone *zone; + unsigned long pfn, max_zone_pfn; - memory_bm_clear_current(forbidden_pages_map); - memory_bm_clear_current(free_pages_map); - __free_page(page); - goto loop; + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + if (swsusp_page_is_forbidden(page) && + swsusp_page_is_free(page)) { + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } + } } - nr_copy_pages = 0; nr_meta_pages = 0; restore_pblist = NULL; @@ -2015,7 +2014,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) do { pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) + if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) swsusp_set_page_free(pfn_to_page(pfn)); else return -EFAULT; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6dadb25..18c6219 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,7 +31,7 @@ #include "power.h" -static const char *pm_labels[] = { "mem", "standby", "freeze", }; +const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 2f524928..bd91bc1 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -129,20 +129,20 @@ static int __init has_wakealarm(struct device *dev, const void *data) * at startup time. They're normally disabled, for faster boot and because * we can't know which states really work on this particular system. */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; +static const char *test_state_label __initdata; static char warn_bad_state[] __initdata = KERN_WARNING "PM: can't test '%s' suspend state\n"; static int __init setup_test_suspend(char *value) { - suspend_state_t i; + int i; /* "=mem" ==> "mem" */ value++; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (!strcmp(pm_states[i], value)) { - test_state = i; + for (i = 0; pm_labels[i]; i++) + if (!strcmp(pm_labels[i], value)) { + test_state_label = pm_labels[i]; return 0; } @@ -158,13 +158,21 @@ static int __init test_suspend(void) struct rtc_device *rtc = NULL; struct device *dev; + suspend_state_t test_state; /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!pm_states[test_state]) { - printk(warn_bad_state, pm_states[test_state]); - goto done; + if (!test_state_label) + return 0; + + for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) { + const char *state_label = pm_states[test_state]; + + if (state_label && !strcmp(test_state_label, state_label)) + break; + } + if (test_state == PM_SUSPEND_MAX) { + printk(warn_bad_state, test_state_label); + return 0; } /* RTCs have initialized by now too ... can we use one? */ @@ -173,13 +181,12 @@ static int __init test_suspend(void) rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); - goto done; + return 0; } /* go for it */ test_wakealarm(rtc, test_state); rtc_class_close(rtc); -done: return 0; } late_initcall(test_suspend); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839d..1ce7706 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@ #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/utsname.h> +#include <linux/ctype.h> #include <asm/uaccess.h> @@ -56,7 +57,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; @@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +147,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -257,7 +258,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -266,10 +267,23 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* Return log buffer address */ +char *log_buf_addr_get(void) +{ + return log_buf; +} + +/* Return log buffer size */ +u32 log_buf_len_get(void) +{ + return log_buf_len; +} + /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { @@ -344,7 +358,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -453,11 +467,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -828,34 +838,74 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { @@ -872,7 +922,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -881,7 +931,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { - ignore_loglevel = 1; + ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; @@ -947,11 +997,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) @@ -1310,7 +1356,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; @@ -1416,10 +1462,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1432,8 +1477,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1476,7 +1523,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1608,7 +1655,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1617,15 +1665,15 @@ asmlinkage int vprintk_emit(int facility, int level, raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; - if (recursion_bug) { + if (unlikely(recursion_bug)) { static const char recursion_msg[] = "BUG: recent printk recursion!"; recursion_bug = 0; - text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, text_len); + NULL, 0, recursion_msg, + strlen(recursion_msg)); } /* @@ -1716,21 +1764,30 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + lockdep_off(); + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); + preempt_enable(); + lockdep_on(); } - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -1802,7 +1859,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1881,11 +1938,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -1902,7 +1960,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1911,7 +1970,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1950,7 +2009,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha i++, c++) if (strcmp(c->name, name) == 0 && c->index == idx) { strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; return i; @@ -1959,12 +2017,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { - console_suspend_enabled = 0; + console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); @@ -2045,8 +2103,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ @@ -2618,14 +2676,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 71e64c7..6a86eb7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -358,7 +358,7 @@ struct rcu_data { struct rcu_head **nocb_gp_tail; long nocb_gp_count; long nocb_gp_count_lazy; - bool nocb_leader_wake; /* Is the nocb leader thread awake? */ + bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 00dc411..a7997e2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2074,9 +2074,9 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) return; - if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { + if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior xchg orders against prior callback enqueue. */ - ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; + ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; wake_up(&rdp_leader->nocb_wq); } } @@ -2253,7 +2253,7 @@ wait_again: if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); wait_event_interruptible(my_rdp->nocb_wq, - ACCESS_ONCE(my_rdp->nocb_leader_wake)); + !ACCESS_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ @@ -2292,12 +2292,12 @@ wait_again: schedule_timeout_interruptible(1); /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_wake = false; - smp_mb(); /* Ensure _wake false before scan. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) if (ACCESS_ONCE(rdp->nocb_head)) { /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_wake = true; + my_rdp->nocb_leader_sleep = false; break; } goto wait_again; @@ -2307,17 +2307,17 @@ wait_again: rcu_nocb_wait_gp(my_rdp); /* - * We left ->nocb_leader_wake set to reduce cache thrashing. - * We clear it now, but recheck for new callbacks while + * We left ->nocb_leader_sleep unset to reduce cache thrashing. + * We set it now, but recheck for new callbacks while * traversing our follower list. */ - my_rdp->nocb_leader_wake = false; - smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { if (ACCESS_ONCE(rdp->nocb_head)) - my_rdp->nocb_leader_wake = true; /* No need to wait. */ + my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237a..60c5a38 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -339,9 +351,12 @@ static int find_next_system_ram(struct resource *res, char *name) end = res->end; BUG_ON(start >= end); + if (first_level_children_only) + sibling_only = true; + read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +368,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -365,6 +381,70 @@ static int find_next_system_ram(struct resource *res, char *name) } /* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + +/* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". @@ -382,7 +462,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1211575..ec1a286 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2393,6 +2393,13 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *this = this_rq(); + *nr_waiters = atomic_read(&this->nr_iowait); + *load = this->cpu_load[0]; +} + #ifdef CONFIG_SMP /* diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30..8ecd552 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -8,13 +8,6 @@ #include "sched.h" -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - /* * Global load-average calculations * diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 25b0043..44eb005 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -203,7 +203,7 @@ static u32 seccomp_run_filters(int syscall) static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) { - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) return false; @@ -214,7 +214,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) static inline void seccomp_assign_mode(struct task_struct *task, unsigned long seccomp_mode) { - BUG_ON(!spin_is_locked(&task->sighand->siglock)); + assert_spin_locked(&task->sighand->siglock); task->seccomp.mode = seccomp_mode; /* @@ -253,7 +253,7 @@ static inline pid_t seccomp_can_sync_threads(void) struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Validate all threads being eligible for synchronization. */ caller = current; @@ -294,7 +294,7 @@ static inline void seccomp_sync_threads(void) struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Synchronize all threads. */ caller = current; @@ -464,7 +464,7 @@ static long seccomp_attach_filter(unsigned int flags, unsigned long total_insns; struct seccomp_filter *walker; - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Validate resulting filter length. */ total_insns = filter->prog->len; diff --git a/kernel/signal.c b/kernel/signal.c index 40b76e3..8f0876f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, - struct pt_regs *regs, void *cookie) +int get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2241,13 +2240,13 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, info); + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info); + signr = ptrace_signal(signr, &ksig->info); if (!signr) continue; } @@ -2255,13 +2254,13 @@ relock: ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ - trace_signal_deliver(signr, info, ka); + trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ - *return_ka = *ka; + ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; @@ -2311,7 +2310,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(info->si_signo))) { + if (likely(do_signal_stop(ksig->info.si_signo))) { /* It released the siglock. */ goto relock; } @@ -2332,7 +2331,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(info->si_signo); + print_fatal_signal(ksig->info.si_signo); proc_coredump_connector(current); /* * If it was able to dump core, this kills all @@ -2342,34 +2341,32 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info); + do_coredump(&ksig->info); } /* * Death signals, no core dump. */ - do_group_exit(info->si_signo); + do_group_exit(ksig->info.si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); - return signr; + + ksig->sig = signr; + return ksig->sig > 0; } /** * signal_delivered - - * @sig: number of signal being delivered - * @info: siginfo_t of signal being delivered - * @ka: sigaction setting that chose the handler - * @regs: user register state + * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been - * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER - * is set in @ka->sa.sa_flags. Tracing is notified. + * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ -void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, - struct pt_regs *regs, int stepping) +static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; @@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, simply clear the restore sigmask flag. */ clear_restore_sigmask(); - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); + sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); + if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); - tracehook_signal_handler(sig, info, ka, regs, stepping); + tracehook_signal_handler(stepping); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) @@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping) if (failed) force_sigsegv(ksig->sig, current); else - signal_delivered(ksig->sig, &ksig->info, &ksig->ka, - signal_pt_regs(), stepping); + signal_delivered(ksig, stepping); } /* diff --git a/kernel/smp.c b/kernel/smp.c index 487653b..aff8aa1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); - WARN_ON_ONCE(!ret); + WARN_ON_ONCE(ret); } preempt_enable(); } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2904a21..d4709d4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); @@ -155,6 +156,9 @@ cond_syscall(sys_process_vm_writev); cond_syscall(compat_sys_process_vm_readv); cond_syscall(compat_sys_process_vm_writev); cond_syscall(sys_uselib); +cond_syscall(sys_fadvise64); +cond_syscall(sys_fadvise64_64); +cond_syscall(sys_madvise); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); @@ -197,6 +201,7 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create); /* performance counters: */ cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e2..75875a7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e4ba9a5..9a4f750 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbf..0dbab6d 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@ * the GNU General Public License for more details. */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt + #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/random.h> @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); + pr_err("incorrect value in post_handler\n"); } posth_val = preh_val + div_factor; } @@ -59,8 +60,7 @@ static int test_kprobe(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); + pr_err("register_kprobe returned %d\n", ret); return ret; } @@ -68,14 +68,12 @@ static int test_kprobe(void) unregister_kprobe(&kp); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); + pr_err("incorrect value in post_handler2\n"); } posth_val = preh_val + div_factor; } @@ -120,8 +117,7 @@ static int test_kprobes(void) kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); + pr_err("register_kprobes returned %d\n", ret); return ret; } @@ -130,14 +126,12 @@ static int test_kprobes(void) ret = target(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -146,14 +140,12 @@ static int test_kprobes(void) ret = target2(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); + pr_err("kprobe pre_handler2 not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); + pr_err("kprobe post_handler2 not called\n"); handler_errors++; } @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) { if (value != rand1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); + pr_err("incorrect value in jprobe handler\n"); } jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void) ret = register_jprobe(&jp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); + pr_err("register_jprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } @@ -217,24 +206,21 @@ static int test_jprobes(void) jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); + pr_err("register_jprobes returned %d\n", ret); return ret; } jph_val = 0; ret = target(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } jph_val = 0; ret = target2(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); + pr_err("jprobe handler2 not called\n"); handler_errors++; } unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); + pr_err("incorrect value in kretprobe handler\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void) ret = register_kretprobe(&rp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); + pr_err("incorrect value in kretprobe handler2\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void) rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } krph_val = 0; ret = target(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } krph_val = 0; ret = target2(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); + pr_err("kretprobe handler2 not called\n"); handler_errors++; } unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void) rand1 = prandom_u32(); } while (rand1 <= div_factor); - printk(KERN_INFO "Kprobe smoke test started\n"); + pr_info("started\n"); num_tests++; ret = test_kprobe(); if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void) #endif /* CONFIG_KRETPROBES */ if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); + pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); + pr_err("BUG: %d error(s) running handlers\n", handler_errors); else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + pr_info("passed successfully\n"); return 0; } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4aec4a4..a7077d3 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -464,18 +464,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { + unsigned long flags; struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); - if (posix_timer_event(ptr, 0) != 0) - ptr->it_overrun++; + enum alarmtimer_restart result = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&ptr->it_lock, flags); + if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; + } /* Re-add periodic timers */ if (ptr->it.alarm.interval.tv64) { ptr->it_overrun += alarm_forward(alarm, now, ptr->it.alarm.interval); - return ALARMTIMER_RESTART; + result = ALARMTIMER_RESTART; } - return ALARMTIMER_NORESTART; + spin_unlock_irqrestore(&ptr->it_lock, flags); + + return result; } /** @@ -541,18 +549,22 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @new_timer: k_itimer pointer * @cur_setting: itimerspec data to fill * - * Copies the itimerspec data out from the k_itimer + * Copies out the current itimerspec data */ static void alarm_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { - memset(cur_setting, 0, sizeof(struct itimerspec)); + ktime_t relative_expiry_time = + alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); + + if (ktime_to_ns(relative_expiry_time) > 0) { + cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + } else { + cur_setting->it_value.tv_sec = 0; + cur_setting->it_value.tv_nsec = 0; + } - cur_setting->it_interval = - ktime_to_timespec(timr->it.alarm.interval); - cur_setting->it_value = - ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); - return; + cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee..f654a8a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -225,6 +225,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { }; /* + * Kick this CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. + */ +void tick_nohz_full_kick(void) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +/* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ diff --git a/kernel/time/time.c b/kernel/time/time.c index f0294ba..a9ae20f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -559,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies); * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * Note that due to the small error in the multiplier here, this + * rounding is incorrect for sufficiently large values of tv_nsec, but + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're + * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. */ -unsigned long -timespec_to_jiffies(const struct timespec *value) +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) { - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; + nsec = nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; @@ -580,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value) (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } + +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); +} + EXPORT_SYMBOL(timespec_to_jiffies); void @@ -596,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) } EXPORT_SYMBOL(jiffies_to_timespec); -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. +/* + * We could use a similar algorithm to timespec_to_jiffies (with a + * different multiplier for usec instead of nsec). But this has a + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the + * usec value, since it's not necessarily integral. + * + * We could instead round in the intermediate scaled representation + * (i.e. in units of 1/2^(large scale) jiffies) but that's also + * perilous: the scaling introduces a small positive error, which + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 + * units to the intermediate before shifting) leads to accidental + * overflow and overestimates. + * + * At the cost of one additional multiplication by a constant, just + * use the timespec implementation. */ unsigned long timeval_to_jiffies(const struct timeval *value) { - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + return __timespec_to_jiffies(value->tv_sec, + value->tv_usec * NSEC_PER_USEC); } EXPORT_SYMBOL(timeval_to_jiffies); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f36b028..ec1791f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); static inline void update_vsyscall(struct timekeeper *tk) { - struct timespec xt; + struct timespec xt, wm; xt = timespec64_to_timespec(tk_xtime(tk)); - update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, + wm = timespec64_to_timespec(tk->wall_to_monotonic); + update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, tk->tkr.cycle_last); } @@ -441,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) tk->ntp_error = 0; ntp_clear(); } - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk_update_ktime_data(tk); + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1654b12..5916a8e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -65,15 +65,21 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE -#define INIT_REGEX_LOCK(opsname) \ - .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), +#define INIT_OPS_HASH(opsname) \ + .func_hash = &opsname.local_hash, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else -#define INIT_REGEX_LOCK(opsname) +#define INIT_OPS_HASH(opsname) +#define ASSIGN_OPS_HASH(opsname, val) #endif static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + INIT_OPS_HASH(ftrace_list_end) }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -140,7 +146,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) { #ifdef CONFIG_DYNAMIC_FTRACE if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { - mutex_init(&ops->regex_lock); + mutex_init(&ops->local_hash.regex_lock); + ops->func_hash = &ops->local_hash; ops->flags |= FTRACE_OPS_FL_INITIALIZED; } #endif @@ -899,7 +906,7 @@ static void unregister_ftrace_profiler(void) static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(ftrace_profile_ops) + INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) @@ -1081,11 +1088,12 @@ static const struct ftrace_hash empty_hash = { #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) static struct ftrace_ops global_ops = { - .func = ftrace_stub, - .notrace_hash = EMPTY_HASH, - .filter_hash = EMPTY_HASH, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) + .func = ftrace_stub, + .local_hash.notrace_hash = EMPTY_HASH, + .local_hash.filter_hash = EMPTY_HASH, + INIT_OPS_HASH(global_ops) + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED, }; struct ftrace_page { @@ -1226,8 +1234,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); - free_ftrace_hash(ops->filter_hash); - free_ftrace_hash(ops->notrace_hash); + free_ftrace_hash(ops->func_hash->filter_hash); + free_ftrace_hash(ops->func_hash->notrace_hash); } static struct ftrace_hash *alloc_ftrace_hash(int size_bits) @@ -1288,9 +1296,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) } static void -ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); static void -ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_move(struct ftrace_ops *ops, int enable, @@ -1342,13 +1350,13 @@ update: * Remove the current set, update the hash and add * them back. */ - ftrace_hash_rec_disable(ops, enable); + ftrace_hash_rec_disable_modify(ops, enable); old_hash = *dst; rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - ftrace_hash_rec_enable(ops, enable); + ftrace_hash_rec_enable_modify(ops, enable); return 0; } @@ -1382,8 +1390,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); - notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); + filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -1503,25 +1511,38 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static void ftrace_remove_tramp(struct ftrace_ops *ops, struct dyn_ftrace *rec) { - struct ftrace_func_entry *entry; - - entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); - if (!entry) + /* If TRAMP is not set, no ops should have a trampoline for this */ + if (!(rec->flags & FTRACE_FL_TRAMP)) return; + rec->flags &= ~FTRACE_FL_TRAMP; + + if ((!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) || + ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) + return; /* * The tramp_hash entry will be removed at time * of update. */ ops->nr_trampolines--; - rec->flags &= ~FTRACE_FL_TRAMP; } -static void ftrace_clear_tramps(struct dyn_ftrace *rec) +static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops) { struct ftrace_ops *op; + /* If TRAMP is not set, no ops should have a trampoline for this */ + if (!(rec->flags & FTRACE_FL_TRAMP)) + return; + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This function is called to clear other tramps + * not the one that is being updated. + */ + if (op == ops) + continue; if (op->nr_trampolines) ftrace_remove_tramp(op, rec); } while_for_each_ftrace_op(op); @@ -1554,14 +1575,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * gets inversed. */ if (filter_hash) { - hash = ops->filter_hash; - other_hash = ops->notrace_hash; + hash = ops->func_hash->filter_hash; + other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; - hash = ops->notrace_hash; - other_hash = ops->filter_hash; + hash = ops->func_hash->notrace_hash; + other_hash = ops->func_hash->filter_hash; /* * If the notrace hash has no items, * then there's nothing to do. @@ -1622,13 +1643,10 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, /* * If we are adding another function callback * to this function, and the previous had a - * trampoline used, then we need to go back to - * the default trampoline. + * custom trampoline in use, then we need to go + * back to the default trampoline. */ - rec->flags &= ~FTRACE_FL_TRAMP; - - /* remove trampolines from any ops for this rec */ - ftrace_clear_tramps(rec); + ftrace_clear_tramps(rec, ops); } /* @@ -1682,6 +1700,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } +static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, + int filter_hash, int inc) +{ + struct ftrace_ops *op; + + __ftrace_hash_rec_update(ops, filter_hash, inc); + + if (ops->func_hash != &global_ops.local_hash) + return; + + /* + * If the ops shares the global_ops hash, then we need to update + * all ops that are enabled and use this hash. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Already done */ + if (op == ops) + continue; + if (op->func_hash == &global_ops.local_hash) + __ftrace_hash_rec_update(op, filter_hash, inc); + } while_for_each_ftrace_op(op); +} + +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 1); +} + static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -1896,8 +1949,8 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP) { ops = ftrace_find_tramp_ops_new(rec); if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n", + (void *)rec->ip, (void *)rec->ip, rec->flags); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } @@ -1964,7 +2017,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: - return ftrace_make_nop(NULL, rec, ftrace_addr); + return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); @@ -2227,7 +2280,10 @@ static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) } while_for_each_ftrace_rec(); /* The number of recs in the hash must match nr_trampolines */ - FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); + if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines)) + pr_warn("count=%ld trampolines=%d\n", + ops->tramp_hash->count, + ops->nr_trampolines); return 0; } @@ -2436,8 +2492,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) * Filter_hash being empty will default to trace module. * But notrace hash requires a test of individual module functions. */ - return ftrace_hash_empty(ops->filter_hash) && - ftrace_hash_empty(ops->notrace_hash); + return ftrace_hash_empty(ops->func_hash->filter_hash) && + ftrace_hash_empty(ops->func_hash->notrace_hash); } /* @@ -2459,12 +2515,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 0; /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->filter_hash) && - !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) return 0; return 1; @@ -2785,10 +2841,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } else { rec = &iter->pg->records[iter->idx++]; if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || + !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -2837,9 +2893,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) * functions are enabled. */ if ((iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->filter_hash)) || + ftrace_hash_empty(ops->func_hash->filter_hash)) || (iter->flags & FTRACE_ITER_NOTRACE && - ftrace_hash_empty(ops->notrace_hash))) { + ftrace_hash_empty(ops->func_hash->notrace_hash))) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -3001,12 +3057,12 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (flag & FTRACE_ITER_NOTRACE) - hash = ops->notrace_hash; + hash = ops->func_hash->notrace_hash; else - hash = ops->filter_hash; + hash = ops->func_hash->filter_hash; if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; @@ -3041,7 +3097,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, file->private_data = iter; out_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); return ret; } @@ -3279,7 +3335,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = { .func = function_trace_probe_call, .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(trace_probe_ops) + INIT_OPS_HASH(trace_probe_ops) }; static int ftrace_probe_registered; @@ -3342,7 +3398,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_probe *entry; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3359,7 +3415,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (WARN_ON(not)) return -EINVAL; - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) { @@ -3428,7 +3484,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, out_unlock: mutex_unlock(&ftrace_lock); out: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); return count; @@ -3446,7 +3502,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; @@ -3468,7 +3524,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return; } - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) @@ -3521,7 +3577,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); } @@ -3717,12 +3773,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, if (unlikely(ftrace_disabled)) return -ENODEV; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (enable) - orig_hash = &ops->filter_hash; + orig_hash = &ops->func_hash->filter_hash; else - orig_hash = &ops->notrace_hash; + orig_hash = &ops->func_hash->notrace_hash; if (reset) hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); @@ -3752,7 +3808,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); out_regex_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); free_ftrace_hash(hash); return ret; @@ -3975,15 +4031,15 @@ int ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); - mutex_lock(&iter->ops->regex_lock); + mutex_lock(&iter->ops->func_hash->regex_lock); if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); if (filter_hash) - orig_hash = &iter->ops->filter_hash; + orig_hash = &iter->ops->func_hash->filter_hash; else - orig_hash = &iter->ops->notrace_hash; + orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); ret = ftrace_hash_move(iter->ops, filter_hash, @@ -3994,7 +4050,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_lock); } - mutex_unlock(&iter->ops->regex_lock); + mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); kfree(iter); @@ -4611,7 +4667,6 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -4713,7 +4768,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(control_ops) + INIT_OPS_HASH(control_ops) }; static inline void @@ -5145,6 +5200,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER +static struct ftrace_ops graph_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; + static int ftrace_graph_active; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -5307,12 +5373,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) */ static void update_function_graph_func(void) { - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list == &global_ops && - global_ops.next == &ftrace_list_end)) - ftrace_graph_entry = __ftrace_graph_entry; - else + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; } static struct notifier_block ftrace_suspend_notifier = { @@ -5353,16 +5435,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - /* Function graph doesn't use the .func field of global_ops */ - global_ops.flags |= FTRACE_OPS_FL_STUB; - -#ifdef CONFIG_DYNAMIC_FTRACE - /* Optimize function graph calling (if implemented by arch) */ - if (FTRACE_GRAPH_TRAMP_ADDR != 0) - global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; -#endif - - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5380,12 +5453,7 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); - global_ops.flags &= ~FTRACE_OPS_FL_STUB; -#ifdef CONFIG_DYNAMIC_FTRACE - if (FTRACE_GRAPH_TRAMP_ADDR != 0) - global_ops.trampoline = 0; -#endif + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 925f629..2d75c94 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -626,8 +626,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - work->waiters_pending = true; poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit + * is set, the next event will wake the task up, but we can get stuck + * if there's only a single event in. + * + * FIXME: Ideally, we need a memory barrier on the writer side as well, + * but adding a memory barrier to all events will cause too much of a + * performance hit in the fast path. We only need a memory barrier when + * the buffer goes from empty to having content. But as this race is + * extremely small, and it's not a problem if another event comes in, we + * will fix it later. + */ + smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) @@ -1968,7 +1982,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) /** * rb_update_event - update event type and data - * @event: the even to update + * @event: the event to update * @type: the type of event * @length: the size of the event field in the ring buffer * @@ -3341,21 +3355,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ - if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = rb_set_head_page(cpu_buffer); - if (unlikely(!iter->head_page)) - return; - iter->head = iter->head_page->read; - } else { - iter->head_page = cpu_buffer->reader_page; - iter->head = cpu_buffer->reader_page->read; - } + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + + iter->cache_reader_page = iter->head_page; + iter->cache_read = cpu_buffer->read; + if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; - iter->cache_reader_page = cpu_buffer->reader_page; - iter->cache_read = cpu_buffer->read; } /** @@ -3748,12 +3757,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a time extend is encountered. - * Since the time extend is always attached to a data event, - * we should never loop more than once. - * (We never hit the following condition more than twice). + * We repeat when a time extend is encountered or we hit + * the end of the page. Since the time extend is always attached + * to a data event, we should never loop more than three times. + * Once for going to next page, once on time extend, and + * finally once to get the event. + * (We never hit the following condition more than thrice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc0256..aa312b0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) return; } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, diff --git a/kernel/utsname.c b/kernel/utsname.c index fd39312..883aaaa 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task) struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->uts_ns; get_uts_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd..a8d6914 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) smp_mb__after_atomic(); } + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", |