summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c484
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/events/uprobes.c15
-rw-r--r--kernel/exit.c50
-rw-r--r--kernel/fork.c81
-rw-r--r--kernel/gcov/fs.c3
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kexec.c1291
-rw-r--r--kernel/module.c14
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/printk/printk.c169
-rw-r--r--kernel/resource.c101
-rw-r--r--kernel/seccomp.c10
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/test_kprobes.c87
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/trace/ring_buffer.c31
-rw-r--r--kernel/user_namespace.c6
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/watchdog.c11
26 files changed, 1778 insertions, 671 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0026cf5..dc5c775 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -105,7 +105,7 @@ targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
index a1844f1..b4c667d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
+#include <linux/fs_pin.h>
/*
* These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct bsd_acct_struct *acct,
- struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct);
-/*
- * This structure is used so that all the data protected by lock
- * can be placed in the same cache line as the lock. This primes
- * the cache line to have the data after getting the lock.
- */
struct bsd_acct_struct {
+ struct fs_pin pin;
+ struct mutex lock;
int active;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
- struct list_head list;
+ struct work_struct work;
+ struct completion done;
};
-static DEFINE_SPINLOCK(acct_lock);
-static LIST_HEAD(acct_list);
-
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- int res;
- int act;
- u64 resume;
- u64 suspend;
-
- spin_lock(&acct_lock);
- res = acct->active;
- if (!file || time_is_before_jiffies(acct->needcheck))
+
+ if (time_is_before_jiffies(acct->needcheck))
goto out;
- spin_unlock(&acct_lock);
/* May block */
- if (vfs_statfs(&file->f_path, &sbuf))
- return res;
- suspend = sbuf.f_blocks * SUSPEND;
- resume = sbuf.f_blocks * RESUME;
-
- do_div(suspend, 100);
- do_div(resume, 100);
-
- if (sbuf.f_bavail <= suspend)
- act = -1;
- else if (sbuf.f_bavail >= resume)
- act = 1;
- else
- act = 0;
-
- /*
- * If some joker switched acct->file under us we'ld better be
- * silent and _not_ touch anything.
- */
- spin_lock(&acct_lock);
- if (file != acct->file) {
- if (act)
- res = act > 0;
+ if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
- }
if (acct->active) {
- if (act < 0) {
+ u64 suspend = sbuf.f_blocks * SUSPEND;
+ do_div(suspend, 100);
+ if (sbuf.f_bavail <= suspend) {
acct->active = 0;
- printk(KERN_INFO "Process accounting paused\n");
+ pr_info("Process accounting paused\n");
}
} else {
- if (act > 0) {
+ u64 resume = sbuf.f_blocks * RESUME;
+ do_div(resume, 100);
+ if (sbuf.f_bavail >= resume) {
acct->active = 1;
- printk(KERN_INFO "Process accounting resumed\n");
+ pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
- res = acct->active;
out:
- spin_unlock(&acct_lock);
+ return acct->active;
+}
+
+static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
+{
+ struct bsd_acct_struct *res;
+again:
+ smp_rmb();
+ rcu_read_lock();
+ res = ACCESS_ONCE(ns->bacct);
+ if (!res) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ if (!atomic_long_inc_not_zero(&res->pin.count)) {
+ rcu_read_unlock();
+ cpu_relax();
+ goto again;
+ }
+ rcu_read_unlock();
+ mutex_lock(&res->lock);
+ if (!res->ns) {
+ mutex_unlock(&res->lock);
+ pin_put(&res->pin);
+ goto again;
+ }
return res;
}
-/*
- * Close the old accounting file (if currently open) and then replace
- * it with file (if non-NULL).
- *
- * NOTE: acct_lock MUST be held on entry and exit.
- */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
- struct pid_namespace *ns)
+static void close_work(struct work_struct *work)
+{
+ struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
+ struct file *file = acct->file;
+ if (file->f_op->flush)
+ file->f_op->flush(file, NULL);
+ __fput_sync(file);
+ complete(&acct->done);
+}
+
+static void acct_kill(struct bsd_acct_struct *acct,
+ struct bsd_acct_struct *new)
{
- struct file *old_acct = NULL;
- struct pid_namespace *old_ns = NULL;
-
- if (acct->file) {
- old_acct = acct->file;
- old_ns = acct->ns;
- acct->active = 0;
- acct->file = NULL;
+ if (acct) {
+ struct pid_namespace *ns = acct->ns;
+ do_acct_process(acct);
+ INIT_WORK(&acct->work, close_work);
+ init_completion(&acct->done);
+ schedule_work(&acct->work);
+ wait_for_completion(&acct->done);
+ pin_remove(&acct->pin);
+ ns->bacct = new;
acct->ns = NULL;
- list_del(&acct->list);
+ atomic_long_dec(&acct->pin.count);
+ mutex_unlock(&acct->lock);
+ pin_put(&acct->pin);
}
- if (file) {
- acct->file = file;
- acct->ns = ns;
- acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
- acct->active = 1;
- list_add(&acct->list, &acct_list);
- }
- if (old_acct) {
- mnt_unpin(old_acct->f_path.mnt);
- spin_unlock(&acct_lock);
- do_acct_process(acct, old_ns, old_acct);
- filp_close(old_acct, NULL);
- spin_lock(&acct_lock);
+}
+
+static void acct_pin_kill(struct fs_pin *pin)
+{
+ struct bsd_acct_struct *acct;
+ acct = container_of(pin, struct bsd_acct_struct, pin);
+ mutex_lock(&acct->lock);
+ if (!acct->ns) {
+ mutex_unlock(&acct->lock);
+ pin_put(pin);
+ acct = NULL;
}
+ acct_kill(acct, NULL);
}
static int acct_on(struct filename *pathname)
{
struct file *file;
- struct vfsmount *mnt;
- struct pid_namespace *ns;
- struct bsd_acct_struct *acct = NULL;
+ struct vfsmount *mnt, *internal;
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ struct bsd_acct_struct *acct, *old;
+ int err;
+
+ acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+ if (!acct)
+ return -ENOMEM;
/* Difference from BSD - they don't do O_APPEND */
file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ kfree(acct);
return PTR_ERR(file);
+ }
if (!S_ISREG(file_inode(file)->i_mode)) {
+ kfree(acct);
filp_close(file, NULL);
return -EACCES;
}
if (!file->f_op->write) {
+ kfree(acct);
filp_close(file, NULL);
return -EIO;
}
-
- ns = task_active_pid_ns(current);
- if (ns->bacct == NULL) {
- acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
- if (acct == NULL) {
- filp_close(file, NULL);
- return -ENOMEM;
- }
+ internal = mnt_clone_internal(&file->f_path);
+ if (IS_ERR(internal)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return PTR_ERR(internal);
}
-
- spin_lock(&acct_lock);
- if (ns->bacct == NULL) {
- ns->bacct = acct;
- acct = NULL;
+ err = mnt_want_write(internal);
+ if (err) {
+ mntput(internal);
+ kfree(acct);
+ filp_close(file, NULL);
+ return err;
}
-
mnt = file->f_path.mnt;
- mnt_pin(mnt);
- acct_file_reopen(ns->bacct, file, ns);
- spin_unlock(&acct_lock);
-
- mntput(mnt); /* it's pinned, now give up active reference */
- kfree(acct);
-
+ file->f_path.mnt = internal;
+
+ atomic_long_set(&acct->pin.count, 1);
+ acct->pin.kill = acct_pin_kill;
+ acct->file = file;
+ acct->needcheck = jiffies;
+ acct->ns = ns;
+ mutex_init(&acct->lock);
+ mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
+ pin_insert(&acct->pin, mnt);
+
+ old = acct_get(ns);
+ if (old)
+ acct_kill(old, acct);
+ else
+ ns->bacct = acct;
+ mutex_unlock(&acct->lock);
+ mnt_drop_write(mnt);
+ mntput(mnt);
return 0;
}
+static DEFINE_MUTEX(acct_on_mutex);
+
/**
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
@@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
if (name) {
struct filename *tmp = getname(name);
+
if (IS_ERR(tmp))
return PTR_ERR(tmp);
+ mutex_lock(&acct_on_mutex);
error = acct_on(tmp);
+ mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
- struct bsd_acct_struct *acct;
-
- acct = task_active_pid_ns(current)->bacct;
- if (acct == NULL)
- return 0;
-
- spin_lock(&acct_lock);
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
+ acct_kill(acct_get(task_active_pid_ns(current)), NULL);
}
return error;
}
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @m: vfsmount being shut down
- *
- * If the accounting is turned on for a file in the subtree pointed to
- * to by m, turn accounting off. Done when m is about to die.
- */
-void acct_auto_close_mnt(struct vfsmount *m)
-{
- struct bsd_acct_struct *acct;
-
- spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.mnt == m) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
- }
- spin_unlock(&acct_lock);
-}
-
-/**
- * acct_auto_close - turn off a filesystem's accounting if it is on
- * @sb: super block for the filesystem
- *
- * If the accounting is turned on for a file in the filesystem pointed
- * to by sb, turn accounting off.
- */
-void acct_auto_close(struct super_block *sb)
-{
- struct bsd_acct_struct *acct;
-
- spin_lock(&acct_lock);
-restart:
- list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
- acct_file_reopen(acct, NULL, NULL);
- goto restart;
- }
- spin_unlock(&acct_lock);
-}
-
void acct_exit_ns(struct pid_namespace *ns)
{
- struct bsd_acct_struct *acct = ns->bacct;
-
- if (acct == NULL)
- return;
-
- spin_lock(&acct_lock);
- if (acct->file != NULL)
- acct_file_reopen(acct, NULL, NULL);
- spin_unlock(&acct_lock);
-
- kfree(acct);
+ acct_kill(acct_get(ns), NULL);
}
/*
@@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value)
return exp;
}
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/*
* encode an u64 into a comp2_t (24 bits)
*
@@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value)
#define MANTSIZE2 20 /* 20 bit mantissa. */
#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
-#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */
+#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
static comp2_t encode_comp2_t(u64 value)
{
@@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value)
}
#endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
@@ -429,8 +391,9 @@ static u32 encode_float(u64 value)
unsigned exp = 190;
unsigned u;
- if (value==0) return 0;
- while ((s64)value > 0){
+ if (value == 0)
+ return 0;
+ while ((s64)value > 0) {
value <<= 1;
exp--;
}
@@ -448,116 +411,112 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct,
- struct pid_namespace *ns, struct file *file)
+static void fill_ac(acct_t *ac)
{
struct pacct_struct *pacct = &current->signal->pacct;
- acct_t ac;
- mm_segment_t fs;
- unsigned long flim;
u64 elapsed, run_time;
struct tty_struct *tty;
- const struct cred *orig_cred;
-
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct, file))
- goto out;
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
- memset(&ac, 0, sizeof(acct_t));
+ memset(ac, 0, sizeof(acct_t));
- ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
- strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
+ ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
+ strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
/* calculate run_time in nsec*/
run_time = ktime_get_ns();
run_time -= current->group_leader->start_time;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
-#if ACCT_VERSION==3
- ac.ac_etime = encode_float(elapsed);
+#if ACCT_VERSION == 3
+ ac->ac_etime = encode_float(elapsed);
#else
- ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
- (unsigned long) elapsed : (unsigned long) -1l);
+ ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
+ (unsigned long) elapsed : (unsigned long) -1l);
#endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
{
/* new enlarged etime field */
comp2_t etime = encode_comp2_t(elapsed);
- ac.ac_etime_hi = etime >> 16;
- ac.ac_etime_lo = (u16) etime;
+
+ ac->ac_etime_hi = etime >> 16;
+ ac->ac_etime_lo = (u16) etime;
}
#endif
do_div(elapsed, AHZ);
- ac.ac_btime = get_seconds() - elapsed;
+ ac->ac_btime = get_seconds() - elapsed;
+#if ACCT_VERSION==2
+ ac->ac_ahz = AHZ;
+#endif
+
+ spin_lock_irq(&current->sighand->siglock);
+ tty = current->signal->tty; /* Safe as we hold the siglock */
+ ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
+ ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+ ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac->ac_flag = pacct->ac_flag;
+ ac->ac_mem = encode_comp_t(pacct->ac_mem);
+ ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
+ ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
+ ac->ac_exitcode = pacct->ac_exitcode;
+ spin_unlock_irq(&current->sighand->siglock);
+}
+/*
+ * do_acct_process does all actual work. Caller holds the reference to file.
+ */
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ acct_t ac;
+ unsigned long flim;
+ const struct cred *orig_cred;
+ struct pid_namespace *ns = acct->ns;
+ struct file *file = acct->file;
+
+ /*
+ * Accounting records are not subject to resource limits.
+ */
+ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ /* Perform file operations on behalf of whoever enabled accounting */
+ orig_cred = override_creds(file->f_cred);
+
+ /*
+ * First check to see if there is enough free_space to continue
+ * the process accounting system.
+ */
+ if (!check_free_space(acct))
+ goto out;
+
+ fill_ac(&ac);
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
-#if ACCT_VERSION==2
- ac.ac_ahz = AHZ;
-#endif
-#if ACCT_VERSION==1 || ACCT_VERSION==2
+#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
#endif
-#if ACCT_VERSION==3
+#if ACCT_VERSION == 3
ac.ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
#endif
-
- spin_lock_irq(&current->sighand->siglock);
- tty = current->signal->tty; /* Safe as we hold the siglock */
- ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
- ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
- ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
- ac.ac_flag = pacct->ac_flag;
- ac.ac_mem = encode_comp_t(pacct->ac_mem);
- ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
- ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
- ac.ac_exitcode = pacct->ac_exitcode;
- spin_unlock_irq(&current->sighand->siglock);
- ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
- ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
- ac.ac_swaps = encode_comp_t(0);
-
/*
* Get freeze protection. If the fs is frozen, just skip the write
* as we could deadlock the system otherwise.
*/
- if (!file_start_write_trylock(file))
- goto out;
- /*
- * Kernel segment override to datasegment and write it
- * to the accounting file.
- */
- fs = get_fs();
- set_fs(KERNEL_DS);
- /*
- * Accounting records are not subject to resource limits.
- */
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- file->f_op->write(file, (char *)&ac,
- sizeof(acct_t), &file->f_pos);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- set_fs(fs);
- file_end_write(file);
+ if (file_start_write_trylock(file)) {
+ /* it's been opened O_APPEND, so position is irrelevant */
+ loff_t pos = 0;
+ __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
+ file_end_write(file);
+ }
out:
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
@@ -574,6 +533,7 @@ void acct_collect(long exitcode, int group_dead)
if (group_dead && current->mm) {
struct vm_area_struct *vma;
+
down_read(&current->mm->mmap_sem);
vma = current->mm->mmap;
while (vma) {
@@ -605,34 +565,20 @@ void acct_collect(long exitcode, int group_dead)
spin_unlock_irq(&current->sighand->siglock);
}
-static void acct_process_in_ns(struct pid_namespace *ns)
+static void slow_acct_process(struct pid_namespace *ns)
{
- struct file *file = NULL;
- struct bsd_acct_struct *acct;
-
- acct = ns->bacct;
- /*
- * accelerate the common fastpath:
- */
- if (!acct || !acct->file)
- return;
-
- spin_lock(&acct_lock);
- file = acct->file;
- if (unlikely(!file)) {
- spin_unlock(&acct_lock);
- return;
+ for ( ; ns; ns = ns->parent) {
+ struct bsd_acct_struct *acct = acct_get(ns);
+ if (acct) {
+ do_acct_process(acct);
+ mutex_unlock(&acct->lock);
+ pin_put(&acct->pin);
+ }
}
- get_file(file);
- spin_unlock(&acct_lock);
-
- do_acct_process(acct, ns, file);
- fput(file);
}
/**
- * acct_process - now just a wrapper around acct_process_in_ns,
- * which in turn is a wrapper around do_acct_process.
+ * acct_process
*
* handles process accounting for an exiting task
*/
@@ -645,6 +591,10 @@ void acct_process(void)
* alive and holds its namespace, which in turn holds
* its parent.
*/
- for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
- acct_process_in_ns(ns);
+ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
+ if (ns->bacct)
+ break;
+ }
+ if (unlikely(ns))
+ slow_acct_process(ns);
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8e9bc9c..c447cd9 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)
if (unlikely(!entry))
return NULL;
- fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+ fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
if (unlikely(!fields)) {
kfree(entry);
return NULL;
@@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];
int __init audit_register_class(int class, unsigned *list)
{
- __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
+ __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
if (!p)
return -ENOMEM;
while (*list != ~0U) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9fd4246..e1d1d195 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,7 +9,6 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
-#include <linux/page_cgroup.h>
#include <linux/log2.h>
#include <linux/spinlock_types.h>
@@ -18,7 +17,6 @@ void foo(void)
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
- DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
#ifdef CONFIG_SMP
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6f3254e..1d0af8a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
/* For mmu_notifiers */
const unsigned long mmun_start = addr;
const unsigned long mmun_end = addr + PAGE_SIZE;
+ struct mem_cgroup *memcg;
+
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ if (err)
+ return err;
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(page);
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr);
+ mem_cgroup_commit_charge(kpage, memcg, false);
+ lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
dec_mm_counter(mm, MM_FILEPAGES);
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
+ mem_cgroup_cancel_charge(kpage, memcg);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
@@ -315,18 +323,11 @@ retry:
if (!new_page)
goto put_old;
- if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
- goto put_new;
-
__SetPageUptodate(new_page);
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = __replace_page(vma, vaddr, old_page, new_page);
- if (ret)
- mem_cgroup_uncharge_page(new_page);
-
-put_new:
page_cache_release(new_page);
put_old:
put_page(old_page);
diff --git a/kernel/exit.c b/kernel/exit.c
index e5c4668..32c58f7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,7 +59,7 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-static void exit_mm(struct task_struct * tsk);
+static void exit_mm(struct task_struct *tsk);
static void __unhash_process(struct task_struct *p, bool group_dead)
{
@@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk)
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
- clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+ clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
if (group_dead) {
flush_sigqueue(&sig->shared_pending);
tty_kref_put(tty);
@@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
}
-void release_task(struct task_struct * p)
+void release_task(struct task_struct *p)
{
struct task_struct *leader;
int zap_leader;
@@ -192,7 +192,8 @@ repeat:
*/
zap_leader = 0;
leader = p->group_leader;
- if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+ if (leader != p && thread_group_empty(leader)
+ && leader->exit_state == EXIT_ZOMBIE) {
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
@@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
-static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
+static int will_become_orphaned_pgrp(struct pid *pgrp,
+ struct task_struct *ignored_task)
{
struct task_struct *p;
@@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
struct task_struct *ignored_task = tsk;
if (!parent)
- /* exit: our father is in a different pgrp than
- * we are and we were the only connection outside.
- */
+ /* exit: our father is in a different pgrp than
+ * we are and we were the only connection outside.
+ */
parent = tsk->real_parent;
else
/* reparent: our child is in a different pgrp than
@@ -405,7 +407,7 @@ assign_new_owner:
* Turn us into a lazy TLB process if we
* aren't already..
*/
-static void exit_mm(struct task_struct * tsk)
+static void exit_mm(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
@@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk)
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
+
up_read(&mm->mmap_sem);
self.task = tsk;
@@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk)
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
+ clear_thread_flag(TIF_MEMDIE);
}
/*
@@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father)
list_for_each_entry_safe(p, n, &father->children, sibling) {
struct task_struct *t = p;
+
do {
t->real_parent = reaper;
if (t->parent == father) {
@@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/*
* This does two things:
*
- * A. Make init inherit all the child processes
+ * A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
@@ -648,9 +653,8 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s (%d) used greatest stack depth: "
- "%lu bytes left\n",
- current->comm, task_pid_nr(current), free);
+ pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -691,8 +695,7 @@ void do_exit(long code)
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
- printk(KERN_ALERT
- "Fixing recursive fault but reboot is needed!\n");
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
/*
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
@@ -716,9 +719,9 @@ void do_exit(long code)
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic()))
- printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
@@ -836,7 +839,6 @@ void do_exit(long code)
for (;;)
cpu_relax(); /* For when BUG is null */
}
-
EXPORT_SYMBOL_GPL(do_exit);
void complete_and_exit(struct completion *comp, long code)
@@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code)
do_exit(code);
}
-
EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
@@ -869,6 +870,7 @@ do_group_exit(int exit_code)
exit_code = sig->group_exit_code;
else if (!thread_group_empty(current)) {
struct sighand_struct *const sighand = current->sighand;
+
spin_lock_irq(&sighand->siglock);
if (signal_group_exit(sig))
/* Another thread got here before we took the lock. */
@@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_cputime_adjusted() to get times for the thread
- * group, which consolidates times for all threads in the
- * group including the group leader.
+ * We use thread_group_cputime_adjusted() to get times for
+ * the thread group, which consolidates times for all threads
+ * in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
@@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
list_for_each_entry(p, &tsk->children, sibling) {
int ret = wait_consider_task(wo, 0, p);
+
if (ret)
return ret;
}
@@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
int ret = wait_consider_task(wo, 1, p);
+
if (ret)
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index fbd3497..0cf9cdb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -374,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0;
- mm->mmap = NULL;
- mm->vmacache_seqnum = 0;
- mm->map_count = 0;
- cpumask_clear(mm_cpumask(mm));
- mm->mm_rb = RB_ROOT;
+ mm->total_vm = oldmm->total_vm;
+ mm->shared_vm = oldmm->shared_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
@@ -430,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
atomic_dec(&inode->i_writecount);
mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
- mapping->i_mmap_writable++;
+ atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
if (unlikely(tmp->vm_flags & VM_NONLINEAR))
@@ -536,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ mm->owner = p;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
+ mm->mmap = NULL;
+ mm->mm_rb = RB_ROOT;
+ mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
+ mm->map_count = 0;
+ mm->locked_vm = 0;
+ mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mmu_notifier_mm_init(mm);
clear_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+#endif
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -558,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm->def_flags = 0;
}
- if (likely(!mm_alloc_pgd(mm))) {
- mmu_notifier_mm_init(mm);
- return mm;
- }
+ if (mm_alloc_pgd(mm))
+ goto fail_nopgd;
+
+ if (init_new_context(p, mm))
+ goto fail_nocontext;
+
+ return mm;
+fail_nocontext:
+ mm_free_pgd(mm);
+fail_nopgd:
free_mm(mm);
return NULL;
}
@@ -596,7 +619,6 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- mm_init_cpumask(mm);
return mm_init(mm, current);
}
@@ -828,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
- mm_init_cpumask(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- mm->pmd_huge_pte = NULL;
-#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
- if (init_new_context(tsk, mm))
- goto fail_nocontext;
-
dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm);
@@ -860,15 +875,6 @@ free_pt:
fail_nomem:
return NULL;
-
-fail_nocontext:
- /*
- * If init_new_context() failed, we cannot use mmput() to free the mm
- * because it calls destroy_context()
- */
- mm_free_pgd(mm);
- free_mm(mm);
- return NULL;
}
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -1099,7 +1105,7 @@ static void copy_seccomp(struct task_struct *p)
* needed because this new task is not yet running and cannot
* be racing exec.
*/
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
/* Ref-count the new filter user, and assign it. */
get_seccomp_filter(current);
@@ -1140,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p)
#endif
}
-#ifdef CONFIG_MEMCG
-void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
-{
- mm->owner = p;
-}
-#endif /* CONFIG_MEMCG */
-
/*
* Initialize POSIX timer handling for a single task.
*/
@@ -1346,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
-#ifdef CONFIG_MEMCG
- p->memcg_batch.do_batch = 0;
- p->memcg_batch.memcg = NULL;
-#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@ -1367,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (retval)
goto bad_fork_cleanup_policy;
/* copy all the process information */
+ shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
@@ -1918,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
*/
exit_sem(current);
}
+ if (unshare_flags & CLONE_NEWIPC) {
+ /* Orphan segments in old ns (see sem above). */
+ exit_shm(current);
+ shm_init_task(current);
+ }
if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a..edf67c4 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)
err_remove:
pr_err("init failed\n");
- if (root_node.dentry)
- debugfs_remove(root_node.dentry);
+ debugfs_remove(root_node.dentry);
return rc;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index cb0cf37..ae51670 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,
address += symbol_offset;
name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
- return sprintf(buffer, "0x%lx", address);
+ return sprintf(buffer, "0x%lx", address - symbol_offset);
if (name != buffer)
strcpy(buffer, name);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b8f0c9..0b49a0a 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -40,6 +42,9 @@
#include <asm/io.h>
#include <asm/sections.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
.name = "Crash kernel",
@@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image,
gfp_t gfp_mask,
unsigned long dest);
-static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int copy_user_segment_list(struct kimage *image,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
{
+ int ret;
size_t segment_bytes;
- struct kimage *image;
- unsigned long i;
- int result;
-
- /* Allocate a controlling structure */
- result = -ENOMEM;
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- goto out;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->start = entry;
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unuseable_pages);
/* Read in the segments */
image->nr_segments = nr_segments;
segment_bytes = nr_segments * sizeof(*segments);
- result = copy_from_user(image->segment, segments, segment_bytes);
- if (result) {
- result = -EFAULT;
- goto out;
- }
+ ret = copy_from_user(image->segment, segments, segment_bytes);
+ if (ret)
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
/*
* Verify we have good destination addresses. The caller is
@@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- goto out;
+ return result;
if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- goto out;
+ return result;
}
/* Verify our destination addresses do not overlap.
@@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
pend = pstart + image->segment[j].memsz;
/* Do the segments overlap ? */
if ((mend > pstart) && (mstart < pend))
- goto out;
+ return result;
}
}
@@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
if (image->segment[i].bufsz > image->segment[i].memsz)
- goto out;
+ return result;
}
- result = 0;
-out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
- return result;
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+ return 0;
+}
+
+static struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
}
static void kimage_free_page_list(struct list_head *list);
-static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments,
+ unsigned long flags)
{
- int result;
+ int ret;
struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+
+ if (kexec_on_panic) {
+ /* Verify we have a valid entry point */
+ if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ return -EADDRNOTAVAIL;
+ }
/* Allocate and initialize a controlling structure */
- image = NULL;
- result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result)
- goto out;
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->start = entry;
+
+ ret = copy_user_segment_list(image, nr_segments, segments);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
+
+ /* Enable the special crash kernel control page allocation policy. */
+ if (kexec_on_panic) {
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
* counted as destination pages.
*/
- result = -ENOMEM;
+ ret = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
pr_err("Could not allocate control_code_buffer\n");
- goto out_free;
+ goto out_free_image;
}
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- pr_err("Could not allocate swap buffer\n");
- goto out_free;
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
}
*rimage = image;
return 0;
-
-out_free:
+out_free_control_pages:
kimage_free_page_list(&image->control_pages);
+out_free_image:
kfree(image);
-out:
- return result;
+ return ret;
}
-static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
{
- int result;
- struct kimage *image;
- unsigned long i;
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
- image = NULL;
- /* Verify we have a valid entry point */
- if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
- result = -EADDRNOTAVAIL;
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
goto out;
}
- /* Allocate and initialize a controlling structure */
- result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result)
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
goto out;
+ }
- /* Enable the special crash kernel control page
- * allocation policy.
- */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) || (mend > crashk_res.end))
- goto out_free;
+ if (bytes == 0)
+ break;
+ pos += bytes;
}
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
+ }
+
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+static void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
/*
- * Find a location for the control code buffer, and add
- * the vector of segments so that it's pages will also be
- * counted as destination pages.
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
*/
- result = -ENOMEM;
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
pr_err("Could not allocate control_code_buffer\n");
- goto out_free;
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err(KERN_ERR "Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
}
*rimage = image;
return 0;
-
-out_free:
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
kfree(image);
-out:
- return result;
+ return ret;
}
static int kimage_is_destination_range(struct kimage *image,
@@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image)
kimage_free_page_list(&image->dest_pages);
/* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unuseable_pages);
+ kimage_free_page_list(&image->unusable_pages);
}
static void kimage_terminate(struct kimage *image)
@@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image)
/* Free the kexec control pages... */
kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
kfree(image);
}
@@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
/* If the page cannot be used file it away */
if (page_to_pfn(page) >
(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unuseable_pages);
+ list_add(&page->lru, &image->unusable_pages);
continue;
}
addr = page_to_pfn(page) << PAGE_SHIFT;
@@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image,
unsigned long maddr;
size_t ubytes, mbytes;
int result;
- unsigned char __user *buf;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
result = 0;
- buf = segment->buf;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
@@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image,
PAGE_SIZE - (maddr & ~PAGE_MASK));
uchunk = min(ubytes, mchunk);
- result = copy_from_user(ptr, buf, uchunk);
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
if (result) {
result = -EFAULT;
@@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image,
}
ubytes -= uchunk;
maddr += mchunk;
- buf += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
mbytes -= mchunk;
}
out:
@@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image,
unsigned long maddr;
size_t ubytes, mbytes;
int result;
- unsigned char __user *buf;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
result = 0;
- buf = segment->buf;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
@@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image,
/* Zero the trailing part of the page */
memset(ptr + uchunk, 0, mchunk - uchunk);
}
- result = copy_from_user(ptr, buf, uchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
if (result) {
@@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image,
}
ubytes -= uchunk;
maddr += mchunk;
- buf += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
mbytes -= mchunk;
}
out:
@@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
/* Loading another kernel to reboot into */
if ((flags & KEXEC_ON_CRASH) == 0)
- result = kimage_normal_alloc(&image, entry,
- nr_segments, segments);
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
/* Loading another kernel to switch to if this one crashes */
else if (flags & KEXEC_ON_CRASH) {
/* Free any current crash dump kernel before
* we corrupt it.
*/
kimage_free(xchg(&kexec_crash_image, NULL));
- result = kimage_crash_alloc(&image, entry,
- nr_segments, segments);
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
crash_map_reserved_pages();
}
if (result)
@@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
}
#endif
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
void crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
@@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void)
subsys_initcall(crash_save_vmcoreinfo_init);
+static int __kexec_add_segment(struct kimage *image, char *buf,
+ unsigned long bufsz, unsigned long mem,
+ unsigned long memsz)
+{
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = buf;
+ ksegment->bufsz = bufsz;
+ ksegment->mem = mem;
+ ksegment->memsz = memsz;
+ image->nr_segments++;
+
+ return 0;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+ kbuf->memsz);
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+ kbuf->memsz);
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments - 1];
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective archicture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
+
/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
diff --git a/kernel/module.c b/kernel/module.c
index ae79ce6..6f69463 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3381,6 +3381,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
*/
static inline int is_arm_mapping_symbol(const char *str)
{
+ if (str[0] == '.' && str[1] == 'L')
+ return true;
return str[0] == '$' && strchr("atd", str[1])
&& (str[2] == '\0' || str[2] == '.');
}
@@ -3444,8 +3446,7 @@ const char *module_address_lookup(unsigned long addr,
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
if (modname)
*modname = mod->name;
ret = get_ksymbol(mod, addr, size, offset);
@@ -3469,8 +3470,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
const char *sym;
sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -3495,8 +3495,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
const char *sym;
sym = get_ksymbol(mod, addr, size, offset);
@@ -3760,8 +3759,7 @@ struct module *__module_address(unsigned long addr)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_core(addr, mod)
- || within_module_init(addr, mod))
+ if (within_module(addr, mod))
return mod;
}
return NULL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e78110..ef42d0a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ task_lock(p);
ns = p->nsproxy;
+ p->nsproxy = new;
+ task_unlock(p);
- rcu_assign_pointer(p->nsproxy, new);
-
- if (ns && atomic_dec_and_test(&ns->count)) {
- /*
- * wait for others to get what they want from this nsproxy.
- *
- * cannot release this nsproxy via the call_rcu() since
- * put_mnt_ns() will want to sleep
- */
- synchronize_rcu();
+ if (ns && atomic_dec_and_test(&ns->count))
free_nsproxy(ns);
- }
}
void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 62e16ce..d09dc5c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -224,6 +224,7 @@ static const struct tnt tnts[] = {
{ TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
{ TAINT_OOT_MODULE, 'O', ' ' },
{ TAINT_UNSIGNED_MODULE, 'E', ' ' },
+ { TAINT_SOFTLOCKUP, 'L', ' ' },
};
/**
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 13e839d..e04c455 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <asm/uaccess.h>
@@ -56,7 +57,7 @@
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
- DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
+ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
@@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip)
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
* definitely not the perfect debug tool (we don't know if _WE_
- * hold it are racing, but it helps tracking those weird code
- * path in the console code where we end up in places I want
- * locked without the console sempahore held
+ * hold it and are racing, but it helps tracking those weird code
+ * paths in the console code where we end up in places I want
+ * locked without the console sempahore held).
*/
static int console_locked, console_suspended;
@@ -146,8 +147,8 @@ static int console_may_schedule;
* the overall length of the record.
*
* The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these both entries are maintained when messages
- * are stored..
+ * sequence numbers of these entries are maintained when messages are
+ * stored.
*
* If the heads indicate available messages, the length in the header
* tells the start next message. A length == 0 for the next message
@@ -257,7 +258,7 @@ static u64 clear_seq;
static u32 clear_idx;
#define PREFIX_MAX 32
-#define LOG_LINE_MAX 1024 - PREFIX_MAX
+#define LOG_LINE_MAX (1024 - PREFIX_MAX)
/* record buffer */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -266,10 +267,23 @@ static u32 clear_idx;
#define LOG_ALIGN __alignof__(struct printk_log)
#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
+/* Return log buffer address */
+char *log_buf_addr_get(void)
+{
+ return log_buf;
+}
+
+/* Return log buffer size */
+u32 log_buf_len_get(void)
+{
+ return log_buf_len;
+}
+
/* human readable text of the record */
static char *log_text(const struct printk_log *msg)
{
@@ -344,7 +358,7 @@ static int log_make_free_space(u32 msg_size)
while (log_first_seq < log_next_seq) {
if (logbuf_has_space(msg_size, false))
return 0;
- /* drop old messages until we have enough continuous space */
+ /* drop old messages until we have enough contiguous space */
log_first_idx = log_next(log_first_idx);
log_first_seq++;
}
@@ -453,11 +467,7 @@ static int log_store(int facility, int level,
return msg->text_len;
}
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
+int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
static int syslog_action_restricted(int type)
{
@@ -828,34 +838,74 @@ void log_buf_kexec_setup(void)
/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;
-/* save requested log_buf_len since it's too early to process it */
-static int __init log_buf_len_setup(char *str)
+/* we practice scaling the ring buffer by powers of 2 */
+static void __init log_buf_len_update(unsigned size)
{
- unsigned size = memparse(str, &str);
-
if (size)
size = roundup_pow_of_two(size);
if (size > log_buf_len)
new_log_buf_len = size;
+}
+
+/* save requested log_buf_len since it's too early to process it */
+static int __init log_buf_len_setup(char *str)
+{
+ unsigned size = memparse(str, &str);
+
+ log_buf_len_update(size);
return 0;
}
early_param("log_buf_len", log_buf_len_setup);
+static void __init log_buf_add_cpu(void)
+{
+ unsigned int cpu_extra;
+
+ /*
+ * archs should set up cpu_possible_bits properly with
+ * set_cpu_possible() after setup_arch() but just in
+ * case lets ensure this is valid.
+ */
+ if (num_possible_cpus() == 1)
+ return;
+
+ cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
+
+ /* by default this will only continue through for large > 64 CPUs */
+ if (cpu_extra <= __LOG_BUF_LEN / 2)
+ return;
+
+ pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
+ __LOG_CPU_MAX_BUF_LEN);
+ pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
+ cpu_extra);
+ pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
+
+ log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
+}
+
void __init setup_log_buf(int early)
{
unsigned long flags;
char *new_log_buf;
int free;
+ if (log_buf != __log_buf)
+ return;
+
+ if (!early && !new_log_buf_len)
+ log_buf_add_cpu();
+
if (!new_log_buf_len)
return;
if (early) {
new_log_buf =
- memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
+ memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
} else {
- new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
+ new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
+ LOG_ALIGN);
}
if (unlikely(!new_log_buf)) {
@@ -872,7 +922,7 @@ void __init setup_log_buf(int early)
memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
- pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("log_buf_len: %d bytes\n", log_buf_len);
pr_info("early log buf free: %d(%d%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
}
@@ -881,7 +931,7 @@ static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
- ignore_loglevel = 1;
+ ignore_loglevel = true;
pr_info("debug: ignoring loglevel setting.\n");
return 0;
@@ -947,11 +997,7 @@ static inline void boot_delay_msec(int level)
}
#endif
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time;
-#endif
+static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
static size_t print_time(u64 ts, char *buf)
@@ -1310,7 +1356,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_idx - syslog_idx;
+ error = log_next_seq - syslog_seq;
} else {
u64 seq = syslog_seq;
u32 idx = syslog_idx;
@@ -1416,10 +1462,9 @@ static int have_callable_console(void)
/*
* Can we actually use the console at this time on this cpu?
*
- * Console drivers may assume that per-cpu resources have
- * been allocated. So unless they're explicitly marked as
- * being able to cope (CON_ANYTIME) don't call them until
- * this CPU is officially up.
+ * Console drivers may assume that per-cpu resources have been allocated. So
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * call them until this CPU is officially up.
*/
static inline int can_use_console(unsigned int cpu)
{
@@ -1432,8 +1477,10 @@ static inline int can_use_console(unsigned int cpu)
* console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
*/
-static int console_trylock_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(void)
{
+ unsigned int cpu = smp_processor_id();
+
if (!console_trylock())
return 0;
/*
@@ -1476,7 +1523,7 @@ static struct cont {
struct task_struct *owner; /* task of first print*/
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
- u8 facility; /* log level of first message */
+ u8 facility; /* log facility of first message */
enum log_flags flags; /* prefix, newline flags */
bool flushed:1; /* buffer sealed and committed */
} cont;
@@ -1608,7 +1655,8 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
recursion_bug = 1;
- goto out_restore_irqs;
+ local_irq_restore(flags);
+ return 0;
}
zap_locks();
}
@@ -1716,21 +1764,30 @@ asmlinkage int vprintk_emit(int facility, int level,
logbuf_cpu = UINT_MAX;
raw_spin_unlock(&logbuf_lock);
+ lockdep_on();
+ local_irq_restore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
+ lockdep_off();
+ /*
+ * Disable preemption to avoid being preempted while holding
+ * console_sem which would prevent anyone from printing to
+ * console
+ */
+ preempt_disable();
+
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
- if (console_trylock_for_printk(this_cpu))
+ if (console_trylock_for_printk())
console_unlock();
+ preempt_enable();
+ lockdep_on();
}
- lockdep_on();
-out_restore_irqs:
- local_irq_restore(flags);
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -1802,7 +1859,7 @@ EXPORT_SYMBOL(printk);
#define LOG_LINE_MAX 0
#define PREFIX_MAX 0
-#define LOG_LINE_MAX 0
+
static u64 syslog_seq;
static u32 syslog_idx;
static u64 console_seq;
@@ -1881,11 +1938,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
return 0;
}
/*
- * Set up a list of consoles. Called from init/main.c
+ * Set up a console. Called via do_early_param() in init/main.c
+ * for each "console=" parameter in the boot command line.
*/
static int __init console_setup(char *str)
{
- char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
+ char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
char *s, *options, *brl_options = NULL;
int idx;
@@ -1902,7 +1960,8 @@ static int __init console_setup(char *str)
strncpy(buf, str, sizeof(buf) - 1);
}
buf[sizeof(buf) - 1] = 0;
- if ((options = strchr(str, ',')) != NULL)
+ options = strchr(str, ',');
+ if (options)
*(options++) = 0;
#ifdef __sparc__
if (!strcmp(str, "ttya"))
@@ -1911,7 +1970,7 @@ static int __init console_setup(char *str)
strcpy(buf, "ttyS1");
#endif
for (s = buf; *s; s++)
- if ((*s >= '0' && *s <= '9') || *s == ',')
+ if (isdigit(*s) || *s == ',')
break;
idx = simple_strtoul(s, NULL, 10);
*s = 0;
@@ -1950,7 +2009,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
i++, c++)
if (strcmp(c->name, name) == 0 && c->index == idx) {
strlcpy(c->name, name_new, sizeof(c->name));
- c->name[sizeof(c->name) - 1] = 0;
c->options = options;
c->index = idx_new;
return i;
@@ -1959,12 +2017,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
return -1;
}
-bool console_suspend_enabled = 1;
+bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
static int __init console_suspend_disable(char *str)
{
- console_suspend_enabled = 0;
+ console_suspend_enabled = false;
return 1;
}
__setup("no_console_suspend", console_suspend_disable);
@@ -2045,8 +2103,8 @@ EXPORT_SYMBOL(console_lock);
/**
* console_trylock - try to lock the console system for exclusive use.
*
- * Tried to acquire a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that the caller has exclusive
+ * access to the console system and the console_drivers list.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
@@ -2618,14 +2676,13 @@ EXPORT_SYMBOL(__printk_ratelimit);
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msecs)
{
- if (*caller_jiffies == 0
- || !time_in_range(jiffies, *caller_jiffies,
- *caller_jiffies
- + msecs_to_jiffies(interval_msecs))) {
- *caller_jiffies = jiffies;
- return true;
- }
- return false;
+ unsigned long elapsed = jiffies - *caller_jiffies;
+
+ if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
+ return false;
+
+ *caller_jiffies = jiffies;
+ return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/resource.c b/kernel/resource.c
index 3c2237a..da14b8d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock);
static struct resource *bootmem_resource_free;
static DEFINE_SPINLOCK(bootmem_resource_lock);
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+static struct resource *next_resource(struct resource *p, bool sibling_only)
{
- struct resource *p = v;
- (*pos)++;
+ /* Caller wants to traverse through siblings only */
+ if (sibling_only)
+ return p->sibling;
+
if (p->child)
return p->child;
while (!p->sibling && p->parent)
@@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
return p->sibling;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+ (*pos)++;
+ return (void *)next_resource(p, false);
+}
+
#ifdef CONFIG_PROC_FS
enum { MAX_IORES_LEVEL = 5 };
@@ -322,16 +331,19 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
/*
- * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
* the caller must specify res->start, res->end, res->flags and "name".
* If found, returns 0, res is overwritten, if not found, returns -1.
+ * This walks through whole tree and not just first level children
+ * until and unless first_level_children_only is true.
*/
-static int find_next_system_ram(struct resource *res, char *name)
+static int find_next_iomem_res(struct resource *res, char *name,
+ bool first_level_children_only)
{
resource_size_t start, end;
struct resource *p;
+ bool sibling_only = false;
BUG_ON(!res);
@@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name)
BUG_ON(start >= end);
read_lock(&resource_lock);
- for (p = iomem_resource.child; p ; p = p->sibling) {
- /* system ram is just marked as IORESOURCE_MEM */
+
+ if (first_level_children_only) {
+ p = iomem_resource.child;
+ sibling_only = true;
+ } else
+ p = &iomem_resource;
+
+ while ((p = next_resource(p, sibling_only))) {
if (p->flags != res->flags)
continue;
if (name && strcmp(p->name, name))
@@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name)
if ((p->end >= start) && (p->start < end))
break;
}
+
read_unlock(&resource_lock);
if (!p)
return -1;
@@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name)
}
/*
+ * Walks through iomem resources and calls func() with matching resource
+ * ranges. This walks through whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
+ * name are valid candidates.
+ *
+ * @name: name of resource
+ * @flags: resource flags
+ * @start: start addr
+ * @end: end addr
+ */
+int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
+ void *arg, int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = flags;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, name, false))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM". This function deals with
+ * full ranges and not pfn. If resources are not pfn aligned, dealing
+ * with pfn can truncate ranges.
+ */
+int walk_system_ram_res(u64 start, u64 end, void *arg,
+ int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, "System RAM", true))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+
+/*
* This function calls callback against all memory range of "System RAM"
* which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
* Now, this function is only for "System RAM".
@@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
- (find_next_system_ram(&res, "System RAM") >= 0)) {
+ (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 25b0043..44eb005 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -203,7 +203,7 @@ static u32 seccomp_run_filters(int syscall)
static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
{
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
return false;
@@ -214,7 +214,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
static inline void seccomp_assign_mode(struct task_struct *task,
unsigned long seccomp_mode)
{
- BUG_ON(!spin_is_locked(&task->sighand->siglock));
+ assert_spin_locked(&task->sighand->siglock);
task->seccomp.mode = seccomp_mode;
/*
@@ -253,7 +253,7 @@ static inline pid_t seccomp_can_sync_threads(void)
struct task_struct *thread, *caller;
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
/* Validate all threads being eligible for synchronization. */
caller = current;
@@ -294,7 +294,7 @@ static inline void seccomp_sync_threads(void)
struct task_struct *thread, *caller;
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
/* Synchronize all threads. */
caller = current;
@@ -464,7 +464,7 @@ static long seccomp_attach_filter(unsigned int flags,
unsigned long total_insns;
struct seccomp_filter *walker;
- BUG_ON(!spin_is_locked(&current->sighand->siglock));
+ assert_spin_locked(&current->sighand->siglock);
/* Validate resulting filter length. */
total_insns = filter->prog->len;
diff --git a/kernel/signal.c b/kernel/signal.c
index 40b76e3..8f0876f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
return signr;
}
-int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
- struct pt_regs *regs, void *cookie)
+int get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
@@ -2241,13 +2240,13 @@ relock:
goto relock;
}
- signr = dequeue_signal(current, &current->blocked, info);
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
if (unlikely(current->ptrace) && signr != SIGKILL) {
- signr = ptrace_signal(signr, info);
+ signr = ptrace_signal(signr, &ksig->info);
if (!signr)
continue;
}
@@ -2255,13 +2254,13 @@ relock:
ka = &sighand->action[signr-1];
/* Trace actually delivered signals. */
- trace_signal_deliver(signr, info, ka);
+ trace_signal_deliver(signr, &ksig->info, ka);
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
/* Run the handler. */
- *return_ka = *ka;
+ ksig->ka = *ka;
if (ka->sa.sa_flags & SA_ONESHOT)
ka->sa.sa_handler = SIG_DFL;
@@ -2311,7 +2310,7 @@ relock:
spin_lock_irq(&sighand->siglock);
}
- if (likely(do_signal_stop(info->si_signo))) {
+ if (likely(do_signal_stop(ksig->info.si_signo))) {
/* It released the siglock. */
goto relock;
}
@@ -2332,7 +2331,7 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(info->si_signo);
+ print_fatal_signal(ksig->info.si_signo);
proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
@@ -2342,34 +2341,32 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(info);
+ do_coredump(&ksig->info);
}
/*
* Death signals, no core dump.
*/
- do_group_exit(info->si_signo);
+ do_group_exit(ksig->info.si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
- return signr;
+
+ ksig->sig = signr;
+ return ksig->sig > 0;
}
/**
* signal_delivered -
- * @sig: number of signal being delivered
- * @info: siginfo_t of signal being delivered
- * @ka: sigaction setting that chose the handler
- * @regs: user register state
+ * @ksig: kernel signal struct
* @stepping: nonzero if debugger single-step or block-step in use
*
* This function should be called when a signal has successfully been
- * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
* is always blocked, and the signal itself is blocked unless %SA_NODEFER
- * is set in @ka->sa.sa_flags. Tracing is notified.
+ * is set in @ksig->ka.sa.sa_flags. Tracing is notified.
*/
-void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
- struct pt_regs *regs, int stepping)
+static void signal_delivered(struct ksignal *ksig, int stepping)
{
sigset_t blocked;
@@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
simply clear the restore sigmask flag. */
clear_restore_sigmask();
- sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&blocked, sig);
+ sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
+ if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
+ sigaddset(&blocked, ksig->sig);
set_current_blocked(&blocked);
- tracehook_signal_handler(sig, info, ka, regs, stepping);
+ tracehook_signal_handler(stepping);
}
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
if (failed)
force_sigsegv(ksig->sig, current);
else
- signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
- signal_pt_regs(), stepping);
+ signal_delivered(ksig, stepping);
}
/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 487653b..aff8aa1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func,
info, wait);
- WARN_ON_ONCE(!ret);
+ WARN_ON_ONCE(ret);
}
preempt_enable();
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2904a21..391d4dd 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon);
cond_syscall(sys_swapoff);
cond_syscall(sys_kexec_load);
cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_kexec_file_load);
cond_syscall(sys_init_module);
cond_syscall(sys_finit_module);
cond_syscall(sys_delete_module);
@@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75b22e2..75875a7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_sysctl_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#ifdef CONFIG_NUMA
{
@@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#endif
{
@@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_overcommit_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#endif
{
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 12d6ebbf..0dbab6d 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -14,6 +14,8 @@
* the GNU General Public License for more details.
*/
+#define pr_fmt(fmt) "Kprobe smoke test: " fmt
+
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/random.h>
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
{
if (preh_val != (rand1 / div_factor)) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in post_handler\n");
+ pr_err("incorrect value in post_handler\n");
}
posth_val = preh_val + div_factor;
}
@@ -59,8 +60,7 @@ static int test_kprobe(void)
ret = register_kprobe(&kp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kprobe returned %d\n", ret);
+ pr_err("register_kprobe returned %d\n", ret);
return ret;
}
@@ -68,14 +68,12 @@ static int test_kprobe(void)
unregister_kprobe(&kp);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler not called\n");
+ pr_err("kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler not called\n");
+ pr_err("kprobe post_handler not called\n");
handler_errors++;
}
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
{
if (preh_val != (rand1 / div_factor) + 1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in post_handler2\n");
+ pr_err("incorrect value in post_handler2\n");
}
posth_val = preh_val + div_factor;
}
@@ -120,8 +117,7 @@ static int test_kprobes(void)
kp.flags = 0;
ret = register_kprobes(kps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kprobes returned %d\n", ret);
+ pr_err("register_kprobes returned %d\n", ret);
return ret;
}
@@ -130,14 +126,12 @@ static int test_kprobes(void)
ret = target(rand1);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler not called\n");
+ pr_err("kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler not called\n");
+ pr_err("kprobe post_handler not called\n");
handler_errors++;
}
@@ -146,14 +140,12 @@ static int test_kprobes(void)
ret = target2(rand1);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler2 not called\n");
+ pr_err("kprobe pre_handler2 not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler2 not called\n");
+ pr_err("kprobe post_handler2 not called\n");
handler_errors++;
}
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)
{
if (value != rand1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in jprobe handler\n");
+ pr_err("incorrect value in jprobe handler\n");
}
jph_val = rand1;
@@ -186,16 +177,14 @@ static int test_jprobe(void)
ret = register_jprobe(&jp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_jprobe returned %d\n", ret);
+ pr_err("register_jprobe returned %d\n", ret);
return ret;
}
ret = target(rand1);
unregister_jprobe(&jp);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler not called\n");
+ pr_err("jprobe handler not called\n");
handler_errors++;
}
@@ -217,24 +206,21 @@ static int test_jprobes(void)
jp.kp.flags = 0;
ret = register_jprobes(jps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_jprobes returned %d\n", ret);
+ pr_err("register_jprobes returned %d\n", ret);
return ret;
}
jph_val = 0;
ret = target(rand1);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler not called\n");
+ pr_err("jprobe handler not called\n");
handler_errors++;
}
jph_val = 0;
ret = target2(rand1);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler2 not called\n");
+ pr_err("jprobe handler2 not called\n");
handler_errors++;
}
unregister_jprobes(jps, 2);
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
if (ret != (rand1 / div_factor)) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in kretprobe handler\n");
+ pr_err("incorrect value in kretprobe handler\n");
}
if (krph_val == 0) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "call to kretprobe entry handler failed\n");
+ pr_err("call to kretprobe entry handler failed\n");
}
krph_val = rand1;
@@ -281,16 +265,14 @@ static int test_kretprobe(void)
ret = register_kretprobe(&rp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe returned %d\n", ret);
return ret;
}
ret = target(rand1);
unregister_kretprobe(&rp);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler not called\n");
+ pr_err("kretprobe handler not called\n");
handler_errors++;
}
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
if (ret != (rand1 / div_factor) + 1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in kretprobe handler2\n");
+ pr_err("incorrect value in kretprobe handler2\n");
}
if (krph_val == 0) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "call to kretprobe entry handler failed\n");
+ pr_err("call to kretprobe entry handler failed\n");
}
krph_val = rand1;
@@ -332,24 +312,21 @@ static int test_kretprobes(void)
rp.kp.flags = 0;
ret = register_kretprobes(rps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe returned %d\n", ret);
return ret;
}
krph_val = 0;
ret = target(rand1);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler not called\n");
+ pr_err("kretprobe handler not called\n");
handler_errors++;
}
krph_val = 0;
ret = target2(rand1);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler2 not called\n");
+ pr_err("kretprobe handler2 not called\n");
handler_errors++;
}
unregister_kretprobes(rps, 2);
@@ -368,7 +345,7 @@ int init_test_probes(void)
rand1 = prandom_u32();
} while (rand1 <= div_factor);
- printk(KERN_INFO "Kprobe smoke test started\n");
+ pr_info("started\n");
num_tests++;
ret = test_kprobe();
if (ret < 0)
@@ -402,13 +379,11 @@ int init_test_probes(void)
#endif /* CONFIG_KRETPROBES */
if (errors)
- printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
- "%d tests failed\n", errors, num_tests);
+ pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
else if (handler_errors)
- printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
- "running handlers\n", handler_errors);
+ pr_err("BUG: %d error(s) running handlers\n", handler_errors);
else
- printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+ pr_info("passed successfully\n");
return 0;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f36b028..fb4a9c2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
static inline void update_vsyscall(struct timekeeper *tk)
{
- struct timespec xt;
+ struct timespec xt, wm;
xt = timespec64_to_timespec(tk_xtime(tk));
- update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult,
+ wm = timespec64_to_timespec(tk->wall_to_monotonic);
+ update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
tk->tkr.cycle_last);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 925f629..afb04b9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1968,7 +1968,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
/**
* rb_update_event - update event type and data
- * @event: the even to update
+ * @event: the event to update
* @type: the type of event
* @length: the size of the event field in the ring buffer
*
@@ -3341,21 +3341,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
/* Iterator usage is expected to have record disabled */
- if (list_empty(&cpu_buffer->reader_page->list)) {
- iter->head_page = rb_set_head_page(cpu_buffer);
- if (unlikely(!iter->head_page))
- return;
- iter->head = iter->head_page->read;
- } else {
- iter->head_page = cpu_buffer->reader_page;
- iter->head = cpu_buffer->reader_page->read;
- }
+ iter->head_page = cpu_buffer->reader_page;
+ iter->head = cpu_buffer->reader_page->read;
+
+ iter->cache_reader_page = iter->head_page;
+ iter->cache_read = iter->head;
+
if (iter->head)
iter->read_stamp = cpu_buffer->read_stamp;
else
iter->read_stamp = iter->head_page->page->time_stamp;
- iter->cache_reader_page = cpu_buffer->reader_page;
- iter->cache_read = cpu_buffer->read;
}
/**
@@ -3748,12 +3743,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL;
/*
- * We repeat when a time extend is encountered.
- * Since the time extend is always attached to a data event,
- * we should never loop more than once.
- * (We never hit the following condition more than twice).
+ * We repeat when a time extend is encountered or we hit
+ * the end of the page. Since the time extend is always attached
+ * to a data event, we should never loop more than three times.
+ * Once for going to next page, once on time extend, and
+ * finally once to get the event.
+ * (We never hit the following condition more than thrice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index fcc0256..aa312b0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v)
return;
}
-struct seq_operations proc_uid_seq_operations = {
+const struct seq_operations proc_uid_seq_operations = {
.start = uid_m_start,
.stop = m_stop,
.next = m_next,
.show = uid_m_show,
};
-struct seq_operations proc_gid_seq_operations = {
+const struct seq_operations proc_gid_seq_operations = {
.start = gid_m_start,
.stop = m_stop,
.next = m_next,
.show = gid_m_show,
};
-struct seq_operations proc_projid_seq_operations = {
+const struct seq_operations proc_projid_seq_operations = {
.start = projid_m_start,
.stop = m_stop,
.next = m_next,
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd39312..883aaaa 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
struct uts_namespace *ns = NULL;
struct nsproxy *nsproxy;
- rcu_read_lock();
- nsproxy = task_nsproxy(task);
+ task_lock(task);
+ nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->uts_ns;
get_uts_ns(ns);
}
- rcu_read_unlock();
+ task_unlock(task);
return ns;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c3319bd..a8d6914 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ panic("Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
}
- printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
print_modules();
@@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
smp_mb__after_atomic();
}
+ add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true);
@@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
if (PTR_ERR(event) == -EOPNOTSUPP)
pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
else if (PTR_ERR(event) == -ENOENT)
- pr_warning("disabled (cpu%i): hardware events not enabled\n",
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
cpu);
else
pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
OpenPOWER on IntegriCloud