summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c94
-rw-r--r--kernel/audit.h17
-rw-r--r--kernel/auditsc.c176
-rw-r--r--kernel/kexec.c23
-rw-r--r--kernel/module.c9
-rw-r--r--kernel/power/suspend.c43
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/sched/idle.c16
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/time/tick-common.c50
-rw-r--r--kernel/time/timekeeping.c48
-rw-r--r--kernel/time/timekeeping.h2
13 files changed, 249 insertions, 238 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 33738ef..e6c10d1a 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,10 +76,11 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct bsd_acct_struct *acct);
struct bsd_acct_struct {
struct fs_pin pin;
+ atomic_long_t count;
+ struct rcu_head rcu;
struct mutex lock;
int active;
unsigned long needcheck;
@@ -89,6 +90,8 @@ struct bsd_acct_struct {
struct completion done;
};
+static void do_acct_process(struct bsd_acct_struct *acct);
+
/*
* Check the amount of free space and suspend/resume accordingly.
*/
@@ -124,32 +127,56 @@ out:
return acct->active;
}
+static void acct_put(struct bsd_acct_struct *p)
+{
+ if (atomic_long_dec_and_test(&p->count))
+ kfree_rcu(p, rcu);
+}
+
+static inline struct bsd_acct_struct *to_acct(struct fs_pin *p)
+{
+ return p ? container_of(p, struct bsd_acct_struct, pin) : NULL;
+}
+
static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
{
struct bsd_acct_struct *res;
again:
smp_rmb();
rcu_read_lock();
- res = ACCESS_ONCE(ns->bacct);
+ res = to_acct(ACCESS_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
}
- if (!atomic_long_inc_not_zero(&res->pin.count)) {
+ if (!atomic_long_inc_not_zero(&res->count)) {
rcu_read_unlock();
cpu_relax();
goto again;
}
rcu_read_unlock();
mutex_lock(&res->lock);
- if (!res->ns) {
+ if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
- pin_put(&res->pin);
+ acct_put(res);
goto again;
}
return res;
}
+static void acct_pin_kill(struct fs_pin *pin)
+{
+ struct bsd_acct_struct *acct = to_acct(pin);
+ mutex_lock(&acct->lock);
+ do_acct_process(acct);
+ schedule_work(&acct->work);
+ wait_for_completion(&acct->done);
+ cmpxchg(&acct->ns->bacct, pin, NULL);
+ mutex_unlock(&acct->lock);
+ pin_remove(pin);
+ acct_put(acct);
+}
+
static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
@@ -160,44 +187,13 @@ static void close_work(struct work_struct *work)
complete(&acct->done);
}
-static void acct_kill(struct bsd_acct_struct *acct,
- struct bsd_acct_struct *new)
-{
- if (acct) {
- struct pid_namespace *ns = acct->ns;
- do_acct_process(acct);
- INIT_WORK(&acct->work, close_work);
- init_completion(&acct->done);
- schedule_work(&acct->work);
- wait_for_completion(&acct->done);
- pin_remove(&acct->pin);
- ns->bacct = new;
- acct->ns = NULL;
- atomic_long_dec(&acct->pin.count);
- mutex_unlock(&acct->lock);
- pin_put(&acct->pin);
- }
-}
-
-static void acct_pin_kill(struct fs_pin *pin)
-{
- struct bsd_acct_struct *acct;
- acct = container_of(pin, struct bsd_acct_struct, pin);
- mutex_lock(&acct->lock);
- if (!acct->ns) {
- mutex_unlock(&acct->lock);
- pin_put(pin);
- acct = NULL;
- }
- acct_kill(acct, NULL);
-}
-
static int acct_on(struct filename *pathname)
{
struct file *file;
struct vfsmount *mnt, *internal;
struct pid_namespace *ns = task_active_pid_ns(current);
- struct bsd_acct_struct *acct, *old;
+ struct bsd_acct_struct *acct;
+ struct fs_pin *old;
int err;
acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
@@ -238,21 +234,21 @@ static int acct_on(struct filename *pathname)
mnt = file->f_path.mnt;
file->f_path.mnt = internal;
- atomic_long_set(&acct->pin.count, 1);
- acct->pin.kill = acct_pin_kill;
+ atomic_long_set(&acct->count, 1);
+ init_fs_pin(&acct->pin, acct_pin_kill);
acct->file = file;
acct->needcheck = jiffies;
acct->ns = ns;
mutex_init(&acct->lock);
+ INIT_WORK(&acct->work, close_work);
+ init_completion(&acct->done);
mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
pin_insert(&acct->pin, mnt);
- old = acct_get(ns);
- if (old)
- acct_kill(old, acct);
- else
- ns->bacct = acct;
+ rcu_read_lock();
+ old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
+ pin_kill(old);
mnt_drop_write(mnt);
mntput(mnt);
return 0;
@@ -288,7 +284,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
mutex_unlock(&acct_on_mutex);
putname(tmp);
} else {
- acct_kill(acct_get(task_active_pid_ns(current)), NULL);
+ rcu_read_lock();
+ pin_kill(task_active_pid_ns(current)->bacct);
}
return error;
@@ -296,7 +293,8 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
void acct_exit_ns(struct pid_namespace *ns)
{
- acct_kill(acct_get(ns), NULL);
+ rcu_read_lock();
+ pin_kill(ns->bacct);
}
/*
@@ -576,7 +574,7 @@ static void slow_acct_process(struct pid_namespace *ns)
if (acct) {
do_acct_process(acct);
mutex_unlock(&acct->lock);
- pin_put(&acct->pin);
+ acct_put(acct);
}
}
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 3cdffad..1caa0d3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -24,12 +24,6 @@
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
-/* 0 = no checking
- 1 = put_count checking
- 2 = verbose put_count checking
-*/
-#define AUDIT_DEBUG 0
-
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
* a name dynamically and also add those to the list anchored by names_list. */
@@ -74,9 +68,8 @@ struct audit_cap_data {
};
};
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
+/* When fs/namei.c:getname() is called, we store the pointer in name and bump
+ * the refcnt in the associated filename struct.
*
* Further, in fs/namei.c:path_lookup() we store the inode and device.
*/
@@ -86,7 +79,6 @@ struct audit_names {
struct filename *name;
int name_len; /* number of chars to log */
bool hidden; /* don't log this record */
- bool name_put; /* call __putname()? */
unsigned long ino;
dev_t dev;
@@ -208,11 +200,6 @@ struct audit_context {
};
int fds[2];
struct audit_proctitle proctitle;
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
};
extern u32 audit_ever_enabled;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 072566d..dc4ae70 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -866,33 +866,10 @@ static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
-#if AUDIT_DEBUG == 2
- if (context->put_count + context->ino_count != context->name_count) {
- int i = 0;
-
- pr_err("%s:%d(:%d): major=%d in_syscall=%d"
- " name_count=%d put_count=%d ino_count=%d"
- " [NOT freeing]\n", __FILE__, __LINE__,
- context->serial, context->major, context->in_syscall,
- context->name_count, context->put_count,
- context->ino_count);
- list_for_each_entry(n, &context->names_list, list) {
- pr_err("names[%d] = %p = %s\n", i++, n->name,
- n->name->name ?: "(null)");
- }
- dump_stack();
- return;
- }
-#endif
-#if AUDIT_DEBUG
- context->put_count = 0;
- context->ino_count = 0;
-#endif
-
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
- if (n->name && n->name_put)
- final_putname(n->name);
+ if (n->name)
+ putname(n->name);
if (n->should_free)
kfree(n);
}
@@ -1711,9 +1688,6 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
-#if AUDIT_DEBUG
- context->ino_count++;
-#endif
return aname;
}
@@ -1734,8 +1708,10 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr)
+ if (n->name->uptr == uptr) {
+ n->name->refcnt++;
return n->name;
+ }
}
return NULL;
}
@@ -1752,19 +1728,8 @@ void __audit_getname(struct filename *name)
struct audit_context *context = current->audit_context;
struct audit_names *n;
- if (!context->in_syscall) {
-#if AUDIT_DEBUG == 2
- pr_err("%s:%d(:%d): ignoring getname(%p)\n",
- __FILE__, __LINE__, context->serial, name);
- dump_stack();
-#endif
+ if (!context->in_syscall)
return;
- }
-
-#if AUDIT_DEBUG
- /* The filename _must_ have a populated ->name */
- BUG_ON(!name->name);
-#endif
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
@@ -1772,56 +1737,13 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
- n->name_put = true;
name->aname = n;
+ name->refcnt++;
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
}
-/* audit_putname - intercept a putname request
- * @name: name to intercept and delay for putname
- *
- * If we have stored the name from getname in the audit context,
- * then we delay the putname until syscall exit.
- * Called from include/linux/fs.h:putname().
- */
-void audit_putname(struct filename *name)
-{
- struct audit_context *context = current->audit_context;
-
- BUG_ON(!context);
- if (!name->aname || !context->in_syscall) {
-#if AUDIT_DEBUG == 2
- pr_err("%s:%d(:%d): final_putname(%p)\n",
- __FILE__, __LINE__, context->serial, name);
- if (context->name_count) {
- struct audit_names *n;
- int i = 0;
-
- list_for_each_entry(n, &context->names_list, list)
- pr_err("name[%d] = %p = %s\n", i++, n->name,
- n->name->name ?: "(null)");
- }
-#endif
- final_putname(name);
- }
-#if AUDIT_DEBUG
- else {
- ++context->put_count;
- if (context->put_count > context->name_count) {
- pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
- " name_count=%d put_count=%d\n",
- __FILE__, __LINE__,
- context->serial, context->major,
- context->in_syscall, name->name,
- context->name_count, context->put_count);
- dump_stack();
- }
- }
-#endif
-}
-
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1842,10 +1764,6 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
if (!name)
goto out_alloc;
-#if AUDIT_DEBUG
- /* The struct filename _must_ have a populated ->name */
- BUG_ON(!name->name);
-#endif
/*
* If we have a pointer to an audit_names entry already, then we can
* just use it directly if the type is correct.
@@ -1863,7 +1781,17 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
}
list_for_each_entry_reverse(n, &context->names_list, list) {
- if (!n->name || strcmp(n->name->name, name->name))
+ if (n->ino) {
+ /* valid inode number, use that for the comparison */
+ if (n->ino != inode->i_ino ||
+ n->dev != inode->i_sb->s_dev)
+ continue;
+ } else if (n->name) {
+ /* inode number has not been set, check the name */
+ if (strcmp(n->name->name, name->name))
+ continue;
+ } else
+ /* no inode and no name (?!) ... this is odd ... */
continue;
/* match the correct record type */
@@ -1882,44 +1810,11 @@ out_alloc:
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
if (!n)
return;
- /* unfortunately, while we may have a path name to record with the
- * inode, we can't always rely on the string lasting until the end of
- * the syscall so we need to create our own copy, it may fail due to
- * memory allocation issues, but we do our best */
if (name) {
- /* we can't use getname_kernel() due to size limits */
- size_t len = strlen(name->name) + 1;
- struct filename *new = __getname();
-
- if (unlikely(!new))
- goto out;
-
- if (len <= (PATH_MAX - sizeof(*new))) {
- new->name = (char *)(new) + sizeof(*new);
- new->separate = false;
- } else if (len <= PATH_MAX) {
- /* this looks odd, but is due to final_putname() */
- struct filename *new2;
-
- new2 = kmalloc(sizeof(*new2), GFP_KERNEL);
- if (unlikely(!new2)) {
- __putname(new);
- goto out;
- }
- new2->name = (char *)new;
- new2->separate = true;
- new = new2;
- } else {
- /* we should never get here, but let's be safe */
- __putname(new);
- goto out;
- }
- strlcpy((char *)new->name, name->name, len);
- new->uptr = NULL;
- new->aname = n;
- n->name = new;
- n->name_put = true;
+ n->name = name;
+ name->refcnt++;
}
+
out:
if (parent) {
n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
@@ -1970,11 +1865,16 @@ void __audit_inode_child(const struct inode *parent,
/* look for a parent entry first */
list_for_each_entry(n, &context->names_list, list) {
- if (!n->name || n->type != AUDIT_TYPE_PARENT)
+ if (!n->name ||
+ (n->type != AUDIT_TYPE_PARENT &&
+ n->type != AUDIT_TYPE_UNKNOWN))
continue;
- if (n->ino == parent->i_ino &&
- !audit_compare_dname_path(dname, n->name->name, n->name_len)) {
+ if (n->ino == parent->i_ino && n->dev == parent->i_sb->s_dev &&
+ !audit_compare_dname_path(dname,
+ n->name->name, n->name_len)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = AUDIT_TYPE_PARENT;
found_parent = n;
break;
}
@@ -1983,11 +1883,8 @@ void __audit_inode_child(const struct inode *parent,
/* is there a matching child entry? */
list_for_each_entry(n, &context->names_list, list) {
/* can only match entries that have a name */
- if (!n->name || n->type != type)
- continue;
-
- /* if we found a parent, make sure this one is a child of it */
- if (found_parent && (n->name != found_parent->name))
+ if (!n->name ||
+ (n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
continue;
if (!strcmp(dname, n->name->name) ||
@@ -1995,6 +1892,8 @@ void __audit_inode_child(const struct inode *parent,
found_parent ?
found_parent->name_len :
AUDIT_NAME_FULL)) {
+ if (n->type == AUDIT_TYPE_UNKNOWN)
+ n->type = type;
found_child = n;
break;
}
@@ -2019,10 +1918,10 @@ void __audit_inode_child(const struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- /* don't call __putname() */
- found_child->name_put = false;
+ found_child->name->refcnt++;
}
}
+
if (inode)
audit_copy_inode(found_child, dentry, inode);
else
@@ -2405,7 +2304,6 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
struct audit_aux_data_bprm_fcaps *ax;
struct audit_context *context = current->audit_context;
struct cpu_vfs_cap_data vcaps;
- struct dentry *dentry;
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
if (!ax)
@@ -2415,9 +2313,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- dentry = dget(bprm->file->f_path.dentry);
- get_vfs_caps_from_disk(dentry, &vcaps);
- dput(dentry);
+ get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c852776..38c25b1 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -444,7 +444,7 @@ arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
}
/*
- * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * Free up memory used by kernel, initrd, and command line. This is temporary
* memory allocation which is not needed any more after these buffers have
* been loaded into separate segments and have been copied elsewhere.
*/
@@ -856,8 +856,6 @@ static int kimage_set_destination(struct kimage *image,
destination &= PAGE_MASK;
result = kimage_add_entry(image, destination | IND_DESTINATION);
- if (result == 0)
- image->destination = destination;
return result;
}
@@ -869,8 +867,6 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
page &= PAGE_MASK;
result = kimage_add_entry(image, page | IND_SOURCE);
- if (result == 0)
- image->destination += PAGE_SIZE;
return result;
}
@@ -1288,19 +1284,22 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
if (nr_segments > 0) {
unsigned long i;
- /* Loading another kernel to reboot into */
- if ((flags & KEXEC_ON_CRASH) == 0)
- result = kimage_alloc_init(&image, entry, nr_segments,
- segments, flags);
- /* Loading another kernel to switch to if this one crashes */
- else if (flags & KEXEC_ON_CRASH) {
- /* Free any current crash dump kernel before
+ if (flags & KEXEC_ON_CRASH) {
+ /*
+ * Loading another kernel to switch to if this one
+ * crashes. Free any current crash dump kernel before
* we corrupt it.
*/
+
kimage_free(xchg(&kexec_crash_image, NULL));
result = kimage_alloc_init(&image, entry, nr_segments,
segments, flags);
crash_map_reserved_pages();
+ } else {
+ /* Loading another kernel to reboot into. */
+
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
}
if (result)
goto out;
diff --git a/kernel/module.c b/kernel/module.c
index 8426ad4..b34813f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3025,8 +3025,13 @@ static void do_free_init(struct rcu_head *head)
kfree(m);
}
-/* This is where the real work happens */
-static int do_init_module(struct module *mod)
+/*
+ * This is where the real work happens.
+ *
+ * Keep it uninlined to provide a reliable breakpoint target, e.g. for the gdb
+ * helper command 'lx-symbols'.
+ */
+static noinline int do_init_module(struct module *mod)
{
int ret = 0;
struct mod_initfree *freeinit;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c347e3c..b7d6b3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,7 +37,9 @@ const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-static bool suspend_freeze_wake;
+
+enum freeze_state __read_mostly suspend_freeze_state;
+static DEFINE_SPINLOCK(suspend_freeze_lock);
void freeze_set_ops(const struct platform_freeze_ops *ops)
{
@@ -48,22 +50,49 @@ void freeze_set_ops(const struct platform_freeze_ops *ops)
static void freeze_begin(void)
{
- suspend_freeze_wake = false;
+ suspend_freeze_state = FREEZE_STATE_NONE;
}
static void freeze_enter(void)
{
- cpuidle_use_deepest_state(true);
+ spin_lock_irq(&suspend_freeze_lock);
+ if (pm_wakeup_pending())
+ goto out;
+
+ suspend_freeze_state = FREEZE_STATE_ENTER;
+ spin_unlock_irq(&suspend_freeze_lock);
+
+ get_online_cpus();
cpuidle_resume();
- wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+
+ /* Push all the CPUs into the idle loop. */
+ wake_up_all_idle_cpus();
+ pr_debug("PM: suspend-to-idle\n");
+ /* Make the current CPU wait so it can enter the idle loop too. */
+ wait_event(suspend_freeze_wait_head,
+ suspend_freeze_state == FREEZE_STATE_WAKE);
+ pr_debug("PM: resume from suspend-to-idle\n");
+
cpuidle_pause();
- cpuidle_use_deepest_state(false);
+ put_online_cpus();
+
+ spin_lock_irq(&suspend_freeze_lock);
+
+ out:
+ suspend_freeze_state = FREEZE_STATE_NONE;
+ spin_unlock_irq(&suspend_freeze_lock);
}
void freeze_wake(void)
{
- suspend_freeze_wake = true;
- wake_up(&suspend_freeze_wait_head);
+ unsigned long flags;
+
+ spin_lock_irqsave(&suspend_freeze_lock, flags);
+ if (suspend_freeze_state > FREEZE_STATE_NONE) {
+ suspend_freeze_state = FREEZE_STATE_WAKE;
+ wake_up(&suspend_freeze_wait_head);
+ }
+ spin_unlock_irqrestore(&suspend_freeze_lock, flags);
}
EXPORT_SYMBOL_GPL(freeze_wake);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1eb9d90..227fec3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1077,7 +1077,6 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
}
#if defined CONFIG_COMPAT
-#include <linux/compat.h>
int compat_ptrace_request(struct task_struct *child, compat_long_t request,
compat_ulong_t addr, compat_ulong_t data)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index aaf1c1d..94b2d7b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
+#include <linux/suspend.h>
#include <asm/tlb.h>
@@ -105,6 +106,21 @@ static void cpuidle_idle_call(void)
rcu_idle_enter();
/*
+ * Suspend-to-idle ("freeze") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
+ */
+ if (idle_should_freeze()) {
+ cpuidle_enter_freeze();
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ /*
* Ask the cpuidle framework to choose a convenient idle state.
* Fall back to the default arch idle method on errors.
*/
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4ef9687..4f44028 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -629,7 +629,9 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
switch (action) {
case SECCOMP_RET_ERRNO:
- /* Set the low-order 16-bits as a errno. */
+ /* Set low-order bits as an errno, capped at MAX_ERRNO. */
+ if (data > MAX_ERRNO)
+ data = MAX_ERRNO;
syscall_set_return_value(current, task_pt_regs(current),
-data, 0);
goto skip;
diff --git a/kernel/signal.c b/kernel/signal.c
index 33a5275..a390499 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3550,7 +3550,7 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
SYSCALL_DEFINE0(pause)
{
while (!signal_pending(current)) {
- current->state = TASK_INTERRUPTIBLE;
+ __set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
return -ERESTARTNOHAND;
@@ -3563,7 +3563,7 @@ int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
- current->state = TASK_INTERRUPTIBLE;
+ __set_current_state(TASK_INTERRUPTIBLE);
schedule();
set_restore_sigmask();
return -ERESTARTNOHAND;
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 7efeedf..f7c5155 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -394,6 +394,56 @@ void tick_resume(void)
}
}
+static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static unsigned int tick_freeze_depth;
+
+/**
+ * tick_freeze - Suspend the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the last online CPU executing the function and if so,
+ * suspend timekeeping. Otherwise suspend the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_unfreeze().
+ * Interrupts must not be enabled before the subsequent %tick_unfreeze().
+ */
+void tick_freeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ tick_freeze_depth++;
+ if (tick_freeze_depth == num_online_cpus()) {
+ timekeeping_suspend();
+ } else {
+ tick_suspend();
+ tick_suspend_broadcast();
+ }
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
+/**
+ * tick_unfreeze - Resume the local tick and (possibly) timekeeping.
+ *
+ * Check if this is the first CPU executing the function and if so, resume
+ * timekeeping. Otherwise resume the local tick.
+ *
+ * Call with interrupts disabled. Must be balanced with %tick_freeze().
+ * Interrupts must not be enabled after the preceding %tick_freeze().
+ */
+void tick_unfreeze(void)
+{
+ raw_spin_lock(&tick_freeze_lock);
+
+ if (tick_freeze_depth == num_online_cpus())
+ timekeeping_resume();
+ else
+ tick_resume();
+
+ tick_freeze_depth--;
+
+ raw_spin_unlock(&tick_freeze_lock);
+}
+
/**
* tick_init - initialize the tick control
*/
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b124af2..91db941 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -230,9 +230,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
- * @tk: The timekeeper from which we take the update
- * @tkf: The fast timekeeper to update
- * @tbase: The time base for the fast timekeeper (mono/raw)
+ * @tkr: Timekeeping readout base from which we take the update
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
@@ -244,11 +242,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
* smp_wmb(); <- Ensure that the last base[1] update is visible
* tkf->seq++;
* smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[0], tk);
+ * update(tkf->base[0], tkr);
* smp_wmb(); <- Ensure that the base[0] update is visible
* tkf->seq++;
* smp_wmb(); <- Ensure that the seqcount update is visible
- * update(tkf->base[1], tk);
+ * update(tkf->base[1], tkr);
*
* The reader side does:
*
@@ -269,7 +267,7 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
* slightly wrong timestamp (a few nanoseconds). See
* @ktime_get_mono_fast_ns.
*/
-static void update_fast_timekeeper(struct timekeeper *tk)
+static void update_fast_timekeeper(struct tk_read_base *tkr)
{
struct tk_read_base *base = tk_fast_mono.base;
@@ -277,7 +275,7 @@ static void update_fast_timekeeper(struct timekeeper *tk)
raw_write_seqcount_latch(&tk_fast_mono.seq);
/* Update base[0] */
- memcpy(base, &tk->tkr, sizeof(*base));
+ memcpy(base, tkr, sizeof(*base));
/* Force readers back to base[0] */
raw_write_seqcount_latch(&tk_fast_mono.seq);
@@ -334,6 +332,35 @@ u64 notrace ktime_get_mono_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+/* Suspend-time cycles value for halted fast timekeeper. */
+static cycle_t cycles_at_suspend;
+
+static cycle_t dummy_clock_read(struct clocksource *cs)
+{
+ return cycles_at_suspend;
+}
+
+/**
+ * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
+ * @tk: Timekeeper to snapshot.
+ *
+ * It generally is unsafe to access the clocksource after timekeeping has been
+ * suspended, so take a snapshot of the readout base of @tk and use it as the
+ * fast timekeeper's readout base while suspended. It will return the same
+ * number of cycles every time until timekeeping is resumed at which time the
+ * proper readout base for the fast timekeeper will be restored automatically.
+ */
+static void halt_fast_timekeeper(struct timekeeper *tk)
+{
+ static struct tk_read_base tkr_dummy;
+ struct tk_read_base *tkr = &tk->tkr;
+
+ memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
+ cycles_at_suspend = tkr->read(tkr->clock);
+ tkr_dummy.read = dummy_clock_read;
+ update_fast_timekeeper(&tkr_dummy);
+}
+
#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
static inline void update_vsyscall(struct timekeeper *tk)
@@ -462,7 +489,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
- update_fast_timekeeper(tk);
+ update_fast_timekeeper(&tk->tkr);
}
/**
@@ -1170,7 +1197,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
* xtime/wall_to_monotonic/jiffies/etc are
* still managed by arch specific suspend/resume code.
*/
-static void timekeeping_resume(void)
+void timekeeping_resume(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct clocksource *clock = tk->tkr.clock;
@@ -1251,7 +1278,7 @@ static void timekeeping_resume(void)
hrtimers_resume();
}
-static int timekeeping_suspend(void)
+int timekeeping_suspend(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long flags;
@@ -1296,6 +1323,7 @@ static int timekeeping_suspend(void)
}
timekeeping_update(tk, TK_MIRROR);
+ halt_fast_timekeeper(tk);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index adc1fc9..1d91416 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -16,5 +16,7 @@ extern int timekeeping_inject_offset(struct timespec *ts);
extern s32 timekeeping_get_tai_offset(void);
extern void timekeeping_set_tai_offset(s32 tai_offset);
extern void timekeeping_clocktai(struct timespec *ts);
+extern int timekeeping_suspend(void);
+extern void timekeeping_resume(void);
#endif
OpenPOWER on IntegriCloud